Open
Description
Second taco lacks one of the optimizations of the first taco, that is important for QCD-style expressions. The optimization pushes sums that only pertain to one factor down around that factor.
Given the expression tau = z(i) * z(j) * z(k) * theta(i,j) * theta(i,k)
, the unoptimized second taco generated code is:
void compute(Tensor tau, Tensor z, Tensor theta) {
double tau_val = 0.0;
double ti_val = 0.0;
for (int32_t i = 0; i < theta1_dimension; i++) {
int32_t pz1 = 0 * z1_dimension + i;
int32_t ptheta1 = 0 * theta1_dimension + i;
int32_t ptheta10 = 0 * theta1_dimension + i;
double tj_val = 0.0;
for (int32_t j = 0; j < theta2_dimension; j++) {
int32_t pz10 = 0 * z1_dimension + j;
int32_t ptheta2 = ptheta1 * theta2_dimension + j;
double tk_val = 0.0;
for (int32_t k = 0; k < theta2_dimension; k++) {
int32_t pz11 = 0 * z1_dimension + k;
int32_t ptheta20 = ptheta10 * theta2_dimension + k;
tk_val = tk_val + z_vals[pz1] * z_vals[pz10] * z_vals[pz11] * theta_vals[ptheta2] * theta_vals[ptheta20];
}
tj_val = tj_val + tk_val;
}
ti_val = ti_val + tj_val;
}
tau_val = ti_val;
tau_vals[0] = tau_val;
}
While the optimized first taco code is:
void compute(Tensor tau, Tensor z, Tensor theta) {
tau_vals[0] = 0.0;
for (int32_t iz = 0; iz < z1_dimension; iz++) {
int32_t pz1 = iz;
int32_t ptheta1 = iz;
int32_t ptheta10 = iz;
double tj = 0.0;
for (int32_t jz = 0; jz < z1_dimension; jz++) {
int32_t pz10 = jz;
int32_t ptheta2 = ptheta1 * theta2_dimension + jz;
tj = tj + z_vals[pz10] * theta_vals[ptheta2];
}
double tk = 0.0;
for (int32_t kz = 0; kz < z1_dimension; kz++) {
int32_t pz11 = kz;
int32_t ptheta20 = ptheta10 * theta2_dimension + kz;
tk = tk + z_vals[pz11] * theta_vals[ptheta20];
}
tau_vals[0] = tau_vals[0] + z_vals[pz1] * tj * tk;
}
}