Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 217 additions & 8 deletions src/apps/EDGE3D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
/// double z[NB] = {z0[i],z1[i],z2[i],z3[i],z4[i],z5[i],z6[i],z7[i]};
/// double edge_matrix[EB][EB];
///
// Get integration points and weights
/// // Get integration points and weights
/// double qpts_1d[MAX_QUAD_ORDER];
/// double wgts_1d[MAX_QUAD_ORDER];
///
/// get_quadrature_rule(quad_type, quad_order, qpts_1d, wgts_1d);
///
// Compute cell centered Jacobian
/// // Compute cell centered Jacobian
/// const double jxx_cc = Jxx(x, y, z, 0.25, 0.25, 0.25, 0.25);
/// const double jxy_cc = Jxy(x, y, z, 0.25, 0.25, 0.25, 0.25);
/// const double jxz_cc = Jxz(x, y, z, 0.25, 0.25, 0.25, 0.25);
Expand All @@ -37,20 +37,20 @@
/// const double jzy_cc = Jzy(x, y, z, 0.25, 0.25, 0.25, 0.25);
/// const double jzz_cc = Jzz(x, y, z, 0.25, 0.25, 0.25, 0.25);
///
// Compute cell centered Jacobian determinant
/// // Compute cell centered Jacobian determinant
/// const double detj_cc = compute_detj(
/// jxx_cc, jxy_cc, jxz_cc,
/// jyx_cc, jyy_cc, jyz_cc,
/// jzx_cc, jzy_cc, jzz_cc);
///
// Initialize the stiffness matrix
/// // Initialize the stiffness matrix
/// for (int m = 0; m < EB; m++) {
/// for (int p = m; p < EB; p++) {
/// matrix[m][p] = 0.0;
/// }
/// }
///
// Compute values at each quadrature point
/// // Compute values at each quadrature point
/// for ( int i = 0; i < quad_order; i++ ) {
///
/// const double xloc = qpts_1d[i];
Expand Down Expand Up @@ -80,7 +80,7 @@
/// double dbasisy[EB] = {0};
/// curl_edgebasis_y(dbasisy, tmpy, yloc);
///
// Differeniate basis with respect to z at this quadrature point
/// // Differeniate basis with respect to z at this quadrature point
///
/// for ( int k = 0; k < quad_order; k++ ) {
///
Expand Down Expand Up @@ -158,14 +158,14 @@
/// dbasisx, dbasisy, dbasisz,
/// tdbasisx, tdbasisy, tdbasisz);
///
// the inner product: alpha*<w_i, w_j>
/// // the inner product: alpha*<w_i, w_j>
/// inner_product(
/// detjwgts*alpha,
/// tebasisx, tebasisy, tebasisz,
/// tebasisx, tebasisy, tebasisz,
/// matrix, true);
///
// the inner product: beta*<Curl(w_i), Curl(w_j)>
/// // the inner product: beta*<Curl(w_i), Curl(w_j)>
/// inner_product(
/// detjwgts*beta,
/// tdbasisx, tdbasisy, tdbasisz,
Expand Down Expand Up @@ -360,6 +360,196 @@ RAJA_INLINE void edge_MpSmatrix(
}
}

RAJA_HOST_DEVICE
RAJA_INLINE void symmetric_edge_MpSmatrix(
const rajaperf::Real_type (&x)[NB],
const rajaperf::Real_type (&y)[NB],
const rajaperf::Real_type (&z)[NB],
rajaperf::Real_type alpha,
rajaperf::Real_type beta,
const rajaperf::Real_type detj_tol,
const rajaperf::Int_type quad_type,
const rajaperf::Int_type quad_order,
rajaperf::Real_type (&matrix)[EB][EB])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is still a full matrix?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The interface that this is used for requires having a full matrix. However, when we we are computing the work at each quadrature point, we are using a symmetric matrix.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can follow up with a study on how beneficial it would be to never create the full matrix. If that is a major impact, that may be enough incentive to rewrite how things are done in the ultimate use case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did a quick study measuring the performance of an impl that has no 12x12 matrices. Its indistinguishable from the performance seen in this MR.

{
// Get integration points and weights
rajaperf::Real_type qpts_1d[MAX_QUAD_ORDER];
rajaperf::Real_type wgts_1d[MAX_QUAD_ORDER];

get_quadrature_rule(quad_type, quad_order, qpts_1d, wgts_1d);

// Compute cell centered Jacobian
const rajaperf::Real_type jxx_cc = Jxx(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jxy_cc = Jxy(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jxz_cc = Jxz(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jyx_cc = Jyx(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jyy_cc = Jyy(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jyz_cc = Jyz(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jzx_cc = Jzx(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jzy_cc = Jzy(x, y, z, 0.25, 0.25, 0.25, 0.25);
const rajaperf::Real_type jzz_cc = Jzz(x, y, z, 0.25, 0.25, 0.25, 0.25);

// Compute cell centered Jacobian determinant
const rajaperf::Real_type detj_cc = compute_detj(
jxx_cc, jxy_cc, jxz_cc,
jyx_cc, jyy_cc, jyz_cc,
jzx_cc, jzy_cc, jzz_cc);

// Initialize the stiffness matrix
for (rajaperf::Int_type m = 0; m < EB; m++) {
for (rajaperf::Int_type p = m; p < EB; p++) {
matrix[m][p] = 0.0;
}
}

constexpr rajaperf::Int_type symmetric_size = (EB*(EB+1))/2;
rajaperf::Real_type symmetric_matrix[symmetric_size];

// Initialize the symmetric stiffness matrix
for (rajaperf::Int_type m = 0; m < symmetric_size; m++) {
symmetric_matrix[m] = 0.0;
}

// Compute values at each quadrature point
for ( rajaperf::Int_type i = 0; i < quad_order; i++ ) {

const rajaperf::Real_type xloc = qpts_1d[i];
const rajaperf::Real_type tmpx = 1. - xloc;

rajaperf::Real_type dbasisx[EB] = {0};
curl_edgebasis_x(dbasisx, tmpx, xloc);

for ( rajaperf::Int_type j = 0; j < quad_order; j++ ) {

const rajaperf::Real_type yloc = qpts_1d[j];
const rajaperf::Real_type wgtxy = wgts_1d[i]*wgts_1d[j];
const rajaperf::Real_type tmpy = 1. - yloc;

rajaperf::Real_type tmpxy = tmpx*tmpy;
rajaperf::Real_type xyloc = xloc*yloc;
rajaperf::Real_type tmpxyloc = tmpx*yloc;
rajaperf::Real_type xloctmpy = xloc*tmpy;

const rajaperf::Real_type jzx = Jzx(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc);
const rajaperf::Real_type jzy = Jzy(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc);
const rajaperf::Real_type jzz = Jzz(x, y, z, tmpxy, xloctmpy, xyloc, tmpxyloc);

rajaperf::Real_type ebasisz[EB] = {0};
edgebasis_z(ebasisz, tmpxy, xloctmpy, xyloc, tmpxyloc);

rajaperf::Real_type dbasisy[EB] = {0};
curl_edgebasis_y(dbasisy, tmpy, yloc);

// Differeniate basis with respect to z at this quadrature point

for ( rajaperf::Int_type k = 0; k < quad_order; k++ ) {

const rajaperf::Real_type zloc = qpts_1d[k];
const rajaperf::Real_type wgts = wgtxy*wgts_1d[k];
const rajaperf::Real_type tmpz = 1. - zloc;

const rajaperf::Real_type tmpxz = tmpx*tmpz;
const rajaperf::Real_type tmpyz = tmpy*tmpz;

const rajaperf::Real_type xzloc = xloc*zloc;
const rajaperf::Real_type yzloc = yloc*zloc;

const rajaperf::Real_type tmpyzloc = tmpy*zloc;
const rajaperf::Real_type tmpxzloc = tmpx*zloc;

const rajaperf::Real_type yloctmpz = yloc*tmpz;
const rajaperf::Real_type xloctmpz = xloc*tmpz;

const rajaperf::Real_type jxx = Jxx(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc);
const rajaperf::Real_type jxy = Jxy(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc);
const rajaperf::Real_type jxz = Jxz(x, y, z, tmpyz, yloctmpz, tmpyzloc, yzloc);
const rajaperf::Real_type jyx = Jyx(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc);
const rajaperf::Real_type jyy = Jyy(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc);
const rajaperf::Real_type jyz = Jyz(x, y, z, tmpxz, xloctmpz, tmpxzloc, xzloc);

rajaperf::Real_type jinvxx, jinvxy, jinvxz,
jinvyx, jinvyy, jinvyz,
jinvzx, jinvzy, jinvzz,
detj_unfixed, detj, abs_detj, invdetj;

jacobian_inv(
jxx, jxy, jxz,
jyx, jyy, jyz,
jzx, jzy, jzz,
detj_cc, detj_tol,
jinvxx, jinvxy, jinvxz,
jinvyx, jinvyy, jinvyz,
jinvzx, jinvzy, jinvzz,
detj_unfixed, detj, abs_detj, invdetj);

const rajaperf::Real_type detjwgts = wgts*abs_detj;

rajaperf::Real_type ebasisx[EB] = {0};
edgebasis_x(ebasisx, tmpyz, yloctmpz, tmpyzloc, yzloc);

rajaperf::Real_type ebasisy[EB] = {0};
edgebasis_y(ebasisy, tmpxz, xloctmpz, tmpxzloc, xzloc);

rajaperf::Real_type dbasisz[EB] = {0};
curl_edgebasis_z(dbasisz, tmpz, zloc);

const rajaperf::Real_type inv_abs_detj = 1./(abs_detj+ptiny);

rajaperf::Real_type tebasisx[EB] = {0};
rajaperf::Real_type tebasisy[EB] = {0};
rajaperf::Real_type tebasisz[EB] = {0};

transform_edge_basis(
jinvxx, jinvxy, jinvxz,
jinvyx, jinvyy, jinvyz,
jinvzx, jinvzy, jinvzz,
ebasisx, ebasisy, ebasisz,
tebasisx, tebasisy, tebasisz);

rajaperf::Real_type tdbasisx[EB] = {0};
rajaperf::Real_type tdbasisy[EB] = {0};
rajaperf::Real_type tdbasisz[EB] = {0};

transform_curl_edge_basis(
jxx, jxy, jxz,
jyx, jyy, jyz,
jzx, jzy, jzz,
inv_abs_detj,
dbasisx, dbasisy, dbasisz,
tdbasisx, tdbasisy, tdbasisz);

// the inner product: alpha*<w_i, w_j>
inner_product(
detjwgts*alpha,
tebasisx, tebasisy, tebasisz,
tebasisx, tebasisy, tebasisz,
symmetric_matrix);

// the inner product: beta*<Curl(w_i), Curl(w_j)>
inner_product(
detjwgts*beta,
tdbasisx, tdbasisy, tdbasisz,
tdbasisx, tdbasisy, tdbasisz,
symmetric_matrix);
}
}
}

// write back to matrix
rajaperf::Int_type offset = 0;
for (rajaperf::Int_type p = 0; p < EB; p++) {
// off-diagonal values
for (rajaperf::Int_type m = p + 1; m < EB; m++) {
auto value = symmetric_matrix[offset + m - p];
matrix[p][m] = value;
matrix[m][p] = value;
}
// diagonal value
matrix[p][p] = symmetric_matrix[offset];
offset += (EB-p);
}
}

#define EDGE3D_DATA_SETUP \
Real_ptr x = m_x; \
Real_ptr y = m_y; \
Expand All @@ -374,7 +564,24 @@ RAJA_INLINE void edge_MpSmatrix(
NDPTRSET(m_domain->jp, m_domain->kp, y,y0,y1,y2,y3,y4,y5,y6,y7) ; \
NDPTRSET(m_domain->jp, m_domain->kp, z,z0,z1,z2,z3,z4,z5,z6,z7) ;

#if 1 // Symmetric implementation
#define EDGE3D_BODY \
rajaperf::Real_type X[NB] = {x0[i],x1[i],x2[i],x3[i],x4[i],x5[i],x6[i],x7[i]};\
rajaperf::Real_type Y[NB] = {y0[i],y1[i],y2[i],y3[i],y4[i],y5[i],y6[i],y7[i]};\
rajaperf::Real_type Z[NB] = {z0[i],z1[i],z2[i],z3[i],z4[i],z5[i],z6[i],z7[i]};\
rajaperf::Real_type edge_matrix[EB][EB];\
symmetric_edge_MpSmatrix(X, Y, Z, 1.0, 1.0, 0.0, 1.0, NQ_1D, edge_matrix);\
rajaperf::Real_type local_sum = 0.0;\
for (rajaperf::Int_type m = 0; m < EB; m++) {\
rajaperf::Real_type check = 0.0;\
for (rajaperf::Int_type p = 0; p < EB; p++) {\
check += edge_matrix[m][p];\
}\
local_sum += check;\
}\
sum[i] = local_sum;\

#else // Non-symmetic implementation
rajaperf::Real_type X[NB] = {x0[i],x1[i],x2[i],x3[i],x4[i],x5[i],x6[i],x7[i]};\
rajaperf::Real_type Y[NB] = {y0[i],y1[i],y2[i],y3[i],y4[i],y5[i],y6[i],y7[i]};\
rajaperf::Real_type Z[NB] = {z0[i],z1[i],z2[i],z3[i],z4[i],z5[i],z6[i],z7[i]};\
Expand All @@ -390,6 +597,8 @@ RAJA_INLINE void edge_MpSmatrix(
}\
sum[i] = local_sum;\

#endif

#include "common/KernelBase.hpp"

namespace rajaperf
Expand Down
33 changes: 33 additions & 0 deletions src/apps/mixed_fem_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,39 @@ constexpr void inner_product(
}
}

template<rajaperf::Int_type M>
RAJA_HOST_DEVICE
constexpr void inner_product(
const rajaperf::Real_type weight,
const rajaperf::Real_type (&basis_1_x)[M],
const rajaperf::Real_type (&basis_1_y)[M],
const rajaperf::Real_type (&basis_1_z)[M],
const rajaperf::Real_type (&basis_2_x)[M],
const rajaperf::Real_type (&basis_2_y)[M],
const rajaperf::Real_type (&basis_2_z)[M],
rajaperf::Real_type (&matrix)[(M*(M+1))/2])
{
// inner product is <basis_2, basis_1>
rajaperf::Int_type offset = 0;

for (rajaperf::Int_type p = 0; p < M; p++) {

const rajaperf::Real_type txi = basis_2_x[p];
const rajaperf::Real_type tyi = basis_2_y[p];
const rajaperf::Real_type tzi = basis_2_z[p];

for (rajaperf::Int_type m = 0; m < (M-p); m++) {

const rajaperf::Real_type txj = basis_1_x[p+m];
const rajaperf::Real_type tyj = basis_1_y[p+m];
const rajaperf::Real_type tzj = basis_1_z[p+m];
matrix[offset + m] += weight*(txi*txj + tyi*tyj + tzi*tzj);
}

offset += (M-p);
}
}

constexpr rajaperf::Int_type flops_bad_zone_algorithm()
{
return 5;
Expand Down
Loading