24 static constexpr int nv = 4;
25 static constexpr int dim = 2;
26 static constexpr int ddm2 = (
dim*(
dim+1))/2;
27 static constexpr int nd1d = ORDER + 1;
28 static constexpr int ndof_per_el = nd1d*nd1d;
29 static constexpr int nnz_per_row = 9;
30 static constexpr int sz_local_mat = nv*nv;
32 const bool const_mq =
c1.
Size() == 1;
33 const auto MQ = const_mq
36 const bool const_dq =
c2.
Size() == 1;
37 const auto DQ = const_dq
52 MFEM_FOREACH_THREAD(iy,y,nd1d)
54 MFEM_FOREACH_THREAD(ix,x,nd1d)
56 for (
int j=0; j<nnz_per_row; ++j)
58 V(j,ix,iy,iel_ho) = 0.0;
65 MFEM_FOREACH_THREAD(ky,y,ORDER)
67 MFEM_FOREACH_THREAD(kx,x,ORDER)
70 real_t local_mat_[sz_local_mat];
74 for (
int i=0; i<sz_local_mat; ++i) { local_mat[i] = 0.0; }
78 for (
int iqx=0; iqx<2; ++iqx)
80 for (
int iqy=0; iqy<2; ++iqy)
82 const real_t mq = const_mq ? MQ(0,0,0) : MQ(kx+iqx, ky+iqy, iel_ho);
83 const real_t dq = const_dq ? DQ(0,0,0) : DQ(kx+iqx, ky+iqy, iel_ho);
84 for (
int jy=0; jy<2; ++jy)
86 const real_t bjy = (jy == iqy) ? 1.0 : 0.0;
87 const real_t gjy = (jy == 0) ? -1.0 : 1.0;
88 for (
int jx=0; jx<2; ++jx)
90 const real_t bjx = (jx == iqx) ? 1.0 : 0.0;
91 const real_t gjx = (jx == 0) ? -1.0 : 1.0;
93 const real_t djx = gjx*bjy;
94 const real_t djy = bjx*gjy;
96 int jj_loc = jx + 2*jy;
98 for (
int iy=0; iy<2; ++iy)
100 const real_t biy = (iy == iqy) ? 1.0 : 0.0;
101 const real_t giy = (iy == 0) ? -1.0 : 1.0;
102 for (
int ix=0; ix<2; ++ix)
104 const real_t bix = (ix == iqx) ? 1.0 : 0.0;
105 const real_t gix = (ix == 0) ? -1.0 : 1.0;
107 const real_t dix = gix*biy;
108 const real_t diy = bix*giy;
110 int ii_loc = ix + 2*iy;
114 if (jj_loc > ii_loc) {
continue; }
117 val += dix*djx*Q(0,iqy,iqx);
118 val += (dix*djy + diy*djx)*Q(1,iqy,iqx);
119 val += diy*djy*Q(2,iqy,iqx);
122 val += mq*bix*biy*bjx*bjy*Q(3,iqy,iqx);
124 local_mat(ii_loc, jj_loc) += val;
134 for (
int ii_loc=0; ii_loc<nv; ++ii_loc)
136 const int ix = ii_loc%2;
137 const int iy = ii_loc/2;
138 for (
int jj_loc=0; jj_loc<nv; ++jj_loc)
140 const int jx = jj_loc%2;
141 const int jy = jj_loc/2;
142 const int jj_off = (jx-ix+1) + 3*(jy-iy+1);
145 if (jj_loc <= ii_loc)
147 AtomicAdd(V(jj_off, ix+kx, iy+ky, iel_ho), local_mat(ii_loc, jj_loc));
151 AtomicAdd(V(jj_off, ix+kx, iy+ky, iel_ho), local_mat(jj_loc, ii_loc));
162 for (
int iy=0; iy<nd1d; ++iy)
164 const int jy_begin = (iy > 0) ? iy - 1 : 0;
165 const int jy_end = (iy < ORDER) ? iy + 1 : ORDER;
166 for (
int ix=0; ix<nd1d; ++ix)
168 const int jx_begin = (ix > 0) ? ix - 1 : 0;
169 const int jx_end = (ix < ORDER) ? ix + 1 : ORDER;
170 const int ii_el = ix + nd1d*iy;
171 for (
int jy=jy_begin; jy<=jy_end; ++jy)
173 for (
int jx=jx_begin; jx<=jx_end; ++jx)
175 const int jj_off = (jx-ix+1) + 3*(jy-iy+1);
176 const int jj_el = jx + nd1d*jy;
177 map(jj_off, ii_el) = jj_el;
188 static constexpr int nv = 8;
189 static constexpr int dim = 3;
190 static constexpr int ddm2 = (
dim*(
dim+1))/2;
191 static constexpr int nd1d = ORDER + 1;
192 static constexpr int ndof_per_el = nd1d*nd1d*nd1d;
193 static constexpr int nnz_per_row = 27;
194 static constexpr int sz_grad_A = 3*3*2*2*2*2;
195 static constexpr int sz_grad_B = sz_grad_A*2;
196 static constexpr int sz_mass_A = 2*2*2*2;
197 static constexpr int sz_mass_B = sz_mass_A*2;
198 static constexpr int sz_local_mat = nv*nv;
200 const bool const_mq =
c1.
Size() == 1;
201 const auto MQ = const_mq
204 const bool const_dq =
c2.
Size() == 1;
205 const auto DQ = const_dq
216 [=] MFEM_HOST_DEVICE (
int iel_ho)
221 MFEM_FOREACH_THREAD(iz,z,nd1d)
223 MFEM_FOREACH_THREAD(iy,y,nd1d)
225 MFEM_FOREACH_THREAD(ix,x,nd1d)
227 MFEM_UNROLL(nnz_per_row)
228 for (
int j=0; j<nnz_per_row; ++j)
230 V(j,ix,iy,iz,iel_ho) = 0.0;
238 MFEM_FOREACH_THREAD(kz,z,ORDER)
240 MFEM_FOREACH_THREAD(ky,y,ORDER)
242 MFEM_FOREACH_THREAD(kx,x,ORDER)
245 real_t grad_A_[sz_grad_A];
246 real_t grad_B_[sz_grad_B];
247 real_t mass_A_[sz_mass_A];
248 real_t mass_B_[sz_mass_B];
249 real_t local_mat_[sz_local_mat];
259 for (
int i=0; i<sz_local_mat; ++i) { local_mat[i] = 0.0; }
263 for (
int i=0; i<sz_grad_A; ++i) { grad_A[i] = 0.0; }
264 for (
int i=0; i<sz_grad_B; ++i) { grad_B[i] = 0.0; }
266 for (
int i=0; i<sz_mass_A; ++i) { mass_A[i] = 0.0; }
267 for (
int i=0; i<sz_mass_B; ++i) { mass_B[i] = 0.0; }
269 real_t vx[8], vy[8], vz[8];
273 for (
int iqz=0; iqz<2; ++iqz)
276 for (
int iqy=0; iqy<2; ++iqy)
279 for (
int iqx=0; iqx<2; ++iqx)
292 const real_t w_detJ = w/detJ;
299 Q(0,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(0,0)+A(0,1)*A(0,1)+A(0,2)*A(0,2));
300 Q(1,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(1,0)+A(0,1)*A(1,1)+A(0,2)*A(1,2));
301 Q(2,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(2,0)+A(0,1)*A(2,1)+A(0,2)*A(2,2));
302 Q(3,iqz,iqy,iqx) = w_detJ*(A(1,0)*A(1,0)+A(1,1)*A(1,1)+A(1,2)*A(1,2));
303 Q(4,iqz,iqy,iqx) = w_detJ*(A(1,0)*A(2,0)+A(1,1)*A(2,1)+A(1,2)*A(2,2));
304 Q(5,iqz,iqy,iqx) = w_detJ*(A(2,0)*A(2,0)+A(2,1)*A(2,1)+A(2,2)*A(2,2));
305 Q(6,iqz,iqy,iqx) = w*detJ;
311 for (
int iqx=0; iqx<2; ++iqx)
314 for (
int jz=0; jz<2; ++jz)
319 for (
int iz=jz; iz<2; ++iz)
322 for (
int iqy=0; iqy<2; ++iqy)
325 for (
int iqz=0; iqz<2; ++iqz)
327 const real_t mq = const_mq ? MQ(0,0,0,0) : MQ(kx+iqx, ky+iqy, kz+iqz, iel_ho);
328 const real_t dq = const_dq ? DQ(0,0,0,0) : DQ(kx+iqx, ky+iqy, kz+iqz, iel_ho);
330 const real_t biz = (iz == iqz) ? 1.0 : 0.0;
331 const real_t giz = (iz == 0) ? -1.0 : 1.0;
333 const real_t bjz = (jz == iqz) ? 1.0 : 0.0;
334 const real_t gjz = (jz == 0) ? -1.0 : 1.0;
336 const real_t J11 = Q(0,iqz,iqy,iqx);
337 const real_t J21 = Q(1,iqz,iqy,iqx);
338 const real_t J31 = Q(2,iqz,iqy,iqx);
340 const real_t J22 = Q(3,iqz,iqy,iqx);
341 const real_t J32 = Q(4,iqz,iqy,iqx);
344 const real_t J33 = Q(5,iqz,iqy,iqx);
346 grad_A(0,0,iqy,iz,jz,iqx) += dq*J11*biz*bjz;
347 grad_A(1,0,iqy,iz,jz,iqx) += dq*J21*biz*bjz;
348 grad_A(2,0,iqy,iz,jz,iqx) += dq*J31*giz*bjz;
349 grad_A(0,1,iqy,iz,jz,iqx) += dq*J12*biz*bjz;
350 grad_A(1,1,iqy,iz,jz,iqx) += dq*J22*biz*bjz;
351 grad_A(2,1,iqy,iz,jz,iqx) += dq*J32*giz*bjz;
352 grad_A(0,2,iqy,iz,jz,iqx) += dq*J13*biz*gjz;
353 grad_A(1,2,iqy,iz,jz,iqx) += dq*J23*biz*gjz;
354 grad_A(2,2,iqy,iz,jz,iqx) += dq*J33*giz*gjz;
356 real_t wdetJ = Q(6,iqz,iqy,iqx);
357 mass_A(iqy,iz,jz,iqx) += mq*wdetJ*biz*bjz;
360 for (
int jy=0; jy<2; ++jy)
363 for (
int iy=0; iy<2; ++iy)
365 const real_t biy = (iy == iqy) ? 1.0 : 0.0;
366 const real_t giy = (iy == 0) ? -1.0 : 1.0;
368 const real_t bjy = (jy == iqy) ? 1.0 : 0.0;
369 const real_t gjy = (jy == 0) ? -1.0 : 1.0;
371 grad_B(0,0,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(0,0,iqy,iz,jz,iqx);
372 grad_B(1,0,iy,jy,iz,jz,iqx) += giy*bjy*grad_A(1,0,iqy,iz,jz,iqx);
373 grad_B(2,0,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(2,0,iqy,iz,jz,iqx);
374 grad_B(0,1,iy,jy,iz,jz,iqx) += biy*gjy*grad_A(0,1,iqy,iz,jz,iqx);
375 grad_B(1,1,iy,jy,iz,jz,iqx) += giy*gjy*grad_A(1,1,iqy,iz,jz,iqx);
376 grad_B(2,1,iy,jy,iz,jz,iqx) += biy*gjy*grad_A(2,1,iqy,iz,jz,iqx);
377 grad_B(0,2,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(0,2,iqy,iz,jz,iqx);
378 grad_B(1,2,iy,jy,iz,jz,iqx) += giy*bjy*grad_A(1,2,iqy,iz,jz,iqx);
379 grad_B(2,2,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(2,2,iqy,iz,jz,iqx);
381 mass_B(iy,jy,iz,jz,iqx) += biy*bjy*mass_A(iqy,iz,jz,iqx);
386 for (
int jy=0; jy<2; ++jy)
389 for (
int jx=0; jx<2; ++jx)
392 for (
int iy=0; iy<2; ++iy)
395 for (
int ix=0; ix<2; ++ix)
397 const real_t bix = (ix == iqx) ? 1.0 : 0.0;
398 const real_t gix = (ix == 0) ? -1.0 : 1.0;
400 const real_t bjx = (jx == iqx) ? 1.0 : 0.0;
401 const real_t gjx = (jx == 0) ? -1.0 : 1.0;
403 int ii_loc = ix + 2*iy + 4*iz;
404 int jj_loc = jx + 2*jy + 4*jz;
408 if (jj_loc > ii_loc) {
continue; }
411 val += gix*gjx*grad_B(0,0,iy,jy,iz,jz,iqx);
412 val += bix*gjx*grad_B(1,0,iy,jy,iz,jz,iqx);
413 val += bix*gjx*grad_B(2,0,iy,jy,iz,jz,iqx);
414 val += gix*bjx*grad_B(0,1,iy,jy,iz,jz,iqx);
415 val += bix*bjx*grad_B(1,1,iy,jy,iz,jz,iqx);
416 val += bix*bjx*grad_B(2,1,iy,jy,iz,jz,iqx);
417 val += gix*bjx*grad_B(0,2,iy,jy,iz,jz,iqx);
418 val += bix*bjx*grad_B(2,2,iy,jy,iz,jz,iqx);
419 val += bix*bjx*grad_B(1,2,iy,jy,iz,jz,iqx);
421 val += bix*bjx*mass_B(iy,jy,iz,jz,iqx);
423 local_mat(ii_loc, jj_loc) += val;
435 for (
int ii_loc=0; ii_loc<nv; ++ii_loc)
437 const int ix = ii_loc%2;
438 const int iy = (ii_loc/2)%2;
439 const int iz = ii_loc/2/2;
441 for (
int jj_loc=0; jj_loc<nv; ++jj_loc)
443 const int jx = jj_loc%2;
444 const int jy = (jj_loc/2)%2;
445 const int jz = jj_loc/2/2;
446 const int jj_off = (jx-ix+1) + 3*(jy-iy+1) + 9*(jz-iz+1);
448 if (jj_loc <= ii_loc)
450 AtomicAdd(V(jj_off, ix+kx, iy+ky, iz+kz, iel_ho), local_mat(ii_loc, jj_loc));
454 AtomicAdd(V(jj_off, ix+kx, iy+ky, iz+kz, iel_ho), local_mat(jj_loc, ii_loc));
466 for (
int iz=0; iz<nd1d; ++iz)
468 const int jz_begin = (iz > 0) ? iz - 1 : 0;
469 const int jz_end = (iz < ORDER) ? iz + 1 : ORDER;
470 for (
int iy=0; iy<nd1d; ++iy)
472 const int jy_begin = (iy > 0) ? iy - 1 : 0;
473 const int jy_end = (iy < ORDER) ? iy + 1 : ORDER;
474 for (
int ix=0; ix<nd1d; ++ix)
476 const int jx_begin = (ix > 0) ? ix - 1 : 0;
477 const int jx_end = (ix < ORDER) ? ix + 1 : ORDER;
479 const int ii_el = ix + nd1d*(iy + nd1d*iz);
481 for (
int jz=jz_begin; jz<=jz_end; ++jz)
483 for (
int jy=jy_begin; jy<=jy_end; ++jy)
485 for (
int jx=jx_begin; jx<=jx_end; ++jx)
487 const int jj_off = (jx-ix+1) + 3*(jy-iy+1) + 9*(jz-iz+1);
488 const int jj_el = jx + nd1d*(jy + nd1d*jz);
489 map(jj_off, ii_el) = jj_el;