12 #include "../general/forall.hpp"
25 static void OccaPADiffusionSetup2D(
const int D1D,
28 const Array<double> &W,
33 occa::properties props;
34 props[
"defines/D1D"] = D1D;
35 props[
"defines/Q1D"] = Q1D;
39 const occa_id_t id = std::make_pair(D1D,Q1D);
41 if (OccaDiffSetup2D_ker.find(
id) == OccaDiffSetup2D_ker.end())
43 const occa::kernel DiffusionSetup2D =
45 "DiffusionSetup2D", props);
46 OccaDiffSetup2D_ker.emplace(
id, DiffusionSetup2D);
48 OccaDiffSetup2D_ker.at(
id)(NE, o_W, o_J, COEFF, o_op);
51 static void OccaPADiffusionSetup3D(
const int D1D,
54 const Array<double> &W,
59 occa::properties props;
60 props[
"defines/D1D"] = D1D;
61 props[
"defines/Q1D"] = Q1D;
65 const occa_id_t id = std::make_pair(D1D,Q1D);
67 if (OccaDiffSetup3D_ker.find(
id) == OccaDiffSetup3D_ker.end())
69 const occa::kernel DiffusionSetup3D =
71 "DiffusionSetup3D", props);
72 OccaDiffSetup3D_ker.emplace(
id, DiffusionSetup3D);
74 OccaDiffSetup3D_ker.at(
id)(NE, o_W, o_J, COEFF, o_op);
76 #endif // MFEM_USE_OCCA
79 static void PADiffusionSetup2D(
const int Q1D,
81 const Array<double> &w,
86 const int NQ = Q1D*Q1D;
89 auto J =
Reshape(j.Read(), NQ, 2, 2, NE);
90 auto y =
Reshape(op.Write(), NQ, 3, NE);
94 for (
int q = 0; q < NQ; ++q)
96 const double J11 = J(q,0,0,e);
97 const double J21 = J(q,1,0,e);
98 const double J12 = J(q,0,1,e);
99 const double J22 = J(q,1,1,e);
100 const double c_detJ = W[q] * COEFF / ((J11*J22)-(J21*J12));
101 y(q,0,e) = c_detJ * (J12*J12 + J22*J22);
102 y(q,1,e) = -c_detJ * (J12*J11 + J22*J21);
103 y(q,2,e) = c_detJ * (J11*J11 + J21*J21);
109 static void PADiffusionSetup3D(
const int Q1D,
111 const Array<double> &w,
116 const int NQ = Q1D*Q1D*Q1D;
118 auto J =
Reshape(j.Read(), NQ, 3, 3, NE);
119 auto y =
Reshape(op.Write(), NQ, 6, NE);
122 for (
int q = 0; q < NQ; ++q)
124 const double J11 = J(q,0,0,e);
125 const double J21 = J(q,1,0,e);
126 const double J31 = J(q,2,0,e);
127 const double J12 = J(q,0,1,e);
128 const double J22 = J(q,1,1,e);
129 const double J32 = J(q,2,1,e);
130 const double J13 = J(q,0,2,e);
131 const double J23 = J(q,1,2,e);
132 const double J33 = J(q,2,2,e);
133 const double detJ = J11 * (J22 * J33 - J32 * J23) -
134 J21 * (J12 * J33 - J32 * J13) +
135 J31 * (J12 * J23 - J22 * J13);
136 const double c_detJ = W[q] * COEFF / detJ;
138 const double A11 = (J22 * J33) - (J23 * J32);
139 const double A12 = (J32 * J13) - (J12 * J33);
140 const double A13 = (J12 * J23) - (J22 * J13);
141 const double A21 = (J31 * J23) - (J21 * J33);
142 const double A22 = (J11 * J33) - (J13 * J31);
143 const double A23 = (J21 * J13) - (J11 * J23);
144 const double A31 = (J21 * J32) - (J31 * J22);
145 const double A32 = (J31 * J12) - (J11 * J32);
146 const double A33 = (J11 * J22) - (J12 * J21);
148 y(q,0,e) = c_detJ * (A11*A11 + A12*A12 + A13*A13);
149 y(q,1,e) = c_detJ * (A11*A21 + A12*A22 + A13*A23);
150 y(q,2,e) = c_detJ * (A11*A31 + A12*A32 + A13*A33);
151 y(q,3,e) = c_detJ * (A21*A21 + A22*A22 + A23*A23);
152 y(q,4,e) = c_detJ * (A21*A31 + A22*A32 + A23*A33);
153 y(q,5,e) = c_detJ * (A31*A31 + A32*A32 + A33*A33);
158 static void PADiffusionSetup(
const int dim,
162 const Array<double> &W,
167 if (dim == 1) { MFEM_ABORT(
"dim==1 not supported in PADiffusionSetup"); }
173 OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, COEFF, op);
176 #endif // MFEM_USE_OCCA
177 PADiffusionSetup2D(Q1D, NE, W, J, COEFF, op);
184 OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, COEFF, op);
187 #endif // MFEM_USE_OCCA
188 PADiffusionSetup3D(Q1D, NE, W, J, COEFF, op);
198 const int dims = el.
GetDim();
199 const int symmDims = (dims * (dims + 1)) / 2;
209 MFEM_VERIFY(cQ != NULL,
"only ConstantCoefficient is supported!");
211 PADiffusionSetup(dim, dofs1D, quad1D, ne, ir->
GetWeights(),
geom->J,
217 static void OccaPADiffusionApply2D(
const int D1D,
228 occa::properties props;
229 props[
"defines/D1D"] = D1D;
230 props[
"defines/Q1D"] = Q1D;
238 const occa_id_t id = std::make_pair(D1D,Q1D);
239 if (!Device::Allows(Backend::OCCA_CUDA))
242 if (OccaDiffApply2D_cpu.find(
id) == OccaDiffApply2D_cpu.end())
244 const occa::kernel DiffusionApply2D_CPU =
246 "DiffusionApply2D_CPU", props);
247 OccaDiffApply2D_cpu.emplace(
id, DiffusionApply2D_CPU);
249 OccaDiffApply2D_cpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_op, o_x, o_y);
254 if (OccaDiffApply2D_gpu.find(
id) == OccaDiffApply2D_gpu.end())
256 const occa::kernel DiffusionApply2D_GPU =
258 "DiffusionApply2D_GPU", props);
259 OccaDiffApply2D_gpu.emplace(
id, DiffusionApply2D_GPU);
261 OccaDiffApply2D_gpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_op, o_x, o_y);
266 static void OccaPADiffusionApply3D(
const int D1D,
269 const Array<double> &B,
270 const Array<double> &G,
271 const Array<double> &Bt,
272 const Array<double> &Gt,
277 occa::properties props;
278 props[
"defines/D1D"] = D1D;
279 props[
"defines/Q1D"] = Q1D;
282 const occa::memory o_Bt =
OccaMemoryRead(Bt.GetMemory(), Bt.Size());
283 const occa::memory o_Gt =
OccaMemoryRead(Gt.GetMemory(), Gt.Size());
284 const occa::memory o_op =
OccaMemoryRead(op.GetMemory(), op.Size());
287 const occa_id_t id = std::make_pair(D1D,Q1D);
288 if (!Device::Allows(Backend::OCCA_CUDA))
291 if (OccaDiffApply3D_cpu.find(
id) == OccaDiffApply3D_cpu.end())
293 const occa::kernel DiffusionApply3D_CPU =
295 "DiffusionApply3D_CPU", props);
296 OccaDiffApply3D_cpu.emplace(
id, DiffusionApply3D_CPU);
298 OccaDiffApply3D_cpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_op, o_x, o_y);
303 if (OccaDiffApply3D_gpu.find(
id) == OccaDiffApply3D_gpu.end())
305 const occa::kernel DiffusionApply3D_GPU =
307 "DiffusionApply3D_GPU", props);
308 OccaDiffApply3D_gpu.emplace(
id, DiffusionApply3D_GPU);
310 OccaDiffApply3D_gpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_op, o_x, o_y);
313 #endif // MFEM_USE_OCCA
316 template<
int T_D1D = 0,
int T_Q1D = 0>
static
317 void PADiffusionApply2D(
const int NE,
318 const Array<double> &b,
319 const Array<double> &g,
320 const Array<double> &bt,
321 const Array<double> >,
328 const int D1D = T_D1D ? T_D1D : d1d;
329 const int Q1D = T_Q1D ? T_Q1D : q1d;
330 MFEM_VERIFY(D1D <=
MAX_D1D,
"");
331 MFEM_VERIFY(Q1D <=
MAX_Q1D,
"");
332 auto B =
Reshape(b.Read(), Q1D, D1D);
333 auto G =
Reshape(g.Read(), Q1D, D1D);
334 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
335 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
336 auto op =
Reshape(_op.Read(), Q1D*Q1D, 3, NE);
337 auto x =
Reshape(_x.Read(), D1D, D1D, NE);
338 auto y =
Reshape(_y.ReadWrite(), D1D, D1D, NE);
341 const int D1D = T_D1D ? T_D1D : d1d;
342 const int Q1D = T_Q1D ? T_Q1D : q1d;
344 constexpr
int max_D1D = T_D1D ? T_D1D :
MAX_D1D;
345 constexpr
int max_Q1D = T_Q1D ? T_Q1D :
MAX_Q1D;
347 double grad[max_Q1D][max_Q1D][2];
348 for (
int qy = 0; qy < Q1D; ++qy)
350 for (
int qx = 0; qx < Q1D; ++qx)
352 grad[qy][qx][0] = 0.0;
353 grad[qy][qx][1] = 0.0;
356 for (
int dy = 0; dy < D1D; ++dy)
358 double gradX[max_Q1D][2];
359 for (
int qx = 0; qx < Q1D; ++qx)
364 for (
int dx = 0; dx < D1D; ++dx)
366 const double s = x(dx,dy,e);
367 for (
int qx = 0; qx < Q1D; ++qx)
369 gradX[qx][0] += s * B(qx,dx);
370 gradX[qx][1] += s * G(qx,dx);
373 for (
int qy = 0; qy < Q1D; ++qy)
375 const double wy = B(qy,dy);
376 const double wDy = G(qy,dy);
377 for (
int qx = 0; qx < Q1D; ++qx)
379 grad[qy][qx][0] += gradX[qx][1] * wy;
380 grad[qy][qx][1] += gradX[qx][0] * wDy;
385 for (
int qy = 0; qy < Q1D; ++qy)
387 for (
int qx = 0; qx < Q1D; ++qx)
389 const int q = qx + qy * Q1D;
391 const double O11 = op(q,0,e);
392 const double O12 = op(q,1,e);
393 const double O22 = op(q,2,e);
395 const double gradX = grad[qy][qx][0];
396 const double gradY = grad[qy][qx][1];
398 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
399 grad[qy][qx][1] = (O12 * gradX) + (O22 * gradY);
402 for (
int qy = 0; qy < Q1D; ++qy)
404 double gradX[max_D1D][2];
405 for (
int dx = 0; dx < D1D; ++dx)
410 for (
int qx = 0; qx < Q1D; ++qx)
412 const double gX = grad[qy][qx][0];
413 const double gY = grad[qy][qx][1];
414 for (
int dx = 0; dx < D1D; ++dx)
416 const double wx = Bt(dx,qx);
417 const double wDx = Gt(dx,qx);
418 gradX[dx][0] += gX * wDx;
419 gradX[dx][1] += gY * wx;
422 for (
int dy = 0; dy < D1D; ++dy)
424 const double wy = Bt(dy,qy);
425 const double wDy = Gt(dy,qy);
426 for (
int dx = 0; dx < D1D; ++dx)
428 y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
436 template<
const int T_D1D = 0,
439 static void SmemPADiffusionApply2D(
const int NE,
440 const Array<double> &_b,
441 const Array<double> &_g,
442 const Array<double> &_bt,
443 const Array<double> &_gt,
450 const int D1D = T_D1D ? T_D1D : d1d;
451 const int Q1D = T_Q1D ? T_Q1D : q1d;
452 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
453 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
454 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
455 MFEM_VERIFY(D1D <= MD1,
"");
456 MFEM_VERIFY(Q1D <= MQ1,
"");
457 auto b =
Reshape(_b.Read(), Q1D, D1D);
458 auto g =
Reshape(_g.Read(), Q1D, D1D);
459 auto op =
Reshape(_op.Read(), Q1D*Q1D, 3, NE);
460 auto x =
Reshape(_x.Read(), D1D, D1D, NE);
461 auto y =
Reshape(_y.ReadWrite(), D1D, D1D, NE);
462 MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
464 const int tidz = MFEM_THREAD_ID(z);
465 const int D1D = T_D1D ? T_D1D : d1d;
466 const int Q1D = T_Q1D ? T_Q1D : q1d;
467 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
468 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
469 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
470 MFEM_SHARED
double sBG[2][MQ1*MD1];
471 double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
472 double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
473 double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
474 double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
475 MFEM_SHARED
double Xz[NBZ][MD1][MD1];
476 MFEM_SHARED
double GD[2][NBZ][MD1][MQ1];
477 MFEM_SHARED
double GQ[2][NBZ][MD1][MQ1];
478 double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
479 double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
480 double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
481 double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
482 double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
483 MFEM_FOREACH_THREAD(dy,y,D1D)
485 MFEM_FOREACH_THREAD(dx,x,D1D)
487 X[dy][dx] = x(dx,dy,e);
492 MFEM_FOREACH_THREAD(d,y,D1D)
494 MFEM_FOREACH_THREAD(q,x,Q1D)
502 MFEM_FOREACH_THREAD(dy,y,D1D)
504 MFEM_FOREACH_THREAD(qx,x,Q1D)
508 for (
int dx = 0; dx < D1D; ++dx)
510 const double coords = X[dy][dx];
511 u += B[qx][dx] * coords;
512 v += G[qx][dx] * coords;
519 MFEM_FOREACH_THREAD(qy,y,Q1D)
521 MFEM_FOREACH_THREAD(qx,x,Q1D)
525 for (
int dy = 0; dy < D1D; ++dy)
527 u += DQ1[dy][qx] * B[qy][dy];
528 v += DQ0[dy][qx] * G[qy][dy];
535 MFEM_FOREACH_THREAD(qy,y,Q1D)
537 MFEM_FOREACH_THREAD(qx,x,Q1D)
539 const int q = (qx + ((qy) * Q1D));
540 const double O11 = op(q,0,e);
541 const double O12 = op(q,1,e);
542 const double O22 = op(q,2,e);
543 const double gX = QQ0[qy][qx];
544 const double gY = QQ1[qy][qx];
545 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
546 QQ1[qy][qx] = (O12 * gX) + (O22 * gY);
552 MFEM_FOREACH_THREAD(d,y,D1D)
554 MFEM_FOREACH_THREAD(q,x,Q1D)
562 MFEM_FOREACH_THREAD(qy,y,Q1D)
564 MFEM_FOREACH_THREAD(dx,x,D1D)
568 for (
int qx = 0; qx < Q1D; ++qx)
570 u += Gt[dx][qx] * QQ0[qy][qx];
571 v += Bt[dx][qx] * QQ1[qy][qx];
578 MFEM_FOREACH_THREAD(dy,y,D1D)
580 MFEM_FOREACH_THREAD(dx,x,D1D)
584 for (
int qy = 0; qy < Q1D; ++qy)
586 u += DQ0[qy][dx] * Bt[dy][qy];
587 v += DQ1[qy][dx] * Gt[dy][qy];
589 y(dx,dy,e) += (u + v);
597 template<
const int T_D1D = 0,
598 const int T_Q1D = 0>
static
599 void PADiffusionApply3D(
const int NE,
600 const Array<double> &b,
601 const Array<double> &g,
602 const Array<double> &bt,
603 const Array<double> >,
607 int d1d = 0,
int q1d = 0)
609 const int D1D = T_D1D ? T_D1D : d1d;
610 const int Q1D = T_Q1D ? T_Q1D : q1d;
611 MFEM_VERIFY(D1D <=
MAX_D1D,
"");
612 MFEM_VERIFY(Q1D <=
MAX_Q1D,
"");
613 auto B =
Reshape(b.Read(), Q1D, D1D);
614 auto G =
Reshape(g.Read(), Q1D, D1D);
615 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
616 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
617 auto op =
Reshape(_op.Read(), Q1D*Q1D*Q1D, 6, NE);
618 auto x =
Reshape(_x.Read(), D1D, D1D, D1D, NE);
619 auto y =
Reshape(_y.ReadWrite(), D1D, D1D, D1D, NE);
622 const int D1D = T_D1D ? T_D1D : d1d;
623 const int Q1D = T_Q1D ? T_Q1D : q1d;
624 constexpr
int max_D1D = T_D1D ? T_D1D :
MAX_D1D;
625 constexpr
int max_Q1D = T_Q1D ? T_Q1D :
MAX_Q1D;
626 double grad[max_Q1D][max_Q1D][max_Q1D][4];
627 for (
int qz = 0; qz < Q1D; ++qz)
629 for (
int qy = 0; qy < Q1D; ++qy)
631 for (
int qx = 0; qx < Q1D; ++qx)
633 grad[qz][qy][qx][0] = 0.0;
634 grad[qz][qy][qx][1] = 0.0;
635 grad[qz][qy][qx][2] = 0.0;
639 for (
int dz = 0; dz < D1D; ++dz)
641 double gradXY[max_Q1D][max_Q1D][4];
642 for (
int qy = 0; qy < Q1D; ++qy)
644 for (
int qx = 0; qx < Q1D; ++qx)
646 gradXY[qy][qx][0] = 0.0;
647 gradXY[qy][qx][1] = 0.0;
648 gradXY[qy][qx][2] = 0.0;
651 for (
int dy = 0; dy < D1D; ++dy)
653 double gradX[max_Q1D][2];
654 for (
int qx = 0; qx < Q1D; ++qx)
659 for (
int dx = 0; dx < D1D; ++dx)
661 const double s = x(dx,dy,dz,e);
662 for (
int qx = 0; qx < Q1D; ++qx)
664 gradX[qx][0] += s * B(qx,dx);
665 gradX[qx][1] += s * G(qx,dx);
668 for (
int qy = 0; qy < Q1D; ++qy)
670 const double wy = B(qy,dy);
671 const double wDy = G(qy,dy);
672 for (
int qx = 0; qx < Q1D; ++qx)
674 const double wx = gradX[qx][0];
675 const double wDx = gradX[qx][1];
676 gradXY[qy][qx][0] += wDx * wy;
677 gradXY[qy][qx][1] += wx * wDy;
678 gradXY[qy][qx][2] += wx * wy;
682 for (
int qz = 0; qz < Q1D; ++qz)
684 const double wz = B(qz,dz);
685 const double wDz = G(qz,dz);
686 for (
int qy = 0; qy < Q1D; ++qy)
688 for (
int qx = 0; qx < Q1D; ++qx)
690 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
691 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
692 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
698 for (
int qz = 0; qz < Q1D; ++qz)
700 for (
int qy = 0; qy < Q1D; ++qy)
702 for (
int qx = 0; qx < Q1D; ++qx)
704 const int q = qx + (qy + qz * Q1D) * Q1D;
705 const double O11 = op(q,0,e);
706 const double O12 = op(q,1,e);
707 const double O13 = op(q,2,e);
708 const double O22 = op(q,3,e);
709 const double O23 = op(q,4,e);
710 const double O33 = op(q,5,e);
711 const double gradX = grad[qz][qy][qx][0];
712 const double gradY = grad[qz][qy][qx][1];
713 const double gradZ = grad[qz][qy][qx][2];
714 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
715 grad[qz][qy][qx][1] = (O12*gradX)+(O22*gradY)+(O23*gradZ);
716 grad[qz][qy][qx][2] = (O13*gradX)+(O23*gradY)+(O33*gradZ);
720 for (
int qz = 0; qz < Q1D; ++qz)
722 double gradXY[max_D1D][max_D1D][4];
723 for (
int dy = 0; dy < D1D; ++dy)
725 for (
int dx = 0; dx < D1D; ++dx)
727 gradXY[dy][dx][0] = 0;
728 gradXY[dy][dx][1] = 0;
729 gradXY[dy][dx][2] = 0;
732 for (
int qy = 0; qy < Q1D; ++qy)
734 double gradX[max_D1D][4];
735 for (
int dx = 0; dx < D1D; ++dx)
741 for (
int qx = 0; qx < Q1D; ++qx)
743 const double gX = grad[qz][qy][qx][0];
744 const double gY = grad[qz][qy][qx][1];
745 const double gZ = grad[qz][qy][qx][2];
746 for (
int dx = 0; dx < D1D; ++dx)
748 const double wx = Bt(dx,qx);
749 const double wDx = Gt(dx,qx);
750 gradX[dx][0] += gX * wDx;
751 gradX[dx][1] += gY * wx;
752 gradX[dx][2] += gZ * wx;
755 for (
int dy = 0; dy < D1D; ++dy)
757 const double wy = Bt(dy,qy);
758 const double wDy = Gt(dy,qy);
759 for (
int dx = 0; dx < D1D; ++dx)
761 gradXY[dy][dx][0] += gradX[dx][0] * wy;
762 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
763 gradXY[dy][dx][2] += gradX[dx][2] * wy;
767 for (
int dz = 0; dz < D1D; ++dz)
769 const double wz = Bt(dz,qz);
770 const double wDz = Gt(dz,qz);
771 for (
int dy = 0; dy < D1D; ++dy)
773 for (
int dx = 0; dx < D1D; ++dx)
776 ((gradXY[dy][dx][0] * wz) +
777 (gradXY[dy][dx][1] * wz) +
778 (gradXY[dy][dx][2] * wDz));
787 template<
const int T_D1D = 0,
789 static void SmemPADiffusionApply3D(
const int NE,
790 const Array<double> &_b,
791 const Array<double> &_g,
792 const Array<double> &_bt,
793 const Array<double> &_gt,
800 const int D1D = T_D1D ? T_D1D : d1d;
801 const int Q1D = T_Q1D ? T_Q1D : q1d;
802 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
803 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
804 MFEM_VERIFY(D1D <= MD1,
"");
805 MFEM_VERIFY(Q1D <= MQ1,
"");
806 auto b =
Reshape(_b.Read(), Q1D, D1D);
807 auto g =
Reshape(_g.Read(), Q1D, D1D);
808 auto op =
Reshape(_op.Read(), Q1D*Q1D*Q1D, 6, NE);
809 auto x =
Reshape(_x.Read(), D1D, D1D, D1D, NE);
810 auto y =
Reshape(_y.ReadWrite(), D1D, D1D, D1D, NE);
811 MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
813 const int tidz = MFEM_THREAD_ID(z);
814 const int D1D = T_D1D ? T_D1D : d1d;
815 const int Q1D = T_Q1D ? T_Q1D : q1d;
816 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
817 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
818 constexpr
int MDQ = MQ1 > MD1 ? MQ1 : MD1;
819 MFEM_SHARED
double sBG[2][MQ1*MD1];
820 double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
821 double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
822 double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
823 double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
824 MFEM_SHARED
double sm0[3][MDQ*MDQ*MDQ];
825 MFEM_SHARED
double sm1[3][MDQ*MDQ*MDQ];
826 double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
827 double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
828 double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
829 double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
830 double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
831 double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
832 double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
833 double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
834 double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
835 double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
836 double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
837 double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
838 double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
839 double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
840 double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
841 MFEM_FOREACH_THREAD(dz,z,D1D)
843 MFEM_FOREACH_THREAD(dy,y,D1D)
845 MFEM_FOREACH_THREAD(dx,x,D1D)
847 X[dz][dy][dx] = x(dx,dy,dz,e);
853 MFEM_FOREACH_THREAD(d,y,D1D)
855 MFEM_FOREACH_THREAD(q,x,Q1D)
863 MFEM_FOREACH_THREAD(dz,z,D1D)
865 MFEM_FOREACH_THREAD(dy,y,D1D)
867 MFEM_FOREACH_THREAD(qx,x,Q1D)
871 for (
int dx = 0; dx < D1D; ++dx)
873 const double coords = X[dz][dy][dx];
874 u += coords * B[qx][dx];
875 v += coords * G[qx][dx];
877 DDQ0[dz][dy][qx] = u;
878 DDQ1[dz][dy][qx] = v;
883 MFEM_FOREACH_THREAD(dz,z,D1D)
885 MFEM_FOREACH_THREAD(qy,y,Q1D)
887 MFEM_FOREACH_THREAD(qx,x,Q1D)
892 for (
int dy = 0; dy < D1D; ++dy)
894 u += DDQ1[dz][dy][qx] * B[qy][dy];
895 v += DDQ0[dz][dy][qx] * G[qy][dy];
896 w += DDQ0[dz][dy][qx] * B[qy][dy];
898 DQQ0[dz][qy][qx] = u;
899 DQQ1[dz][qy][qx] = v;
900 DQQ2[dz][qy][qx] = w;
905 MFEM_FOREACH_THREAD(qz,z,Q1D)
907 MFEM_FOREACH_THREAD(qy,y,Q1D)
909 MFEM_FOREACH_THREAD(qx,x,Q1D)
914 for (
int dz = 0; dz < D1D; ++dz)
916 u += DQQ0[dz][qy][qx] * B[qz][dz];
917 v += DQQ1[dz][qy][qx] * B[qz][dz];
918 w += DQQ2[dz][qy][qx] * G[qz][dz];
920 QQQ0[qz][qy][qx] = u;
921 QQQ1[qz][qy][qx] = v;
922 QQQ2[qz][qy][qx] = w;
927 MFEM_FOREACH_THREAD(qz,z,Q1D)
929 MFEM_FOREACH_THREAD(qy,y,Q1D)
931 MFEM_FOREACH_THREAD(qx,x,Q1D)
933 const int q = qx + ((qy*Q1D) + (qz*Q1D*Q1D));
934 const double O11 = op(q,0,e);
935 const double O12 = op(q,1,e);
936 const double O13 = op(q,2,e);
937 const double O22 = op(q,3,e);
938 const double O23 = op(q,4,e);
939 const double O33 = op(q,5,e);
940 const double gX = QQQ0[qz][qy][qx];
941 const double gY = QQQ1[qz][qy][qx];
942 const double gZ = QQQ2[qz][qy][qx];
943 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
944 QQQ1[qz][qy][qx] = (O12*gX) + (O22*gY) + (O23*gZ);
945 QQQ2[qz][qy][qx] = (O13*gX) + (O23*gY) + (O33*gZ);
952 MFEM_FOREACH_THREAD(d,y,D1D)
954 MFEM_FOREACH_THREAD(q,x,Q1D)
962 MFEM_FOREACH_THREAD(qz,z,Q1D)
964 MFEM_FOREACH_THREAD(qy,y,Q1D)
966 MFEM_FOREACH_THREAD(dx,x,D1D)
971 for (
int qx = 0; qx < Q1D; ++qx)
973 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
974 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
975 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
977 QQD0[qz][qy][dx] = u;
978 QQD1[qz][qy][dx] = v;
979 QQD2[qz][qy][dx] = w;
984 MFEM_FOREACH_THREAD(qz,z,Q1D)
986 MFEM_FOREACH_THREAD(dy,y,D1D)
988 MFEM_FOREACH_THREAD(dx,x,D1D)
993 for (
int qy = 0; qy < Q1D; ++qy)
995 u += QQD0[qz][qy][dx] * Bt[dy][qy];
996 v += QQD1[qz][qy][dx] * Gt[dy][qy];
997 w += QQD2[qz][qy][dx] * Bt[dy][qy];
999 QDD0[qz][dy][dx] = u;
1000 QDD1[qz][dy][dx] = v;
1001 QDD2[qz][dy][dx] = w;
1006 MFEM_FOREACH_THREAD(dz,z,D1D)
1008 MFEM_FOREACH_THREAD(dy,y,D1D)
1010 MFEM_FOREACH_THREAD(dx,x,D1D)
1015 for (
int qz = 0; qz < Q1D; ++qz)
1017 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1018 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1019 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1021 y(dx,dy,dz,e) += (u + v + w);
1028 static void PADiffusionApply(
const int dim,
1032 const Array<double> &B,
1033 const Array<double> &G,
1034 const Array<double> &Bt,
1035 const Array<double> &Gt,
1040 #ifdef MFEM_USE_OCCA
1045 OccaPADiffusionApply2D(D1D, Q1D, NE, B, G, Bt, Gt, op, x, y);
1050 OccaPADiffusionApply3D(D1D, Q1D, NE, B, G, Bt, Gt, op, x, y);
1053 MFEM_ABORT(
"OCCA PADiffusionApply unknown kernel!");
1055 #endif // MFEM_USE_OCCA
1057 if (Device::Allows(Backend::RAJA_CUDA))
1061 switch ((D1D << 4 ) | Q1D)
1063 case 0x22:
return PADiffusionApply2D<2,2>(NE,B,G,Bt,Gt,op,x,y);
1064 case 0x33:
return PADiffusionApply2D<3,3>(NE,B,G,Bt,Gt,op,x,y);
1065 case 0x44:
return PADiffusionApply2D<4,4>(NE,B,G,Bt,Gt,op,x,y);
1066 case 0x55:
return PADiffusionApply2D<5,5>(NE,B,G,Bt,Gt,op,x,y);
1067 case 0x66:
return PADiffusionApply2D<6,6>(NE,B,G,Bt,Gt,op,x,y);
1068 case 0x77:
return PADiffusionApply2D<7,7>(NE,B,G,Bt,Gt,op,x,y);
1069 case 0x88:
return PADiffusionApply2D<8,8>(NE,B,G,Bt,Gt,op,x,y);
1070 case 0x99:
return PADiffusionApply2D<9,9>(NE,B,G,Bt,Gt,op,x,y);
1071 default:
return PADiffusionApply2D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
1076 switch ((D1D << 4 ) | Q1D)
1078 case 0x23:
return PADiffusionApply3D<2,3>(NE,B,G,Bt,Gt,op,x,y);
1079 case 0x34:
return PADiffusionApply3D<3,4>(NE,B,G,Bt,Gt,op,x,y);
1080 case 0x45:
return PADiffusionApply3D<4,5>(NE,B,G,Bt,Gt,op,x,y);
1081 case 0x56:
return PADiffusionApply3D<5,6>(NE,B,G,Bt,Gt,op,x,y);
1082 case 0x67:
return PADiffusionApply3D<6,7>(NE,B,G,Bt,Gt,op,x,y);
1083 case 0x78:
return PADiffusionApply3D<7,8>(NE,B,G,Bt,Gt,op,x,y);
1084 case 0x89:
return PADiffusionApply3D<8,9>(NE,B,G,Bt,Gt,op,x,y);
1085 default:
return PADiffusionApply3D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
1091 switch ((D1D << 4 ) | Q1D)
1093 case 0x22:
return SmemPADiffusionApply2D<2,2,16>(NE,B,G,Bt,Gt,op,x,y);
1094 case 0x33:
return SmemPADiffusionApply2D<3,3,16>(NE,B,G,Bt,Gt,op,x,y);
1095 case 0x44:
return SmemPADiffusionApply2D<4,4,8>(NE,B,G,Bt,Gt,op,x,y);
1096 case 0x55:
return SmemPADiffusionApply2D<5,5,8>(NE,B,G,Bt,Gt,op,x,y);
1097 case 0x66:
return SmemPADiffusionApply2D<6,6,4>(NE,B,G,Bt,Gt,op,x,y);
1098 case 0x77:
return SmemPADiffusionApply2D<7,7,4>(NE,B,G,Bt,Gt,op,x,y);
1099 case 0x88:
return SmemPADiffusionApply2D<8,8,2>(NE,B,G,Bt,Gt,op,x,y);
1100 case 0x99:
return SmemPADiffusionApply2D<9,9,2>(NE,B,G,Bt,Gt,op,x,y);
1101 default:
return PADiffusionApply2D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
1106 switch ((D1D << 4 ) | Q1D)
1108 case 0x23:
return SmemPADiffusionApply3D<2,3>(NE,B,G,Bt,Gt,op,x,y);
1109 case 0x34:
return SmemPADiffusionApply3D<3,4>(NE,B,G,Bt,Gt,op,x,y);
1110 case 0x45:
return SmemPADiffusionApply3D<4,5>(NE,B,G,Bt,Gt,op,x,y);
1111 case 0x56:
return SmemPADiffusionApply3D<5,6>(NE,B,G,Bt,Gt,op,x,y);
1112 case 0x67:
return SmemPADiffusionApply3D<6,7>(NE,B,G,Bt,Gt,op,x,y);
1113 case 0x78:
return SmemPADiffusionApply3D<7,8>(NE,B,G,Bt,Gt,op,x,y);
1114 case 0x89:
return SmemPADiffusionApply3D<8,9>(NE,B,G,Bt,Gt,op,x,y);
1115 default:
return PADiffusionApply3D(NE,B,G,Bt,Gt,op,x,y,D1D,Q1D);
1118 MFEM_ABORT(
"Unknown kernel.");
1124 PADiffusionApply(dim, dofs1D, quad1D, ne,
1125 maps->B, maps->G, maps->Bt, maps->Gt,
int GetNPoints() const
Returns the number of the points in the integration rule.
Abstract class for Finite Elements.
int Size() const
Logical size of the array.
int GetDim() const
Returns the reference space dimension for the finite element.
Class for an integration rule - an Array of IntegrationPoint.
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Subclass constant coefficient.
occa::device & OccaDev()
Return the default occa::device used by MFEM.
int Size() const
Returns the size of the vector.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
std::map< occa_id_t, occa::kernel > occa_kernel_t
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims...dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
int GetNE() const
Returns number of elements in the mesh.
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags)
Return the mesh geometric factors corresponding to the given integration rule.
Mesh * GetMesh() const
Returns the mesh.
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
MemoryType GetMemoryType(MemoryClass mc)
Return a suitable MemoryType for a given MemoryClass.
const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement associated with i'th element.
std::pair< int, int > occa_id_t