12 #include "../general/forall.hpp"
15 #include "libceed/diffusion.hpp"
26 static void OccaPADiffusionSetup2D(
const int D1D,
29 const Array<double> &W,
34 occa::properties props;
35 props[
"defines/D1D"] = D1D;
36 props[
"defines/Q1D"] = Q1D;
41 const bool const_c = C.Size() == 1;
42 const occa_id_t id = std::make_pair(D1D,Q1D);
44 if (OccaDiffSetup2D_ker.find(
id) == OccaDiffSetup2D_ker.end())
46 const occa::kernel DiffusionSetup2D =
48 "DiffusionSetup2D", props);
49 OccaDiffSetup2D_ker.emplace(
id, DiffusionSetup2D);
51 OccaDiffSetup2D_ker.at(
id)(NE, o_W, o_J, o_C, o_op, const_c);
54 static void OccaPADiffusionSetup3D(
const int D1D,
57 const Array<double> &W,
62 occa::properties props;
63 props[
"defines/D1D"] = D1D;
64 props[
"defines/Q1D"] = Q1D;
69 const bool const_c = C.Size() == 1;
70 const occa_id_t id = std::make_pair(D1D,Q1D);
72 if (OccaDiffSetup3D_ker.find(
id) == OccaDiffSetup3D_ker.end())
74 const occa::kernel DiffusionSetup3D =
76 "DiffusionSetup3D", props);
77 OccaDiffSetup3D_ker.emplace(
id, DiffusionSetup3D);
79 OccaDiffSetup3D_ker.at(
id)(NE, o_W, o_J, o_C, o_op, const_c);
81 #endif // MFEM_USE_OCCA
84 static void PADiffusionSetup2D(
const int Q1D,
86 const Array<double> &w,
91 const int NQ = Q1D*Q1D;
92 const bool const_c = c.Size() == 1;
94 auto J =
Reshape(j.Read(), NQ, 2, 2, NE);
95 auto C = const_c ?
Reshape(c.Read(), 1, 1) :
Reshape(c.Read(), NQ, NE);
96 auto D =
Reshape(d.Write(), NQ, 3, NE);
100 for (
int q = 0; q < NQ; ++q)
102 const double J11 = J(q,0,0,e);
103 const double J21 = J(q,1,0,e);
104 const double J12 = J(q,0,1,e);
105 const double J22 = J(q,1,1,e);
106 const double coeff = const_c ? C(0,0) : C(q,e);
107 const double c_detJ = W[q] * coeff / ((J11*J22)-(J21*J12));
108 D(q,0,e) = c_detJ * (J12*J12 + J22*J22);
109 D(q,1,e) = -c_detJ * (J12*J11 + J22*J21);
110 D(q,2,e) = c_detJ * (J11*J11 + J21*J21);
116 static void PADiffusionSetup3D(
const int Q1D,
118 const Array<double> &w,
123 const int NQ = Q1D*Q1D*Q1D;
124 const bool const_c = c.Size() == 1;
126 auto J =
Reshape(j.Read(), NQ, 3, 3, NE);
127 auto C = const_c ?
Reshape(c.Read(), 1, 1) :
Reshape(c.Read(), NQ, NE);
128 auto D =
Reshape(d.Write(), NQ, 6, NE);
131 for (
int q = 0; q < NQ; ++q)
133 const double J11 = J(q,0,0,e);
134 const double J21 = J(q,1,0,e);
135 const double J31 = J(q,2,0,e);
136 const double J12 = J(q,0,1,e);
137 const double J22 = J(q,1,1,e);
138 const double J32 = J(q,2,1,e);
139 const double J13 = J(q,0,2,e);
140 const double J23 = J(q,1,2,e);
141 const double J33 = J(q,2,2,e);
142 const double detJ = J11 * (J22 * J33 - J32 * J23) -
143 J21 * (J12 * J33 - J32 * J13) +
144 J31 * (J12 * J23 - J22 * J13);
145 const double coeff = const_c ? C(0,0) : C(q,e);
146 const double c_detJ = W[q] * coeff / detJ;
148 const double A11 = (J22 * J33) - (J23 * J32);
149 const double A12 = (J32 * J13) - (J12 * J33);
150 const double A13 = (J12 * J23) - (J22 * J13);
151 const double A21 = (J31 * J23) - (J21 * J33);
152 const double A22 = (J11 * J33) - (J13 * J31);
153 const double A23 = (J21 * J13) - (J11 * J23);
154 const double A31 = (J21 * J32) - (J31 * J22);
155 const double A32 = (J31 * J12) - (J11 * J32);
156 const double A33 = (J11 * J22) - (J12 * J21);
158 D(q,0,e) = c_detJ * (A11*A11 + A12*A12 + A13*A13);
159 D(q,1,e) = c_detJ * (A11*A21 + A12*A22 + A13*A23);
160 D(q,2,e) = c_detJ * (A11*A31 + A12*A32 + A13*A33);
161 D(q,3,e) = c_detJ * (A21*A21 + A22*A22 + A23*A23);
162 D(q,4,e) = c_detJ * (A21*A31 + A22*A32 + A23*A33);
163 D(q,5,e) = c_detJ * (A31*A31 + A32*A32 + A33*A33);
168 static void PADiffusionSetup(
const int dim,
172 const Array<double> &W,
177 if (dim == 1) { MFEM_ABORT(
"dim==1 not supported in PADiffusionSetup"); }
183 OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, C, D);
186 #endif // MFEM_USE_OCCA
187 PADiffusionSetup2D(Q1D, NE, W, J, C, D);
194 OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, C, D);
197 #endif // MFEM_USE_OCCA
198 PADiffusionSetup3D(Q1D, NE, W, J, C, D);
208 if (mesh->
GetNE() == 0) {
return; }
212 if (DeviceCanUseCeed() && !force)
214 if (ceedDataPtr) {
delete ceedDataPtr; }
215 CeedData* ptr =
new CeedData();
217 InitCeedCoeff(Q, ptr);
218 return CeedPADiffusionAssemble(fes, *ir, *ptr);
221 const int dims = el.
GetDim();
222 const int symmDims = (dims * (dims + 1)) / 2;
230 pa_data.SetSize(symmDims * nq * ne, Device::GetDeviceMemoryType());
240 coeff(0) = cQ->constant;
246 for (
int e = 0; e < ne; ++e)
249 for (
int q = 0; q < nq; ++q)
251 C(q,e) = Q->Eval(T, ir->
IntPoint(q));
255 PADiffusionSetup(dim, dofs1D, quad1D, ne, ir->
GetWeights(),
geom->J, coeff,
265 template<
int T_D1D = 0,
int T_Q1D = 0>
266 static void PADiffusionDiagonal2D(
const int NE,
274 const int D1D = T_D1D ? T_D1D : d1d;
275 const int Q1D = T_Q1D ? T_Q1D : q1d;
276 MFEM_VERIFY(D1D <=
MAX_D1D,
"");
277 MFEM_VERIFY(Q1D <=
MAX_Q1D,
"");
286 const int D1D = T_D1D ? T_D1D : d1d;
287 const int Q1D = T_Q1D ? T_Q1D : q1d;
288 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
289 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
291 double QD0[MQ1][MD1];
292 double QD1[MQ1][MD1];
293 double QD2[MQ1][MD1];
294 for (
int qx = 0; qx < Q1D; ++qx)
296 for (
int dy = 0; dy < D1D; ++dy)
301 for (
int qy = 0; qy < Q1D; ++qy)
303 const int q = qx + qy * Q1D;
304 const double D0 = D(q,0,e);
305 const double D1 = D(q,1,e);
306 const double D2 = D(q,2,e);
307 QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D0;
308 QD1[qx][dy] += B(qy, dy) * G(qy, dy) * D1;
309 QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D2;
313 for (
int dy = 0; dy < D1D; ++dy)
315 for (
int dx = 0; dx < D1D; ++dx)
317 for (
int qx = 0; qx < Q1D; ++qx)
319 Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
320 Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
321 Y(dx,dy,e) += B(qx, dx) * G(qx, dx) * QD1[qx][dy];
322 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
330 template<
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
331 static void SmemPADiffusionDiagonal2D(
const int NE,
332 const Array<double> &b_,
333 const Array<double> &g_,
339 const int D1D = T_D1D ? T_D1D : d1d;
340 const int Q1D = T_Q1D ? T_Q1D : q1d;
341 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
342 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
343 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
344 MFEM_VERIFY(D1D <= MD1,
"");
345 MFEM_VERIFY(Q1D <= MQ1,
"");
346 auto b =
Reshape(b_.Read(), Q1D, D1D);
347 auto g =
Reshape(g_.Read(), Q1D, D1D);
348 auto D =
Reshape(d_.Read(), Q1D*Q1D, 3, NE);
349 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
350 MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
352 const int tidz = MFEM_THREAD_ID(z);
353 const int D1D = T_D1D ? T_D1D : d1d;
354 const int Q1D = T_Q1D ? T_Q1D : q1d;
355 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
356 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
357 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
358 MFEM_SHARED
double BG[2][MQ1*MD1];
359 double (*B)[MD1] = (double (*)[MD1]) (BG+0);
360 double (*G)[MD1] = (double (*)[MD1]) (BG+1);
361 MFEM_SHARED
double QD[4][NBZ][MD1][MQ1];
362 double (*QD0)[MD1] = (double (*)[MD1])(QD[0] + tidz);
363 double (*QD1)[MD1] = (double (*)[MD1])(QD[1] + tidz);
364 double (*QD2)[MD1] = (double (*)[MD1])(QD[3] + tidz);
367 MFEM_FOREACH_THREAD(d,y,D1D)
369 MFEM_FOREACH_THREAD(q,x,Q1D)
377 MFEM_FOREACH_THREAD(qx,x,Q1D)
379 MFEM_FOREACH_THREAD(dy,y,D1D)
384 for (
int qy = 0; qy < Q1D; ++qy)
386 const int q = qx + qy * Q1D;
387 const double D0 = D(q,0,e);
388 const double D1 = D(q,1,e);
389 const double D2 = D(q,2,e);
390 const double By = B[qy][dy];
391 const double Gy = G[qy][dy];
392 const double BB = By * By;
393 const double BG = By * Gy;
394 const double GG = Gy * Gy;
395 QD0[qx][dy] += BB * D0;
396 QD1[qx][dy] += BG * D1;
397 QD2[qx][dy] += GG * D2;
402 MFEM_FOREACH_THREAD(dy,y,D1D)
404 MFEM_FOREACH_THREAD(dx,x,D1D)
406 for (
int qx = 0; qx < Q1D; ++qx)
408 const double Bx = B[qx][dx];
409 const double Gx = G[qx][dx];
410 const double BB = Bx * Bx;
411 const double BG = Bx * Gx;
412 const double GG = Gx * Gx;
413 Y(dx,dy,e) += GG * QD0[qx][dy];
414 Y(dx,dy,e) += BG * QD1[qx][dy];
415 Y(dx,dy,e) += BG * QD1[qx][dy];
416 Y(dx,dy,e) += BB * QD2[qx][dy];
423 template<
int T_D1D = 0,
int T_Q1D = 0>
424 static void PADiffusionDiagonal3D(
const int NE,
425 const Array<double> &b,
426 const Array<double> &g,
432 constexpr
int DIM = 3;
433 const int D1D = T_D1D ? T_D1D : d1d;
434 const int Q1D = T_Q1D ? T_Q1D : q1d;
435 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
436 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
437 MFEM_VERIFY(D1D <= MD1,
"");
438 MFEM_VERIFY(Q1D <= MQ1,
"");
439 auto B =
Reshape(b.Read(), Q1D, D1D);
440 auto G =
Reshape(g.Read(), Q1D, D1D);
441 auto Q =
Reshape(d.Read(), Q1D*Q1D*Q1D, 6, NE);
442 auto Y =
Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
445 const int D1D = T_D1D ? T_D1D : d1d;
446 const int Q1D = T_Q1D ? T_Q1D : q1d;
447 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
448 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
449 double QQD[MQ1][MQ1][MD1];
450 double QDD[MQ1][MD1][MD1];
451 for (
int i = 0; i < DIM; ++i)
453 for (
int j = 0; j < DIM; ++j)
456 for (
int qx = 0; qx < Q1D; ++qx)
458 for (
int qy = 0; qy < Q1D; ++qy)
460 for (
int dz = 0; dz < D1D; ++dz)
462 QQD[qx][qy][dz] = 0.0;
463 for (
int qz = 0; qz < Q1D; ++qz)
465 const int q = qx + (qy + qz * Q1D) * Q1D;
466 const int k = j >= i ?
467 3 - (3-i)*(2-i)/2 + j:
468 3 - (3-j)*(2-j)/2 + i;
469 const double O = Q(q,k,e);
470 const double Bz = B(qz,dz);
471 const double Gz = G(qz,dz);
472 const double L = i==2 ? Gz : Bz;
473 const double R = j==2 ? Gz : Bz;
474 QQD[qx][qy][dz] += L * O * R;
480 for (
int qx = 0; qx < Q1D; ++qx)
482 for (
int dz = 0; dz < D1D; ++dz)
484 for (
int dy = 0; dy < D1D; ++dy)
486 QDD[qx][dy][dz] = 0.0;
487 for (
int qy = 0; qy < Q1D; ++qy)
489 const double By = B(qy,dy);
490 const double Gy = G(qy,dy);
491 const double L = i==1 ? Gy : By;
492 const double R = j==1 ? Gy : By;
493 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
499 for (
int dz = 0; dz < D1D; ++dz)
501 for (
int dy = 0; dy < D1D; ++dy)
503 for (
int dx = 0; dx < D1D; ++dx)
505 for (
int qx = 0; qx < Q1D; ++qx)
507 const double Bx = B(qx,dx);
508 const double Gx = G(qx,dx);
509 const double L = i==0 ? Gx : Bx;
510 const double R = j==0 ? Gx : Bx;
511 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
522 template<
int T_D1D = 0,
int T_Q1D = 0>
523 static void SmemPADiffusionDiagonal3D(
const int NE,
524 const Array<double> &b_,
525 const Array<double> &g_,
531 constexpr
int DIM = 3;
532 const int D1D = T_D1D ? T_D1D : d1d;
533 const int Q1D = T_Q1D ? T_Q1D : q1d;
534 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
535 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
536 MFEM_VERIFY(D1D <= MD1,
"");
537 MFEM_VERIFY(Q1D <= MQ1,
"");
538 auto b =
Reshape(b_.Read(), Q1D, D1D);
539 auto g =
Reshape(g_.Read(), Q1D, D1D);
540 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, 6, NE);
541 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
542 MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
544 const int tidz = MFEM_THREAD_ID(z);
545 const int D1D = T_D1D ? T_D1D : d1d;
546 const int Q1D = T_Q1D ? T_Q1D : q1d;
547 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
548 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
549 MFEM_SHARED
double BG[2][MQ1*MD1];
550 double (*B)[MD1] = (double (*)[MD1]) (BG+0);
551 double (*G)[MD1] = (double (*)[MD1]) (BG+1);
552 MFEM_SHARED
double QQD[MQ1][MQ1][MD1];
553 MFEM_SHARED
double QDD[MQ1][MD1][MD1];
556 MFEM_FOREACH_THREAD(d,y,D1D)
558 MFEM_FOREACH_THREAD(q,x,Q1D)
566 for (
int i = 0; i < DIM; ++i)
568 for (
int j = 0; j < DIM; ++j)
571 MFEM_FOREACH_THREAD(qx,x,Q1D)
573 MFEM_FOREACH_THREAD(qy,y,Q1D)
575 MFEM_FOREACH_THREAD(dz,z,D1D)
577 QQD[qx][qy][dz] = 0.0;
578 for (
int qz = 0; qz < Q1D; ++qz)
580 const int q = qx + (qy + qz * Q1D) * Q1D;
581 const int k = j >= i ?
582 3 - (3-i)*(2-i)/2 + j:
583 3 - (3-j)*(2-j)/2 + i;
584 const double O = D(q,k,e);
585 const double Bz = B[qz][dz];
586 const double Gz = G[qz][dz];
587 const double L = i==2 ? Gz : Bz;
588 const double R = j==2 ? Gz : Bz;
589 QQD[qx][qy][dz] += L * O * R;
596 MFEM_FOREACH_THREAD(qx,x,Q1D)
598 MFEM_FOREACH_THREAD(dz,z,D1D)
600 MFEM_FOREACH_THREAD(dy,y,D1D)
602 QDD[qx][dy][dz] = 0.0;
603 for (
int qy = 0; qy < Q1D; ++qy)
605 const double By = B[qy][dy];
606 const double Gy = G[qy][dy];
607 const double L = i==1 ? Gy : By;
608 const double R = j==1 ? Gy : By;
609 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
616 MFEM_FOREACH_THREAD(dz,z,D1D)
618 MFEM_FOREACH_THREAD(dy,y,D1D)
620 MFEM_FOREACH_THREAD(dx,x,D1D)
622 for (
int qx = 0; qx < Q1D; ++qx)
624 const double Bx = B[qx][dx];
625 const double Gx = G[qx][dx];
626 const double L = i==0 ? Gx : Bx;
627 const double R = j==0 ? Gx : Bx;
628 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
638 static void PADiffusionAssembleDiagonal(
const int dim,
642 const Array<double> &B,
643 const Array<double> &G,
649 switch ((D1D << 4 ) | Q1D)
651 case 0x22:
return SmemPADiffusionDiagonal2D<2,2,8>(NE,B,G,D,Y);
652 case 0x33:
return SmemPADiffusionDiagonal2D<3,3,8>(NE,B,G,D,Y);
653 case 0x44:
return SmemPADiffusionDiagonal2D<4,4,4>(NE,B,G,D,Y);
654 case 0x55:
return SmemPADiffusionDiagonal2D<5,5,4>(NE,B,G,D,Y);
655 case 0x66:
return SmemPADiffusionDiagonal2D<6,6,2>(NE,B,G,D,Y);
656 case 0x77:
return SmemPADiffusionDiagonal2D<7,7,2>(NE,B,G,D,Y);
657 case 0x88:
return SmemPADiffusionDiagonal2D<8,8,1>(NE,B,G,D,Y);
658 case 0x99:
return SmemPADiffusionDiagonal2D<9,9,1>(NE,B,G,D,Y);
659 default:
return PADiffusionDiagonal2D(NE,B,G,D,Y,D1D,Q1D);
664 switch ((D1D << 4 ) | Q1D)
666 case 0x23:
return SmemPADiffusionDiagonal3D<2,3>(NE,B,G,D,Y);
667 case 0x34:
return SmemPADiffusionDiagonal3D<3,4>(NE,B,G,D,Y);
668 case 0x45:
return SmemPADiffusionDiagonal3D<4,5>(NE,B,G,D,Y);
669 case 0x56:
return SmemPADiffusionDiagonal3D<5,6>(NE,B,G,D,Y);
670 case 0x67:
return SmemPADiffusionDiagonal3D<6,7>(NE,B,G,D,Y);
671 case 0x78:
return SmemPADiffusionDiagonal3D<7,8>(NE,B,G,D,Y);
672 case 0x89:
return SmemPADiffusionDiagonal3D<8,9>(NE,B,G,D,Y);
673 case 0x9A:
return SmemPADiffusionDiagonal3D<9,10>(NE,B,G,D,Y);
674 default:
return PADiffusionDiagonal3D(NE,B,G,D,Y,D1D,Q1D);
677 MFEM_ABORT(
"Unknown kernel.");
680 void DiffusionIntegrator::AssembleDiagonalPA(
Vector &diag)
682 if (pa_data.Size()==0) { SetupPA(*fespace,
true); }
683 PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne,
684 maps->B, maps->G, pa_data, diag);
690 static void OccaPADiffusionApply2D(
const int D1D,
701 occa::properties props;
702 props[
"defines/D1D"] = D1D;
703 props[
"defines/Q1D"] = Q1D;
711 const occa_id_t id = std::make_pair(D1D,Q1D);
712 if (!Device::Allows(Backend::OCCA_CUDA))
715 if (OccaDiffApply2D_cpu.find(
id) == OccaDiffApply2D_cpu.end())
717 const occa::kernel DiffusionApply2D_CPU =
719 "DiffusionApply2D_CPU", props);
720 OccaDiffApply2D_cpu.emplace(
id, DiffusionApply2D_CPU);
722 OccaDiffApply2D_cpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
727 if (OccaDiffApply2D_gpu.find(
id) == OccaDiffApply2D_gpu.end())
729 const occa::kernel DiffusionApply2D_GPU =
731 "DiffusionApply2D_GPU", props);
732 OccaDiffApply2D_gpu.emplace(
id, DiffusionApply2D_GPU);
734 OccaDiffApply2D_gpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
739 static void OccaPADiffusionApply3D(
const int D1D,
742 const Array<double> &B,
743 const Array<double> &G,
744 const Array<double> &Bt,
745 const Array<double> &Gt,
750 occa::properties props;
751 props[
"defines/D1D"] = D1D;
752 props[
"defines/Q1D"] = Q1D;
755 const occa::memory o_Bt =
OccaMemoryRead(Bt.GetMemory(), Bt.Size());
756 const occa::memory o_Gt =
OccaMemoryRead(Gt.GetMemory(), Gt.Size());
760 const occa_id_t id = std::make_pair(D1D,Q1D);
761 if (!Device::Allows(Backend::OCCA_CUDA))
764 if (OccaDiffApply3D_cpu.find(
id) == OccaDiffApply3D_cpu.end())
766 const occa::kernel DiffusionApply3D_CPU =
768 "DiffusionApply3D_CPU", props);
769 OccaDiffApply3D_cpu.emplace(
id, DiffusionApply3D_CPU);
771 OccaDiffApply3D_cpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
776 if (OccaDiffApply3D_gpu.find(
id) == OccaDiffApply3D_gpu.end())
778 const occa::kernel DiffusionApply3D_GPU =
780 "DiffusionApply3D_GPU", props);
781 OccaDiffApply3D_gpu.emplace(
id, DiffusionApply3D_GPU);
783 OccaDiffApply3D_gpu.at(
id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
786 #endif // MFEM_USE_OCCA
789 template<
int T_D1D = 0,
int T_Q1D = 0>
790 static void PADiffusionApply2D(
const int NE,
791 const Array<double> &b_,
792 const Array<double> &g_,
793 const Array<double> &bt_,
794 const Array<double> >_,
801 const int D1D = T_D1D ? T_D1D : d1d;
802 const int Q1D = T_Q1D ? T_Q1D : q1d;
803 MFEM_VERIFY(D1D <=
MAX_D1D,
"");
804 MFEM_VERIFY(Q1D <=
MAX_Q1D,
"");
805 auto B =
Reshape(b_.Read(), Q1D, D1D);
806 auto G =
Reshape(g_.Read(), Q1D, D1D);
807 auto Bt =
Reshape(bt_.Read(), D1D, Q1D);
808 auto Gt =
Reshape(gt_.Read(), D1D, Q1D);
809 auto D =
Reshape(d_.Read(), Q1D*Q1D, 3, NE);
810 auto X =
Reshape(x_.Read(), D1D, D1D, NE);
811 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
814 const int D1D = T_D1D ? T_D1D : d1d;
815 const int Q1D = T_Q1D ? T_Q1D : q1d;
817 constexpr
int max_D1D = T_D1D ? T_D1D :
MAX_D1D;
818 constexpr
int max_Q1D = T_Q1D ? T_Q1D :
MAX_Q1D;
820 double grad[max_Q1D][max_Q1D][2];
821 for (
int qy = 0; qy < Q1D; ++qy)
823 for (
int qx = 0; qx < Q1D; ++qx)
825 grad[qy][qx][0] = 0.0;
826 grad[qy][qx][1] = 0.0;
829 for (
int dy = 0; dy < D1D; ++dy)
831 double gradX[max_Q1D][2];
832 for (
int qx = 0; qx < Q1D; ++qx)
837 for (
int dx = 0; dx < D1D; ++dx)
839 const double s = X(dx,dy,e);
840 for (
int qx = 0; qx < Q1D; ++qx)
842 gradX[qx][0] += s * B(qx,dx);
843 gradX[qx][1] += s * G(qx,dx);
846 for (
int qy = 0; qy < Q1D; ++qy)
848 const double wy = B(qy,dy);
849 const double wDy = G(qy,dy);
850 for (
int qx = 0; qx < Q1D; ++qx)
852 grad[qy][qx][0] += gradX[qx][1] * wy;
853 grad[qy][qx][1] += gradX[qx][0] * wDy;
858 for (
int qy = 0; qy < Q1D; ++qy)
860 for (
int qx = 0; qx < Q1D; ++qx)
862 const int q = qx + qy * Q1D;
864 const double O11 = D(q,0,e);
865 const double O12 = D(q,1,e);
866 const double O22 = D(q,2,e);
868 const double gradX = grad[qy][qx][0];
869 const double gradY = grad[qy][qx][1];
871 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
872 grad[qy][qx][1] = (O12 * gradX) + (O22 * gradY);
875 for (
int qy = 0; qy < Q1D; ++qy)
877 double gradX[max_D1D][2];
878 for (
int dx = 0; dx < D1D; ++dx)
883 for (
int qx = 0; qx < Q1D; ++qx)
885 const double gX = grad[qy][qx][0];
886 const double gY = grad[qy][qx][1];
887 for (
int dx = 0; dx < D1D; ++dx)
889 const double wx = Bt(dx,qx);
890 const double wDx = Gt(dx,qx);
891 gradX[dx][0] += gX * wDx;
892 gradX[dx][1] += gY * wx;
895 for (
int dy = 0; dy < D1D; ++dy)
897 const double wy = Bt(dy,qy);
898 const double wDy = Gt(dy,qy);
899 for (
int dx = 0; dx < D1D; ++dx)
901 Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
909 template<
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
910 static void SmemPADiffusionApply2D(
const int NE,
911 const Array<double> &b_,
912 const Array<double> &g_,
913 const Array<double> &bt_,
914 const Array<double> >_,
921 const int D1D = T_D1D ? T_D1D : d1d;
922 const int Q1D = T_Q1D ? T_Q1D : q1d;
923 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
924 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
925 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
926 MFEM_VERIFY(D1D <= MD1,
"");
927 MFEM_VERIFY(Q1D <= MQ1,
"");
928 auto b =
Reshape(b_.Read(), Q1D, D1D);
929 auto g =
Reshape(g_.Read(), Q1D, D1D);
930 auto D =
Reshape(d_.Read(), Q1D*Q1D, 3, NE);
931 auto x =
Reshape(x_.Read(), D1D, D1D, NE);
932 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, NE);
933 MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
935 const int tidz = MFEM_THREAD_ID(z);
936 const int D1D = T_D1D ? T_D1D : d1d;
937 const int Q1D = T_Q1D ? T_Q1D : q1d;
938 constexpr
int NBZ = T_NBZ ? T_NBZ : 1;
939 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
940 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
941 MFEM_SHARED
double sBG[2][MQ1*MD1];
942 double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
943 double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
944 double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
945 double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
946 MFEM_SHARED
double Xz[NBZ][MD1][MD1];
947 MFEM_SHARED
double GD[2][NBZ][MD1][MQ1];
948 MFEM_SHARED
double GQ[2][NBZ][MD1][MQ1];
949 double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
950 double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
951 double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
952 double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
953 double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
954 MFEM_FOREACH_THREAD(dy,y,D1D)
956 MFEM_FOREACH_THREAD(dx,x,D1D)
958 X[dy][dx] = x(dx,dy,e);
963 MFEM_FOREACH_THREAD(dy,y,D1D)
965 MFEM_FOREACH_THREAD(q,x,Q1D)
973 MFEM_FOREACH_THREAD(dy,y,D1D)
975 MFEM_FOREACH_THREAD(qx,x,Q1D)
979 for (
int dx = 0; dx < D1D; ++dx)
981 const double coords = X[dy][dx];
982 u += B[qx][dx] * coords;
983 v += G[qx][dx] * coords;
990 MFEM_FOREACH_THREAD(qy,y,Q1D)
992 MFEM_FOREACH_THREAD(qx,x,Q1D)
996 for (
int dy = 0; dy < D1D; ++dy)
998 u += DQ1[dy][qx] * B[qy][dy];
999 v += DQ0[dy][qx] * G[qy][dy];
1006 MFEM_FOREACH_THREAD(qy,y,Q1D)
1008 MFEM_FOREACH_THREAD(qx,x,Q1D)
1010 const int q = (qx + ((qy) * Q1D));
1011 const double O11 = D(q,0,e);
1012 const double O12 = D(q,1,e);
1013 const double O22 = D(q,2,e);
1014 const double gX = QQ0[qy][qx];
1015 const double gY = QQ1[qy][qx];
1016 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
1017 QQ1[qy][qx] = (O12 * gX) + (O22 * gY);
1023 MFEM_FOREACH_THREAD(dy,y,D1D)
1025 MFEM_FOREACH_THREAD(q,x,Q1D)
1027 Bt[dy][q] =
b(q,dy);
1028 Gt[dy][q] = g(q,dy);
1033 MFEM_FOREACH_THREAD(qy,y,Q1D)
1035 MFEM_FOREACH_THREAD(dx,x,D1D)
1039 for (
int qx = 0; qx < Q1D; ++qx)
1041 u += Gt[dx][qx] * QQ0[qy][qx];
1042 v += Bt[dx][qx] * QQ1[qy][qx];
1049 MFEM_FOREACH_THREAD(dy,y,D1D)
1051 MFEM_FOREACH_THREAD(dx,x,D1D)
1055 for (
int qy = 0; qy < Q1D; ++qy)
1057 u += DQ0[qy][dx] * Bt[dy][qy];
1058 v += DQ1[qy][dx] * Gt[dy][qy];
1060 Y(dx,dy,e) += (u + v);
1067 template<
int T_D1D = 0,
int T_Q1D = 0>
1068 static void PADiffusionApply3D(
const int NE,
1069 const Array<double> &b,
1070 const Array<double> &g,
1071 const Array<double> &bt,
1072 const Array<double> >,
1076 int d1d = 0,
int q1d = 0)
1078 const int D1D = T_D1D ? T_D1D : d1d;
1079 const int Q1D = T_Q1D ? T_Q1D : q1d;
1080 MFEM_VERIFY(D1D <=
MAX_D1D,
"");
1081 MFEM_VERIFY(Q1D <=
MAX_Q1D,
"");
1082 auto B =
Reshape(b.Read(), Q1D, D1D);
1083 auto G =
Reshape(g.Read(), Q1D, D1D);
1084 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
1085 auto Gt =
Reshape(gt.Read(), D1D, Q1D);
1086 auto D =
Reshape(d_.Read(), Q1D*Q1D*Q1D, 6, NE);
1087 auto X =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
1088 auto Y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1091 const int D1D = T_D1D ? T_D1D : d1d;
1092 const int Q1D = T_Q1D ? T_Q1D : q1d;
1093 constexpr
int max_D1D = T_D1D ? T_D1D :
MAX_D1D;
1094 constexpr
int max_Q1D = T_Q1D ? T_Q1D :
MAX_Q1D;
1095 double grad[max_Q1D][max_Q1D][max_Q1D][3];
1096 for (
int qz = 0; qz < Q1D; ++qz)
1098 for (
int qy = 0; qy < Q1D; ++qy)
1100 for (
int qx = 0; qx < Q1D; ++qx)
1102 grad[qz][qy][qx][0] = 0.0;
1103 grad[qz][qy][qx][1] = 0.0;
1104 grad[qz][qy][qx][2] = 0.0;
1108 for (
int dz = 0; dz < D1D; ++dz)
1110 double gradXY[max_Q1D][max_Q1D][3];
1111 for (
int qy = 0; qy < Q1D; ++qy)
1113 for (
int qx = 0; qx < Q1D; ++qx)
1115 gradXY[qy][qx][0] = 0.0;
1116 gradXY[qy][qx][1] = 0.0;
1117 gradXY[qy][qx][2] = 0.0;
1120 for (
int dy = 0; dy < D1D; ++dy)
1122 double gradX[max_Q1D][2];
1123 for (
int qx = 0; qx < Q1D; ++qx)
1128 for (
int dx = 0; dx < D1D; ++dx)
1130 const double s = X(dx,dy,dz,e);
1131 for (
int qx = 0; qx < Q1D; ++qx)
1133 gradX[qx][0] += s * B(qx,dx);
1134 gradX[qx][1] += s * G(qx,dx);
1137 for (
int qy = 0; qy < Q1D; ++qy)
1139 const double wy = B(qy,dy);
1140 const double wDy = G(qy,dy);
1141 for (
int qx = 0; qx < Q1D; ++qx)
1143 const double wx = gradX[qx][0];
1144 const double wDx = gradX[qx][1];
1145 gradXY[qy][qx][0] += wDx * wy;
1146 gradXY[qy][qx][1] += wx * wDy;
1147 gradXY[qy][qx][2] += wx * wy;
1151 for (
int qz = 0; qz < Q1D; ++qz)
1153 const double wz = B(qz,dz);
1154 const double wDz = G(qz,dz);
1155 for (
int qy = 0; qy < Q1D; ++qy)
1157 for (
int qx = 0; qx < Q1D; ++qx)
1159 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
1160 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
1161 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
1167 for (
int qz = 0; qz < Q1D; ++qz)
1169 for (
int qy = 0; qy < Q1D; ++qy)
1171 for (
int qx = 0; qx < Q1D; ++qx)
1173 const int q = qx + (qy + qz * Q1D) * Q1D;
1174 const double O11 = D(q,0,e);
1175 const double O12 = D(q,1,e);
1176 const double O13 = D(q,2,e);
1177 const double O22 = D(q,3,e);
1178 const double O23 = D(q,4,e);
1179 const double O33 = D(q,5,e);
1180 const double gradX = grad[qz][qy][qx][0];
1181 const double gradY = grad[qz][qy][qx][1];
1182 const double gradZ = grad[qz][qy][qx][2];
1183 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
1184 grad[qz][qy][qx][1] = (O12*gradX)+(O22*gradY)+(O23*gradZ);
1185 grad[qz][qy][qx][2] = (O13*gradX)+(O23*gradY)+(O33*gradZ);
1189 for (
int qz = 0; qz < Q1D; ++qz)
1191 double gradXY[max_D1D][max_D1D][3];
1192 for (
int dy = 0; dy < D1D; ++dy)
1194 for (
int dx = 0; dx < D1D; ++dx)
1196 gradXY[dy][dx][0] = 0;
1197 gradXY[dy][dx][1] = 0;
1198 gradXY[dy][dx][2] = 0;
1201 for (
int qy = 0; qy < Q1D; ++qy)
1203 double gradX[max_D1D][3];
1204 for (
int dx = 0; dx < D1D; ++dx)
1210 for (
int qx = 0; qx < Q1D; ++qx)
1212 const double gX = grad[qz][qy][qx][0];
1213 const double gY = grad[qz][qy][qx][1];
1214 const double gZ = grad[qz][qy][qx][2];
1215 for (
int dx = 0; dx < D1D; ++dx)
1217 const double wx = Bt(dx,qx);
1218 const double wDx = Gt(dx,qx);
1219 gradX[dx][0] += gX * wDx;
1220 gradX[dx][1] += gY * wx;
1221 gradX[dx][2] += gZ * wx;
1224 for (
int dy = 0; dy < D1D; ++dy)
1226 const double wy = Bt(dy,qy);
1227 const double wDy = Gt(dy,qy);
1228 for (
int dx = 0; dx < D1D; ++dx)
1230 gradXY[dy][dx][0] += gradX[dx][0] * wy;
1231 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
1232 gradXY[dy][dx][2] += gradX[dx][2] * wy;
1236 for (
int dz = 0; dz < D1D; ++dz)
1238 const double wz = Bt(dz,qz);
1239 const double wDz = Gt(dz,qz);
1240 for (
int dy = 0; dy < D1D; ++dy)
1242 for (
int dx = 0; dx < D1D; ++dx)
1245 ((gradXY[dy][dx][0] * wz) +
1246 (gradXY[dy][dx][1] * wz) +
1247 (gradXY[dy][dx][2] * wDz));
1256 template<
int T_D1D = 0,
int T_Q1D = 0>
1257 static void SmemPADiffusionApply3D(
const int NE,
1258 const Array<double> &b_,
1259 const Array<double> &g_,
1260 const Array<double> &bt_,
1261 const Array<double> >_,
1268 const int D1D = T_D1D ? T_D1D : d1d;
1269 const int Q1D = T_Q1D ? T_Q1D : q1d;
1270 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
1271 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
1272 MFEM_VERIFY(D1D <= MD1,
"");
1273 MFEM_VERIFY(Q1D <= MQ1,
"");
1274 auto b =
Reshape(b_.Read(), Q1D, D1D);
1275 auto g =
Reshape(g_.Read(), Q1D, D1D);
1276 auto d =
Reshape(d_.Read(), Q1D*Q1D*Q1D, 6, NE);
1277 auto x =
Reshape(x_.Read(), D1D, D1D, D1D, NE);
1278 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1279 MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
1281 const int tidz = MFEM_THREAD_ID(z);
1282 const int D1D = T_D1D ? T_D1D : d1d;
1283 const int Q1D = T_Q1D ? T_Q1D : q1d;
1284 constexpr
int MQ1 = T_Q1D ? T_Q1D :
MAX_Q1D;
1285 constexpr
int MD1 = T_D1D ? T_D1D :
MAX_D1D;
1286 constexpr
int MDQ = MQ1 > MD1 ? MQ1 : MD1;
1287 MFEM_SHARED
double sBG[2][MQ1*MD1];
1288 double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1289 double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1290 double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1291 double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1292 MFEM_SHARED
double sm0[3][MDQ*MDQ*MDQ];
1293 MFEM_SHARED
double sm1[3][MDQ*MDQ*MDQ];
1294 double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1295 double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
1296 double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
1297 double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
1298 double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
1299 double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
1300 double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
1301 double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
1302 double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
1303 double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
1304 double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
1305 double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
1306 double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
1307 double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
1308 double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1309 MFEM_FOREACH_THREAD(dz,z,D1D)
1311 MFEM_FOREACH_THREAD(dy,y,D1D)
1313 MFEM_FOREACH_THREAD(dx,x,D1D)
1315 X[dz][dy][dx] = x(dx,dy,dz,e);
1321 MFEM_FOREACH_THREAD(d,y,D1D)
1323 MFEM_FOREACH_THREAD(q,x,Q1D)
1331 MFEM_FOREACH_THREAD(dz,z,D1D)
1333 MFEM_FOREACH_THREAD(dy,y,D1D)
1335 MFEM_FOREACH_THREAD(qx,x,Q1D)
1339 for (
int dx = 0; dx < D1D; ++dx)
1341 const double coords = X[dz][dy][dx];
1342 u += coords * B[qx][dx];
1343 v += coords * G[qx][dx];
1345 DDQ0[dz][dy][qx] = u;
1346 DDQ1[dz][dy][qx] = v;
1351 MFEM_FOREACH_THREAD(dz,z,D1D)
1353 MFEM_FOREACH_THREAD(qy,y,Q1D)
1355 MFEM_FOREACH_THREAD(qx,x,Q1D)
1360 for (
int dy = 0; dy < D1D; ++dy)
1362 u += DDQ1[dz][dy][qx] * B[qy][dy];
1363 v += DDQ0[dz][dy][qx] * G[qy][dy];
1364 w += DDQ0[dz][dy][qx] * B[qy][dy];
1366 DQQ0[dz][qy][qx] = u;
1367 DQQ1[dz][qy][qx] = v;
1368 DQQ2[dz][qy][qx] = w;
1373 MFEM_FOREACH_THREAD(qz,z,Q1D)
1375 MFEM_FOREACH_THREAD(qy,y,Q1D)
1377 MFEM_FOREACH_THREAD(qx,x,Q1D)
1382 for (
int dz = 0; dz < D1D; ++dz)
1384 u += DQQ0[dz][qy][qx] * B[qz][dz];
1385 v += DQQ1[dz][qy][qx] * B[qz][dz];
1386 w += DQQ2[dz][qy][qx] * G[qz][dz];
1388 QQQ0[qz][qy][qx] = u;
1389 QQQ1[qz][qy][qx] = v;
1390 QQQ2[qz][qy][qx] = w;
1395 MFEM_FOREACH_THREAD(qz,z,Q1D)
1397 MFEM_FOREACH_THREAD(qy,y,Q1D)
1399 MFEM_FOREACH_THREAD(qx,x,Q1D)
1401 const int q = qx + ((qy*Q1D) + (qz*Q1D*Q1D));
1402 const double O11 = d(q,0,e);
1403 const double O12 = d(q,1,e);
1404 const double O13 = d(q,2,e);
1405 const double O22 = d(q,3,e);
1406 const double O23 = d(q,4,e);
1407 const double O33 = d(q,5,e);
1408 const double gX = QQQ0[qz][qy][qx];
1409 const double gY = QQQ1[qz][qy][qx];
1410 const double gZ = QQQ2[qz][qy][qx];
1411 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1412 QQQ1[qz][qy][qx] = (O12*gX) + (O22*gY) + (O23*gZ);
1413 QQQ2[qz][qy][qx] = (O13*gX) + (O23*gY) + (O33*gZ);
1420 MFEM_FOREACH_THREAD(d,y,D1D)
1422 MFEM_FOREACH_THREAD(q,x,Q1D)
1430 MFEM_FOREACH_THREAD(qz,z,Q1D)
1432 MFEM_FOREACH_THREAD(qy,y,Q1D)
1434 MFEM_FOREACH_THREAD(dx,x,D1D)
1439 for (
int qx = 0; qx < Q1D; ++qx)
1441 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1442 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1443 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1445 QQD0[qz][qy][dx] = u;
1446 QQD1[qz][qy][dx] = v;
1447 QQD2[qz][qy][dx] = w;
1452 MFEM_FOREACH_THREAD(qz,z,Q1D)
1454 MFEM_FOREACH_THREAD(dy,y,D1D)
1456 MFEM_FOREACH_THREAD(dx,x,D1D)
1461 for (
int qy = 0; qy < Q1D; ++qy)
1463 u += QQD0[qz][qy][dx] * Bt[dy][qy];
1464 v += QQD1[qz][qy][dx] * Gt[dy][qy];
1465 w += QQD2[qz][qy][dx] * Bt[dy][qy];
1467 QDD0[qz][dy][dx] = u;
1468 QDD1[qz][dy][dx] = v;
1469 QDD2[qz][dy][dx] = w;
1474 MFEM_FOREACH_THREAD(dz,z,D1D)
1476 MFEM_FOREACH_THREAD(dy,y,D1D)
1478 MFEM_FOREACH_THREAD(dx,x,D1D)
1483 for (
int qz = 0; qz < Q1D; ++qz)
1485 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1486 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1487 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1489 y(dx,dy,dz,e) += (u + v + w);
1496 static void PADiffusionApply(
const int dim,
1500 const Array<double> &B,
1501 const Array<double> &G,
1502 const Array<double> &Bt,
1503 const Array<double> &Gt,
1508 #ifdef MFEM_USE_OCCA
1513 OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1518 OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1521 MFEM_ABORT(
"OCCA PADiffusionApply unknown kernel!");
1523 #endif // MFEM_USE_OCCA
1526 switch ((D1D << 4 ) | Q1D)
1528 case 0x22:
return SmemPADiffusionApply2D<2,2,16>(NE,B,G,Bt,Gt,D,X,Y);
1529 case 0x33:
return SmemPADiffusionApply2D<3,3,16>(NE,B,G,Bt,Gt,D,X,Y);
1530 case 0x44:
return SmemPADiffusionApply2D<4,4,8>(NE,B,G,Bt,Gt,D,X,Y);
1531 case 0x55:
return SmemPADiffusionApply2D<5,5,8>(NE,B,G,Bt,Gt,D,X,Y);
1532 case 0x66:
return SmemPADiffusionApply2D<6,6,4>(NE,B,G,Bt,Gt,D,X,Y);
1533 case 0x77:
return SmemPADiffusionApply2D<7,7,4>(NE,B,G,Bt,Gt,D,X,Y);
1534 case 0x88:
return SmemPADiffusionApply2D<8,8,2>(NE,B,G,Bt,Gt,D,X,Y);
1535 case 0x99:
return SmemPADiffusionApply2D<9,9,2>(NE,B,G,Bt,Gt,D,X,Y);
1536 default:
return PADiffusionApply2D(NE,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1541 switch ((D1D << 4 ) | Q1D)
1543 case 0x23:
return SmemPADiffusionApply3D<2,3>(NE,B,G,Bt,Gt,D,X,Y);
1544 case 0x34:
return SmemPADiffusionApply3D<3,4>(NE,B,G,Bt,Gt,D,X,Y);
1545 case 0x45:
return SmemPADiffusionApply3D<4,5>(NE,B,G,Bt,Gt,D,X,Y);
1546 case 0x46:
return SmemPADiffusionApply3D<4,6>(NE,B,G,Bt,Gt,D,X,Y);
1547 case 0x56:
return SmemPADiffusionApply3D<5,6>(NE,B,G,Bt,Gt,D,X,Y);
1548 case 0x58:
return SmemPADiffusionApply3D<5,8>(NE,B,G,Bt,Gt,D,X,Y);
1549 case 0x67:
return SmemPADiffusionApply3D<6,7>(NE,B,G,Bt,Gt,D,X,Y);
1550 case 0x78:
return SmemPADiffusionApply3D<7,8>(NE,B,G,Bt,Gt,D,X,Y);
1551 case 0x89:
return SmemPADiffusionApply3D<8,9>(NE,B,G,Bt,Gt,D,X,Y);
1552 default:
return PADiffusionApply3D(NE,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1555 MFEM_ABORT(
"Unknown kernel.");
1561 #ifdef MFEM_USE_CEED
1562 if (DeviceCanUseCeed())
1564 const CeedScalar *x_ptr;
1567 CeedGetPreferredMemType(internal::ceed, &mem);
1568 if ( Device::Allows(Backend::CUDA) && mem==CEED_MEM_DEVICE )
1577 mem = CEED_MEM_HOST;
1579 CeedVectorSetArray(ceedDataPtr->u, mem, CEED_USE_POINTER,
1580 const_cast<CeedScalar*>(x_ptr));
1581 CeedVectorSetArray(ceedDataPtr->v, mem, CEED_USE_POINTER, y_ptr);
1583 CeedOperatorApplyAdd(ceedDataPtr->oper, ceedDataPtr->u, ceedDataPtr->v,
1584 CEED_REQUEST_IMMEDIATE);
1586 CeedVectorSyncArray(ceedDataPtr->v, mem);
1591 PADiffusionApply(dim, dofs1D, quad1D, ne,
1592 maps->B, maps->G, maps->Bt, maps->Gt,
int GetNPoints() const
Returns the number of the points in the integration rule.
Abstract class for Finite Elements.
int Size() const
Logical size of the array.
double * HostReadWrite()
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), false).
int GetDim() const
Returns the reference space dimension for the finite element.
Class for an integration rule - an Array of IntegrationPoint.
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Subclass constant coefficient.
void SetSize(int s)
Resize the vector to size s.
occa::device & OccaDev()
Return the default occa::device used by MFEM.
int Size() const
Returns the size of the vector.
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
int GetNE() const
Returns number of elements.
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
std::map< occa_id_t, occa::kernel > occa_kernel_t
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims...dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
int GetNE() const
Returns number of elements in the mesh.
const double * HostRead() const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), false).
IntegrationPoint & IntPoint(int i)
Returns a reference to the i-th integration point.
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags)
Return the mesh geometric factors corresponding to the given integration rule.
double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Mesh * GetMesh() const
Returns the mesh.
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
ElementTransformation * GetElementTransformation(int i) const
Returns ElementTransformation for the i-th element.
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement associated with i'th element.
double * HostWrite()
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), false).
std::pair< int, int > occa_id_t