12#ifndef MFEM_FEM_KERNELS_HPP
13#define MFEM_FEM_KERNELS_HPP
34#if ((defined(MFEM_USE_CUDA) && defined(__CUDA_ARCH__)) || \
35 (defined(MFEM_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)))
39template <
int VDIM,
int N>
42template <
int VDIM,
int DIM,
int N = 0>
48template <
int VDIM,
int N>
51template <
int VDIM,
int DIM,
int N>
55constexpr int SetMaxOf(
int n) {
return n; }
60template <
int VDIM,
int N>
63template <
int VDIM,
int DIM,
int N>
69template <
int VDIM,
int N>
72template <
int VDIM,
int DIM,
int N>
77constexpr int NextMultipleOf(
int n)
79 static_assert(N > 0 && (N & (N - 1)) == 0,
"N must be a power of 2");
80 return (n + (N - 1)) & ~(N - 1);
82constexpr int SetMaxOf(
int n) {
return NextMultipleOf<4>(n); }
87inline MFEM_HOST_DEVICE
void LoadMatrix(
const int d1d,
const int q1d,
90 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
92 MFEM_FOREACH_THREAD_DIRECT(qx, x, q1d)
94 N[dy][qx] = M[dy * q1d + qx];
101template <
int VDIM,
int DIM,
int MQ1 = 0>
102inline MFEM_HOST_DEVICE
void LoadDofs2d(
const int e,
const int d1d,
const int c,
103 const DeviceTensor<4, const real_t> &X,
104 vd_regs2d_t<VDIM, DIM, MQ1> &Y)
106 for (
int d = 0; d <
DIM; d++)
108 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
110 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
112 Y[c][d][dy][dx] = X(dx, dy, c, e);
120template <
int VDIM,
int DIM,
int MQ1 = 0>
121inline MFEM_HOST_DEVICE
void LoadDofs2d(
const int e,
const int d1d,
122 const DeviceTensor<4, const real_t> &X,
123 vd_regs2d_t<VDIM, DIM, MQ1> &Y)
125 for (
int c = 0; c < VDIM; ++c) { LoadDofs2d(e, d1d, c, X, Y); }
129template <
int VDIM,
int MQ1 = 0>
130inline MFEM_HOST_DEVICE
void LoadDofs2d(
const int e,
const int d1d,
131 const DeviceTensor<4, const real_t> &X,
132 v_regs2d_t<VDIM, MQ1> &Y)
134 for (
int c = 0; c < VDIM; ++c)
136 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
138 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
140 Y[c][dy][dx] = X(dx, dy, c, e);
148template <
int MQ1 = 0>
149inline MFEM_HOST_DEVICE
void LoadDofs2d(
const int e,
const int d1d,
150 const DeviceTensor<3, const real_t> &X,
153 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
155 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
157 Y[dy][dx] = X(dx, dy, e);
164template <
int VDIM,
int DIM,
int MQ1 = 0>
165inline MFEM_HOST_DEVICE
void WriteDofs2d(
const int e,
const int d1d,
166 const int i,
const int j,
167 vd_regs2d_t<VDIM, DIM, MQ1> &X,
168 const DeviceTensor<4, real_t> &Y)
170 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
172 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
175 for (
int d = 0; d <
DIM; d++) { y += X(i, d, dy, dx); }
176 Y(dx, dy, j, e) += y;
183template <
int VDIM,
int DIM,
int MQ1 = 0>
184inline MFEM_HOST_DEVICE
void WriteDofs2d(
const int e,
const int d1d,
185 vd_regs2d_t<VDIM, DIM, MQ1> &X,
186 const DeviceTensor<4, real_t> &Y)
188 for (
int c = 0; c < VDIM; ++c) { WriteDofs2d(e, d1d, c, c, X, Y); }
192template <
int VDIM,
int MQ1 = 0>
193inline MFEM_HOST_DEVICE
void WriteDofs2d(
const int e,
const int d1d,
194 v_regs2d_t<VDIM, MQ1> &X,
195 const DeviceTensor<4, real_t> &Y)
197 for (
int c = 0; c < VDIM; ++c)
199 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
201 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
203 Y(dx, dy, c, e) += X(c, dy, dx);
211template <
int VDIM,
int DIM,
int MQ1>
212inline MFEM_HOST_DEVICE
void LoadDofs3d(
const int e,
const int d1d,
const int c,
213 const DeviceTensor<5, const real_t> &X,
214 vd_regs3d_t<VDIM, DIM, MQ1> &Y)
216 for (
int d = 0; d <
DIM; d++)
218 for (
int dz = 0; dz < d1d; ++dz)
220 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
222 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
224 Y[c][d][dz][dy][dx] = X(dx, dy, dz, c, e);
233template <
int VDIM,
int DIM,
int MQ1>
234inline MFEM_HOST_DEVICE
void LoadDofs3d(
const int e,
const int d1d,
235 const DeviceTensor<5, const real_t> &X,
236 vd_regs3d_t<VDIM, DIM, MQ1> &Y)
238 for (
int c = 0; c < VDIM; ++c) { LoadDofs3d(e, d1d, c, X, Y); }
242template <
int VDIM,
int MQ1>
243inline MFEM_HOST_DEVICE
void LoadDofs3d(
const int e,
const int d1d,
244 const DeviceTensor<5, const real_t> &X,
245 v_regs3d_t<VDIM, MQ1> &Y)
247 for (
int c = 0; c < VDIM; ++c)
249 for (
int dz = 0; dz < d1d; ++dz)
251 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
253 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
255 Y[c][dz][dy][dx] = X(dx,dy,dz,c,e);
265inline MFEM_HOST_DEVICE
void LoadDofs3d(
const int e,
const int d1d,
266 const DeviceTensor<4, const real_t> &X,
269 for (
int dz = 0; dz < d1d; ++dz)
271 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
273 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
275 Y[dz][dy][dx] = X(dx,dy,dz,e);
283template <
int VDIM,
int DIM,
int MQ1>
284inline MFEM_HOST_DEVICE
void WriteDofs3d(
const int e,
const int d1d,
285 const int i,
const int j,
286 vd_regs3d_t<VDIM, DIM, MQ1> &X,
287 const DeviceTensor<5, real_t> &Y)
289 for (
int dz = 0; dz < d1d; ++dz)
291 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
293 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
296 for (
int d = 0; d <
DIM; d++) { value += X(i, d, dz, dy, dx); }
297 Y(dx, dy, dz, j, e) += value;
305template <
int VDIM,
int DIM,
int MQ1>
306inline MFEM_HOST_DEVICE
void WriteDofs3d(
const int e,
const int d1d,
307 vd_regs3d_t<VDIM, DIM, MQ1> &X,
308 const DeviceTensor<5, real_t> &Y)
310 for (
int c = 0; c < VDIM; ++c) { WriteDofs3d(e, d1d, c, c, X, Y); }
314template <
int VDIM,
int MQ1>
315inline MFEM_HOST_DEVICE
void WriteDofs3d(
const int e,
const int d1d,
316 v_regs3d_t<VDIM, MQ1> &X,
317 const DeviceTensor<5, real_t> &Y)
319 for (
int c = 0; c < VDIM; ++c)
321 for (
int dz = 0; dz < d1d; ++dz)
323 MFEM_FOREACH_THREAD_DIRECT(dy, y, d1d)
325 MFEM_FOREACH_THREAD_DIRECT(dx, x, d1d)
327 Y(dx, dy, dz, c, e) += X(c, dz, dy, dx);
336template <
bool Transpose,
int MQ1>
337inline MFEM_HOST_DEVICE
void ContractX2d(
const int d1d,
const int q1d,
340 const s_regs2d_t<MQ1> &X,
343 MFEM_FOREACH_THREAD_DIRECT(y, y, d1d)
345 MFEM_FOREACH_THREAD_DIRECT(x, x, (
Transpose ? q1d : d1d))
347 smem[y][x] = X[y][x];
351 MFEM_FOREACH_THREAD_DIRECT(y, y, d1d)
353 MFEM_FOREACH_THREAD_DIRECT(x, x, (
Transpose ? d1d : q1d))
356 for (
int k = 0; k < (
Transpose ? q1d : d1d); ++k)
358 u += (
Transpose ? B[x][k] : B[k][x]) * smem[y][k];
367template <
bool Transpose,
int MQ1>
368inline MFEM_HOST_DEVICE
void ContractY2d(
const int d1d,
const int q1d,
371 const s_regs2d_t<MQ1> &X,
374 MFEM_FOREACH_THREAD_DIRECT(y, y, (
Transpose ? q1d : d1d))
376 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d) { smem[y][x] = X[y][x]; }
379 MFEM_FOREACH_THREAD_DIRECT(y, y, (
Transpose ? d1d : q1d))
381 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d)
384 for (
int k = 0; k < (
Transpose ? q1d : d1d); ++k)
386 u += (
Transpose ? B[y][k] : B[k][y]) * smem[k][x];
395template <
int MQ1 = 0>
396inline MFEM_HOST_DEVICE
void Copy2d(
const int q1d,
400 MFEM_FOREACH_THREAD_DIRECT(y, y, q1d)
402 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d) { Y[y][x] = X[y][x]; }
408template <
bool Transpose,
int MQ1>
409inline MFEM_HOST_DEVICE
void Contract2d(
const int d1d,
const int q1d,
418 ContractX2d<false>(d1d, q1d, smem, Bx, X, Y);
419 ContractY2d<false>(d1d, q1d, smem, By, Y, X);
425 ContractY2d<true>(d1d, q1d, smem, By, Y, X);
426 ContractX2d<true>(d1d, q1d, smem, Bx, X, Y);
431template <
int MQ1,
bool Transpose = false>
432inline MFEM_HOST_DEVICE
void Eval2d(
const int d1d,
const int q1d,
438 Contract2d<Transpose, MQ1>(d1d, q1d, smem, B, B, X, Y);
442template <
int VDIM,
int MQ1,
bool Transpose = false>
443inline MFEM_HOST_DEVICE
void Eval2d(
const int d1d,
const int q1d,
446 v_regs2d_t<VDIM, MQ1> &X,
447 v_regs2d_t<VDIM, MQ1> &Y)
449 for (
int c = 0; c < VDIM; c++)
451 Eval2d<MQ1, Transpose>(d1d, q1d, smem, B, X[c], Y[c]);
456template <
int VDIM,
int MQ1>
457inline MFEM_HOST_DEVICE
void EvalTranspose2d(
const int d1d,
const int q1d,
460 v_regs2d_t<VDIM, MQ1> &X,
461 v_regs2d_t<VDIM, MQ1> &Y)
463 Eval2d<VDIM, MQ1, true>(d1d, q1d, smem, B, X, Y);
467template <
int VDIM,
int DIM,
int MQ1,
bool Transpose = false>
468inline MFEM_HOST_DEVICE
void Grad2d(
const int d1d,
const int q1d,
472 vd_regs2d_t<VDIM, DIM, MQ1> &X,
473 vd_regs2d_t<VDIM, DIM, MQ1> &Y,
477 for (
int d = 0; d <
DIM; d++)
479 const real_t (*Bx)[MQ1] = (d == 0) ? G : B;
480 const real_t (*By)[MQ1] = (d == 1) ? G : B;
481 Contract2d<Transpose>(d1d, q1d, smem, Bx, By, X[c][d], Y[c][d]);
487template <
int VDIM,
int DIM,
int MQ1,
bool Transpose = false>
488inline MFEM_HOST_DEVICE
void Grad2d(
const int d1d,
const int q1d,
492 vd_regs2d_t<VDIM, DIM, MQ1> &X,
493 vd_regs2d_t<VDIM, DIM, MQ1> &Y)
495 for (
int c = 0; c < VDIM; ++c)
497 Grad2d<VDIM, DIM, MQ1, Transpose>(d1d, q1d, smem, B, G, X, Y, c);
502template <
int VDIM,
int DIM,
int MQ1>
503inline MFEM_HOST_DEVICE
void GradTranspose2d(
const int d1d,
const int q1d,
507 vd_regs2d_t<VDIM, DIM, MQ1> &X,
508 vd_regs2d_t<VDIM, DIM, MQ1> &Y)
511 Grad2d<VDIM, DIM, MQ1, Transpose>(d1d, q1d, smem, B, G, X, Y);
515template <
int VDIM,
int DIM,
int MQ1>
516inline MFEM_HOST_DEVICE
void GradTranspose2d(
const int d1d,
const int q1d,
520 vd_regs2d_t<VDIM, DIM, MQ1> &X,
521 vd_regs2d_t<VDIM, DIM, MQ1> &Y,
525 Grad2d<VDIM, DIM, MQ1, Transpose>(d1d, q1d, smem, B, G, X, Y, c);
529template <
bool Transpose,
int MQ1>
530inline MFEM_HOST_DEVICE
void ContractX3d(
const int d1d,
const int q1d,
533 const s_regs3d_t<MQ1> &X,
536 for (
int z = 0; z < d1d; ++z)
538 MFEM_FOREACH_THREAD_DIRECT(y, y, d1d)
540 MFEM_FOREACH_THREAD_DIRECT(x, x, (
Transpose ? q1d : d1d))
542 smem[y][x] = X[z][y][x];
546 MFEM_FOREACH_THREAD_DIRECT(y, y, d1d)
548 MFEM_FOREACH_THREAD_DIRECT(x, x, (
Transpose ? d1d : q1d))
551 for (
int k = 0; k < (
Transpose ? q1d : d1d); ++k)
553 u += (
Transpose ? B[x][k] : B[k][x]) * smem[y][k];
563template <
bool Transpose,
int MQ1>
564inline MFEM_HOST_DEVICE
void ContractY3d(
const int d1d,
const int q1d,
567 const s_regs3d_t<MQ1> &X,
570 for (
int z = 0; z < d1d; ++z)
572 MFEM_FOREACH_THREAD_DIRECT(y, y, (
Transpose ? q1d : d1d))
574 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d) { smem[y][x] = X[z][y][x]; }
577 MFEM_FOREACH_THREAD_DIRECT(y, y, (
Transpose ? d1d : q1d))
579 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d)
582 for (
int k = 0; k < (
Transpose ? q1d : d1d); ++k)
584 u += (
Transpose ? B[y][k] : B[k][y]) * smem[k][x];
594template <
bool Transpose,
int MQ1>
595inline MFEM_HOST_DEVICE
void ContractZ3d(
const int d1d,
const int q1d,
597 const s_regs3d_t<MQ1> &X,
600 for (
int z = 0; z < (
Transpose ? d1d : q1d); ++z)
602 MFEM_FOREACH_THREAD_DIRECT(y, y, q1d)
604 MFEM_FOREACH_THREAD_DIRECT(x, x, q1d)
607 for (
int k = 0; k < (
Transpose ? q1d : d1d); ++k)
609 u += (
Transpose ? B[z][k] : B[k][z]) * X[k][y][x];
618template <
bool Transpose,
int MQ1>
619inline MFEM_HOST_DEVICE
void Contract3d(
const int d1d,
const int q1d,
629 ContractX3d<false>(d1d, q1d, smem, Bx, X, Y);
630 ContractY3d<false>(d1d, q1d, smem, By, Y, X);
631 ContractZ3d<false>(d1d, q1d, Bz, X, Y);
635 ContractZ3d<true>(d1d, q1d, Bz, X, Y);
636 ContractY3d<true>(d1d, q1d, smem, By, Y, X);
637 ContractX3d<true>(d1d, q1d, smem, Bx, X, Y);
642template <
int MQ1,
bool Transpose = false>
643inline MFEM_HOST_DEVICE
void Eval3d(
const int d1d,
const int q1d,
649 Contract3d<Transpose>(d1d, q1d, smem, B, B, B, X, Y);
653template <
int VDIM,
int MQ1,
bool Transpose = false>
654inline MFEM_HOST_DEVICE
void Eval3d(
const int d1d,
const int q1d,
657 v_regs3d_t<VDIM, MQ1> &X,
658 v_regs3d_t<VDIM, MQ1> &Y)
660 for (
int c = 0; c < VDIM; c++)
662 Eval3d<MQ1, Transpose>(d1d, q1d, smem, B, X[c], Y[c]);
667template <
int VDIM,
int MQ1>
668inline MFEM_HOST_DEVICE
void EvalTranspose3d(
const int d1d,
const int q1d,
671 v_regs3d_t<VDIM, MQ1> &X,
672 v_regs3d_t<VDIM, MQ1> &Y)
674 Eval3d<VDIM, MQ1, true>(d1d, q1d, smem, B, X, Y);
678template <
int VDIM,
int DIM,
int MQ1,
bool Transpose = false>
679inline MFEM_HOST_DEVICE
void Grad3d(
const int d1d,
const int q1d,
683 vd_regs3d_t<VDIM, DIM, MQ1> &X,
684 vd_regs3d_t<VDIM, DIM, MQ1> &Y,
687 for (
int d = 0; d <
DIM; d++)
689 const real_t (*Bx)[MQ1] = (d == 0) ? G : B;
690 const real_t (*By)[MQ1] = (d == 1) ? G : B;
691 const real_t (*Bz)[MQ1] = (d == 2) ? G : B;
692 Contract3d<Transpose>(d1d, q1d, smem, Bx, By, Bz, X[c][d], Y[c][d]);
697template <
int VDIM,
int DIM,
int MQ1,
bool Transpose = false>
698inline MFEM_HOST_DEVICE
void Grad3d(
const int d1d,
const int q1d,
702 vd_regs3d_t<VDIM, DIM, MQ1> &X,
703 vd_regs3d_t<VDIM, DIM, MQ1> &Y)
705 for (
int c = 0; c < VDIM; c++)
707 Grad3d<VDIM, DIM, MQ1, Transpose>(d1d, q1d, smem, B, G, X, Y, c);
712template <
int VDIM,
int DIM,
int MQ1>
713inline MFEM_HOST_DEVICE
void GradTranspose3d(
const int d1d,
const int q1d,
717 vd_regs3d_t<VDIM, DIM, MQ1> &X,
718 vd_regs3d_t<VDIM, DIM, MQ1> &Y)
720 Grad3d<VDIM, DIM, MQ1, true>(d1d, q1d, smem, B, G, X, Y);
724template <
int VDIM,
int DIM,
int MQ1>
725inline MFEM_HOST_DEVICE
void GradTranspose3d(
const int d1d,
const int q1d,
729 vd_regs3d_t<VDIM, DIM, MQ1> &X,
730 vd_regs3d_t<VDIM, DIM, MQ1> &Y,
733 Grad3d<VDIM, DIM, MQ1, true>(d1d, q1d, smem, B, G, X, Y, c);
737template<
int MD1,
int MQ1>
738MFEM_HOST_DEVICE
inline void LoadB(
const int D1D,
const int Q1D,
742 const int tidz = MFEM_THREAD_ID(z);
747 MFEM_FOREACH_THREAD(d,y,D1D)
749 MFEM_FOREACH_THREAD(q,x,Q1D)
759template<
int MD1,
int MQ1>
760MFEM_HOST_DEVICE
inline void LoadBt(
const int D1D,
const int Q1D,
764 const int tidz = MFEM_THREAD_ID(z);
769 MFEM_FOREACH_THREAD(d,y,D1D)
771 MFEM_FOREACH_THREAD(q,x,Q1D)
781template<
int MD1,
int MQ1>
782MFEM_HOST_DEVICE
inline void LoadBG(
const int D1D,
const int Q1D,
785 real_t (&sBG)[2][MQ1*MD1])
787 const int tidz = MFEM_THREAD_ID(z);
793 MFEM_FOREACH_THREAD(d,y,D1D)
795 MFEM_FOREACH_THREAD(q,x,Q1D)
806template<
int MD1,
int MQ1>
807MFEM_HOST_DEVICE
inline void LoadBGt(
const int D1D,
const int Q1D,
810 real_t (&sBG)[2][MQ1*MD1])
812 const int tidz = MFEM_THREAD_ID(z);
818 MFEM_FOREACH_THREAD(d,y,D1D)
820 MFEM_FOREACH_THREAD(q,x,Q1D)
831MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
832 const DeviceTensor<3, const real_t> &x,
835 MFEM_FOREACH_THREAD(dy,y,D1D)
837 MFEM_FOREACH_THREAD(dx,x,D1D)
839 DD(dx,dy) = x(dx,dy,e);
847template<
int MD1,
int NBZ>
848MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
849 const DeviceTensor<3, const real_t> &x,
850 real_t (&sX)[NBZ][MD1*MD1])
852 const int tidz = MFEM_THREAD_ID(z);
858MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
const int c,
859 const DeviceTensor<4, const real_t> &x,
862 MFEM_FOREACH_THREAD(dy,y,D1D)
864 MFEM_FOREACH_THREAD(dx,x,D1D)
866 DD(dx,dy) = x(dx,dy,c,e);
872template<
int MD1,
int NBZ>
873MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
const int c,
874 const DeviceTensor<4, const real_t> &x,
875 real_t (&sm)[NBZ][MD1*MD1])
877 const int tidz = MFEM_THREAD_ID(z);
883MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
888 MFEM_FOREACH_THREAD(dy,y,D1D)
890 MFEM_FOREACH_THREAD(qx,x,Q1D)
893 for (
int dx = 0; dx < D1D; ++dx)
895 u += B(dx,qx) * DD(dx,dy);
903template<
int MD1,
int MQ1,
int NBZ>
904MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
905 const real_t (&sB)[MQ1*MD1],
906 real_t (&sDD)[NBZ][MD1*MD1],
907 real_t (&sDQ)[NBZ][MD1*MQ1])
909 const int tidz = MFEM_THREAD_ID(z);
913 EvalX(D1D,Q1D,B,DD,DQ);
917MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
922 MFEM_FOREACH_THREAD(qy,y,Q1D)
924 MFEM_FOREACH_THREAD(qx,x,Q1D)
927 for (
int dy = 0; dy < D1D; ++dy)
929 u += DQ(dy,qx) * B(dy,qy);
937template<
int MD1,
int MQ1,
int NBZ>
938MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
939 const real_t (&sB)[MQ1*MD1],
940 real_t (&sDQ)[NBZ][MD1*MQ1],
941 real_t (&sQQ)[NBZ][MQ1*MQ1])
943 const int tidz = MFEM_THREAD_ID(z);
947 EvalY(D1D,Q1D,B,DQ,QQ);
951MFEM_HOST_DEVICE
inline void PullEval(
const int qx,
const int qy,
958template<
int MQ1,
int NBZ>
959MFEM_HOST_DEVICE
inline void PullEval(
const int Q1D,
960 const int qx,
const int qy,
961 real_t (&sQQ)[NBZ][MQ1*MQ1],
964 const int tidz = MFEM_THREAD_ID(z);
966 PullEval(qx,qy,QQ,P);
970template<
int MD1,
int NBZ>
971MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
972 const DeviceTensor<4, const real_t> &X,
973 real_t (&sX)[2][NBZ][MD1*MD1])
975 const int tidz = MFEM_THREAD_ID(z);
979 MFEM_FOREACH_THREAD(dy,y,D1D)
981 MFEM_FOREACH_THREAD(dx,x,D1D)
983 X0(dx,dy) = X(dx,dy,0,e);
984 X1(dx,dy) = X(dx,dy,1,e);
991template<
int MD1,
int MQ1,
int NBZ>
992MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
993 const real_t (&sB)[MQ1*MD1],
994 const real_t (&sX)[2][NBZ][MD1*MD1],
995 real_t (&sDQ)[2][NBZ][MD1*MQ1])
997 const int tidz = MFEM_THREAD_ID(z);
1004 MFEM_FOREACH_THREAD(dy,y,D1D)
1006 MFEM_FOREACH_THREAD(qx,x,Q1D)
1009 for (
int dx = 0; dx < D1D; ++dx)
1011 const real_t xx = X0(dx,dy);
1012 const real_t xy = X1(dx,dy);
1013 u[0] += B(dx,qx) * xx;
1014 u[1] += B(dx,qx) * xy;
1024template<
int MD1,
int MQ1,
int NBZ>
1025MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
1026 const real_t (&sB)[MQ1*MD1],
1027 const real_t (&sDQ)[2][NBZ][MD1*MQ1],
1028 real_t (&sQQ)[2][NBZ][MQ1*MQ1])
1030 const int tidz = MFEM_THREAD_ID(z);
1037 MFEM_FOREACH_THREAD(qy,y,Q1D)
1039 MFEM_FOREACH_THREAD(qx,x,Q1D)
1042 for (
int dy = 0; dy < D1D; ++dy)
1044 u[0] += DQ0(qx,dy) * B(dy,qy);
1045 u[1] += DQ1(qx,dy) * B(dy,qy);
1055template<
int MQ1,
int NBZ>
1056MFEM_HOST_DEVICE
inline void PullEval(
const int Q1D,
1057 const int qx,
const int qy,
1058 const real_t (&sQQ)[2][NBZ][MQ1*MQ1],
1061 const int tidz = MFEM_THREAD_ID(z);
1070template<
int MQ1,
int NBZ>
1071MFEM_HOST_DEVICE
inline void PushEval(
const int Q1D,
1072 const int qx,
const int qy,
1074 real_t (&sQQ)[2][NBZ][MQ1*MQ1])
1076 const int tidz = MFEM_THREAD_ID(z);
1085template<
int MD1,
int MQ1,
int NBZ>
1086MFEM_HOST_DEVICE
inline void EvalXt(
const int D1D,
const int Q1D,
1087 const real_t (&sB)[MQ1*MD1],
1088 const real_t (&sQQ)[2][NBZ][MQ1*MQ1],
1089 real_t (&sDQ)[2][NBZ][MD1*MQ1])
1091 const int tidz = MFEM_THREAD_ID(z);
1098 MFEM_FOREACH_THREAD(qy,y,Q1D)
1100 MFEM_FOREACH_THREAD(dx,x,D1D)
1103 for (
int qx = 0; qx < Q1D; ++qx)
1105 u[0] += QQ0(qx,qy) * Bt(qx,dx);
1106 u[1] += QQ1(qx,qy) * Bt(qx,dx);
1116template<
int MD1,
int MQ1,
int NBZ>
1117MFEM_HOST_DEVICE
inline void EvalYt(
const int D1D,
const int Q1D,
1118 const real_t (&sB)[MQ1*MD1],
1119 const real_t (&sDQ)[2][NBZ][MD1*MQ1],
1120 const DeviceTensor<4> &Y,
1123 const int tidz = MFEM_THREAD_ID(z);
1128 MFEM_FOREACH_THREAD(dy,y,D1D)
1130 MFEM_FOREACH_THREAD(dx,x,D1D)
1133 for (
int qy = 0; qy < Q1D; ++qy)
1135 u[0] += Bt(qy,dy) * DQ0(qy,dx);
1136 u[1] += Bt(qy,dy) * DQ1(qy,dx);
1138 Y(dx,dy,0,e) +=
u[0];
1139 Y(dx,dy,1,e) +=
u[1];
1146template<
int MD1,
int MQ1,
int NBZ>
1147MFEM_HOST_DEVICE
inline void GradX(
const int D1D,
const int Q1D,
1148 const real_t (&sBG)[2][MQ1*MD1],
1149 const real_t (&sX)[2][NBZ][MD1*MD1],
1150 real_t (&sDQ)[4][NBZ][MD1*MQ1])
1152 const int tidz = MFEM_THREAD_ID(z);
1162 MFEM_FOREACH_THREAD(dy,y,D1D)
1164 MFEM_FOREACH_THREAD(qx,x,Q1D)
1167 real_t v[2] = {0.0, 0.0};
1168 for (
int dx = 0; dx < D1D; ++dx)
1170 const real_t Bx = B(dx,qx);
1171 const real_t Gx = G(dx,qx);
1172 const real_t x0 = X0(dx,dy);
1173 const real_t x1 = X1(dx,dy);
1189template<
int MD1,
int MQ1,
int NBZ>
1190MFEM_HOST_DEVICE
inline void GradY(
const int D1D,
const int Q1D,
1191 const real_t (&sBG)[2][MQ1*MD1],
1192 const real_t (&sDQ)[4][NBZ][MD1*MQ1],
1193 real_t (&sQQ)[4][NBZ][MQ1*MQ1])
1195 const int tidz = MFEM_THREAD_ID(z);
1207 MFEM_FOREACH_THREAD(qy,y,Q1D)
1209 MFEM_FOREACH_THREAD(qx,x,Q1D)
1212 real_t v[2] = {0.0, 0.0};
1213 for (
int dy = 0; dy < D1D; ++dy)
1215 const real_t By = B(dy,qy);
1216 const real_t Gy = G(dy,qy);
1217 u[0] += X0G(qx,dy) * By;
1218 v[0] += X0B(qx,dy) * Gy;
1219 u[1] += X1G(qx,dy) * By;
1220 v[1] += X1B(qx,dy) * Gy;
1232template<
int MQ1,
int NBZ>
1233MFEM_HOST_DEVICE
inline void PullGrad(
const int Q1D,
1234 const int qx,
const int qy,
1235 const real_t (&sQQ)[4][NBZ][MQ1*MQ1],
1238 const int tidz = MFEM_THREAD_ID(z);
1244 Jpr[0] = X0GB(qx,qy);
1245 Jpr[1] = X1GB(qx,qy);
1246 Jpr[2] = X0BG(qx,qy);
1247 Jpr[3] = X1BG(qx,qy);
1251template<
int MQ1,
int NBZ>
1252MFEM_HOST_DEVICE
inline void PushGrad(
const int Q1D,
1253 const int qx,
const int qy,
1255 real_t (&sQQ)[4][NBZ][MQ1*MQ1])
1257 const int tidz = MFEM_THREAD_ID(z);
1270template<
int MD1,
int MQ1,
int NBZ>
1271MFEM_HOST_DEVICE
inline void GradYt(
const int D1D,
const int Q1D,
1272 const real_t (&sBG)[2][MQ1*MD1],
1273 const real_t (&GQ)[4][NBZ][MQ1*MQ1],
1274 real_t (&GD)[4][NBZ][MD1*MQ1])
1276 const int tidz = MFEM_THREAD_ID(z);
1288 MFEM_FOREACH_THREAD(qy,y,Q1D)
1290 MFEM_FOREACH_THREAD(dx,x,D1D)
1293 real_t v[2] = {0.0, 0.0};
1294 for (
int qx = 0; qx < Q1D; ++qx)
1296 u[0] += Gt(qx,dx) * QQx0(qx,qy);
1297 u[1] += Gt(qx,dx) * QQy0(qx,qy);
1298 v[0] += Bt(qx,dx) * QQx1(qx,qy);
1299 v[1] += Bt(qx,dx) * QQy1(qx,qy);
1311template<
int MD1,
int MQ1,
int NBZ>
1312MFEM_HOST_DEVICE
inline void GradXt(
const int D1D,
const int Q1D,
1313 const real_t (&sBG)[2][MQ1*MD1],
1314 const real_t (&GD)[4][NBZ][MD1*MQ1],
1315 const DeviceTensor<4> &Y,
1318 const int tidz = MFEM_THREAD_ID(z);
1326 MFEM_FOREACH_THREAD(dy,y,D1D)
1328 MFEM_FOREACH_THREAD(dx,x,D1D)
1331 real_t v[2] = {0.0, 0.0};
1332 for (
int qy = 0; qy < Q1D; ++qy)
1334 u[0] += DQxB(qy,dx) * Bt(qy,dy);
1335 u[1] += DQyB(qy,dx) * Bt(qy,dy);
1336 v[0] += DQxG(qy,dx) * Gt(qy,dy);
1337 v[1] += DQyG(qy,dx) * Gt(qy,dy);
1339 Y(dx,dy,0,e) +=
u[0] + v[0];
1340 Y(dx,dy,1,e) +=
u[1] + v[1];
1347MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
1348 const DeviceTensor<4, const real_t> &x,
1351 MFEM_FOREACH_THREAD(dz,z,D1D)
1353 MFEM_FOREACH_THREAD(dy,y,D1D)
1355 MFEM_FOREACH_THREAD(dx,x,D1D)
1357 X(dx,dy,dz) = x(dx,dy,dz,e);
1365MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
1366 const DeviceTensor<4, const real_t> &x,
1367 real_t (&sm)[MD1*MD1*MD1])
1374MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
const int c,
1375 const DeviceTensor<5, const real_t> &x,
1378 MFEM_FOREACH_THREAD(dz,z,D1D)
1380 MFEM_FOREACH_THREAD(dy,y,D1D)
1382 MFEM_FOREACH_THREAD(dx,x,D1D)
1384 X(dx,dy,dz) = x(dx,dy,dz,c,e);
1393MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
const int c,
1394 const DeviceTensor<5, const real_t> &x,
1395 real_t (&sm)[MD1*MD1*MD1])
1398 return LoadX<MD1>(e,D1D,c,x,X);
1402MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
1407 MFEM_FOREACH_THREAD(dz,z,D1D)
1409 MFEM_FOREACH_THREAD(dy,y,D1D)
1411 MFEM_FOREACH_THREAD(qx,x,Q1D)
1414 for (
int dx = 0; dx < D1D; ++dx)
1416 const real_t Bx = B(dx,qx);
1417 u += Bx * DDD(dx,dy,dz);
1426template<
int MD1,
int MQ1>
1427MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
1428 const real_t (&sB)[MQ1*MD1],
1429 const real_t (&sDDD)[MD1*MD1*MD1],
1430 real_t (&sDDQ)[MD1*MD1*MQ1])
1435 EvalX(D1D,Q1D,B,DDD,DDQ);
1439MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
1444 MFEM_FOREACH_THREAD(dz,z,D1D)
1446 MFEM_FOREACH_THREAD(qy,y,Q1D)
1448 MFEM_FOREACH_THREAD(qx,x,Q1D)
1451 for (
int dy = 0; dy < D1D; ++dy)
1453 const real_t By = B(dy,qy);
1454 u += DDQ(dz,dy,qx) * By;
1463template<
int MD1,
int MQ1>
1464MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
1465 const real_t (&sB)[MQ1*MD1],
1466 const real_t (&sDDQ)[MD1*MD1*MQ1],
1467 real_t (&sDQQ)[MD1*MQ1*MQ1])
1472 EvalY(D1D,Q1D,B,DDQ,DQQ);
1476MFEM_HOST_DEVICE
inline void EvalZ(
const int D1D,
const int Q1D,
1481 MFEM_FOREACH_THREAD(qz,z,Q1D)
1483 MFEM_FOREACH_THREAD(qy,y,Q1D)
1485 MFEM_FOREACH_THREAD(qx,x,Q1D)
1488 for (
int dz = 0; dz < D1D; ++dz)
1490 const real_t Bz = B(dz,qz);
1491 u += DQQ(dz,qy,qx) * Bz;
1500template<
int MD1,
int MQ1>
1501MFEM_HOST_DEVICE
inline void EvalZ(
const int D1D,
const int Q1D,
1502 const real_t (&sB)[MQ1*MD1],
1503 const real_t (&sDQQ)[MD1*MQ1*MQ1],
1504 real_t (&sQQQ)[MQ1*MQ1*MQ1])
1509 EvalZ(D1D,Q1D,B,DQQ,QQQ);
1513MFEM_HOST_DEVICE
inline void PullEval(
const int x,
const int y,
const int z,
1521MFEM_HOST_DEVICE
inline void PullEval(
const int Q1D,
1522 const int x,
const int y,
const int z,
1523 const real_t (&sQQQ)[MQ1*MQ1*MQ1],
1527 PullEval(x,y,z,QQQ,X);
1532MFEM_HOST_DEVICE
inline void LoadX(
const int e,
const int D1D,
1533 const DeviceTensor<5, const real_t> &X,
1534 real_t (*sm)[MD1*MD1*MD1])
1540 MFEM_FOREACH_THREAD(dz,z,D1D)
1542 MFEM_FOREACH_THREAD(dy,y,D1D)
1544 MFEM_FOREACH_THREAD(dx,x,D1D)
1546 Xx(dx,dy,dz) = X(dx,dy,dz,0,e);
1547 Xy(dx,dy,dz) = X(dx,dy,dz,1,e);
1548 Xz(dx,dy,dz) = X(dx,dy,dz,2,e);
1556template<
int MD1,
int MQ1>
1557MFEM_HOST_DEVICE
inline void EvalX(
const int D1D,
const int Q1D,
1558 const real_t (&sB)[MQ1*MD1],
1559 const real_t (&sDDD)[3][MD1*MD1*MD1],
1560 real_t (&sDDQ)[3][MD1*MD1*MQ1])
1570 MFEM_FOREACH_THREAD(dz,z,D1D)
1572 MFEM_FOREACH_THREAD(dy,y,D1D)
1574 MFEM_FOREACH_THREAD(qx,x,Q1D)
1576 real_t u[3] = {0.0, 0.0, 0.0};
1577 for (
int dx = 0; dx < D1D; ++dx)
1579 const real_t Bx = B(dx,qx);
1580 u[0] += Bx * Xx(dx,dy,dz);
1581 u[1] += Bx * Xy(dx,dy,dz);
1582 u[2] += Bx * Xz(dx,dy,dz);
1584 XxB(qx,dy,dz) =
u[0];
1585 XyB(qx,dy,dz) =
u[1];
1586 XzB(qx,dy,dz) =
u[2];
1594template<
int MD1,
int MQ1>
1595MFEM_HOST_DEVICE
inline void EvalY(
const int D1D,
const int Q1D,
1596 const real_t (&sB)[MQ1*MD1],
1597 const real_t (&sDDQ)[3][MD1*MD1*MQ1],
1598 real_t (&sDQQ)[3][MD1*MQ1*MQ1])
1608 MFEM_FOREACH_THREAD(dz,z,D1D)
1610 MFEM_FOREACH_THREAD(qy,y,Q1D)
1612 MFEM_FOREACH_THREAD(qx,x,Q1D)
1614 real_t u[3] = {0.0, 0.0, 0.0};
1615 for (
int dy = 0; dy < D1D; ++dy)
1617 const real_t By = B(dy,qy);
1618 u[0] += XxB(qx,dy,dz) * By;
1619 u[1] += XyB(qx,dy,dz) * By;
1620 u[2] += XzB(qx,dy,dz) * By;
1622 XxBB(qx,qy,dz) =
u[0];
1623 XyBB(qx,qy,dz) =
u[1];
1624 XzBB(qx,qy,dz) =
u[2];
1632template<
int MD1,
int MQ1>
1633MFEM_HOST_DEVICE
inline void EvalZ(
const int D1D,
const int Q1D,
1634 const real_t (&sB)[MQ1*MD1],
1635 const real_t (&sDQQ)[3][MD1*MQ1*MQ1],
1636 real_t (&sQQQ)[3][MQ1*MQ1*MQ1])
1646 MFEM_FOREACH_THREAD(qz,z,Q1D)
1648 MFEM_FOREACH_THREAD(qy,y,Q1D)
1650 MFEM_FOREACH_THREAD(qx,x,Q1D)
1652 real_t u[3] = {0.0, 0.0, 0.0};
1653 for (
int dz = 0; dz < D1D; ++dz)
1655 const real_t Bz = B(dz,qz);
1656 u[0] += XxBB(qx,qy,dz) * Bz;
1657 u[1] += XyBB(qx,qy,dz) * Bz;
1658 u[2] += XzBB(qx,qy,dz) * Bz;
1660 XxBBB(qx,qy,qz) =
u[0];
1661 XyBBB(qx,qy,qz) =
u[1];
1662 XzBBB(qx,qy,qz) =
u[2];
1671MFEM_HOST_DEVICE
inline void PullEval(
const int Q1D,
1672 const int x,
const int y,
const int z,
1673 const real_t (&sQQQ)[3][MQ1*MQ1*MQ1],
1680 X[0] = XxBBB(x,y,z);
1681 X[1] = XyBBB(x,y,z);
1682 X[2] = XzBBB(x,y,z);
1687MFEM_HOST_DEVICE
inline void PushEval(
const int Q1D,
1688 const int x,
const int y,
const int z,
1690 real_t (&sQQQ)[3][MQ1*MQ1*MQ1])
1696 XxBBB(x,y,z) = A[0];
1697 XyBBB(x,y,z) = A[1];
1698 XzBBB(x,y,z) = A[2];
1702template<
int MD1,
int MQ1>
1703MFEM_HOST_DEVICE
inline void EvalXt(
const int D1D,
const int Q1D,
1704 const real_t (&sB)[MQ1*MD1],
1705 const real_t (&sQQQ)[3][MQ1*MQ1*MQ1],
1706 real_t (&sDQQ)[3][MD1*MQ1*MQ1])
1716 MFEM_FOREACH_THREAD(qz,z,Q1D)
1718 MFEM_FOREACH_THREAD(qy,y,Q1D)
1720 MFEM_FOREACH_THREAD(dx,x,D1D)
1722 real_t u[3] = {0.0, 0.0, 0.0};
1723 for (
int qx = 0; qx < Q1D; ++qx)
1725 const real_t Btx = Bt(qx,dx);
1726 u[0] += XxBBB(qx,qy,qz) * Btx;
1727 u[1] += XyBBB(qx,qy,qz) * Btx;
1728 u[2] += XzBBB(qx,qy,qz) * Btx;
1730 XxBB(qz,qy,dx) =
u[0];
1731 XyBB(qz,qy,dx) =
u[1];
1732 XzBB(qz,qy,dx) =
u[2];
1740template<
int MD1,
int MQ1>
1741MFEM_HOST_DEVICE
inline void EvalYt(
const int D1D,
const int Q1D,
1742 const real_t (&sB)[MQ1*MD1],
1743 const real_t (&sDQQ)[3][MD1*MQ1*MQ1],
1744 real_t (&sDDQ)[3][MD1*MD1*MQ1])
1754 MFEM_FOREACH_THREAD(qz,z,Q1D)
1756 MFEM_FOREACH_THREAD(dy,y,D1D)
1758 MFEM_FOREACH_THREAD(dx,x,D1D)
1760 real_t u[3] = {0.0, 0.0, 0.0};
1761 for (
int qy = 0; qy < Q1D; ++qy)
1763 const real_t Bty = Bt(qy,dy);
1764 u[0] += XxBB(qz,qy,dx) * Bty;
1765 u[1] += XyBB(qz,qy,dx) * Bty;
1766 u[2] += XzBB(qz,qy,dx) * Bty;
1769 XxB(qz,dy,dx) =
u[0];
1770 XyB(qz,dy,dx) =
u[1];
1771 XzB(qz,dy,dx)=
u[2];
1779template<
int MD1,
int MQ1>
1780MFEM_HOST_DEVICE
inline void EvalZt(
const int D1D,
const int Q1D,
1781 const real_t (&sB)[MQ1*MD1],
1782 const real_t (&sDDQ)[3][MD1*MD1*MQ1],
1783 const DeviceTensor<5> &Y,
1791 MFEM_FOREACH_THREAD(dz,z,D1D)
1793 MFEM_FOREACH_THREAD(dy,y,D1D)
1795 MFEM_FOREACH_THREAD(dx,x,D1D)
1797 real_t u[3] = {0.0, 0.0, 0.0};
1798 for (
int qz = 0; qz < Q1D; ++qz)
1800 const real_t Btz = Bt(qz,dz);
1801 u[0] += XxB(qz,dy,dx) * Btz;
1802 u[1] += XyB(qz,dy,dx) * Btz;
1803 u[2] += XzB(qz,dy,dx) * Btz;
1805 Y(dx,dy,dz,0,e) +=
u[0];
1806 Y(dx,dy,dz,1,e) +=
u[1];
1807 Y(dx,dy,dz,2,e) +=
u[2];
1814template<
int MD1,
int MQ1>
1815MFEM_HOST_DEVICE
inline void GradX(
const int D1D,
const int Q1D,
1816 const real_t (*sBG)[MQ1*MD1],
1817 const real_t (*sDDD)[MD1*MD1*MD1],
1818 real_t (*sDDQ)[MD1*MD1*MQ1])
1832 MFEM_FOREACH_THREAD(dz,z,D1D)
1834 MFEM_FOREACH_THREAD(dy,y,D1D)
1836 MFEM_FOREACH_THREAD(qx,x,Q1D)
1838 real_t u[3] = {0.0, 0.0, 0.0};
1839 real_t v[3] = {0.0, 0.0, 0.0};
1840 for (
int dx = 0; dx < D1D; ++dx)
1842 const real_t xx = Xx(dx,dy,dz);
1843 const real_t xy = Xy(dx,dy,dz);
1844 const real_t xz = Xz(dx,dy,dz);
1845 const real_t Bx = B(dx,qx);
1846 const real_t Gx = G(dx,qx);
1855 XxB(qx,dy,dz) =
u[0];
1856 XyB(qx,dy,dz) =
u[1];
1857 XzB(qx,dy,dz) =
u[2];
1859 XxG(qx,dy,dz) = v[0];
1860 XyG(qx,dy,dz) = v[1];
1861 XzG(qx,dy,dz) = v[2];
1869template<
int MD1,
int MQ1>
1870MFEM_HOST_DEVICE
inline void GradY(
const int D1D,
const int Q1D,
1871 const real_t (*sBG)[MQ1*MD1],
1872 const real_t (*sDDQ)[MD1*MD1*MQ1],
1873 real_t (*sDQQ)[MD1*MQ1*MQ1])
1893 MFEM_FOREACH_THREAD(dz,z,D1D)
1895 MFEM_FOREACH_THREAD(qy,y,Q1D)
1897 MFEM_FOREACH_THREAD(qx,x,Q1D)
1899 real_t u[3] = {0.0, 0.0, 0.0};
1900 real_t v[3] = {0.0, 0.0, 0.0};
1901 real_t w[3] = {0.0, 0.0, 0.0};
1902 for (
int dy = 0; dy < D1D; ++dy)
1904 const real_t By = B(dy,qy);
1905 const real_t Gy = G(dy,qy);
1907 u[0] += XxB(qx,dy,dz) * By;
1908 u[1] += XyB(qx,dy,dz) * By;
1909 u[2] += XzB(qx,dy,dz) * By;
1911 v[0] += XxG(qx,dy,dz) * By;
1912 v[1] += XyG(qx,dy,dz) * By;
1913 v[2] += XzG(qx,dy,dz) * By;
1915 w[0] += XxB(qx,dy,dz) * Gy;
1916 w[1] += XyB(qx,dy,dz) * Gy;
1917 w[2] += XzB(qx,dy,dz) * Gy;
1919 XxBB(qx,qy,dz) =
u[0];
1920 XyBB(qx,qy,dz) =
u[1];
1921 XzBB(qx,qy,dz) =
u[2];
1923 XxBG(qx,qy,dz) = v[0];
1924 XyBG(qx,qy,dz) = v[1];
1925 XzBG(qx,qy,dz) = v[2];
1927 XxGB(qx,qy,dz) = w[0];
1928 XyGB(qx,qy,dz) = w[1];
1929 XzGB(qx,qy,dz) = w[2];
1937template<
int MD1,
int MQ1>
1938MFEM_HOST_DEVICE
inline void GradZ(
const int D1D,
const int Q1D,
1939 const real_t (*sBG)[MQ1*MD1],
1940 const real_t (*sDQQ)[MD1*MQ1*MQ1],
1941 real_t (*sQQQ)[MQ1*MQ1*MQ1])
1964 MFEM_FOREACH_THREAD(qz,z,Q1D)
1966 MFEM_FOREACH_THREAD(qy,y,Q1D)
1968 MFEM_FOREACH_THREAD(qx,x,Q1D)
1970 real_t u[3] = {0.0, 0.0, 0.0};
1971 real_t v[3] = {0.0, 0.0, 0.0};
1972 real_t w[3] = {0.0, 0.0, 0.0};
1973 for (
int dz = 0; dz < D1D; ++dz)
1975 const real_t Bz = B(dz,qz);
1976 const real_t Gz = G(dz,qz);
1978 u[0] += XxBG(qx,qy,dz) * Bz;
1979 u[1] += XyBG(qx,qy,dz) * Bz;
1980 u[2] += XzBG(qx,qy,dz) * Bz;
1982 v[0] += XxGB(qx,qy,dz) * Bz;
1983 v[1] += XyGB(qx,qy,dz) * Bz;
1984 v[2] += XzGB(qx,qy,dz) * Bz;
1986 w[0] += XxBB(qx,qy,dz) * Gz;
1987 w[1] += XyBB(qx,qy,dz) * Gz;
1988 w[2] += XzBB(qx,qy,dz) * Gz;
1990 XxBBG(qx,qy,qz) =
u[0];
1991 XyBBG(qx,qy,qz) =
u[1];
1992 XzBBG(qx,qy,qz) =
u[2];
1994 XxBGB(qx,qy,qz) = v[0];
1995 XyBGB(qx,qy,qz) = v[1];
1996 XzBGB(qx,qy,qz) = v[2];
1998 XxGBB(qx,qy,qz)= w[0];
1999 XyGBB(qx,qy,qz) = w[1];
2000 XzGBB(qx,qy,qz) = w[2];
2009MFEM_HOST_DEVICE
inline void PullGrad(
const int Q1D,
2010 const int x,
const int y,
const int z,
2011 const real_t (*sQQQ)[MQ1*MQ1*MQ1],
2024 Jpr[0] = XxBBG(x,y,z);
2025 Jpr[3] = XxBGB(x,y,z);
2026 Jpr[6] = XxGBB(x,y,z);
2027 Jpr[1] = XyBBG(x,y,z);
2028 Jpr[4] = XyBGB(x,y,z);
2029 Jpr[7] = XyGBB(x,y,z);
2030 Jpr[2] = XzBBG(x,y,z);
2031 Jpr[5] = XzBGB(x,y,z);
2032 Jpr[8] = XzGBB(x,y,z);
2037MFEM_HOST_DEVICE
inline void PushGrad(
const int Q1D,
2038 const int x,
const int y,
const int z,
2040 real_t (&sQQQ)[9][MQ1*MQ1*MQ1])
2052 XxBBG(x,y,z) = A[0];
2053 XxBGB(x,y,z) = A[1];
2054 XxGBB(x,y,z) = A[2];
2055 XyBBG(x,y,z) = A[3];
2056 XyBGB(x,y,z) = A[4];
2057 XyGBB(x,y,z) = A[5];
2058 XzBBG(x,y,z) = A[6];
2059 XzBGB(x,y,z) = A[7];
2060 XzGBB(x,y,z) = A[8];
2064template<
int MD1,
int MQ1>
2065MFEM_HOST_DEVICE
inline void GradZt(
const int D1D,
const int Q1D,
2066 const real_t (&sBG)[2][MQ1*MD1],
2067 const real_t (&sQQQ)[9][MQ1*MQ1*MQ1],
2068 real_t (&sDQQ)[9][MD1*MQ1*MQ1])
2092 MFEM_FOREACH_THREAD(qz,z,Q1D)
2094 MFEM_FOREACH_THREAD(qy,y,Q1D)
2096 MFEM_FOREACH_THREAD(dx,x,D1D)
2098 real_t u[3] = {0.0, 0.0, 0.0};
2099 real_t v[3] = {0.0, 0.0, 0.0};
2100 real_t w[3] = {0.0, 0.0, 0.0};
2101 for (
int qx = 0; qx < Q1D; ++qx)
2103 const real_t Btx = Bt(qx,dx);
2104 const real_t Gtx = Gt(qx,dx);
2106 u[0] += XxBBG(qx,qy,qz) * Gtx;
2107 v[0] += XxBGB(qx,qy,qz) * Btx;
2108 w[0] += XxGBB(qx,qy,qz) * Btx;
2110 u[1] += XyBBG(qx,qy,qz) * Gtx;
2111 v[1] += XyBGB(qx,qy,qz) * Btx;
2112 w[1] += XyGBB(qx,qy,qz) * Btx;
2114 u[2] += XzBBG(qx,qy,qz) * Gtx;
2115 v[2] += XzBGB(qx,qy,qz) * Btx;
2116 w[2] += XzGBB(qx,qy,qz) * Btx;
2118 XxBB(qz,qy,dx) =
u[0];
2119 XxBG(qz,qy,dx) = v[0];
2120 XxGB(qz,qy,dx) = w[0];
2122 XyBB(qz,qy,dx) =
u[1];
2123 XyBG(qz,qy,dx) = v[1];
2124 XyGB(qz,qy,dx) = w[1];
2126 XzBB(qz,qy,dx) =
u[2];
2127 XzBG(qz,qy,dx) = v[2];
2128 XzGB(qz,qy,dx) = w[2];
2136template<
int MD1,
int MQ1>
2137MFEM_HOST_DEVICE
inline void GradYt(
const int D1D,
const int Q1D,
2138 const real_t (&sBG)[2][MQ1*MD1],
2139 const real_t (&sDQQ)[9][MD1*MQ1*MQ1],
2140 real_t (&sDDQ)[9][MD1*MD1*MQ1])
2163 MFEM_FOREACH_THREAD(qz,z,Q1D)
2165 MFEM_FOREACH_THREAD(dy,y,D1D)
2167 MFEM_FOREACH_THREAD(dx,x,D1D)
2169 real_t u[3] = {0.0, 0.0, 0.0};
2170 real_t v[3] = {0.0, 0.0, 0.0};
2171 real_t w[3] = {0.0, 0.0, 0.0};
2172 for (
int qy = 0; qy < Q1D; ++qy)
2174 const real_t Bty = Bt(qy,dy);
2175 const real_t Gty = Gt(qy,dy);
2177 u[0] += XxBB(qz,qy,dx) * Bty;
2178 v[0] += XxBG(qz,qy,dx) * Gty;
2179 w[0] += XxGB(qz,qy,dx) * Bty;
2181 u[1] += XyBB(qz,qy,dx) * Bty;
2182 v[1] += XyBG(qz,qy,dx) * Gty;
2183 w[1] += XyGB(qz,qy,dx) * Bty;
2185 u[2] += XzBB(qz,qy,dx) * Bty;
2186 v[2] += XzBG(qz,qy,dx) * Gty;
2187 w[2] += XzGB(qz,qy,dx) * Bty;
2190 XxB(qz,dy,dx) =
u[0];
2191 XxC(qz,dy,dx) = v[0];
2192 XxG(qz,dy,dx) = w[0];
2194 XyB(qz,dy,dx) =
u[1];
2195 XyC(qz,dy,dx) = v[1];
2196 XyG(qz,dy,dx) = w[1];
2198 XzB(qz,dy,dx) =
u[2];
2199 XzC(qz,dy,dx) = v[2];
2200 XzG(qz,dy,dx) = w[2];
2208template<
int MD1,
int MQ1>
2209MFEM_HOST_DEVICE
inline void GradXt(
const int D1D,
const int Q1D,
2210 const real_t (&sBG)[2][MQ1*MD1],
2211 const real_t (&sDDQ)[9][MD1*MD1*MQ1],
2212 const DeviceTensor<5> &Y,
2227 MFEM_FOREACH_THREAD(dz,z,D1D)
2229 MFEM_FOREACH_THREAD(dy,y,D1D)
2231 MFEM_FOREACH_THREAD(dx,x,D1D)
2233 real_t u[3] = {0.0, 0.0, 0.0};
2234 real_t v[3] = {0.0, 0.0, 0.0};
2235 real_t w[3] = {0.0, 0.0, 0.0};
2236 for (
int qz = 0; qz < Q1D; ++qz)
2238 const real_t Btz = Bt(qz,dz);
2239 const real_t Gtz = Gt(qz,dz);
2241 u[0] += XxB(qz,dy,dx) * Btz;
2242 v[0] += XxC(qz,dy,dx) * Btz;
2243 w[0] += XxG(qz,dy,dx) * Gtz;
2245 u[1] += XyB(qz,dy,dx) * Btz;
2246 v[1] += XyC(qz,dy,dx)* Btz;
2247 w[1] += XyG(qz,dy,dx) * Gtz;
2249 u[2] += XzB(qz,dy,dx) * Btz;
2250 v[2] += XzC(qz,dy,dx) * Btz;
2251 w[2] += XzG(qz,dy,dx) * Gtz;
2253 Y(dx,dy,dz,0,e) +=
u[0] + v[0] + w[0];
2254 Y(dx,dy,dz,1,e) +=
u[1] + v[1] + w[1];
2255 Y(dx,dy,dz,2,e) +=
u[2] + v[2] + w[2];
DeviceTensor< 3, real_t > DeviceCube
real_t u(const Vector &xvec)
DeviceTensor< 3, const real_t > ConstDeviceCube
void Transpose(const Table &A, Table &At, int ncols_A_)
Transpose a Table.
DeviceTensor< 2, const real_t > ConstDeviceMatrix
DeviceTensor< 2, real_t > DeviceMatrix
Implementation of the tensor class.