12#ifndef BILININTEG_DGTRACE_KERNELS_HPP
13#define BILININTEG_DGTRACE_KERNELS_HPP
29template <
int T_D1D = 0,
int T_Q1D = 0>
30static void PADGTraceApply2D(
const int NF,
const Array<real_t> &
b,
31 const Array<real_t> &bt,
const Vector &op_,
32 const Vector &x_, Vector &y_,
const int d1d = 0,
36 const int D1D = T_D1D ? T_D1D : d1d;
37 const int Q1D = T_Q1D ? T_Q1D : q1d;
40 auto B =
Reshape(
b.Read(), Q1D, D1D);
41 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
42 auto op =
Reshape(op_.Read(), Q1D, 2, 2, NF);
43 auto x =
Reshape(x_.Read(), D1D, VDIM, 2, NF);
44 auto y =
Reshape(y_.ReadWrite(), D1D, VDIM, 2, NF);
49 const int D1D = T_D1D ? T_D1D : d1d;
50 const int Q1D = T_Q1D ? T_Q1D : q1d;
52 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
53 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
56 for (
int d = 0; d < D1D; d++)
58 for (
int c = 0; c < VDIM; c++)
60 u0[d][c] = x(d, c, 0,
f);
61 u1[d][c] = x(d, c, 1,
f);
66 for (
int q = 0; q < Q1D; ++q)
68 for (
int c = 0; c < VDIM; c++)
73 for (
int d = 0; d < D1D; ++d)
76 for (
int c = 0; c < VDIM; c++)
78 Bu0[q][c] +=
b * u0[d][c];
79 Bu1[q][c] +=
b * u1[d][c];
84 for (
int q = 0; q < Q1D; ++q)
86 for (
int c = 0; c < VDIM; c++)
88 DBu[q][c] = op(q, 0, 0,
f) * Bu0[q][c] + op(q, 1, 0,
f) * Bu1[q][c];
91 real_t BDBu[max_D1D][VDIM];
92 for (
int d = 0; d < D1D; ++d)
94 for (
int c = 0; c < VDIM; c++)
98 for (
int q = 0; q < Q1D; ++q)
101 for (
int c = 0; c < VDIM; c++)
103 BDBu[d][c] +=
b * DBu[q][c];
106 for (
int c = 0; c < VDIM; c++)
108 y(d, c, 0,
f) += BDBu[d][c];
109 y(d, c, 1,
f) += -BDBu[d][c];
116template <
int T_D1D = 0,
int T_Q1D = 0>
117static void PADGTraceApply3D(
const int NF,
const Array<real_t> &
b,
118 const Array<real_t> &bt,
const Vector &op_,
119 const Vector &x_, Vector &y_,
const int d1d = 0,
123 const int D1D = T_D1D ? T_D1D : d1d;
124 const int Q1D = T_Q1D ? T_Q1D : q1d;
127 auto B =
Reshape(
b.Read(), Q1D, D1D);
128 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
129 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
130 auto x =
Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
131 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
136 const int D1D = T_D1D ? T_D1D : d1d;
137 const int Q1D = T_Q1D ? T_Q1D : q1d;
139 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
140 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
141 real_t u0[max_D1D][max_D1D][VDIM];
142 real_t u1[max_D1D][max_D1D][VDIM];
143 for (
int d1 = 0; d1 < D1D; d1++)
145 for (
int d2 = 0; d2 < D1D; d2++)
147 for (
int c = 0; c < VDIM; c++)
149 u0[d1][d2][c] = x(d1, d2, c, 0,
f);
150 u1[d1][d2][c] = x(d1, d2, c, 1,
f);
154 real_t Bu0[max_Q1D][max_D1D][VDIM];
155 real_t Bu1[max_Q1D][max_D1D][VDIM];
156 for (
int q = 0; q < Q1D; ++q)
158 for (
int d2 = 0; d2 < D1D; d2++)
160 for (
int c = 0; c < VDIM; c++)
165 for (
int d1 = 0; d1 < D1D; ++d1)
168 for (
int c = 0; c < VDIM; c++)
170 Bu0[q][d2][c] +=
b * u0[d1][d2][c];
171 Bu1[q][d2][c] +=
b * u1[d1][d2][c];
176 real_t BBu0[max_Q1D][max_Q1D][VDIM];
177 real_t BBu1[max_Q1D][max_Q1D][VDIM];
178 for (
int q1 = 0; q1 < Q1D; ++q1)
180 for (
int q2 = 0; q2 < Q1D; q2++)
182 for (
int c = 0; c < VDIM; c++)
184 BBu0[q1][q2][c] = 0.0;
185 BBu1[q1][q2][c] = 0.0;
187 for (
int d2 = 0; d2 < D1D; ++d2)
190 for (
int c = 0; c < VDIM; c++)
192 BBu0[q1][q2][c] +=
b * Bu0[q1][d2][c];
193 BBu1[q1][q2][c] +=
b * Bu1[q1][d2][c];
198 real_t DBBu[max_Q1D][max_Q1D][VDIM];
199 for (
int q1 = 0; q1 < Q1D; ++q1)
201 for (
int q2 = 0; q2 < Q1D; q2++)
203 for (
int c = 0; c < VDIM; c++)
205 DBBu[q1][q2][c] = op(q1, q2, 0, 0,
f) * BBu0[q1][q2][c] +
206 op(q1, q2, 1, 0,
f) * BBu1[q1][q2][c];
210 real_t BDBBu[max_Q1D][max_D1D][VDIM];
211 for (
int q1 = 0; q1 < Q1D; ++q1)
213 for (
int d2 = 0; d2 < D1D; d2++)
215 for (
int c = 0; c < VDIM; c++)
217 BDBBu[q1][d2][c] = 0.0;
219 for (
int q2 = 0; q2 < Q1D; ++q2)
222 for (
int c = 0; c < VDIM; c++)
224 BDBBu[q1][d2][c] +=
b * DBBu[q1][q2][c];
229 real_t BBDBBu[max_D1D][max_D1D][VDIM];
230 for (
int d1 = 0; d1 < D1D; ++d1)
232 for (
int d2 = 0; d2 < D1D; d2++)
234 for (
int c = 0; c < VDIM; c++)
236 BBDBBu[d1][d2][c] = 0.0;
238 for (
int q1 = 0; q1 < Q1D; ++q1)
241 for (
int c = 0; c < VDIM; c++)
243 BBDBBu[d1][d2][c] +=
b * BDBBu[q1][d2][c];
246 for (
int c = 0; c < VDIM; c++)
248 y(d1, d2, c, 0,
f) += BBDBBu[d1][d2][c];
249 y(d1, d2, c, 1,
f) += -BBDBBu[d1][d2][c];
257template <
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
258static void SmemPADGTraceApply3D(
const int NF,
const Array<real_t> &
b,
259 const Array<real_t> &bt,
const Vector &op_,
260 const Vector &x_, Vector &y_,
261 const int d1d = 0,
const int q1d = 0)
263 const int D1D = T_D1D ? T_D1D : d1d;
264 const int Q1D = T_Q1D ? T_Q1D : q1d;
265 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
268 auto B =
Reshape(
b.Read(), Q1D, D1D);
269 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
270 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
271 auto x =
Reshape(x_.Read(), D1D, D1D, 2, NF);
272 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
276 const int tidz = MFEM_THREAD_ID(z);
277 const int D1D = T_D1D ? T_D1D : d1d;
278 const int Q1D = T_Q1D ? T_Q1D : q1d;
280 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
281 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
282 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
283 MFEM_SHARED
real_t u0[NBZ][max_D1D][max_D1D];
284 MFEM_SHARED
real_t u1[NBZ][max_D1D][max_D1D];
285 MFEM_FOREACH_THREAD(d1, x, D1D)
287 MFEM_FOREACH_THREAD(d2, y, D1D)
289 u0[tidz][d1][d2] = x(d1, d2, 0,
f);
290 u1[tidz][d1][d2] = x(d1, d2, 1,
f);
294 MFEM_SHARED
real_t Bu0[NBZ][max_Q1D][max_D1D];
295 MFEM_SHARED
real_t Bu1[NBZ][max_Q1D][max_D1D];
296 MFEM_FOREACH_THREAD(q1, x, Q1D)
298 MFEM_FOREACH_THREAD(d2, y, D1D)
302 for (
int d1 = 0; d1 < D1D; ++d1)
305 Bu0_ +=
b * u0[tidz][d1][d2];
306 Bu1_ +=
b * u1[tidz][d1][d2];
308 Bu0[tidz][q1][d2] = Bu0_;
309 Bu1[tidz][q1][d2] = Bu1_;
313 MFEM_SHARED
real_t BBu0[NBZ][max_Q1D][max_Q1D];
314 MFEM_SHARED
real_t BBu1[NBZ][max_Q1D][max_Q1D];
315 MFEM_FOREACH_THREAD(q1, x, Q1D)
317 MFEM_FOREACH_THREAD(q2, y, Q1D)
321 for (
int d2 = 0; d2 < D1D; ++d2)
324 BBu0_ +=
b * Bu0[tidz][q1][d2];
325 BBu1_ +=
b * Bu1[tidz][q1][d2];
327 BBu0[tidz][q1][q2] = BBu0_;
328 BBu1[tidz][q1][q2] = BBu1_;
332 MFEM_SHARED
real_t DBBu[NBZ][max_Q1D][max_Q1D];
333 MFEM_FOREACH_THREAD(q1, x, Q1D)
335 MFEM_FOREACH_THREAD(q2, y, Q1D)
337 DBBu[tidz][q1][q2] = op(q1, q2, 0, 0,
f) * BBu0[tidz][q1][q2] +
338 op(q1, q2, 1, 0,
f) * BBu1[tidz][q1][q2];
342 MFEM_SHARED
real_t BDBBu[NBZ][max_Q1D][max_D1D];
343 MFEM_FOREACH_THREAD(q1, x, Q1D)
345 MFEM_FOREACH_THREAD(d2, y, D1D)
348 for (
int q2 = 0; q2 < Q1D; ++q2)
351 BDBBu_ +=
b * DBBu[tidz][q1][q2];
353 BDBBu[tidz][q1][d2] = BDBBu_;
357 MFEM_FOREACH_THREAD(d1, x, D1D)
359 MFEM_FOREACH_THREAD(d2, y, D1D)
362 for (
int q1 = 0; q1 < Q1D; ++q1)
365 BBDBBu_ +=
b * BDBBu[tidz][q1][d2];
367 y(d1, d2, 0,
f) += BBDBBu_;
368 y(d1, d2, 1,
f) += -BBDBBu_;
375template <
int T_D1D = 0,
int T_Q1D = 0>
376static void PADGTraceApplyTranspose2D(
const int NF,
const Array<real_t> &
b,
377 const Array<real_t> &bt,
378 const Vector &op_,
const Vector &x_,
379 Vector &y_,
const int d1d = 0,
383 const int D1D = T_D1D ? T_D1D : d1d;
384 const int Q1D = T_Q1D ? T_Q1D : q1d;
387 auto B =
Reshape(
b.Read(), Q1D, D1D);
388 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
389 auto op =
Reshape(op_.Read(), Q1D, 2, 2, NF);
390 auto x =
Reshape(x_.Read(), D1D, VDIM, 2, NF);
391 auto y =
Reshape(y_.ReadWrite(), D1D, VDIM, 2, NF);
396 const int D1D = T_D1D ? T_D1D : d1d;
397 const int Q1D = T_Q1D ? T_Q1D : q1d;
399 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
400 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
403 for (
int d = 0; d < D1D; d++)
405 for (
int c = 0; c < VDIM; c++)
407 u0[d][c] = x(d, c, 0,
f);
408 u1[d][c] = x(d, c, 1,
f);
411 real_t Bu0[max_Q1D][VDIM];
412 real_t Bu1[max_Q1D][VDIM];
413 for (
int q = 0; q < Q1D; ++q)
415 for (
int c = 0; c < VDIM; c++)
420 for (
int d = 0; d < D1D; ++d)
423 for (
int c = 0; c < VDIM; c++)
425 Bu0[q][c] +=
b * u0[d][c];
426 Bu1[q][c] +=
b * u1[d][c];
430 real_t DBu0[max_Q1D][VDIM];
431 real_t DBu1[max_Q1D][VDIM];
432 for (
int q = 0; q < Q1D; ++q)
434 for (
int c = 0; c < VDIM; c++)
437 op(q, 0, 0,
f) * Bu0[q][c] + op(q, 0, 1,
f) * Bu1[q][c];
439 op(q, 1, 0,
f) * Bu0[q][c] + op(q, 1, 1,
f) * Bu1[q][c];
442 real_t BDBu0[max_D1D][VDIM];
443 real_t BDBu1[max_D1D][VDIM];
444 for (
int d = 0; d < D1D; ++d)
446 for (
int c = 0; c < VDIM; c++)
451 for (
int q = 0; q < Q1D; ++q)
454 for (
int c = 0; c < VDIM; c++)
456 BDBu0[d][c] +=
b * DBu0[q][c];
457 BDBu1[d][c] +=
b * DBu1[q][c];
460 for (
int c = 0; c < VDIM; c++)
462 y(d, c, 0,
f) += BDBu0[d][c];
463 y(d, c, 1,
f) += BDBu1[d][c];
470template <
int T_D1D = 0,
int T_Q1D = 0>
471static void PADGTraceApplyTranspose3D(
const int NF,
const Array<real_t> &
b,
472 const Array<real_t> &bt,
473 const Vector &op_,
const Vector &x_,
474 Vector &y_,
const int d1d = 0,
478 const int D1D = T_D1D ? T_D1D : d1d;
479 const int Q1D = T_Q1D ? T_Q1D : q1d;
482 auto B =
Reshape(
b.Read(), Q1D, D1D);
483 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
484 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
485 auto x =
Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
486 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
491 const int D1D = T_D1D ? T_D1D : d1d;
492 const int Q1D = T_Q1D ? T_Q1D : q1d;
494 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
495 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
496 real_t u0[max_D1D][max_D1D][VDIM];
497 real_t u1[max_D1D][max_D1D][VDIM];
498 for (
int d1 = 0; d1 < D1D; d1++)
500 for (
int d2 = 0; d2 < D1D; d2++)
502 for (
int c = 0; c < VDIM; c++)
504 u0[d1][d2][c] = x(d1, d2, c, 0,
f);
505 u1[d1][d2][c] = x(d1, d2, c, 1,
f);
509 real_t Bu0[max_Q1D][max_D1D][VDIM];
510 real_t Bu1[max_Q1D][max_D1D][VDIM];
511 for (
int q1 = 0; q1 < Q1D; ++q1)
513 for (
int d2 = 0; d2 < D1D; ++d2)
515 for (
int c = 0; c < VDIM; c++)
517 Bu0[q1][d2][c] = 0.0;
518 Bu1[q1][d2][c] = 0.0;
520 for (
int d1 = 0; d1 < D1D; ++d1)
523 for (
int c = 0; c < VDIM; c++)
525 Bu0[q1][d2][c] +=
b * u0[d1][d2][c];
526 Bu1[q1][d2][c] +=
b * u1[d1][d2][c];
531 real_t BBu0[max_Q1D][max_Q1D][VDIM];
532 real_t BBu1[max_Q1D][max_Q1D][VDIM];
533 for (
int q1 = 0; q1 < Q1D; ++q1)
535 for (
int q2 = 0; q2 < Q1D; ++q2)
537 for (
int c = 0; c < VDIM; c++)
539 BBu0[q1][q2][c] = 0.0;
540 BBu1[q1][q2][c] = 0.0;
542 for (
int d2 = 0; d2 < D1D; ++d2)
545 for (
int c = 0; c < VDIM; c++)
547 BBu0[q1][q2][c] +=
b * Bu0[q1][d2][c];
548 BBu1[q1][q2][c] +=
b * Bu1[q1][d2][c];
553 real_t DBu0[max_Q1D][max_Q1D][VDIM];
554 real_t DBu1[max_Q1D][max_Q1D][VDIM];
555 for (
int q1 = 0; q1 < Q1D; ++q1)
557 for (
int q2 = 0; q2 < Q1D; ++q2)
559 const real_t D00 = op(q1, q2, 0, 0,
f);
560 const real_t D01 = op(q1, q2, 0, 1,
f);
561 const real_t D10 = op(q1, q2, 1, 0,
f);
562 const real_t D11 = op(q1, q2, 1, 1,
f);
563 for (
int c = 0; c < VDIM; c++)
565 DBu0[q1][q2][c] = D00 * BBu0[q1][q2][c] + D01 * BBu1[q1][q2][c];
566 DBu1[q1][q2][c] = D10 * BBu0[q1][q2][c] + D11 * BBu1[q1][q2][c];
570 real_t BDBu0[max_D1D][max_Q1D][VDIM];
571 real_t BDBu1[max_D1D][max_Q1D][VDIM];
572 for (
int d1 = 0; d1 < D1D; ++d1)
574 for (
int q2 = 0; q2 < Q1D; ++q2)
576 for (
int c = 0; c < VDIM; c++)
578 BDBu0[d1][q2][c] = 0.0;
579 BDBu1[d1][q2][c] = 0.0;
581 for (
int q1 = 0; q1 < Q1D; ++q1)
584 for (
int c = 0; c < VDIM; c++)
586 BDBu0[d1][q2][c] +=
b * DBu0[q1][q2][c];
587 BDBu1[d1][q2][c] +=
b * DBu1[q1][q2][c];
592 real_t BBDBu0[max_D1D][max_D1D][VDIM];
593 real_t BBDBu1[max_D1D][max_D1D][VDIM];
594 for (
int d1 = 0; d1 < D1D; ++d1)
596 for (
int d2 = 0; d2 < D1D; ++d2)
598 for (
int c = 0; c < VDIM; c++)
600 BBDBu0[d1][d2][c] = 0.0;
601 BBDBu1[d1][d2][c] = 0.0;
603 for (
int q2 = 0; q2 < Q1D; ++q2)
606 for (
int c = 0; c < VDIM; c++)
608 BBDBu0[d1][d2][c] +=
b * BDBu0[d1][q2][c];
609 BBDBu1[d1][d2][c] +=
b * BDBu1[d1][q2][c];
612 for (
int c = 0; c < VDIM; c++)
614 y(d1, d2, c, 0,
f) += BBDBu0[d1][d2][c];
615 y(d1, d2, c, 1,
f) += BBDBu1[d1][d2][c];
623template <
int T_D1D = 0,
int T_Q1D = 0,
int T_NBZ = 0>
624static void SmemPADGTraceApplyTranspose3D(
const int NF,
const Array<real_t> &
b,
625 const Array<real_t> &bt,
626 const Vector &op_,
const Vector &x_,
627 Vector &y_,
const int d1d = 0,
630 const int D1D = T_D1D ? T_D1D : d1d;
631 const int Q1D = T_Q1D ? T_Q1D : q1d;
632 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
635 auto B =
Reshape(
b.Read(), Q1D, D1D);
636 auto Bt =
Reshape(bt.Read(), D1D, Q1D);
637 auto op =
Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
638 auto x =
Reshape(x_.Read(), D1D, D1D, 2, NF);
639 auto y =
Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
643 const int tidz = MFEM_THREAD_ID(z);
644 const int D1D = T_D1D ? T_D1D : d1d;
645 const int Q1D = T_Q1D ? T_Q1D : q1d;
647 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
648 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
649 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
650 MFEM_SHARED
real_t u0[NBZ][max_D1D][max_D1D];
651 MFEM_SHARED
real_t u1[NBZ][max_D1D][max_D1D];
652 MFEM_FOREACH_THREAD(d1, x, D1D)
654 MFEM_FOREACH_THREAD(d2, y, D1D)
656 u0[tidz][d1][d2] = x(d1, d2, 0,
f);
657 u1[tidz][d1][d2] = x(d1, d2, 1,
f);
661 MFEM_SHARED
real_t Bu0[NBZ][max_Q1D][max_D1D];
662 MFEM_SHARED
real_t Bu1[NBZ][max_Q1D][max_D1D];
663 MFEM_FOREACH_THREAD(q1, x, Q1D)
665 MFEM_FOREACH_THREAD(d2, y, D1D)
669 for (
int d1 = 0; d1 < D1D; ++d1)
672 Bu0_ +=
b * u0[tidz][d1][d2];
673 Bu1_ +=
b * u1[tidz][d1][d2];
675 Bu0[tidz][q1][d2] = Bu0_;
676 Bu1[tidz][q1][d2] = Bu1_;
680 MFEM_SHARED
real_t BBu0[NBZ][max_Q1D][max_Q1D];
681 MFEM_SHARED
real_t BBu1[NBZ][max_Q1D][max_Q1D];
682 MFEM_FOREACH_THREAD(q1, x, Q1D)
684 MFEM_FOREACH_THREAD(q2, y, Q1D)
688 for (
int d2 = 0; d2 < D1D; ++d2)
691 BBu0_ +=
b * Bu0[tidz][q1][d2];
692 BBu1_ +=
b * Bu1[tidz][q1][d2];
694 BBu0[tidz][q1][q2] = BBu0_;
695 BBu1[tidz][q1][q2] = BBu1_;
699 MFEM_SHARED
real_t DBBu0[NBZ][max_Q1D][max_Q1D];
700 MFEM_SHARED
real_t DBBu1[NBZ][max_Q1D][max_Q1D];
701 MFEM_FOREACH_THREAD(q1, x, Q1D)
703 MFEM_FOREACH_THREAD(q2, y, Q1D)
705 const real_t D00 = op(q1, q2, 0, 0,
f);
706 const real_t D01 = op(q1, q2, 0, 1,
f);
707 const real_t D10 = op(q1, q2, 1, 0,
f);
708 const real_t D11 = op(q1, q2, 1, 1,
f);
709 const real_t u0q = BBu0[tidz][q1][q2];
710 const real_t u1q = BBu1[tidz][q1][q2];
711 DBBu0[tidz][q1][q2] = D00 * u0q + D01 * u1q;
712 DBBu1[tidz][q1][q2] = D10 * u0q + D11 * u1q;
716 MFEM_SHARED
real_t BDBBu0[NBZ][max_Q1D][max_D1D];
717 MFEM_SHARED
real_t BDBBu1[NBZ][max_Q1D][max_D1D];
718 MFEM_FOREACH_THREAD(q1, x, Q1D)
720 MFEM_FOREACH_THREAD(d2, y, D1D)
724 for (
int q2 = 0; q2 < Q1D; ++q2)
727 BDBBu0_ +=
b * DBBu0[tidz][q1][q2];
728 BDBBu1_ +=
b * DBBu1[tidz][q1][q2];
730 BDBBu0[tidz][q1][d2] = BDBBu0_;
731 BDBBu1[tidz][q1][d2] = BDBBu1_;
735 MFEM_FOREACH_THREAD(d1, x, D1D)
737 MFEM_FOREACH_THREAD(d2, y, D1D)
741 for (
int q1 = 0; q1 < Q1D; ++q1)
744 BBDBBu0_ +=
b * BDBBu0[tidz][q1][d2];
745 BBDBBu1_ +=
b * BDBBu1[tidz][q1][d2];
747 y(d1, d2, 0,
f) += BBDBBu0_;
748 y(d1, d2, 1,
f) += BBDBBu1_;
756template <
int DIM,
int D1D,
int Q1D>
759 if constexpr (
DIM == 2)
761 return internal::PADGTraceApply2D<D1D, Q1D>;
763 else if constexpr (
DIM == 3)
765 if constexpr (D1D == 3 || D1D == 4)
767 return internal::SmemPADGTraceApply3D<D1D, Q1D, 2>;
771 return internal::SmemPADGTraceApply3D<D1D, Q1D>;
777template <
int DIM,
int D1D,
int Q1D>
780 if constexpr (
DIM == 2)
782 return internal::PADGTraceApplyTranspose2D<D1D, Q1D>;
784 else if constexpr (
DIM == 3)
786 return internal::SmemPADGTraceApplyTranspose3D<D1D, Q1D>;
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
arguments: nf, B, Bt, pa_data, x, y, dofs1D, quad1D
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
std::function< real_t(const Vector &)> f(real_t mass_coeff)
void forall(int N, lambda &&body)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.