MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_dgtrace_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef BILININTEG_DGTRACE_KERNELS_HPP
13#define BILININTEG_DGTRACE_KERNELS_HPP
14
16#include "../bilininteg.hpp"
17#include "../gridfunc.hpp"
18#include "../qfunction.hpp"
19#include "../restriction.hpp"
20
21/// \cond DO_NOT_DOCUMENT
22namespace mfem
23{
24
25namespace internal
26{
27
28// PA DGTrace Apply 2D kernel for Gauss-Lobatto/Bernstein
29template <int T_D1D = 0, int T_Q1D = 0>
30static void PADGTraceApply2D(const int NF, const Array<real_t> &b,
31 const Array<real_t> &bt, const Vector &op_,
32 const Vector &x_, Vector &y_, const int d1d = 0,
33 const int q1d = 0)
34{
35 const int VDIM = 1;
36 const int D1D = T_D1D ? T_D1D : d1d;
37 const int Q1D = T_Q1D ? T_Q1D : q1d;
38 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
39 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
40 auto B = Reshape(b.Read(), Q1D, D1D);
41 auto Bt = Reshape(bt.Read(), D1D, Q1D);
42 auto op = Reshape(op_.Read(), Q1D, 2, 2, NF);
43 auto x = Reshape(x_.Read(), D1D, VDIM, 2, NF);
44 auto y = Reshape(y_.ReadWrite(), D1D, VDIM, 2, NF);
45
46 mfem::forall(NF, [=] MFEM_HOST_DEVICE(int f)
47 {
48 const int VDIM = 1;
49 const int D1D = T_D1D ? T_D1D : d1d;
50 const int Q1D = T_Q1D ? T_Q1D : q1d;
51 // the following variables are evaluated at compile time
52 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
53 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
54 real_t u0[max_D1D][VDIM];
55 real_t u1[max_D1D][VDIM];
56 for (int d = 0; d < D1D; d++)
57 {
58 for (int c = 0; c < VDIM; c++)
59 {
60 u0[d][c] = x(d, c, 0, f);
61 u1[d][c] = x(d, c, 1, f);
62 }
63 }
64 real_t Bu0[max_Q1D][VDIM];
65 real_t Bu1[max_Q1D][VDIM];
66 for (int q = 0; q < Q1D; ++q)
67 {
68 for (int c = 0; c < VDIM; c++)
69 {
70 Bu0[q][c] = 0.0;
71 Bu1[q][c] = 0.0;
72 }
73 for (int d = 0; d < D1D; ++d)
74 {
75 const real_t b = B(q, d);
76 for (int c = 0; c < VDIM; c++)
77 {
78 Bu0[q][c] += b * u0[d][c];
79 Bu1[q][c] += b * u1[d][c];
80 }
81 }
82 }
83 real_t DBu[max_Q1D][VDIM];
84 for (int q = 0; q < Q1D; ++q)
85 {
86 for (int c = 0; c < VDIM; c++)
87 {
88 DBu[q][c] = op(q, 0, 0, f) * Bu0[q][c] + op(q, 1, 0, f) * Bu1[q][c];
89 }
90 }
91 real_t BDBu[max_D1D][VDIM];
92 for (int d = 0; d < D1D; ++d)
93 {
94 for (int c = 0; c < VDIM; c++)
95 {
96 BDBu[d][c] = 0.0;
97 }
98 for (int q = 0; q < Q1D; ++q)
99 {
100 const real_t b = Bt(d, q);
101 for (int c = 0; c < VDIM; c++)
102 {
103 BDBu[d][c] += b * DBu[q][c];
104 }
105 }
106 for (int c = 0; c < VDIM; c++)
107 {
108 y(d, c, 0, f) += BDBu[d][c];
109 y(d, c, 1, f) += -BDBu[d][c];
110 }
111 }
112 });
113}
114
115// PA DGTrace Apply 3D kernel for Gauss-Lobatto/Bernstein
116template <int T_D1D = 0, int T_Q1D = 0>
117static void PADGTraceApply3D(const int NF, const Array<real_t> &b,
118 const Array<real_t> &bt, const Vector &op_,
119 const Vector &x_, Vector &y_, const int d1d = 0,
120 const int q1d = 0)
121{
122 const int VDIM = 1;
123 const int D1D = T_D1D ? T_D1D : d1d;
124 const int Q1D = T_Q1D ? T_Q1D : q1d;
125 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
126 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
127 auto B = Reshape(b.Read(), Q1D, D1D);
128 auto Bt = Reshape(bt.Read(), D1D, Q1D);
129 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
130 auto x = Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
131 auto y = Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
132
133 mfem::forall(NF, [=] MFEM_HOST_DEVICE(int f)
134 {
135 const int VDIM = 1;
136 const int D1D = T_D1D ? T_D1D : d1d;
137 const int Q1D = T_Q1D ? T_Q1D : q1d;
138 // the following variables are evaluated at compile time
139 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
140 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
141 real_t u0[max_D1D][max_D1D][VDIM];
142 real_t u1[max_D1D][max_D1D][VDIM];
143 for (int d1 = 0; d1 < D1D; d1++)
144 {
145 for (int d2 = 0; d2 < D1D; d2++)
146 {
147 for (int c = 0; c < VDIM; c++)
148 {
149 u0[d1][d2][c] = x(d1, d2, c, 0, f);
150 u1[d1][d2][c] = x(d1, d2, c, 1, f);
151 }
152 }
153 }
154 real_t Bu0[max_Q1D][max_D1D][VDIM];
155 real_t Bu1[max_Q1D][max_D1D][VDIM];
156 for (int q = 0; q < Q1D; ++q)
157 {
158 for (int d2 = 0; d2 < D1D; d2++)
159 {
160 for (int c = 0; c < VDIM; c++)
161 {
162 Bu0[q][d2][c] = 0.0;
163 Bu1[q][d2][c] = 0.0;
164 }
165 for (int d1 = 0; d1 < D1D; ++d1)
166 {
167 const real_t b = B(q, d1);
168 for (int c = 0; c < VDIM; c++)
169 {
170 Bu0[q][d2][c] += b * u0[d1][d2][c];
171 Bu1[q][d2][c] += b * u1[d1][d2][c];
172 }
173 }
174 }
175 }
176 real_t BBu0[max_Q1D][max_Q1D][VDIM];
177 real_t BBu1[max_Q1D][max_Q1D][VDIM];
178 for (int q1 = 0; q1 < Q1D; ++q1)
179 {
180 for (int q2 = 0; q2 < Q1D; q2++)
181 {
182 for (int c = 0; c < VDIM; c++)
183 {
184 BBu0[q1][q2][c] = 0.0;
185 BBu1[q1][q2][c] = 0.0;
186 }
187 for (int d2 = 0; d2 < D1D; ++d2)
188 {
189 const real_t b = B(q2, d2);
190 for (int c = 0; c < VDIM; c++)
191 {
192 BBu0[q1][q2][c] += b * Bu0[q1][d2][c];
193 BBu1[q1][q2][c] += b * Bu1[q1][d2][c];
194 }
195 }
196 }
197 }
198 real_t DBBu[max_Q1D][max_Q1D][VDIM];
199 for (int q1 = 0; q1 < Q1D; ++q1)
200 {
201 for (int q2 = 0; q2 < Q1D; q2++)
202 {
203 for (int c = 0; c < VDIM; c++)
204 {
205 DBBu[q1][q2][c] = op(q1, q2, 0, 0, f) * BBu0[q1][q2][c] +
206 op(q1, q2, 1, 0, f) * BBu1[q1][q2][c];
207 }
208 }
209 }
210 real_t BDBBu[max_Q1D][max_D1D][VDIM];
211 for (int q1 = 0; q1 < Q1D; ++q1)
212 {
213 for (int d2 = 0; d2 < D1D; d2++)
214 {
215 for (int c = 0; c < VDIM; c++)
216 {
217 BDBBu[q1][d2][c] = 0.0;
218 }
219 for (int q2 = 0; q2 < Q1D; ++q2)
220 {
221 const real_t b = Bt(d2, q2);
222 for (int c = 0; c < VDIM; c++)
223 {
224 BDBBu[q1][d2][c] += b * DBBu[q1][q2][c];
225 }
226 }
227 }
228 }
229 real_t BBDBBu[max_D1D][max_D1D][VDIM];
230 for (int d1 = 0; d1 < D1D; ++d1)
231 {
232 for (int d2 = 0; d2 < D1D; d2++)
233 {
234 for (int c = 0; c < VDIM; c++)
235 {
236 BBDBBu[d1][d2][c] = 0.0;
237 }
238 for (int q1 = 0; q1 < Q1D; ++q1)
239 {
240 const real_t b = Bt(d1, q1);
241 for (int c = 0; c < VDIM; c++)
242 {
243 BBDBBu[d1][d2][c] += b * BDBBu[q1][d2][c];
244 }
245 }
246 for (int c = 0; c < VDIM; c++)
247 {
248 y(d1, d2, c, 0, f) += BBDBBu[d1][d2][c];
249 y(d1, d2, c, 1, f) += -BBDBBu[d1][d2][c];
250 }
251 }
252 }
253 });
254}
255
256// Optimized PA DGTrace Apply 3D kernel for Gauss-Lobatto/Bernstein
257template <int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
258static void SmemPADGTraceApply3D(const int NF, const Array<real_t> &b,
259 const Array<real_t> &bt, const Vector &op_,
260 const Vector &x_, Vector &y_,
261 const int d1d = 0, const int q1d = 0)
262{
263 const int D1D = T_D1D ? T_D1D : d1d;
264 const int Q1D = T_Q1D ? T_Q1D : q1d;
265 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
266 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
267 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
268 auto B = Reshape(b.Read(), Q1D, D1D);
269 auto Bt = Reshape(bt.Read(), D1D, Q1D);
270 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
271 auto x = Reshape(x_.Read(), D1D, D1D, 2, NF);
272 auto y = Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
273
274 mfem::forall_2D_batch(NF, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE(int f)
275 {
276 const int tidz = MFEM_THREAD_ID(z);
277 const int D1D = T_D1D ? T_D1D : d1d;
278 const int Q1D = T_Q1D ? T_Q1D : q1d;
279 // the following variables are evaluated at compile time
280 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
281 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
282 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
283 MFEM_SHARED real_t u0[NBZ][max_D1D][max_D1D];
284 MFEM_SHARED real_t u1[NBZ][max_D1D][max_D1D];
285 MFEM_FOREACH_THREAD(d1, x, D1D)
286 {
287 MFEM_FOREACH_THREAD(d2, y, D1D)
288 {
289 u0[tidz][d1][d2] = x(d1, d2, 0, f);
290 u1[tidz][d1][d2] = x(d1, d2, 1, f);
291 }
292 }
293 MFEM_SYNC_THREAD;
294 MFEM_SHARED real_t Bu0[NBZ][max_Q1D][max_D1D];
295 MFEM_SHARED real_t Bu1[NBZ][max_Q1D][max_D1D];
296 MFEM_FOREACH_THREAD(q1, x, Q1D)
297 {
298 MFEM_FOREACH_THREAD(d2, y, D1D)
299 {
300 real_t Bu0_ = 0.0;
301 real_t Bu1_ = 0.0;
302 for (int d1 = 0; d1 < D1D; ++d1)
303 {
304 const real_t b = B(q1, d1);
305 Bu0_ += b * u0[tidz][d1][d2];
306 Bu1_ += b * u1[tidz][d1][d2];
307 }
308 Bu0[tidz][q1][d2] = Bu0_;
309 Bu1[tidz][q1][d2] = Bu1_;
310 }
311 }
312 MFEM_SYNC_THREAD;
313 MFEM_SHARED real_t BBu0[NBZ][max_Q1D][max_Q1D];
314 MFEM_SHARED real_t BBu1[NBZ][max_Q1D][max_Q1D];
315 MFEM_FOREACH_THREAD(q1, x, Q1D)
316 {
317 MFEM_FOREACH_THREAD(q2, y, Q1D)
318 {
319 real_t BBu0_ = 0.0;
320 real_t BBu1_ = 0.0;
321 for (int d2 = 0; d2 < D1D; ++d2)
322 {
323 const real_t b = B(q2, d2);
324 BBu0_ += b * Bu0[tidz][q1][d2];
325 BBu1_ += b * Bu1[tidz][q1][d2];
326 }
327 BBu0[tidz][q1][q2] = BBu0_;
328 BBu1[tidz][q1][q2] = BBu1_;
329 }
330 }
331 MFEM_SYNC_THREAD;
332 MFEM_SHARED real_t DBBu[NBZ][max_Q1D][max_Q1D];
333 MFEM_FOREACH_THREAD(q1, x, Q1D)
334 {
335 MFEM_FOREACH_THREAD(q2, y, Q1D)
336 {
337 DBBu[tidz][q1][q2] = op(q1, q2, 0, 0, f) * BBu0[tidz][q1][q2] +
338 op(q1, q2, 1, 0, f) * BBu1[tidz][q1][q2];
339 }
340 }
341 MFEM_SYNC_THREAD;
342 MFEM_SHARED real_t BDBBu[NBZ][max_Q1D][max_D1D];
343 MFEM_FOREACH_THREAD(q1, x, Q1D)
344 {
345 MFEM_FOREACH_THREAD(d2, y, D1D)
346 {
347 real_t BDBBu_ = 0.0;
348 for (int q2 = 0; q2 < Q1D; ++q2)
349 {
350 const real_t b = Bt(d2, q2);
351 BDBBu_ += b * DBBu[tidz][q1][q2];
352 }
353 BDBBu[tidz][q1][d2] = BDBBu_;
354 }
355 }
356 MFEM_SYNC_THREAD;
357 MFEM_FOREACH_THREAD(d1, x, D1D)
358 {
359 MFEM_FOREACH_THREAD(d2, y, D1D)
360 {
361 real_t BBDBBu_ = 0.0;
362 for (int q1 = 0; q1 < Q1D; ++q1)
363 {
364 const real_t b = Bt(d1, q1);
365 BBDBBu_ += b * BDBBu[tidz][q1][d2];
366 }
367 y(d1, d2, 0, f) += BBDBBu_;
368 y(d1, d2, 1, f) += -BBDBBu_;
369 }
370 }
371 });
372}
373
374// PA DGTrace Apply 2D kernel for Gauss-Lobatto/Bernstein
375template <int T_D1D = 0, int T_Q1D = 0>
376static void PADGTraceApplyTranspose2D(const int NF, const Array<real_t> &b,
377 const Array<real_t> &bt,
378 const Vector &op_, const Vector &x_,
379 Vector &y_, const int d1d = 0,
380 const int q1d = 0)
381{
382 const int VDIM = 1;
383 const int D1D = T_D1D ? T_D1D : d1d;
384 const int Q1D = T_Q1D ? T_Q1D : q1d;
385 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
386 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
387 auto B = Reshape(b.Read(), Q1D, D1D);
388 auto Bt = Reshape(bt.Read(), D1D, Q1D);
389 auto op = Reshape(op_.Read(), Q1D, 2, 2, NF);
390 auto x = Reshape(x_.Read(), D1D, VDIM, 2, NF);
391 auto y = Reshape(y_.ReadWrite(), D1D, VDIM, 2, NF);
392
393 mfem::forall(NF, [=] MFEM_HOST_DEVICE(int f)
394 {
395 const int VDIM = 1;
396 const int D1D = T_D1D ? T_D1D : d1d;
397 const int Q1D = T_Q1D ? T_Q1D : q1d;
398 // the following variables are evaluated at compile time
399 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
400 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
401 real_t u0[max_D1D][VDIM];
402 real_t u1[max_D1D][VDIM];
403 for (int d = 0; d < D1D; d++)
404 {
405 for (int c = 0; c < VDIM; c++)
406 {
407 u0[d][c] = x(d, c, 0, f);
408 u1[d][c] = x(d, c, 1, f);
409 }
410 }
411 real_t Bu0[max_Q1D][VDIM];
412 real_t Bu1[max_Q1D][VDIM];
413 for (int q = 0; q < Q1D; ++q)
414 {
415 for (int c = 0; c < VDIM; c++)
416 {
417 Bu0[q][c] = 0.0;
418 Bu1[q][c] = 0.0;
419 }
420 for (int d = 0; d < D1D; ++d)
421 {
422 const real_t b = B(q, d);
423 for (int c = 0; c < VDIM; c++)
424 {
425 Bu0[q][c] += b * u0[d][c];
426 Bu1[q][c] += b * u1[d][c];
427 }
428 }
429 }
430 real_t DBu0[max_Q1D][VDIM];
431 real_t DBu1[max_Q1D][VDIM];
432 for (int q = 0; q < Q1D; ++q)
433 {
434 for (int c = 0; c < VDIM; c++)
435 {
436 DBu0[q][c] =
437 op(q, 0, 0, f) * Bu0[q][c] + op(q, 0, 1, f) * Bu1[q][c];
438 DBu1[q][c] =
439 op(q, 1, 0, f) * Bu0[q][c] + op(q, 1, 1, f) * Bu1[q][c];
440 }
441 }
442 real_t BDBu0[max_D1D][VDIM];
443 real_t BDBu1[max_D1D][VDIM];
444 for (int d = 0; d < D1D; ++d)
445 {
446 for (int c = 0; c < VDIM; c++)
447 {
448 BDBu0[d][c] = 0.0;
449 BDBu1[d][c] = 0.0;
450 }
451 for (int q = 0; q < Q1D; ++q)
452 {
453 const real_t b = Bt(d, q);
454 for (int c = 0; c < VDIM; c++)
455 {
456 BDBu0[d][c] += b * DBu0[q][c];
457 BDBu1[d][c] += b * DBu1[q][c];
458 }
459 }
460 for (int c = 0; c < VDIM; c++)
461 {
462 y(d, c, 0, f) += BDBu0[d][c];
463 y(d, c, 1, f) += BDBu1[d][c];
464 }
465 }
466 });
467}
468
469// PA DGTrace Apply Transpose 3D kernel for Gauss-Lobatto/Bernstein
470template <int T_D1D = 0, int T_Q1D = 0>
471static void PADGTraceApplyTranspose3D(const int NF, const Array<real_t> &b,
472 const Array<real_t> &bt,
473 const Vector &op_, const Vector &x_,
474 Vector &y_, const int d1d = 0,
475 const int q1d = 0)
476{
477 const int VDIM = 1;
478 const int D1D = T_D1D ? T_D1D : d1d;
479 const int Q1D = T_Q1D ? T_Q1D : q1d;
480 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
481 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
482 auto B = Reshape(b.Read(), Q1D, D1D);
483 auto Bt = Reshape(bt.Read(), D1D, Q1D);
484 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
485 auto x = Reshape(x_.Read(), D1D, D1D, VDIM, 2, NF);
486 auto y = Reshape(y_.ReadWrite(), D1D, D1D, VDIM, 2, NF);
487
488 mfem::forall(NF, [=] MFEM_HOST_DEVICE(int f)
489 {
490 const int VDIM = 1;
491 const int D1D = T_D1D ? T_D1D : d1d;
492 const int Q1D = T_Q1D ? T_Q1D : q1d;
493 // the following variables are evaluated at compile time
494 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
495 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
496 real_t u0[max_D1D][max_D1D][VDIM];
497 real_t u1[max_D1D][max_D1D][VDIM];
498 for (int d1 = 0; d1 < D1D; d1++)
499 {
500 for (int d2 = 0; d2 < D1D; d2++)
501 {
502 for (int c = 0; c < VDIM; c++)
503 {
504 u0[d1][d2][c] = x(d1, d2, c, 0, f);
505 u1[d1][d2][c] = x(d1, d2, c, 1, f);
506 }
507 }
508 }
509 real_t Bu0[max_Q1D][max_D1D][VDIM];
510 real_t Bu1[max_Q1D][max_D1D][VDIM];
511 for (int q1 = 0; q1 < Q1D; ++q1)
512 {
513 for (int d2 = 0; d2 < D1D; ++d2)
514 {
515 for (int c = 0; c < VDIM; c++)
516 {
517 Bu0[q1][d2][c] = 0.0;
518 Bu1[q1][d2][c] = 0.0;
519 }
520 for (int d1 = 0; d1 < D1D; ++d1)
521 {
522 const real_t b = B(q1, d1);
523 for (int c = 0; c < VDIM; c++)
524 {
525 Bu0[q1][d2][c] += b * u0[d1][d2][c];
526 Bu1[q1][d2][c] += b * u1[d1][d2][c];
527 }
528 }
529 }
530 }
531 real_t BBu0[max_Q1D][max_Q1D][VDIM];
532 real_t BBu1[max_Q1D][max_Q1D][VDIM];
533 for (int q1 = 0; q1 < Q1D; ++q1)
534 {
535 for (int q2 = 0; q2 < Q1D; ++q2)
536 {
537 for (int c = 0; c < VDIM; c++)
538 {
539 BBu0[q1][q2][c] = 0.0;
540 BBu1[q1][q2][c] = 0.0;
541 }
542 for (int d2 = 0; d2 < D1D; ++d2)
543 {
544 const real_t b = B(q2, d2);
545 for (int c = 0; c < VDIM; c++)
546 {
547 BBu0[q1][q2][c] += b * Bu0[q1][d2][c];
548 BBu1[q1][q2][c] += b * Bu1[q1][d2][c];
549 }
550 }
551 }
552 }
553 real_t DBu0[max_Q1D][max_Q1D][VDIM];
554 real_t DBu1[max_Q1D][max_Q1D][VDIM];
555 for (int q1 = 0; q1 < Q1D; ++q1)
556 {
557 for (int q2 = 0; q2 < Q1D; ++q2)
558 {
559 const real_t D00 = op(q1, q2, 0, 0, f);
560 const real_t D01 = op(q1, q2, 0, 1, f);
561 const real_t D10 = op(q1, q2, 1, 0, f);
562 const real_t D11 = op(q1, q2, 1, 1, f);
563 for (int c = 0; c < VDIM; c++)
564 {
565 DBu0[q1][q2][c] = D00 * BBu0[q1][q2][c] + D01 * BBu1[q1][q2][c];
566 DBu1[q1][q2][c] = D10 * BBu0[q1][q2][c] + D11 * BBu1[q1][q2][c];
567 }
568 }
569 }
570 real_t BDBu0[max_D1D][max_Q1D][VDIM];
571 real_t BDBu1[max_D1D][max_Q1D][VDIM];
572 for (int d1 = 0; d1 < D1D; ++d1)
573 {
574 for (int q2 = 0; q2 < Q1D; ++q2)
575 {
576 for (int c = 0; c < VDIM; c++)
577 {
578 BDBu0[d1][q2][c] = 0.0;
579 BDBu1[d1][q2][c] = 0.0;
580 }
581 for (int q1 = 0; q1 < Q1D; ++q1)
582 {
583 const real_t b = Bt(d1, q1);
584 for (int c = 0; c < VDIM; c++)
585 {
586 BDBu0[d1][q2][c] += b * DBu0[q1][q2][c];
587 BDBu1[d1][q2][c] += b * DBu1[q1][q2][c];
588 }
589 }
590 }
591 }
592 real_t BBDBu0[max_D1D][max_D1D][VDIM];
593 real_t BBDBu1[max_D1D][max_D1D][VDIM];
594 for (int d1 = 0; d1 < D1D; ++d1)
595 {
596 for (int d2 = 0; d2 < D1D; ++d2)
597 {
598 for (int c = 0; c < VDIM; c++)
599 {
600 BBDBu0[d1][d2][c] = 0.0;
601 BBDBu1[d1][d2][c] = 0.0;
602 }
603 for (int q2 = 0; q2 < Q1D; ++q2)
604 {
605 const real_t b = Bt(d2, q2);
606 for (int c = 0; c < VDIM; c++)
607 {
608 BBDBu0[d1][d2][c] += b * BDBu0[d1][q2][c];
609 BBDBu1[d1][d2][c] += b * BDBu1[d1][q2][c];
610 }
611 }
612 for (int c = 0; c < VDIM; c++)
613 {
614 y(d1, d2, c, 0, f) += BBDBu0[d1][d2][c];
615 y(d1, d2, c, 1, f) += BBDBu1[d1][d2][c];
616 }
617 }
618 }
619 });
620}
621
622// Optimized PA DGTrace Apply Transpose 3D kernel for Gauss-Lobatto/Bernstein
623template <int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
624static void SmemPADGTraceApplyTranspose3D(const int NF, const Array<real_t> &b,
625 const Array<real_t> &bt,
626 const Vector &op_, const Vector &x_,
627 Vector &y_, const int d1d = 0,
628 const int q1d = 0)
629{
630 const int D1D = T_D1D ? T_D1D : d1d;
631 const int Q1D = T_Q1D ? T_Q1D : q1d;
632 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
633 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
634 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
635 auto B = Reshape(b.Read(), Q1D, D1D);
636 auto Bt = Reshape(bt.Read(), D1D, Q1D);
637 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, 2, NF);
638 auto x = Reshape(x_.Read(), D1D, D1D, 2, NF);
639 auto y = Reshape(y_.ReadWrite(), D1D, D1D, 2, NF);
640
641 mfem::forall_2D_batch(NF, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE(int f)
642 {
643 const int tidz = MFEM_THREAD_ID(z);
644 const int D1D = T_D1D ? T_D1D : d1d;
645 const int Q1D = T_Q1D ? T_Q1D : q1d;
646 // the following variables are evaluated at compile time
647 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
648 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
649 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
650 MFEM_SHARED real_t u0[NBZ][max_D1D][max_D1D];
651 MFEM_SHARED real_t u1[NBZ][max_D1D][max_D1D];
652 MFEM_FOREACH_THREAD(d1, x, D1D)
653 {
654 MFEM_FOREACH_THREAD(d2, y, D1D)
655 {
656 u0[tidz][d1][d2] = x(d1, d2, 0, f);
657 u1[tidz][d1][d2] = x(d1, d2, 1, f);
658 }
659 }
660 MFEM_SYNC_THREAD;
661 MFEM_SHARED real_t Bu0[NBZ][max_Q1D][max_D1D];
662 MFEM_SHARED real_t Bu1[NBZ][max_Q1D][max_D1D];
663 MFEM_FOREACH_THREAD(q1, x, Q1D)
664 {
665 MFEM_FOREACH_THREAD(d2, y, D1D)
666 {
667 real_t Bu0_ = 0.0;
668 real_t Bu1_ = 0.0;
669 for (int d1 = 0; d1 < D1D; ++d1)
670 {
671 const real_t b = B(q1, d1);
672 Bu0_ += b * u0[tidz][d1][d2];
673 Bu1_ += b * u1[tidz][d1][d2];
674 }
675 Bu0[tidz][q1][d2] = Bu0_;
676 Bu1[tidz][q1][d2] = Bu1_;
677 }
678 }
679 MFEM_SYNC_THREAD;
680 MFEM_SHARED real_t BBu0[NBZ][max_Q1D][max_Q1D];
681 MFEM_SHARED real_t BBu1[NBZ][max_Q1D][max_Q1D];
682 MFEM_FOREACH_THREAD(q1, x, Q1D)
683 {
684 MFEM_FOREACH_THREAD(q2, y, Q1D)
685 {
686 real_t BBu0_ = 0.0;
687 real_t BBu1_ = 0.0;
688 for (int d2 = 0; d2 < D1D; ++d2)
689 {
690 const real_t b = B(q2, d2);
691 BBu0_ += b * Bu0[tidz][q1][d2];
692 BBu1_ += b * Bu1[tidz][q1][d2];
693 }
694 BBu0[tidz][q1][q2] = BBu0_;
695 BBu1[tidz][q1][q2] = BBu1_;
696 }
697 }
698 MFEM_SYNC_THREAD;
699 MFEM_SHARED real_t DBBu0[NBZ][max_Q1D][max_Q1D];
700 MFEM_SHARED real_t DBBu1[NBZ][max_Q1D][max_Q1D];
701 MFEM_FOREACH_THREAD(q1, x, Q1D)
702 {
703 MFEM_FOREACH_THREAD(q2, y, Q1D)
704 {
705 const real_t D00 = op(q1, q2, 0, 0, f);
706 const real_t D01 = op(q1, q2, 0, 1, f);
707 const real_t D10 = op(q1, q2, 1, 0, f);
708 const real_t D11 = op(q1, q2, 1, 1, f);
709 const real_t u0q = BBu0[tidz][q1][q2];
710 const real_t u1q = BBu1[tidz][q1][q2];
711 DBBu0[tidz][q1][q2] = D00 * u0q + D01 * u1q;
712 DBBu1[tidz][q1][q2] = D10 * u0q + D11 * u1q;
713 }
714 }
715 MFEM_SYNC_THREAD;
716 MFEM_SHARED real_t BDBBu0[NBZ][max_Q1D][max_D1D];
717 MFEM_SHARED real_t BDBBu1[NBZ][max_Q1D][max_D1D];
718 MFEM_FOREACH_THREAD(q1, x, Q1D)
719 {
720 MFEM_FOREACH_THREAD(d2, y, D1D)
721 {
722 real_t BDBBu0_ = 0.0;
723 real_t BDBBu1_ = 0.0;
724 for (int q2 = 0; q2 < Q1D; ++q2)
725 {
726 const real_t b = Bt(d2, q2);
727 BDBBu0_ += b * DBBu0[tidz][q1][q2];
728 BDBBu1_ += b * DBBu1[tidz][q1][q2];
729 }
730 BDBBu0[tidz][q1][d2] = BDBBu0_;
731 BDBBu1[tidz][q1][d2] = BDBBu1_;
732 }
733 }
734 MFEM_SYNC_THREAD;
735 MFEM_FOREACH_THREAD(d1, x, D1D)
736 {
737 MFEM_FOREACH_THREAD(d2, y, D1D)
738 {
739 real_t BBDBBu0_ = 0.0;
740 real_t BBDBBu1_ = 0.0;
741 for (int q1 = 0; q1 < Q1D; ++q1)
742 {
743 const real_t b = Bt(d1, q1);
744 BBDBBu0_ += b * BDBBu0[tidz][q1][d2];
745 BBDBBu1_ += b * BDBBu1[tidz][q1][d2];
746 }
747 y(d1, d2, 0, f) += BBDBBu0_;
748 y(d1, d2, 1, f) += BBDBBu1_;
749 }
750 }
751 });
752}
753
754} // namespace internal
755
756template <int DIM, int D1D, int Q1D>
757DGTraceIntegrator::ApplyKernelType DGTraceIntegrator::ApplyPAKernels::Kernel()
758{
759 if constexpr (DIM == 2)
760 {
761 return internal::PADGTraceApply2D<D1D, Q1D>;
762 }
763 else if constexpr (DIM == 3)
764 {
765 if constexpr (D1D == 3 || D1D == 4)
766 {
767 return internal::SmemPADGTraceApply3D<D1D, Q1D, 2>;
768 }
769 else
770 {
771 return internal::SmemPADGTraceApply3D<D1D, Q1D>;
772 }
773 }
774 MFEM_ABORT("");
775}
776
777template <int DIM, int D1D, int Q1D>
778DGTraceIntegrator::ApplyKernelType DGTraceIntegrator::ApplyPATKernels::Kernel()
779{
780 if constexpr (DIM == 2)
781 {
782 return internal::PADGTraceApplyTranspose2D<D1D, Q1D>;
783 }
784 else if constexpr (DIM == 3)
785 {
786 return internal::SmemPADGTraceApplyTranspose3D<D1D, Q1D>;
787 }
788 MFEM_ABORT("");
789}
790} // namespace mfem
791
792/// \endcond DO_NOT_DOCUMENT
793#endif
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
arguments: nf, B, Bt, pa_data, x, y, dofs1D, quad1D
real_t b
Definition lissajous.cpp:42
constexpr int DIM
mfem::real_t real_t
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:138
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:931
std::function< real_t(const Vector &)> f(real_t mass_coeff)
Definition lor_mms.hpp:30
void forall(int N, lambda &&body)
Definition forall.hpp:839
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128