MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_convection_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_CONVECTION_KERNELS_HPP
13#define MFEM_BILININTEG_CONVECTION_KERNELS_HPP
14
16#include "../bilininteg.hpp"
17#include "../gridfunc.hpp"
18#include "../qfunction.hpp"
20
21/// \cond DO_NOT_DOCUMENT
22namespace mfem
23{
24
25// PA Convection Apply 2D kernel
26template<int T_D1D = 0, int T_Q1D = 0> static
27void PAConvectionApply2D(const int ne,
28 const Array<real_t> &b,
29 const Array<real_t> &g,
30 const Array<real_t> &bt,
31 const Array<real_t> &gt,
32 const Vector &op_,
33 const Vector &x_,
34 Vector &y_,
35 const int d1d = 0,
36 const int q1d = 0)
37{
38 const int NE = ne;
39 const int D1D = T_D1D ? T_D1D : d1d;
40 const int Q1D = T_Q1D ? T_Q1D : q1d;
41 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
42 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
43 auto B = Reshape(b.Read(), Q1D, D1D);
44 auto G = Reshape(g.Read(), Q1D, D1D);
45 auto Bt = Reshape(bt.Read(), D1D, Q1D);
46 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, NE);
47 auto x = Reshape(x_.Read(), D1D, D1D, NE);
48 auto y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
49 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
50 {
51 const int D1D = T_D1D ? T_D1D : d1d;
52 const int Q1D = T_Q1D ? T_Q1D : q1d;
53 // the following variables are evaluated at compile time
54 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
55 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
56
57 real_t u[max_D1D][max_D1D];
58 for (int dy = 0; dy < D1D; ++dy)
59 {
60 for (int dx = 0; dx < D1D; ++dx)
61 {
62 u[dy][dx] = x(dx,dy,e);
63 }
64 }
65 real_t Bu[max_D1D][max_Q1D];
66 real_t Gu[max_D1D][max_Q1D];
67 for (int dy = 0; dy < D1D; ++dy)
68 {
69 for (int qx = 0; qx < Q1D; ++qx)
70 {
71 Bu[dy][qx] = 0.0;
72 Gu[dy][qx] = 0.0;
73 for (int dx = 0; dx < D1D; ++dx)
74 {
75 const real_t bx = B(qx,dx);
76 const real_t gx = G(qx,dx);
77 const real_t x = u[dy][dx];
78 Bu[dy][qx] += bx * x;
79 Gu[dy][qx] += gx * x;
80 }
81 }
82 }
83 real_t GBu[max_Q1D][max_Q1D];
84 real_t BGu[max_Q1D][max_Q1D];
85 for (int qx = 0; qx < Q1D; ++qx)
86 {
87 for (int qy = 0; qy < Q1D; ++qy)
88 {
89 GBu[qy][qx] = 0.0;
90 BGu[qy][qx] = 0.0;
91 for (int dy = 0; dy < D1D; ++dy)
92 {
93 const real_t bx = B(qy,dy);
94 const real_t gx = G(qy,dy);
95 GBu[qy][qx] += gx * Bu[dy][qx];
96 BGu[qy][qx] += bx * Gu[dy][qx];
97 }
98 }
99 }
100 // Calculate Dxy, xDy in plane
101 real_t DGu[max_Q1D][max_Q1D];
102 for (int qy = 0; qy < Q1D; ++qy)
103 {
104 for (int qx = 0; qx < Q1D; ++qx)
105 {
106 const real_t O1 = op(qx,qy,0,e);
107 const real_t O2 = op(qx,qy,1,e);
108
109 const real_t gradX = BGu[qy][qx];
110 const real_t gradY = GBu[qy][qx];
111
112 DGu[qy][qx] = (O1 * gradX) + (O2 * gradY);
113 }
114 }
115 real_t BDGu[max_D1D][max_Q1D];
116 for (int qx = 0; qx < Q1D; ++qx)
117 {
118 for (int dy = 0; dy < D1D; ++dy)
119 {
120 BDGu[dy][qx] = 0.0;
121 for (int qy = 0; qy < Q1D; ++qy)
122 {
123 const real_t w = Bt(dy,qy);
124 BDGu[dy][qx] += w * DGu[qy][qx];
125 }
126 }
127 }
128 for (int dx = 0; dx < D1D; ++dx)
129 {
130 for (int dy = 0; dy < D1D; ++dy)
131 {
132 real_t BBDGu = 0.0;
133 for (int qx = 0; qx < Q1D; ++qx)
134 {
135 const real_t w = Bt(dx,qx);
136 BBDGu += w * BDGu[dy][qx];
137 }
138 y(dx,dy,e) += BBDGu;
139 }
140 }
141 });
142}
143
144// Optimized PA Convection Apply 2D kernel
145template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
146void SmemPAConvectionApply2D(const int ne,
147 const Array<real_t> &b,
148 const Array<real_t> &g,
149 const Array<real_t> &bt,
150 const Array<real_t> &gt,
151 const Vector &op_,
152 const Vector &x_,
153 Vector &y_,
154 const int d1d = 0,
155 const int q1d = 0)
156{
157 const int NE = ne;
158 const int D1D = T_D1D ? T_D1D : d1d;
159 const int Q1D = T_Q1D ? T_Q1D : q1d;
160 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
161 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
162 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
163 auto B = Reshape(b.Read(), Q1D, D1D);
164 auto G = Reshape(g.Read(), Q1D, D1D);
165 auto Bt = Reshape(bt.Read(), D1D, Q1D);
166 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, NE);
167 auto x = Reshape(x_.Read(), D1D, D1D, NE);
168 auto y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
169 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
170 {
171 const int tidz = MFEM_THREAD_ID(z);
172 const int D1D = T_D1D ? T_D1D : d1d;
173 const int Q1D = T_Q1D ? T_Q1D : q1d;
174 // the following variables are evaluated at compile time
175 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
176 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
177 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
178 // constexpr int MDQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
179 MFEM_SHARED real_t u[NBZ][max_D1D][max_D1D];
180 MFEM_FOREACH_THREAD(dy,y,D1D)
181 {
182 MFEM_FOREACH_THREAD(dx,x,D1D)
183 {
184 // e is really equal to e+tidz
185 u[tidz][dy][dx] = x(dx,dy,e);
186 }
187 }
188 MFEM_SYNC_THREAD;
189 MFEM_SHARED real_t Bu[NBZ][max_D1D][max_Q1D];
190 MFEM_SHARED real_t Gu[NBZ][max_D1D][max_Q1D];
191 MFEM_FOREACH_THREAD(dy,y,D1D)
192 {
193 MFEM_FOREACH_THREAD(qx,x,Q1D)
194 {
195 Bu[tidz][dy][qx] = 0.0;
196 Gu[tidz][dy][qx] = 0.0;
197 for (int dx = 0; dx < D1D; ++dx)
198 {
199 const real_t bx = B(qx,dx);
200 const real_t gx = G(qx,dx);
201 const real_t x = u[tidz][dy][dx];
202 Bu[tidz][dy][qx] += bx * x;
203 Gu[tidz][dy][qx] += gx * x;
204 }
205 }
206 }
207 MFEM_SYNC_THREAD;
208 MFEM_SHARED real_t GBu[NBZ][max_Q1D][max_Q1D];
209 MFEM_SHARED real_t BGu[NBZ][max_Q1D][max_Q1D];
210 MFEM_FOREACH_THREAD(qx,x,Q1D)
211 {
212 MFEM_FOREACH_THREAD(qy,y,Q1D)
213 {
214 GBu[tidz][qy][qx] = 0.0;
215 BGu[tidz][qy][qx] = 0.0;
216 for (int dy = 0; dy < D1D; ++dy)
217 {
218 const real_t bx = B(qy,dy);
219 const real_t gx = G(qy,dy);
220 GBu[tidz][qy][qx] += gx * Bu[tidz][dy][qx];
221 BGu[tidz][qy][qx] += bx * Gu[tidz][dy][qx];
222 }
223 }
224 }
225 MFEM_SYNC_THREAD;
226 // Calculate Dxy, xDy in plane
227 MFEM_SHARED real_t DGu[NBZ][max_Q1D][max_Q1D];
228 MFEM_FOREACH_THREAD(qy,y,Q1D)
229 {
230 MFEM_FOREACH_THREAD(qx,x,Q1D)
231 {
232 const real_t O1 = op(qx,qy,0,e);
233 const real_t O2 = op(qx,qy,1,e);
234
235 const real_t gradX = BGu[tidz][qy][qx];
236 const real_t gradY = GBu[tidz][qy][qx];
237
238 DGu[tidz][qy][qx] = (O1 * gradX) + (O2 * gradY);
239 }
240 }
241 MFEM_SYNC_THREAD;
242 MFEM_SHARED real_t BDGu[NBZ][max_D1D][max_Q1D];
243 MFEM_FOREACH_THREAD(qx,x,Q1D)
244 {
245 MFEM_FOREACH_THREAD(dy,y,D1D)
246 {
247 BDGu[tidz][dy][qx] = 0.0;
248 for (int qy = 0; qy < Q1D; ++qy)
249 {
250 const real_t w = Bt(dy,qy);
251 BDGu[tidz][dy][qx] += w * DGu[tidz][qy][qx];
252 }
253 }
254 }
255 MFEM_SYNC_THREAD;
256 MFEM_FOREACH_THREAD(dx,x,D1D)
257 {
258 MFEM_FOREACH_THREAD(dy,y,D1D)
259 {
260 real_t BBDGu = 0.0;
261 for (int qx = 0; qx < Q1D; ++qx)
262 {
263 const real_t w = Bt(dx,qx);
264 BBDGu += w * BDGu[tidz][dy][qx];
265 }
266 y(dx,dy,e) += BBDGu;
267 }
268 }
269 });
270}
271
272// PA Convection Apply 3D kernel
273template<int T_D1D = 0, int T_Q1D = 0> static
274void PAConvectionApply3D(const int ne,
275 const Array<real_t> &b,
276 const Array<real_t> &g,
277 const Array<real_t> &bt,
278 const Array<real_t> &gt,
279 const Vector &op_,
280 const Vector &x_,
281 Vector &y_,
282 const int d1d = 0,
283 const int q1d = 0)
284{
285 const int NE = ne;
286 const int D1D = T_D1D ? T_D1D : d1d;
287 const int Q1D = T_Q1D ? T_Q1D : q1d;
288 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
289 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
290 auto B = Reshape(b.Read(), Q1D, D1D);
291 auto G = Reshape(g.Read(), Q1D, D1D);
292 auto Bt = Reshape(bt.Read(), D1D, Q1D);
293 auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
294 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
295 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
296 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
297 {
298 const int D1D = T_D1D ? T_D1D : d1d;
299 const int Q1D = T_Q1D ? T_Q1D : q1d;
300 // the following variables are evaluated at compile time
301 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
302 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
303
304 real_t u[max_D1D][max_D1D][max_D1D];
305 for (int dz = 0; dz < D1D; ++dz)
306 {
307 for (int dy = 0; dy < D1D; ++dy)
308 {
309 for (int dx = 0; dx < D1D; ++dx)
310 {
311 u[dz][dy][dx] = x(dx,dy,dz,e);
312 }
313 }
314 }
315 real_t Bu[max_D1D][max_D1D][max_Q1D];
316 real_t Gu[max_D1D][max_D1D][max_Q1D];
317 for (int dz = 0; dz < D1D; ++dz)
318 {
319 for (int dy = 0; dy < D1D; ++dy)
320 {
321 for (int qx = 0; qx < Q1D; ++qx)
322 {
323 Bu[dz][dy][qx] = 0.0;
324 Gu[dz][dy][qx] = 0.0;
325 for (int dx = 0; dx < D1D; ++dx)
326 {
327 const real_t bx = B(qx,dx);
328 const real_t gx = G(qx,dx);
329 const real_t x = u[dz][dy][dx];
330 Bu[dz][dy][qx] += bx * x;
331 Gu[dz][dy][qx] += gx * x;
332 }
333 }
334 }
335 }
336 real_t BBu[max_D1D][max_Q1D][max_Q1D];
337 real_t GBu[max_D1D][max_Q1D][max_Q1D];
338 real_t BGu[max_D1D][max_Q1D][max_Q1D];
339 for (int dz = 0; dz < D1D; ++dz)
340 {
341 for (int qx = 0; qx < Q1D; ++qx)
342 {
343 for (int qy = 0; qy < Q1D; ++qy)
344 {
345 BBu[dz][qy][qx] = 0.0;
346 GBu[dz][qy][qx] = 0.0;
347 BGu[dz][qy][qx] = 0.0;
348 for (int dy = 0; dy < D1D; ++dy)
349 {
350 const real_t bx = B(qy,dy);
351 const real_t gx = G(qy,dy);
352 BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
353 GBu[dz][qy][qx] += gx * Bu[dz][dy][qx];
354 BGu[dz][qy][qx] += bx * Gu[dz][dy][qx];
355 }
356 }
357 }
358 }
359 real_t GBBu[max_Q1D][max_Q1D][max_Q1D];
360 real_t BGBu[max_Q1D][max_Q1D][max_Q1D];
361 real_t BBGu[max_Q1D][max_Q1D][max_Q1D];
362 for (int qx = 0; qx < Q1D; ++qx)
363 {
364 for (int qy = 0; qy < Q1D; ++qy)
365 {
366 for (int qz = 0; qz < Q1D; ++qz)
367 {
368 GBBu[qz][qy][qx] = 0.0;
369 BGBu[qz][qy][qx] = 0.0;
370 BBGu[qz][qy][qx] = 0.0;
371 for (int dz = 0; dz < D1D; ++dz)
372 {
373 const real_t bx = B(qz,dz);
374 const real_t gx = G(qz,dz);
375 GBBu[qz][qy][qx] += gx * BBu[dz][qy][qx];
376 BGBu[qz][qy][qx] += bx * GBu[dz][qy][qx];
377 BBGu[qz][qy][qx] += bx * BGu[dz][qy][qx];
378 }
379 }
380 }
381 }
382 // Calculate Dxy, xDy in plane
383 real_t DGu[max_Q1D][max_Q1D][max_Q1D];
384 for (int qz = 0; qz < Q1D; ++qz)
385 {
386 for (int qy = 0; qy < Q1D; ++qy)
387 {
388 for (int qx = 0; qx < Q1D; ++qx)
389 {
390 const real_t O1 = op(qx,qy,qz,0,e);
391 const real_t O2 = op(qx,qy,qz,1,e);
392 const real_t O3 = op(qx,qy,qz,2,e);
393
394 const real_t gradX = BBGu[qz][qy][qx];
395 const real_t gradY = BGBu[qz][qy][qx];
396 const real_t gradZ = GBBu[qz][qy][qx];
397
398 DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
399 }
400 }
401 }
402 real_t BDGu[max_D1D][max_Q1D][max_Q1D];
403 for (int qx = 0; qx < Q1D; ++qx)
404 {
405 for (int qy = 0; qy < Q1D; ++qy)
406 {
407 for (int dz = 0; dz < D1D; ++dz)
408 {
409 BDGu[dz][qy][qx] = 0.0;
410 for (int qz = 0; qz < Q1D; ++qz)
411 {
412 const real_t w = Bt(dz,qz);
413 BDGu[dz][qy][qx] += w * DGu[qz][qy][qx];
414 }
415 }
416 }
417 }
418 real_t BBDGu[max_D1D][max_D1D][max_Q1D];
419 for (int dz = 0; dz < D1D; ++dz)
420 {
421 for (int qx = 0; qx < Q1D; ++qx)
422 {
423 for (int dy = 0; dy < D1D; ++dy)
424 {
425 BBDGu[dz][dy][qx] = 0.0;
426 for (int qy = 0; qy < Q1D; ++qy)
427 {
428 const real_t w = Bt(dy,qy);
429 BBDGu[dz][dy][qx] += w * BDGu[dz][qy][qx];
430 }
431 }
432 }
433 }
434 for (int dz = 0; dz < D1D; ++dz)
435 {
436 for (int dy = 0; dy < D1D; ++dy)
437 {
438 for (int dx = 0; dx < D1D; ++dx)
439 {
440 real_t BBBDGu = 0.0;
441 for (int qx = 0; qx < Q1D; ++qx)
442 {
443 const real_t w = Bt(dx,qx);
444 BBBDGu += w * BBDGu[dz][dy][qx];
445 }
446 y(dx,dy,dz,e) += BBBDGu;
447 }
448 }
449 }
450 });
451}
452
453// Optimized PA Convection Apply 3D kernel
454template<int T_D1D = 0, int T_Q1D = 0> static
455void SmemPAConvectionApply3D(const int ne,
456 const Array<real_t> &b,
457 const Array<real_t> &g,
458 const Array<real_t> &bt,
459 const Array<real_t> &gt,
460 const Vector &op_,
461 const Vector &x_,
462 Vector &y_,
463 const int d1d = 0,
464 const int q1d = 0)
465{
466 const int NE = ne;
467 const int D1D = T_D1D ? T_D1D : d1d;
468 const int Q1D = T_Q1D ? T_Q1D : q1d;
469 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
470 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
471 auto B = Reshape(b.Read(), Q1D, D1D);
472 auto G = Reshape(g.Read(), Q1D, D1D);
473 auto Bt = Reshape(bt.Read(), D1D, Q1D);
474 auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
475 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
476 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
477 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
478 {
479 const int D1D = T_D1D ? T_D1D : d1d;
480 const int Q1D = T_Q1D ? T_Q1D : q1d;
481 // the following variables are evaluated at compile time
482 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
483 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
484 constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
485 MFEM_SHARED real_t sm0[max_DQ*max_DQ*max_DQ];
486 MFEM_SHARED real_t sm1[max_DQ*max_DQ*max_DQ];
487 MFEM_SHARED real_t sm2[max_DQ*max_DQ*max_DQ];
488 MFEM_SHARED real_t sm3[max_DQ*max_DQ*max_DQ];
489 MFEM_SHARED real_t sm4[max_DQ*max_DQ*max_DQ];
490 MFEM_SHARED real_t sm5[max_DQ*max_DQ*max_DQ];
491
492 real_t (*u)[max_D1D][max_D1D] = (real_t (*)[max_D1D][max_D1D]) sm0;
493 MFEM_FOREACH_THREAD(dz,z,D1D)
494 {
495 MFEM_FOREACH_THREAD(dy,y,D1D)
496 {
497 MFEM_FOREACH_THREAD(dx,x,D1D)
498 {
499 u[dz][dy][dx] = x(dx,dy,dz,e);
500 }
501 }
502 }
503 MFEM_SYNC_THREAD;
504 real_t (*Bu)[max_D1D][max_Q1D] = (real_t (*)[max_D1D][max_Q1D])sm1;
505 real_t (*Gu)[max_D1D][max_Q1D] = (real_t (*)[max_D1D][max_Q1D])sm2;
506 MFEM_FOREACH_THREAD(dz,z,D1D)
507 {
508 MFEM_FOREACH_THREAD(dy,y,D1D)
509 {
510 MFEM_FOREACH_THREAD(qx,x,Q1D)
511 {
512 real_t Bu_ = 0.0;
513 real_t Gu_ = 0.0;
514 for (int dx = 0; dx < D1D; ++dx)
515 {
516 const real_t bx = B(qx,dx);
517 const real_t gx = G(qx,dx);
518 const real_t x = u[dz][dy][dx];
519 Bu_ += bx * x;
520 Gu_ += gx * x;
521 }
522 Bu[dz][dy][qx] = Bu_;
523 Gu[dz][dy][qx] = Gu_;
524 }
525 }
526 }
527 MFEM_SYNC_THREAD;
528 real_t (*BBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm3;
529 real_t (*GBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm4;
530 real_t (*BGu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm5;
531 MFEM_FOREACH_THREAD(dz,z,D1D)
532 {
533 MFEM_FOREACH_THREAD(qx,x,Q1D)
534 {
535 MFEM_FOREACH_THREAD(qy,y,Q1D)
536 {
537 real_t BBu_ = 0.0;
538 real_t GBu_ = 0.0;
539 real_t BGu_ = 0.0;
540 for (int dy = 0; dy < D1D; ++dy)
541 {
542 const real_t bx = B(qy,dy);
543 const real_t gx = G(qy,dy);
544 BBu_ += bx * Bu[dz][dy][qx];
545 GBu_ += gx * Bu[dz][dy][qx];
546 BGu_ += bx * Gu[dz][dy][qx];
547 }
548 BBu[dz][qy][qx] = BBu_;
549 GBu[dz][qy][qx] = GBu_;
550 BGu[dz][qy][qx] = BGu_;
551 }
552 }
553 }
554 MFEM_SYNC_THREAD;
555 real_t (*GBBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm0;
556 real_t (*BGBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm1;
557 real_t (*BBGu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm2;
558 MFEM_FOREACH_THREAD(qx,x,Q1D)
559 {
560 MFEM_FOREACH_THREAD(qy,y,Q1D)
561 {
562 MFEM_FOREACH_THREAD(qz,z,Q1D)
563 {
564 real_t GBBu_ = 0.0;
565 real_t BGBu_ = 0.0;
566 real_t BBGu_ = 0.0;
567 for (int dz = 0; dz < D1D; ++dz)
568 {
569 const real_t bx = B(qz,dz);
570 const real_t gx = G(qz,dz);
571 GBBu_ += gx * BBu[dz][qy][qx];
572 BGBu_ += bx * GBu[dz][qy][qx];
573 BBGu_ += bx * BGu[dz][qy][qx];
574 }
575 GBBu[qz][qy][qx] = GBBu_;
576 BGBu[qz][qy][qx] = BGBu_;
577 BBGu[qz][qy][qx] = BBGu_;
578 }
579 }
580 }
581 MFEM_SYNC_THREAD;
582 real_t (*DGu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm3;
583 MFEM_FOREACH_THREAD(qz,z,Q1D)
584 {
585 MFEM_FOREACH_THREAD(qy,y,Q1D)
586 {
587 MFEM_FOREACH_THREAD(qx,x,Q1D)
588 {
589 const real_t O1 = op(qx,qy,qz,0,e);
590 const real_t O2 = op(qx,qy,qz,1,e);
591 const real_t O3 = op(qx,qy,qz,2,e);
592
593 const real_t gradX = BBGu[qz][qy][qx];
594 const real_t gradY = BGBu[qz][qy][qx];
595 const real_t gradZ = GBBu[qz][qy][qx];
596
597 DGu[qz][qy][qx] = (O1 * gradX) + (O2 * gradY) + (O3 * gradZ);
598 }
599 }
600 }
601 MFEM_SYNC_THREAD;
602 real_t (*BDGu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm4;
603 MFEM_FOREACH_THREAD(qx,x,Q1D)
604 {
605 MFEM_FOREACH_THREAD(qy,y,Q1D)
606 {
607 MFEM_FOREACH_THREAD(dz,z,D1D)
608 {
609 real_t BDGu_ = 0.0;
610 for (int qz = 0; qz < Q1D; ++qz)
611 {
612 const real_t w = Bt(dz,qz);
613 BDGu_ += w * DGu[qz][qy][qx];
614 }
615 BDGu[dz][qy][qx] = BDGu_;
616 }
617 }
618 }
619 MFEM_SYNC_THREAD;
620 real_t (*BBDGu)[max_D1D][max_Q1D] = (real_t (*)[max_D1D][max_Q1D])sm5;
621 MFEM_FOREACH_THREAD(dz,z,D1D)
622 {
623 MFEM_FOREACH_THREAD(qx,x,Q1D)
624 {
625 MFEM_FOREACH_THREAD(dy,y,D1D)
626 {
627 real_t BBDGu_ = 0.0;
628 for (int qy = 0; qy < Q1D; ++qy)
629 {
630 const real_t w = Bt(dy,qy);
631 BBDGu_ += w * BDGu[dz][qy][qx];
632 }
633 BBDGu[dz][dy][qx] = BBDGu_;
634 }
635 }
636 }
637 MFEM_SYNC_THREAD;
638 MFEM_FOREACH_THREAD(dz,z,D1D)
639 {
640 MFEM_FOREACH_THREAD(dy,y,D1D)
641 {
642 MFEM_FOREACH_THREAD(dx,x,D1D)
643 {
644 real_t BBBDGu = 0.0;
645 for (int qx = 0; qx < Q1D; ++qx)
646 {
647 const real_t w = Bt(dx,qx);
648 BBBDGu += w * BBDGu[dz][dy][qx];
649 }
650 y(dx,dy,dz,e) += BBBDGu;
651 }
652 }
653 }
654 });
655}
656
657// PA Convection Apply 2D kernel
658template<int T_D1D = 0, int T_Q1D = 0> static
659void PAConvectionApplyT2D(const int ne,
660 const Array<real_t> &b,
661 const Array<real_t> &g,
662 const Array<real_t> &bt,
663 const Array<real_t> &gt,
664 const Vector &op_,
665 const Vector &x_,
666 Vector &y_,
667 const int d1d = 0,
668 const int q1d = 0)
669{
670 const int NE = ne;
671 const int D1D = T_D1D ? T_D1D : d1d;
672 const int Q1D = T_Q1D ? T_Q1D : q1d;
673 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
674 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
675 auto B = Reshape(b.Read(), Q1D, D1D);
676 auto Bt = Reshape(bt.Read(), D1D, Q1D);
677 auto Gt = Reshape(gt.Read(), D1D, Q1D);
678 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, NE);
679 auto x = Reshape(x_.Read(), D1D, D1D, NE);
680 auto y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
681 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
682 {
683 const int D1D = T_D1D ? T_D1D : d1d;
684 const int Q1D = T_Q1D ? T_Q1D : q1d;
685 // the following variables are evaluated at compile time
686 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
687 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
688
689 real_t u[max_D1D][max_D1D];
690 for (int dy = 0; dy < D1D; ++dy)
691 {
692 for (int dx = 0; dx < D1D; ++dx)
693 {
694 u[dy][dx] = x(dx,dy,e);
695 }
696 }
697 real_t Bu[max_D1D][max_Q1D];
698 for (int dy = 0; dy < D1D; ++dy)
699 {
700 for (int qx = 0; qx < Q1D; ++qx)
701 {
702 Bu[dy][qx] = 0.0;
703 for (int dx = 0; dx < D1D; ++dx)
704 {
705 const real_t bx = B(qx,dx);
706 const real_t x = u[dy][dx];
707 Bu[dy][qx] += bx * x;
708 }
709 }
710 }
711 real_t BBu[max_Q1D][max_Q1D];
712 for (int qx = 0; qx < Q1D; ++qx)
713 {
714 for (int qy = 0; qy < Q1D; ++qy)
715 {
716 BBu[qy][qx] = 0.0;
717 for (int dy = 0; dy < D1D; ++dy)
718 {
719 const real_t bx = B(qy,dy);
720 BBu[qy][qx] += bx * Bu[dy][qx];
721 }
722 }
723 }
724 // Calculate Dxy, xDy in plane
725 real_t DBu[max_Q1D][max_Q1D][2];
726 for (int qy = 0; qy < Q1D; ++qy)
727 {
728 for (int qx = 0; qx < Q1D; ++qx)
729 {
730 const real_t O1 = op(qx,qy,0,e);
731 const real_t O2 = op(qx,qy,1,e);
732
733 const real_t X = BBu[qy][qx];
734
735 DBu[qy][qx][0] = O1 * X;
736 DBu[qy][qx][1] = O2 * X;
737 }
738 }
739 real_t GDBu[max_D1D][max_Q1D][2];
740 for (int qx = 0; qx < Q1D; ++qx)
741 {
742 for (int dy = 0; dy < D1D; ++dy)
743 {
744 GDBu[dy][qx][0] = 0.0;
745 GDBu[dy][qx][1] = 0.0;
746 for (int qy = 0; qy < Q1D; ++qy)
747 {
748 const real_t by = Bt(dy,qy);
749 const real_t gy = Gt(dy,qy);
750 GDBu[dy][qx][0] += by * DBu[qy][qx][0];
751 GDBu[dy][qx][1] += gy * DBu[qy][qx][1];
752 }
753 }
754 }
755 for (int dx = 0; dx < D1D; ++dx)
756 {
757 for (int dy = 0; dy < D1D; ++dy)
758 {
759 real_t res = 0.0;
760 for (int qx = 0; qx < Q1D; ++qx)
761 {
762 const real_t bx = Bt(dx,qx);
763 const real_t gx = Gt(dx,qx);
764 res += gx * GDBu[dy][qx][0] + bx * GDBu[dy][qx][1];
765 }
766 y(dx,dy,e) += res;
767 }
768 }
769 });
770}
771
772// Optimized PA Convection Apply 2D kernel
773template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0> static
774void SmemPAConvectionApplyT2D(const int ne,
775 const Array<real_t> &b,
776 const Array<real_t> &g,
777 const Array<real_t> &bt,
778 const Array<real_t> &gt,
779 const Vector &op_,
780 const Vector &x_,
781 Vector &y_,
782 const int d1d = 0,
783 const int q1d = 0)
784{
785 const int NE = ne;
786 const int D1D = T_D1D ? T_D1D : d1d;
787 const int Q1D = T_Q1D ? T_Q1D : q1d;
788 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
789 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
790 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
791 auto B = Reshape(b.Read(), Q1D, D1D);
792 auto Bt = Reshape(bt.Read(), D1D, Q1D);
793 auto Gt = Reshape(gt.Read(), D1D, Q1D);
794 auto op = Reshape(op_.Read(), Q1D, Q1D, 2, NE);
795 auto x = Reshape(x_.Read(), D1D, D1D, NE);
796 auto y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
797 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
798 {
799 const int tidz = MFEM_THREAD_ID(z);
800 const int D1D = T_D1D ? T_D1D : d1d;
801 const int Q1D = T_Q1D ? T_Q1D : q1d;
802 // the following variables are evaluated at compile time
803 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
804 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
805 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
806 MFEM_SHARED real_t u[NBZ][max_D1D][max_D1D];
807 MFEM_FOREACH_THREAD(dy,y,D1D)
808 {
809 MFEM_FOREACH_THREAD(dx,x,D1D)
810 {
811 // e is really equal to e+tidz
812 u[tidz][dy][dx] = x(dx,dy,e);
813 }
814 }
815 MFEM_SYNC_THREAD;
816 MFEM_SHARED real_t Bu[NBZ][max_D1D][max_Q1D];
817 MFEM_FOREACH_THREAD(dy,y,D1D)
818 {
819 MFEM_FOREACH_THREAD(qx,x,Q1D)
820 {
821 Bu[tidz][dy][qx] = 0.0;
822 for (int dx = 0; dx < D1D; ++dx)
823 {
824 const real_t bx = B(qx,dx);
825 const real_t x = u[tidz][dy][dx];
826 Bu[tidz][dy][qx] += bx * x;
827 }
828 }
829 }
830 MFEM_SYNC_THREAD;
831 MFEM_SHARED real_t BBu[NBZ][max_Q1D][max_Q1D];
832 MFEM_FOREACH_THREAD(qx,x,Q1D)
833 {
834 MFEM_FOREACH_THREAD(qy,y,Q1D)
835 {
836 BBu[tidz][qy][qx] = 0.0;
837 for (int dy = 0; dy < D1D; ++dy)
838 {
839 const real_t bx = B(qy,dy);
840 BBu[tidz][qy][qx] += bx * Bu[tidz][dy][qx];
841 }
842 }
843 }
844 MFEM_SYNC_THREAD;
845 // Calculate Dxy, xDy in plane
846 MFEM_SHARED real_t DBu[NBZ][max_Q1D][max_Q1D][2];
847 MFEM_FOREACH_THREAD(qy,y,Q1D)
848 {
849 MFEM_FOREACH_THREAD(qx,x,Q1D)
850 {
851 const real_t O1 = op(qx,qy,0,e);
852 const real_t O2 = op(qx,qy,1,e);
853
854 const real_t X = BBu[tidz][qy][qx];
855
856 DBu[tidz][qy][qx][0] = O1 * X;
857 DBu[tidz][qy][qx][1] = O2 * X;
858 }
859 }
860 MFEM_SYNC_THREAD;
861 MFEM_SHARED real_t GDBu[NBZ][max_D1D][max_Q1D][2];
862 MFEM_FOREACH_THREAD(qx,x,Q1D)
863 {
864 MFEM_FOREACH_THREAD(dy,y,D1D)
865 {
866 GDBu[tidz][dy][qx][0] = 0.0;
867 GDBu[tidz][dy][qx][1] = 0.0;
868 for (int qy = 0; qy < Q1D; ++qy)
869 {
870 const real_t by = Bt(dy,qy);
871 const real_t gy = Gt(dy,qy);
872 GDBu[tidz][dy][qx][0] += by * DBu[tidz][qy][qx][0];
873 GDBu[tidz][dy][qx][1] += gy * DBu[tidz][qy][qx][1];
874 }
875 }
876 }
877 MFEM_SYNC_THREAD;
878 MFEM_FOREACH_THREAD(dx,x,D1D)
879 {
880 MFEM_FOREACH_THREAD(dy,y,D1D)
881 {
882 real_t res = 0.0;
883 for (int qx = 0; qx < Q1D; ++qx)
884 {
885 const real_t bx = Bt(dx,qx);
886 const real_t gx = Gt(dx,qx);
887 res += gx * GDBu[tidz][dy][qx][0] + bx * GDBu[tidz][dy][qx][1];
888 }
889 y(dx,dy,e) += res;
890 }
891 }
892 });
893}
894
895// PA Convection Apply 3D kernel
896template<int T_D1D = 0, int T_Q1D = 0> static
897void PAConvectionApplyT3D(const int ne,
898 const Array<real_t> &b,
899 const Array<real_t> &g,
900 const Array<real_t> &bt,
901 const Array<real_t> &gt,
902 const Vector &op_,
903 const Vector &x_,
904 Vector &y_,
905 const int d1d = 0,
906 const int q1d = 0)
907{
908 const int NE = ne;
909 const int D1D = T_D1D ? T_D1D : d1d;
910 const int Q1D = T_Q1D ? T_Q1D : q1d;
911 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
912 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
913 auto B = Reshape(b.Read(), Q1D, D1D);
914 auto Bt = Reshape(bt.Read(), D1D, Q1D);
915 auto Gt = Reshape(gt.Read(), D1D, Q1D);
916 auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
917 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
918 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
919 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
920 {
921 const int D1D = T_D1D ? T_D1D : d1d;
922 const int Q1D = T_Q1D ? T_Q1D : q1d;
923 // the following variables are evaluated at compile time
924 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
925 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
926
927 real_t u[max_D1D][max_D1D][max_D1D];
928 for (int dz = 0; dz < D1D; ++dz)
929 {
930 for (int dy = 0; dy < D1D; ++dy)
931 {
932 for (int dx = 0; dx < D1D; ++dx)
933 {
934 u[dz][dy][dx] = x(dx,dy,dz,e);
935 }
936 }
937 }
938 real_t Bu[max_D1D][max_D1D][max_Q1D];
939 for (int dz = 0; dz < D1D; ++dz)
940 {
941 for (int dy = 0; dy < D1D; ++dy)
942 {
943 for (int qx = 0; qx < Q1D; ++qx)
944 {
945 Bu[dz][dy][qx] = 0.0;
946 for (int dx = 0; dx < D1D; ++dx)
947 {
948 const real_t bx = B(qx,dx);
949 const real_t x = u[dz][dy][dx];
950 Bu[dz][dy][qx] += bx * x;
951 }
952 }
953 }
954 }
955 real_t BBu[max_D1D][max_Q1D][max_Q1D];
956 for (int dz = 0; dz < D1D; ++dz)
957 {
958 for (int qx = 0; qx < Q1D; ++qx)
959 {
960 for (int qy = 0; qy < Q1D; ++qy)
961 {
962 BBu[dz][qy][qx] = 0.0;
963 for (int dy = 0; dy < D1D; ++dy)
964 {
965 const real_t bx = B(qy,dy);
966 BBu[dz][qy][qx] += bx * Bu[dz][dy][qx];
967 }
968 }
969 }
970 }
971 real_t BBBu[max_Q1D][max_Q1D][max_Q1D];
972 for (int qx = 0; qx < Q1D; ++qx)
973 {
974 for (int qy = 0; qy < Q1D; ++qy)
975 {
976 for (int qz = 0; qz < Q1D; ++qz)
977 {
978 BBBu[qz][qy][qx] = 0.0;
979 for (int dz = 0; dz < D1D; ++dz)
980 {
981 const real_t bx = B(qz,dz);
982 BBBu[qz][qy][qx] += bx * BBu[dz][qy][qx];
983 }
984 }
985 }
986 }
987 // Calculate Dxy, xDy in plane
988 real_t DBu[max_Q1D][max_Q1D][max_Q1D][3];
989 for (int qz = 0; qz < Q1D; ++qz)
990 {
991 for (int qy = 0; qy < Q1D; ++qy)
992 {
993 for (int qx = 0; qx < Q1D; ++qx)
994 {
995 const real_t O1 = op(qx,qy,qz,0,e);
996 const real_t O2 = op(qx,qy,qz,1,e);
997 const real_t O3 = op(qx,qy,qz,2,e);
998
999 const real_t X = BBBu[qz][qy][qx];
1000
1001 DBu[qz][qy][qx][0] = O1 * X;
1002 DBu[qz][qy][qx][1] = O2 * X;
1003 DBu[qz][qy][qx][2] = O3 * X;
1004 }
1005 }
1006 }
1007 real_t GDBu[max_D1D][max_Q1D][max_Q1D][3];
1008 for (int qx = 0; qx < Q1D; ++qx)
1009 {
1010 for (int qy = 0; qy < Q1D; ++qy)
1011 {
1012 for (int dz = 0; dz < D1D; ++dz)
1013 {
1014 GDBu[dz][qy][qx][0] = 0.0;
1015 GDBu[dz][qy][qx][1] = 0.0;
1016 GDBu[dz][qy][qx][2] = 0.0;
1017 for (int qz = 0; qz < Q1D; ++qz)
1018 {
1019 const real_t bz = Bt(dz,qz);
1020 const real_t gz = Gt(dz,qz);
1021 GDBu[dz][qy][qx][0] += bz * DBu[qz][qy][qx][0];
1022 GDBu[dz][qy][qx][1] += bz * DBu[qz][qy][qx][1];
1023 GDBu[dz][qy][qx][2] += gz * DBu[qz][qy][qx][2];
1024 }
1025 }
1026 }
1027 }
1028 real_t GGDBu[max_D1D][max_D1D][max_Q1D][3];
1029 for (int dz = 0; dz < D1D; ++dz)
1030 {
1031 for (int qx = 0; qx < Q1D; ++qx)
1032 {
1033 for (int dy = 0; dy < D1D; ++dy)
1034 {
1035 GGDBu[dz][dy][qx][0] = 0.0;
1036 GGDBu[dz][dy][qx][1] = 0.0;
1037 GGDBu[dz][dy][qx][2] = 0.0;
1038 for (int qy = 0; qy < Q1D; ++qy)
1039 {
1040 const real_t by = Bt(dy,qy);
1041 const real_t gy = Gt(dy,qy);
1042 GGDBu[dz][dy][qx][0] += by * GDBu[dz][qy][qx][0];
1043 GGDBu[dz][dy][qx][1] += gy * GDBu[dz][qy][qx][1];
1044 GGDBu[dz][dy][qx][2] += by * GDBu[dz][qy][qx][2];
1045 }
1046 }
1047 }
1048 }
1049 for (int dz = 0; dz < D1D; ++dz)
1050 {
1051 for (int dy = 0; dy < D1D; ++dy)
1052 {
1053 for (int dx = 0; dx < D1D; ++dx)
1054 {
1055 real_t res = 0.0;
1056 for (int qx = 0; qx < Q1D; ++qx)
1057 {
1058 const real_t bx = Bt(dx,qx);
1059 const real_t gx = Gt(dx,qx);
1060 res += gx * GGDBu[dz][dy][qx][0];
1061 res += bx * GGDBu[dz][dy][qx][1];
1062 res += bx * GGDBu[dz][dy][qx][2];
1063 }
1064 y(dx,dy,dz,e) += res;
1065 }
1066 }
1067 }
1068 });
1069}
1070
1071// Optimized PA Convection Apply 3D kernel
1072template<int T_D1D = 0, int T_Q1D = 0> static
1073void SmemPAConvectionApplyT3D(const int ne,
1074 const Array<real_t> &b,
1075 const Array<real_t> &g,
1076 const Array<real_t> &bt,
1077 const Array<real_t> &gt,
1078 const Vector &op_,
1079 const Vector &x_,
1080 Vector &y_,
1081 const int d1d = 0,
1082 const int q1d = 0)
1083{
1084 const int NE = ne;
1085 const int D1D = T_D1D ? T_D1D : d1d;
1086 const int Q1D = T_Q1D ? T_Q1D : q1d;
1087 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1088 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1089 auto B = Reshape(b.Read(), Q1D, D1D);
1090 auto Bt = Reshape(bt.Read(), D1D, Q1D);
1091 auto Gt = Reshape(gt.Read(), D1D, Q1D);
1092 auto op = Reshape(op_.Read(), Q1D, Q1D, Q1D, 3, NE);
1093 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1094 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1095 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
1096 {
1097 const int D1D = T_D1D ? T_D1D : d1d;
1098 const int Q1D = T_Q1D ? T_Q1D : q1d;
1099 // the following variables are evaluated at compile time
1100 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1101 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1102 constexpr int max_DQ = (max_Q1D > max_D1D) ? max_Q1D : max_D1D;
1103 MFEM_SHARED real_t sm0[3*max_DQ*max_DQ*max_DQ];
1104 MFEM_SHARED real_t sm1[3*max_DQ*max_DQ*max_DQ];
1105
1106 real_t (*u)[max_D1D][max_D1D] = (real_t (*)[max_D1D][max_D1D]) sm0;
1107 MFEM_FOREACH_THREAD(dz,z,D1D)
1108 {
1109 MFEM_FOREACH_THREAD(dy,y,D1D)
1110 {
1111 MFEM_FOREACH_THREAD(dx,x,D1D)
1112 {
1113 u[dz][dy][dx] = x(dx,dy,dz,e);
1114 }
1115 }
1116 }
1117 MFEM_SYNC_THREAD;
1118 real_t (*Bu)[max_D1D][max_Q1D] = (real_t (*)[max_D1D][max_Q1D])sm1;
1119 MFEM_FOREACH_THREAD(dz,z,D1D)
1120 {
1121 MFEM_FOREACH_THREAD(dy,y,D1D)
1122 {
1123 MFEM_FOREACH_THREAD(qx,x,Q1D)
1124 {
1125 real_t Bu_ = 0.0;
1126 for (int dx = 0; dx < D1D; ++dx)
1127 {
1128 const real_t bx = B(qx,dx);
1129 const real_t x = u[dz][dy][dx];
1130 Bu_ += bx * x;
1131 }
1132 Bu[dz][dy][qx] = Bu_;
1133 }
1134 }
1135 }
1136 MFEM_SYNC_THREAD;
1137 real_t (*BBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm0;
1138 MFEM_FOREACH_THREAD(dz,z,D1D)
1139 {
1140 MFEM_FOREACH_THREAD(qx,x,Q1D)
1141 {
1142 MFEM_FOREACH_THREAD(qy,y,Q1D)
1143 {
1144 real_t BBu_ = 0.0;
1145 for (int dy = 0; dy < D1D; ++dy)
1146 {
1147 const real_t bx = B(qy,dy);
1148 BBu_ += bx * Bu[dz][dy][qx];
1149 }
1150 BBu[dz][qy][qx] = BBu_;
1151 }
1152 }
1153 }
1154 MFEM_SYNC_THREAD;
1155 real_t (*BBBu)[max_Q1D][max_Q1D] = (real_t (*)[max_Q1D][max_Q1D])sm1;
1156 MFEM_FOREACH_THREAD(qx,x,Q1D)
1157 {
1158 MFEM_FOREACH_THREAD(qy,y,Q1D)
1159 {
1160 MFEM_FOREACH_THREAD(qz,z,Q1D)
1161 {
1162 real_t BBBu_ = 0.0;
1163 for (int dz = 0; dz < D1D; ++dz)
1164 {
1165 const real_t bx = B(qz,dz);
1166 BBBu_ += bx * BBu[dz][qy][qx];
1167 }
1168 BBBu[qz][qy][qx] = BBBu_;
1169 }
1170 }
1171 }
1172 MFEM_SYNC_THREAD;
1173 real_t (*DBu)[max_Q1D][max_Q1D][3] = (real_t (*)[max_Q1D][max_Q1D][3])sm0;
1174 MFEM_FOREACH_THREAD(qz,z,Q1D)
1175 {
1176 MFEM_FOREACH_THREAD(qy,y,Q1D)
1177 {
1178 MFEM_FOREACH_THREAD(qx,x,Q1D)
1179 {
1180 const real_t O1 = op(qx,qy,qz,0,e);
1181 const real_t O2 = op(qx,qy,qz,1,e);
1182 const real_t O3 = op(qx,qy,qz,2,e);
1183
1184 const real_t X = BBBu[qz][qy][qx];
1185
1186 DBu[qz][qy][qx][0] = O1 * X;
1187 DBu[qz][qy][qx][1] = O2 * X;
1188 DBu[qz][qy][qx][2] = O3 * X;
1189 }
1190 }
1191 }
1192 MFEM_SYNC_THREAD;
1193 real_t (*GDBu)[max_Q1D][max_Q1D][3] = (real_t (*)[max_Q1D][max_Q1D][3])sm1;
1194 MFEM_FOREACH_THREAD(qx,x,Q1D)
1195 {
1196 MFEM_FOREACH_THREAD(qy,y,Q1D)
1197 {
1198 MFEM_FOREACH_THREAD(dz,z,D1D)
1199 {
1200 real_t GDBu0 = 0.0;
1201 real_t GDBu1 = 0.0;
1202 real_t GDBu2 = 0.0;
1203 for (int qz = 0; qz < Q1D; ++qz)
1204 {
1205 const real_t bz = Bt(dz,qz);
1206 const real_t gz = Gt(dz,qz);
1207 GDBu0 += bz * DBu[qz][qy][qx][0];
1208 GDBu1 += bz * DBu[qz][qy][qx][1];
1209 GDBu2 += gz * DBu[qz][qy][qx][2];
1210 }
1211 GDBu[dz][qy][qx][0] = GDBu0;
1212 GDBu[dz][qy][qx][1] = GDBu1;
1213 GDBu[dz][qy][qx][2] = GDBu2;
1214 }
1215 }
1216 }
1217 MFEM_SYNC_THREAD;
1218 real_t (*GGDBu)[max_D1D][max_Q1D][3] = (real_t (*)[max_D1D][max_Q1D][3])sm0;
1219 MFEM_FOREACH_THREAD(dz,z,D1D)
1220 {
1221 MFEM_FOREACH_THREAD(qx,x,Q1D)
1222 {
1223 MFEM_FOREACH_THREAD(dy,y,D1D)
1224 {
1225 real_t GGDBu0 = 0.0;
1226 real_t GGDBu1 = 0.0;
1227 real_t GGDBu2 = 0.0;
1228 for (int qy = 0; qy < Q1D; ++qy)
1229 {
1230 const real_t by = Bt(dy,qy);
1231 const real_t gy = Gt(dy,qy);
1232 GGDBu0 += by * GDBu[dz][qy][qx][0];
1233 GGDBu1 += gy * GDBu[dz][qy][qx][1];
1234 GGDBu2 += by * GDBu[dz][qy][qx][2];
1235 }
1236 GGDBu[dz][dy][qx][0] = GGDBu0;
1237 GGDBu[dz][dy][qx][1] = GGDBu1;
1238 GGDBu[dz][dy][qx][2] = GGDBu2;
1239 }
1240 }
1241 }
1242 MFEM_SYNC_THREAD;
1243 MFEM_FOREACH_THREAD(dz,z,D1D)
1244 {
1245 MFEM_FOREACH_THREAD(dy,y,D1D)
1246 {
1247 MFEM_FOREACH_THREAD(dx,x,D1D)
1248 {
1249 real_t res = 0.0;
1250 for (int qx = 0; qx < Q1D; ++qx)
1251 {
1252 const real_t bx = Bt(dx,qx);
1253 const real_t gx = Gt(dx,qx);
1254 res += gx * GGDBu[dz][dy][qx][0];
1255 res += bx * GGDBu[dz][dy][qx][1];
1256 res += bx * GGDBu[dz][dy][qx][2];
1257 }
1258 y(dx,dy,dz,e) += res;
1259 }
1260 }
1261 }
1262 });
1263}
1264
1265namespace convection
1266{
1267constexpr int ipow(int x, int p) { return p == 0 ? 1 : x*ipow(x, p-1); }
1268constexpr int D(int D1D) { return (11 - D1D) / 2; }
1269constexpr int NBZ(int D1D)
1270{
1271 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
1272}
1273}
1274
1275template <int DIM, int T_D1D, int T_Q1D>
1277ConvectionIntegrator::ApplyPAKernels::Kernel()
1278{
1279 if constexpr (DIM == 2)
1280 {
1281 constexpr int T_NBZ = convection::NBZ(T_D1D);
1282 return SmemPAConvectionApply2D<T_D1D, T_Q1D, T_NBZ>;
1283 }
1284 else if constexpr (DIM == 3)
1285 {
1286 return SmemPAConvectionApply3D<T_D1D, T_Q1D>;
1287 }
1288 MFEM_ABORT("");
1289}
1290
1291template <int DIM, int T_D1D, int T_Q1D>
1293ConvectionIntegrator::ApplyPATKernels::Kernel()
1294{
1295 if constexpr (DIM == 2)
1296 {
1297 constexpr int T_NBZ = convection::NBZ(T_D1D);
1298 return SmemPAConvectionApplyT2D<T_D1D, T_Q1D, T_NBZ>;
1299 }
1300 else if constexpr (DIM == 3)
1301 {
1302 return SmemPAConvectionApplyT3D<T_D1D, T_Q1D>;
1303 }
1304 MFEM_ABORT("");
1305}
1306} // namespace mfem
1307/// \endcond DO_NOT_DOCUMENT
1308#endif
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
arguments: NE, B, G, Bt, Gt, pa_data, x, y, D1D, Q1D
real_t b
Definition lissajous.cpp:42
constexpr int DIM
mfem::real_t real_t
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:138
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:931
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:937
float real_t
Definition config.hpp:46
void forall(int N, lambda &&body)
Definition forall.hpp:839
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128