MFEM v4.8.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_mass_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP
13#define MFEM_BILININTEG_MASS_KERNELS_HPP
14
20#include "../bilininteg.hpp"
21
22namespace mfem
23{
24
25/// \cond DO_NOT_DOCUMENT
26
27namespace internal
28{
29
30// PA Mass Diagonal 1D kernel
31static void PAMassAssembleDiagonal1D(const int NE,
32 const Array<real_t> &b,
33 const Vector &d,
34 Vector &y,
35 const int D1D,
36 const int Q1D)
37{
38 auto B = Reshape(b.Read(), Q1D, D1D);
39 auto D = Reshape(d.Read(), Q1D, NE);
40 auto Y = Reshape(y.ReadWrite(), D1D, NE);
41 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
42 {
43 for (int dx = 0; dx < D1D; ++dx)
44 {
45 for (int qx = 0; qx < Q1D; ++qx)
46 {
47 Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
48 }
49 }
50 });
51}
52
53MFEM_HOST_DEVICE inline
54void PAMassApply1D_Element(const int e,
55 const int NE,
56 const real_t *b_,
57 const real_t *bt_,
58 const real_t *d_,
59 const real_t *x_,
60 real_t *y_,
61 const int d1d = 0,
62 const int q1d = 0)
63{
64 const int D1D = d1d;
65 const int Q1D = q1d;
66 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
67 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
68 auto D = ConstDeviceMatrix(d_, Q1D, NE);
69 auto X = ConstDeviceMatrix(x_, D1D, NE);
70 auto Y = DeviceMatrix(y_, D1D, NE);
71
72 real_t XQ[DofQuadLimits::MAX_Q1D];
73 for (int qx = 0; qx < Q1D; ++qx)
74 {
75 XQ[qx] = 0.0;
76 }
77 for (int dx = 0; dx < D1D; ++dx)
78 {
79 const real_t s = X(dx,e);
80 for (int qx = 0; qx < Q1D; ++qx)
81 {
82 XQ[qx] += B(qx,dx)*s;
83 }
84 }
85 for (int qx = 0; qx < Q1D; ++qx)
86 {
87 const double q = XQ[qx]*D(qx,e);
88 for (int dx = 0; dx < D1D; ++dx)
89 {
90 Y(dx,e) += Bt(dx,qx) * q;
91 }
92 }
93}
94
95// PA Mass Apply 1D kernel
96static void PAMassApply1D(const int NE,
97 const Array<real_t> &b_,
98 const Array<real_t> &bt_,
99 const Vector &d_,
100 const Vector &x_,
101 Vector &y_,
102 const int d1d = 0,
103 const int q1d = 0)
104{
105 MFEM_VERIFY(d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
106 MFEM_VERIFY(q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
107
108 const auto B = b_.Read();
109 const auto Bt = bt_.Read();
110 const auto D = d_.Read();
111 const auto X = x_.Read();
112 auto Y = y_.ReadWrite();
113
114 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
115 {
116 internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
117 });
118}
119
120// PA Mass Diagonal 2D kernel
121template<int T_D1D = 0, int T_Q1D = 0>
122inline void PAMassAssembleDiagonal2D(const int NE,
123 const Array<real_t> &b,
124 const Vector &d,
125 Vector &y,
126 const int d1d = 0,
127 const int q1d = 0)
128{
129 const int D1D = T_D1D ? T_D1D : d1d;
130 const int Q1D = T_Q1D ? T_Q1D : q1d;
131 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
132 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
133 auto B = Reshape(b.Read(), Q1D, D1D);
134 auto D = Reshape(d.Read(), Q1D, Q1D, NE);
135 auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
136 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
137 {
138 const int D1D = T_D1D ? T_D1D : d1d;
139 const int Q1D = T_Q1D ? T_Q1D : q1d;
140 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
141 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
142 real_t QD[MQ1][MD1];
143 for (int qx = 0; qx < Q1D; ++qx)
144 {
145 for (int dy = 0; dy < D1D; ++dy)
146 {
147 QD[qx][dy] = 0.0;
148 for (int qy = 0; qy < Q1D; ++qy)
149 {
150 QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
151 }
152 }
153 }
154 for (int dy = 0; dy < D1D; ++dy)
155 {
156 for (int dx = 0; dx < D1D; ++dx)
157 {
158 for (int qx = 0; qx < Q1D; ++qx)
159 {
160 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
161 }
162 }
163 }
164 });
165}
166
167namespace mass
168{
169constexpr int ipow(int x, int p) { return p == 0 ? 1 : x*ipow(x, p-1); }
170constexpr int D(int D1D) { return (11 - D1D) / 2; }
171constexpr int NBZ(int D1D)
172{
173 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
174}
175}
176
177// Shared memory PA Mass Diagonal 2D kernel
178template<int T_D1D = 0, int T_Q1D = 0>
179inline void SmemPAMassAssembleDiagonal2D(const int NE,
180 const Array<real_t> &b_,
181 const Vector &d_,
182 Vector &y_,
183 const int d1d = 0,
184 const int q1d = 0)
185{
186 static constexpr int T_NBZ = mass::NBZ(T_D1D);
187 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
188 const int D1D = T_D1D ? T_D1D : d1d;
189 const int Q1D = T_Q1D ? T_Q1D : q1d;
190 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
191 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
192 MFEM_VERIFY(D1D <= max_d1d, "");
193 MFEM_VERIFY(Q1D <= max_q1d, "");
194 auto b = Reshape(b_.Read(), Q1D, D1D);
195 auto D = Reshape(d_.Read(), Q1D, Q1D, NE);
196 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
197 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
198 {
199 const int tidz = MFEM_THREAD_ID(z);
200 const int D1D = T_D1D ? T_D1D : d1d;
201 const int Q1D = T_Q1D ? T_Q1D : q1d;
202 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
203 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
204 MFEM_SHARED real_t B[MQ1][MD1];
205 MFEM_SHARED real_t QDZ[NBZ][MQ1][MD1];
206 real_t (*QD)[MD1] = (real_t (*)[MD1])(QDZ + tidz);
207 if (tidz == 0)
208 {
209 MFEM_FOREACH_THREAD(d,y,D1D)
210 {
211 MFEM_FOREACH_THREAD(q,x,Q1D)
212 {
213 B[q][d] = b(q,d);
214 }
215 }
216 }
217 MFEM_SYNC_THREAD;
218 MFEM_FOREACH_THREAD(qx,x,Q1D)
219 {
220 MFEM_FOREACH_THREAD(dy,y,D1D)
221 {
222 QD[qx][dy] = 0.0;
223 for (int qy = 0; qy < Q1D; ++qy)
224 {
225 QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
226 }
227 }
228 }
229 MFEM_SYNC_THREAD;
230 MFEM_FOREACH_THREAD(dy,y,D1D)
231 {
232 MFEM_FOREACH_THREAD(dx,x,D1D)
233 {
234 for (int qx = 0; qx < Q1D; ++qx)
235 {
236 // might need absolute values on next line
237 Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
238 }
239 }
240 }
241 });
242}
243
244// PA Mass Diagonal 3D kernel
245template<int T_D1D = 0, int T_Q1D = 0>
246inline void PAMassAssembleDiagonal3D(const int NE,
247 const Array<real_t> &b,
248 const Vector &d,
249 Vector &y,
250 const int d1d = 0,
251 const int q1d = 0)
252{
253 const int D1D = T_D1D ? T_D1D : d1d;
254 const int Q1D = T_Q1D ? T_Q1D : q1d;
255 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
256 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
257 auto B = Reshape(b.Read(), Q1D, D1D);
258 auto D = Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
259 auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
260 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
261 {
262 const int D1D = T_D1D ? T_D1D : d1d;
263 const int Q1D = T_Q1D ? T_Q1D : q1d;
264 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
265 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
266 real_t QQD[MQ1][MQ1][MD1];
267 real_t QDD[MQ1][MD1][MD1];
268 for (int qx = 0; qx < Q1D; ++qx)
269 {
270 for (int qy = 0; qy < Q1D; ++qy)
271 {
272 for (int dz = 0; dz < D1D; ++dz)
273 {
274 QQD[qx][qy][dz] = 0.0;
275 for (int qz = 0; qz < Q1D; ++qz)
276 {
277 QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
278 }
279 }
280 }
281 }
282 for (int qx = 0; qx < Q1D; ++qx)
283 {
284 for (int dz = 0; dz < D1D; ++dz)
285 {
286 for (int dy = 0; dy < D1D; ++dy)
287 {
288 QDD[qx][dy][dz] = 0.0;
289 for (int qy = 0; qy < Q1D; ++qy)
290 {
291 QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
292 }
293 }
294 }
295 }
296 for (int dz = 0; dz < D1D; ++dz)
297 {
298 for (int dy = 0; dy < D1D; ++dy)
299 {
300 for (int dx = 0; dx < D1D; ++dx)
301 {
302 real_t t = 0.0;
303 for (int qx = 0; qx < Q1D; ++qx)
304 {
305 t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
306 }
307 Y(dx, dy, dz, e) += t;
308 }
309 }
310 }
311 });
312}
313
314// Shared memory PA Mass Diagonal 3D kernel
315template<int T_D1D = 0, int T_Q1D = 0>
316inline void SmemPAMassAssembleDiagonal3D(const int NE,
317 const Array<real_t> &b_,
318 const Vector &d_,
319 Vector &y_,
320 const int d1d = 0,
321 const int q1d = 0)
322{
323 const int D1D = T_D1D ? T_D1D : d1d;
324 const int Q1D = T_Q1D ? T_Q1D : q1d;
325 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
326 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
327 MFEM_VERIFY(D1D <= max_d1d, "");
328 MFEM_VERIFY(Q1D <= max_q1d, "");
329 auto b = Reshape(b_.Read(), Q1D, D1D);
330 auto D = Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
331 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
332 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
333 {
334 const int tidz = MFEM_THREAD_ID(z);
335 const int D1D = T_D1D ? T_D1D : d1d;
336 const int Q1D = T_Q1D ? T_Q1D : q1d;
337 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
338 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
339 MFEM_SHARED real_t B[MQ1][MD1];
340 MFEM_SHARED real_t QQD[MQ1][MQ1][MD1];
341 MFEM_SHARED real_t QDD[MQ1][MD1][MD1];
342 if (tidz == 0)
343 {
344 MFEM_FOREACH_THREAD(d,y,D1D)
345 {
346 MFEM_FOREACH_THREAD(q,x,Q1D)
347 {
348 B[q][d] = b(q,d);
349 }
350 }
351 }
352 MFEM_SYNC_THREAD;
353 MFEM_FOREACH_THREAD(qx,x,Q1D)
354 {
355 MFEM_FOREACH_THREAD(qy,y,Q1D)
356 {
357 MFEM_FOREACH_THREAD(dz,z,D1D)
358 {
359 QQD[qx][qy][dz] = 0.0;
360 for (int qz = 0; qz < Q1D; ++qz)
361 {
362 QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
363 }
364 }
365 }
366 }
367 MFEM_SYNC_THREAD;
368 MFEM_FOREACH_THREAD(qx,x,Q1D)
369 {
370 MFEM_FOREACH_THREAD(dz,z,D1D)
371 {
372 MFEM_FOREACH_THREAD(dy,y,D1D)
373 {
374 QDD[qx][dy][dz] = 0.0;
375 for (int qy = 0; qy < Q1D; ++qy)
376 {
377 QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
378 }
379 }
380 }
381 }
382 MFEM_SYNC_THREAD;
383 MFEM_FOREACH_THREAD(dz,z,D1D)
384 {
385 MFEM_FOREACH_THREAD(dy,y,D1D)
386 {
387 MFEM_FOREACH_THREAD(dx,x,D1D)
388 {
389 real_t t = 0.0;
390 for (int qx = 0; qx < Q1D; ++qx)
391 {
392 t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
393 }
394 Y(dx, dy, dz, e) += t;
395 }
396 }
397 }
398 });
399}
400
401#ifdef MFEM_USE_OCCA
402// OCCA PA Mass Apply 2D kernel
403void OccaPAMassApply2D(const int D1D,
404 const int Q1D,
405 const int NE,
406 const Array<real_t> &B,
407 const Array<real_t> &Bt,
408 const Vector &D,
409 const Vector &X,
410 Vector &Y);
411
412// OCCA PA Mass Apply 3D kernel
413void OccaPAMassApply3D(const int D1D,
414 const int Q1D,
415 const int NE,
416 const Array<real_t> &B,
417 const Array<real_t> &Bt,
418 const Vector &D,
419 const Vector &X,
420 Vector &Y);
421#endif // MFEM_USE_OCCA
422
423template <bool ACCUMULATE = true>
424MFEM_HOST_DEVICE inline
425void PAMassApply2D_Element(const int e,
426 const int NE,
427 const real_t *b_,
428 const real_t *bt_,
429 const real_t *d_,
430 const real_t *x_,
431 real_t *y_,
432 const int d1d = 0,
433 const int q1d = 0)
434{
435 const int D1D = d1d;
436 const int Q1D = q1d;
437 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
438 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
439 auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
440 auto X = ConstDeviceCube(x_, D1D, D1D, NE);
441 auto Y = DeviceCube(y_, D1D, D1D, NE);
442
443 if (!ACCUMULATE)
444 {
445 for (int dy = 0; dy < D1D; ++dy)
446 {
447 for (int dx = 0; dx < D1D; ++dx)
448 {
449 Y(dx, dy, e) = 0.0;
450 }
451 }
452 }
453
454 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
455 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
456 real_t sol_xy[max_Q1D][max_Q1D];
457 for (int qy = 0; qy < Q1D; ++qy)
458 {
459 for (int qx = 0; qx < Q1D; ++qx)
460 {
461 sol_xy[qy][qx] = 0.0;
462 }
463 }
464 for (int dy = 0; dy < D1D; ++dy)
465 {
466 real_t sol_x[max_Q1D];
467 for (int qy = 0; qy < Q1D; ++qy)
468 {
469 sol_x[qy] = 0.0;
470 }
471 for (int dx = 0; dx < D1D; ++dx)
472 {
473 const real_t s = X(dx,dy,e);
474 for (int qx = 0; qx < Q1D; ++qx)
475 {
476 sol_x[qx] += B(qx,dx)* s;
477 }
478 }
479 for (int qy = 0; qy < Q1D; ++qy)
480 {
481 const real_t d2q = B(qy,dy);
482 for (int qx = 0; qx < Q1D; ++qx)
483 {
484 sol_xy[qy][qx] += d2q * sol_x[qx];
485 }
486 }
487 }
488 for (int qy = 0; qy < Q1D; ++qy)
489 {
490 for (int qx = 0; qx < Q1D; ++qx)
491 {
492 sol_xy[qy][qx] *= D(qx,qy,e);
493 }
494 }
495 for (int qy = 0; qy < Q1D; ++qy)
496 {
497 real_t sol_x[max_D1D];
498 for (int dx = 0; dx < D1D; ++dx)
499 {
500 sol_x[dx] = 0.0;
501 }
502 for (int qx = 0; qx < Q1D; ++qx)
503 {
504 const real_t s = sol_xy[qy][qx];
505 for (int dx = 0; dx < D1D; ++dx)
506 {
507 sol_x[dx] += Bt(dx,qx) * s;
508 }
509 }
510 for (int dy = 0; dy < D1D; ++dy)
511 {
512 const real_t q2d = Bt(dy,qy);
513 for (int dx = 0; dx < D1D; ++dx)
514 {
515 Y(dx,dy,e) += q2d * sol_x[dx];
516 }
517 }
518 }
519}
520
521template<int T_D1D, int T_Q1D, int T_NBZ, bool ACCUMULATE = true>
522MFEM_HOST_DEVICE inline
523void SmemPAMassApply2D_Element(const int e,
524 const int NE,
525 const real_t *b_,
526 const real_t *d_,
527 const real_t *x_,
528 real_t *y_,
529 int d1d = 0,
530 int q1d = 0)
531{
532 const int D1D = T_D1D ? T_D1D : d1d;
533 const int Q1D = T_Q1D ? T_Q1D : q1d;
534 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
535
536 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
537 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
538 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
539
540 auto b = ConstDeviceMatrix(b_, Q1D, D1D);
541 auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
542 auto x = ConstDeviceCube(x_, D1D, D1D, NE);
543 auto Y = DeviceCube(y_, D1D, D1D, NE);
544
545 const int tidz = MFEM_THREAD_ID(z);
546
547 MFEM_SHARED real_t BBt[MQ1*MD1];
548 real_t (*B)[MD1] = (real_t (*)[MD1]) BBt;
549 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) BBt;
550 MFEM_SHARED real_t sm0[NBZ][MDQ*MDQ];
551 MFEM_SHARED real_t sm1[NBZ][MDQ*MDQ];
552 real_t (*X)[MD1] = (real_t (*)[MD1]) (sm0 + tidz);
553 real_t (*DQ)[MQ1] = (real_t (*)[MQ1]) (sm1 + tidz);
554 real_t (*QQ)[MQ1] = (real_t (*)[MQ1]) (sm0 + tidz);
555 real_t (*QD)[MD1] = (real_t (*)[MD1]) (sm1 + tidz);
556
557
558 MFEM_FOREACH_THREAD(dy,y,D1D)
559 {
560 MFEM_FOREACH_THREAD(dx,x,D1D)
561 {
562 X[dy][dx] = x(dx,dy,e);
563 }
564 }
565 if (tidz == 0)
566 {
567 MFEM_FOREACH_THREAD(dy,y,D1D)
568 {
569 MFEM_FOREACH_THREAD(q,x,Q1D)
570 {
571 B[q][dy] = b(q,dy);
572 }
573 }
574 }
575 MFEM_SYNC_THREAD;
576 MFEM_FOREACH_THREAD(dy,y,D1D)
577 {
578 MFEM_FOREACH_THREAD(qx,x,Q1D)
579 {
580 real_t dq = 0.0;
581 for (int dx = 0; dx < D1D; ++dx)
582 {
583 dq += X[dy][dx] * B[qx][dx];
584 }
585 DQ[dy][qx] = dq;
586 }
587 }
588 MFEM_SYNC_THREAD;
589 MFEM_FOREACH_THREAD(qy,y,Q1D)
590 {
591 MFEM_FOREACH_THREAD(qx,x,Q1D)
592 {
593 real_t qq = 0.0;
594 for (int dy = 0; dy < D1D; ++dy)
595 {
596 qq += DQ[dy][qx] * B[qy][dy];
597 }
598 QQ[qy][qx] = qq * D(qx, qy, e);
599 }
600 }
601 MFEM_SYNC_THREAD;
602 if (tidz == 0)
603 {
604 MFEM_FOREACH_THREAD(dy,y,D1D)
605 {
606 MFEM_FOREACH_THREAD(q,x,Q1D)
607 {
608 Bt[dy][q] = b(q,dy);
609 }
610 }
611 }
612 MFEM_SYNC_THREAD;
613 MFEM_FOREACH_THREAD(qy,y,Q1D)
614 {
615 MFEM_FOREACH_THREAD(dx,x,D1D)
616 {
617 real_t dq = 0.0;
618 for (int qx = 0; qx < Q1D; ++qx)
619 {
620 dq += QQ[qy][qx] * Bt[dx][qx];
621 }
622 QD[qy][dx] = dq;
623 }
624 }
625 MFEM_SYNC_THREAD;
626 MFEM_FOREACH_THREAD(dy,y,D1D)
627 {
628 MFEM_FOREACH_THREAD(dx,x,D1D)
629 {
630 real_t dd = 0.0;
631 for (int qy = 0; qy < Q1D; ++qy)
632 {
633 dd += (QD[qy][dx] * Bt[dy][qy]);
634 }
635 if (ACCUMULATE)
636 {
637 Y(dx, dy, e) += dd;
638 }
639 else
640 {
641 Y(dx, dy, e) = dd;
642 }
643 }
644 }
645}
646
647template <bool ACCUMULATE = true>
648MFEM_HOST_DEVICE inline
649void PAMassApply3D_Element(const int e,
650 const int NE,
651 const real_t *b_,
652 const real_t *bt_,
653 const real_t *d_,
654 const real_t *x_,
655 real_t *y_,
656 const int d1d,
657 const int q1d)
658{
659 const int D1D = d1d;
660 const int Q1D = q1d;
661 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
662 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
663 auto D = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
664 auto X = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
665 auto Y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
666
667 if (!ACCUMULATE)
668 {
669 for (int dz = 0; dz < D1D; ++dz)
670 {
671 for (int dy = 0; dy < D1D; ++dy)
672 {
673 for (int dx = 0; dx < D1D; ++dx)
674 {
675 Y(dx, dy, dz, e) = 0.0;
676 }
677 }
678 }
679 }
680
681 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
682 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
683 real_t sol_xyz[max_Q1D][max_Q1D][max_Q1D];
684 for (int qz = 0; qz < Q1D; ++qz)
685 {
686 for (int qy = 0; qy < Q1D; ++qy)
687 {
688 for (int qx = 0; qx < Q1D; ++qx)
689 {
690 sol_xyz[qz][qy][qx] = 0.0;
691 }
692 }
693 }
694 for (int dz = 0; dz < D1D; ++dz)
695 {
696 real_t sol_xy[max_Q1D][max_Q1D];
697 for (int qy = 0; qy < Q1D; ++qy)
698 {
699 for (int qx = 0; qx < Q1D; ++qx)
700 {
701 sol_xy[qy][qx] = 0.0;
702 }
703 }
704 for (int dy = 0; dy < D1D; ++dy)
705 {
706 real_t sol_x[max_Q1D];
707 for (int qx = 0; qx < Q1D; ++qx)
708 {
709 sol_x[qx] = 0;
710 }
711 for (int dx = 0; dx < D1D; ++dx)
712 {
713 const real_t s = X(dx,dy,dz,e);
714 for (int qx = 0; qx < Q1D; ++qx)
715 {
716 sol_x[qx] += B(qx,dx) * s;
717 }
718 }
719 for (int qy = 0; qy < Q1D; ++qy)
720 {
721 const real_t wy = B(qy,dy);
722 for (int qx = 0; qx < Q1D; ++qx)
723 {
724 sol_xy[qy][qx] += wy * sol_x[qx];
725 }
726 }
727 }
728 for (int qz = 0; qz < Q1D; ++qz)
729 {
730 const real_t wz = B(qz,dz);
731 for (int qy = 0; qy < Q1D; ++qy)
732 {
733 for (int qx = 0; qx < Q1D; ++qx)
734 {
735 sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
736 }
737 }
738 }
739 }
740 for (int qz = 0; qz < Q1D; ++qz)
741 {
742 for (int qy = 0; qy < Q1D; ++qy)
743 {
744 for (int qx = 0; qx < Q1D; ++qx)
745 {
746 sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
747 }
748 }
749 }
750 for (int qz = 0; qz < Q1D; ++qz)
751 {
752 real_t sol_xy[max_D1D][max_D1D];
753 for (int dy = 0; dy < D1D; ++dy)
754 {
755 for (int dx = 0; dx < D1D; ++dx)
756 {
757 sol_xy[dy][dx] = 0;
758 }
759 }
760 for (int qy = 0; qy < Q1D; ++qy)
761 {
762 real_t sol_x[max_D1D];
763 for (int dx = 0; dx < D1D; ++dx)
764 {
765 sol_x[dx] = 0;
766 }
767 for (int qx = 0; qx < Q1D; ++qx)
768 {
769 const real_t s = sol_xyz[qz][qy][qx];
770 for (int dx = 0; dx < D1D; ++dx)
771 {
772 sol_x[dx] += Bt(dx,qx) * s;
773 }
774 }
775 for (int dy = 0; dy < D1D; ++dy)
776 {
777 const real_t wy = Bt(dy,qy);
778 for (int dx = 0; dx < D1D; ++dx)
779 {
780 sol_xy[dy][dx] += wy * sol_x[dx];
781 }
782 }
783 }
784 for (int dz = 0; dz < D1D; ++dz)
785 {
786 const real_t wz = Bt(dz,qz);
787 for (int dy = 0; dy < D1D; ++dy)
788 {
789 for (int dx = 0; dx < D1D; ++dx)
790 {
791 Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
792 }
793 }
794 }
795 }
796}
797
798template<int T_D1D, int T_Q1D, bool ACCUMULATE = true>
799MFEM_HOST_DEVICE inline
800void SmemPAMassApply3D_Element(const int e,
801 const int NE,
802 const real_t *b_,
803 const real_t *d_,
804 const real_t *x_,
805 real_t *y_,
806 const int d1d = 0,
807 const int q1d = 0)
808{
809 constexpr int D1D = T_D1D ? T_D1D : d1d;
810 constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
811 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
812 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
813 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
814
815 auto b = ConstDeviceMatrix(b_, Q1D, D1D);
816 auto d = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
817 auto x = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
818 auto y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
819
820 MFEM_SHARED real_t sDQ[MQ1*MD1];
821 real_t (*B)[MD1] = (real_t (*)[MD1]) sDQ;
822 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) sDQ;
823 MFEM_SHARED real_t sm0[MDQ*MDQ*MDQ];
824 MFEM_SHARED real_t sm1[MDQ*MDQ*MDQ];
825 real_t (*X)[MD1][MD1] = (real_t (*)[MD1][MD1]) sm0;
826 real_t (*DDQ)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) sm1;
827 real_t (*DQQ)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) sm0;
828 real_t (*QQQ)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) sm1;
829 real_t (*QQD)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) sm0;
830 real_t (*QDD)[MD1][MD1] = (real_t (*)[MD1][MD1]) sm1;
831 MFEM_FOREACH_THREAD(dy,y,D1D)
832 {
833 MFEM_FOREACH_THREAD(dx,x,D1D)
834 {
835 MFEM_UNROLL(MD1)
836 for (int dz = 0; dz < D1D; ++dz)
837 {
838 X[dz][dy][dx] = x(dx,dy,dz,e);
839 }
840 }
841 MFEM_FOREACH_THREAD(dx,x,Q1D)
842 {
843 B[dx][dy] = b(dx,dy);
844 }
845 }
846 MFEM_SYNC_THREAD;
847 MFEM_FOREACH_THREAD(dy,y,D1D)
848 {
849 MFEM_FOREACH_THREAD(qx,x,Q1D)
850 {
851 real_t u[D1D];
852 MFEM_UNROLL(MD1)
853 for (int dz = 0; dz < D1D; dz++)
854 {
855 u[dz] = 0;
856 }
857 MFEM_UNROLL(MD1)
858 for (int dx = 0; dx < D1D; ++dx)
859 {
860 MFEM_UNROLL(MD1)
861 for (int dz = 0; dz < D1D; ++dz)
862 {
863 u[dz] += X[dz][dy][dx] * B[qx][dx];
864 }
865 }
866 MFEM_UNROLL(MD1)
867 for (int dz = 0; dz < D1D; ++dz)
868 {
869 DDQ[dz][dy][qx] = u[dz];
870 }
871 }
872 }
873 MFEM_SYNC_THREAD;
874 MFEM_FOREACH_THREAD(qy,y,Q1D)
875 {
876 MFEM_FOREACH_THREAD(qx,x,Q1D)
877 {
878 real_t u[D1D];
879 MFEM_UNROLL(MD1)
880 for (int dz = 0; dz < D1D; dz++)
881 {
882 u[dz] = 0;
883 }
884 MFEM_UNROLL(MD1)
885 for (int dy = 0; dy < D1D; ++dy)
886 {
887 MFEM_UNROLL(MD1)
888 for (int dz = 0; dz < D1D; dz++)
889 {
890 u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
891 }
892 }
893 MFEM_UNROLL(MD1)
894 for (int dz = 0; dz < D1D; dz++)
895 {
896 DQQ[dz][qy][qx] = u[dz];
897 }
898 }
899 }
900 MFEM_SYNC_THREAD;
901 MFEM_FOREACH_THREAD(qy,y,Q1D)
902 {
903 MFEM_FOREACH_THREAD(qx,x,Q1D)
904 {
905 real_t u[Q1D];
906 MFEM_UNROLL(MQ1)
907 for (int qz = 0; qz < Q1D; qz++)
908 {
909 u[qz] = 0;
910 }
911 MFEM_UNROLL(MD1)
912 for (int dz = 0; dz < D1D; ++dz)
913 {
914 MFEM_UNROLL(MQ1)
915 for (int qz = 0; qz < Q1D; qz++)
916 {
917 u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
918 }
919 }
920 MFEM_UNROLL(MQ1)
921 for (int qz = 0; qz < Q1D; qz++)
922 {
923 QQQ[qz][qy][qx] = u[qz] * d(qx,qy,qz,e);
924 }
925 }
926 }
927 MFEM_SYNC_THREAD;
928 MFEM_FOREACH_THREAD(di,y,D1D)
929 {
930 MFEM_FOREACH_THREAD(q,x,Q1D)
931 {
932 Bt[di][q] = b(q,di);
933 }
934 }
935 MFEM_SYNC_THREAD;
936 MFEM_FOREACH_THREAD(qy,y,Q1D)
937 {
938 MFEM_FOREACH_THREAD(dx,x,D1D)
939 {
940 real_t u[Q1D];
941 MFEM_UNROLL(MQ1)
942 for (int qz = 0; qz < Q1D; ++qz)
943 {
944 u[qz] = 0;
945 }
946 MFEM_UNROLL(MQ1)
947 for (int qx = 0; qx < Q1D; ++qx)
948 {
949 MFEM_UNROLL(MQ1)
950 for (int qz = 0; qz < Q1D; ++qz)
951 {
952 u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
953 }
954 }
955 MFEM_UNROLL(MQ1)
956 for (int qz = 0; qz < Q1D; ++qz)
957 {
958 QQD[qz][qy][dx] = u[qz];
959 }
960 }
961 }
962 MFEM_SYNC_THREAD;
963 MFEM_FOREACH_THREAD(dy,y,D1D)
964 {
965 MFEM_FOREACH_THREAD(dx,x,D1D)
966 {
967 real_t u[Q1D];
968 MFEM_UNROLL(MQ1)
969 for (int qz = 0; qz < Q1D; ++qz)
970 {
971 u[qz] = 0;
972 }
973 MFEM_UNROLL(MQ1)
974 for (int qy = 0; qy < Q1D; ++qy)
975 {
976 MFEM_UNROLL(MQ1)
977 for (int qz = 0; qz < Q1D; ++qz)
978 {
979 u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
980 }
981 }
982 MFEM_UNROLL(MQ1)
983 for (int qz = 0; qz < Q1D; ++qz)
984 {
985 QDD[qz][dy][dx] = u[qz];
986 }
987 }
988 }
989 MFEM_SYNC_THREAD;
990 MFEM_FOREACH_THREAD(dy,y,D1D)
991 {
992 MFEM_FOREACH_THREAD(dx,x,D1D)
993 {
994 real_t u[D1D];
995 MFEM_UNROLL(MD1)
996 for (int dz = 0; dz < D1D; ++dz)
997 {
998 u[dz] = 0;
999 }
1000 MFEM_UNROLL(MQ1)
1001 for (int qz = 0; qz < Q1D; ++qz)
1002 {
1003 MFEM_UNROLL(MD1)
1004 for (int dz = 0; dz < D1D; ++dz)
1005 {
1006 u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
1007 }
1008 }
1009 MFEM_UNROLL(MD1)
1010 for (int dz = 0; dz < D1D; ++dz)
1011 {
1012 if (ACCUMULATE)
1013 {
1014 y(dx,dy,dz,e) += u[dz];
1015 }
1016 else
1017 {
1018 y(dx,dy,dz,e) = u[dz];
1019 }
1020 }
1021 }
1022 }
1023 MFEM_SYNC_THREAD;
1024}
1025
1026// PA Mass Apply 2D kernel
1027template<int T_D1D = 0, int T_Q1D = 0>
1028inline void PAMassApply2D(const int NE,
1029 const Array<real_t> &b_,
1030 const Array<real_t> &bt_,
1031 const Vector &d_,
1032 const Vector &x_,
1033 Vector &y_,
1034 const int d1d = 0,
1035 const int q1d = 0)
1036{
1037 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1038 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1039
1040 const auto B = b_.Read();
1041 const auto Bt = bt_.Read();
1042 const auto D = d_.Read();
1043 const auto X = x_.Read();
1044 auto Y = y_.ReadWrite();
1045
1046 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1047 {
1048 internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1049 });
1050}
1051
1052// Shared memory PA Mass Apply 2D kernel
1053template<int T_D1D = 0, int T_Q1D = 0>
1054inline void SmemPAMassApply2D(const int NE,
1055 const Array<real_t> &b_,
1056 const Array<real_t> &bt_,
1057 const Vector &d_,
1058 const Vector &x_,
1059 Vector &y_,
1060 const int d1d = 0,
1061 const int q1d = 0)
1062{
1063 MFEM_CONTRACT_VAR(bt_);
1064 static constexpr int T_NBZ = mass::NBZ(T_D1D);
1065 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1066 const int D1D = T_D1D ? T_D1D : d1d;
1067 const int Q1D = T_Q1D ? T_Q1D : q1d;
1068 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1069 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1070 MFEM_VERIFY(D1D <= max_d1d, "");
1071 MFEM_VERIFY(Q1D <= max_q1d, "");
1072 const auto b = b_.Read();
1073 const auto D = d_.Read();
1074 const auto x = x_.Read();
1075 auto Y = y_.ReadWrite();
1076 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
1077 {
1078 internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(
1079 e, NE, b, D, x, Y, d1d, q1d);
1080 });
1081}
1082
1083// PA Mass Apply 3D kernel
1084template<int T_D1D = 0, int T_Q1D = 0>
1085inline void PAMassApply3D(const int NE,
1086 const Array<real_t> &b_,
1087 const Array<real_t> &bt_,
1088 const Vector &d_,
1089 const Vector &x_,
1090 Vector &y_,
1091 const int d1d = 0,
1092 const int q1d = 0)
1093{
1094 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1095 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1096
1097 const auto B = b_.Read();
1098 const auto Bt = bt_.Read();
1099 const auto D = d_.Read();
1100 const auto X = x_.Read();
1101 auto Y = y_.ReadWrite();
1102
1103 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1104 {
1105 internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1106 });
1107}
1108
1109// Shared memory PA Mass Apply 2D kernel
1110template<int T_D1D = 0, int T_Q1D = 0>
1111inline void SmemPAMassApply3D(const int NE,
1112 const Array<real_t> &b_,
1113 const Array<real_t> &bt_,
1114 const Vector &d_,
1115 const Vector &x_,
1116 Vector &y_,
1117 const int d1d = 0,
1118 const int q1d = 0)
1119{
1120 MFEM_CONTRACT_VAR(bt_);
1121 const int D1D = T_D1D ? T_D1D : d1d;
1122 const int Q1D = T_Q1D ? T_Q1D : q1d;
1123 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1124 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1125 MFEM_VERIFY(D1D <= max_d1d, "");
1126 MFEM_VERIFY(Q1D <= max_q1d, "");
1127 auto b = b_.Read();
1128 auto d = d_.Read();
1129 auto x = x_.Read();
1130 auto y = y_.ReadWrite();
1131 mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
1132 {
1133 internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE, b, d, x, y, d1d, q1d);
1134 });
1135}
1136
1137template<int T_D1D = 0, int T_Q1D = 0>
1138inline void EAMassAssemble1D(const int NE,
1139 const Array<real_t> &basis,
1140 const Vector &padata,
1141 Vector &eadata,
1142 const bool add,
1143 const int d1d = 0,
1144 const int q1d = 0)
1145{
1146 const int D1D = T_D1D ? T_D1D : d1d;
1147 const int Q1D = T_Q1D ? T_Q1D : q1d;
1148 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1149 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1150 auto B = Reshape(basis.Read(), Q1D, D1D);
1151 auto D = Reshape(padata.Read(), Q1D, NE);
1152 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, NE);
1153 mfem::forall_2D(NE, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1154 {
1155 const int D1D = T_D1D ? T_D1D : d1d;
1156 const int Q1D = T_Q1D ? T_Q1D : q1d;
1157 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1158 MFEM_FOREACH_THREAD(i1,x,D1D)
1159 {
1160 real_t r_Bi[MQ1];
1161 for (int q = 0; q < Q1D; q++) { r_Bi[q] = B(q,i1); }
1162 MFEM_FOREACH_THREAD(j1,y,D1D)
1163 {
1164 real_t r_Bj[MQ1];
1165 for (int q = 0; q < Q1D; q++) { r_Bj[q] = B(q,j1); }
1166
1167 real_t val = 0.0;
1168 for (int k1 = 0; k1 < Q1D; ++k1)
1169 {
1170 val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
1171 }
1172 if (add)
1173 {
1174 M(i1, j1, e) += val;
1175 }
1176 else
1177 {
1178 M(i1, j1, e) = val;
1179 }
1180 }
1181 }
1182 });
1183}
1184
1185template<int T_D1D = 0, int T_Q1D = 0>
1186inline void EAMassAssemble2D(const int NE,
1187 const Array<real_t> &basis,
1188 const Vector &padata,
1189 Vector &eadata,
1190 const bool add,
1191 const int d1d = 0,
1192 const int q1d = 0)
1193{
1194 const int D1D = T_D1D ? T_D1D : d1d;
1195 const int Q1D = T_Q1D ? T_Q1D : q1d;
1196 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1197 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1198 auto B = Reshape(basis.Read(), Q1D, D1D);
1199 auto D = Reshape(padata.Read(), Q1D, Q1D, NE);
1200 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, D1D, D1D,
1201 NE);
1202 mfem::forall_2D(NE, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1203 {
1204 const int D1D = T_D1D ? T_D1D : d1d;
1205 const int Q1D = T_Q1D ? T_Q1D : q1d;
1206 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1207 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1208 real_t r_B[MQ1][MD1];
1209 for (int d = 0; d < D1D; d++)
1210 {
1211 for (int q = 0; q < Q1D; q++)
1212 {
1213 r_B[q][d] = B(q,d);
1214 }
1215 }
1216 MFEM_SHARED real_t s_D[MQ1][MQ1];
1217 MFEM_FOREACH_THREAD(k1,x,Q1D)
1218 {
1219 MFEM_FOREACH_THREAD(k2,y,Q1D)
1220 {
1221 s_D[k1][k2] = D(k1,k2,e);
1222 }
1223 }
1224 MFEM_SYNC_THREAD;
1225 MFEM_FOREACH_THREAD(i1,x,D1D)
1226 {
1227 MFEM_FOREACH_THREAD(i2,y,D1D)
1228 {
1229 for (int j1 = 0; j1 < D1D; ++j1)
1230 {
1231 for (int j2 = 0; j2 < D1D; ++j2)
1232 {
1233 real_t val = 0.0;
1234 for (int k1 = 0; k1 < Q1D; ++k1)
1235 {
1236 for (int k2 = 0; k2 < Q1D; ++k2)
1237 {
1238 val += r_B[k1][i1] * r_B[k1][j1]
1239 * r_B[k2][i2] * r_B[k2][j2]
1240 * s_D[k1][k2];
1241 }
1242 }
1243 if (add)
1244 {
1245 M(i1, i2, j1, j2, e) += val;
1246 }
1247 else
1248 {
1249 M(i1, i2, j1, j2, e) = val;
1250 }
1251 }
1252 }
1253 }
1254 }
1255 });
1256}
1257
1258template<int T_D1D = 0, int T_Q1D = 0>
1259inline void EAMassAssemble3D(const int NE,
1260 const Array<real_t> &basis,
1261 const Vector &padata,
1262 Vector &eadata,
1263 const bool add,
1264 const int d1d = 0,
1265 const int q1d = 0)
1266{
1267 const int D1D = T_D1D ? T_D1D : d1d;
1268 const int Q1D = T_Q1D ? T_Q1D : q1d;
1269 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1270 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1271 auto B = Reshape(basis.Read(), Q1D, D1D);
1272 auto D = Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
1273 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, D1D, D1D,
1274 D1D, D1D, NE);
1275 mfem::forall_3D(NE, D1D, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1276 {
1277 const int D1D = T_D1D ? T_D1D : d1d;
1278 const int Q1D = T_Q1D ? T_Q1D : q1d;
1279 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1280 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1281 constexpr int DQ = T_D1D * T_Q1D;
1282
1283 // For quadratic and lower it's better to use registers but for higher-order you start to
1284 // spill and it's better to use shared memory
1285 constexpr bool USE_REG = DQ != 0 && DQ <= 12;
1286 constexpr int MD1r = USE_REG ? MD1 : 1;
1287 constexpr int MQ1r = USE_REG ? MQ1 : 1;
1288 constexpr int MD1s = USE_REG ? 1 : MD1;
1289 constexpr int MQ1s = USE_REG ? 1 : MQ1;
1290
1291 MFEM_SHARED real_t s_B[MQ1s][MD1s];
1292 real_t r_B[MQ1r][MD1r];
1293 real_t (*l_B)[MD1] = nullptr;
1294 if (USE_REG)
1295 {
1296 for (int d = 0; d < D1D; d++)
1297 {
1298 for (int q = 0; q < Q1D; q++)
1299 {
1300 r_B[q][d] = B(q,d);
1301 }
1302 }
1303 l_B = (real_t (*)[MD1])r_B;
1304 }
1305 else
1306 {
1307 if (MFEM_THREAD_ID(z) == 0)
1308 {
1309 MFEM_FOREACH_THREAD(d,x,D1D)
1310 {
1311 MFEM_FOREACH_THREAD(q,y,Q1D)
1312 {
1313 s_B[q][d] = B(q,d);
1314 }
1315 }
1316 }
1317 l_B = (real_t (*)[MD1])s_B;
1318 }
1319
1320 MFEM_SHARED real_t s_D[MQ1][MQ1][MQ1];
1321 MFEM_FOREACH_THREAD(k1,x,Q1D)
1322 {
1323 MFEM_FOREACH_THREAD(k2,y,Q1D)
1324 {
1325 MFEM_FOREACH_THREAD(k3,z,Q1D)
1326 {
1327 s_D[k1][k2][k3] = D(k1,k2,k3,e);
1328 }
1329 }
1330 }
1331 MFEM_SYNC_THREAD;
1332 MFEM_FOREACH_THREAD(i1,x,D1D)
1333 {
1334 MFEM_FOREACH_THREAD(i2,y,D1D)
1335 {
1336 MFEM_FOREACH_THREAD(i3,z,D1D)
1337 {
1338 for (int j1 = 0; j1 < D1D; ++j1)
1339 {
1340 for (int j2 = 0; j2 < D1D; ++j2)
1341 {
1342 for (int j3 = 0; j3 < D1D; ++j3)
1343 {
1344 real_t val = 0.0;
1345 for (int k1 = 0; k1 < Q1D; ++k1)
1346 {
1347 for (int k2 = 0; k2 < Q1D; ++k2)
1348 {
1349 for (int k3 = 0; k3 < Q1D; ++k3)
1350 {
1351 val += l_B[k1][i1] * l_B[k1][j1]
1352 * l_B[k2][i2] * l_B[k2][j2]
1353 * l_B[k3][i3] * l_B[k3][j3]
1354 * s_D[k1][k2][k3];
1355 }
1356 }
1357 }
1358 if (add)
1359 {
1360 M(i1, i2, i3, j1, j2, j3, e) += val;
1361 }
1362 else
1363 {
1364 M(i1, i2, i3, j1, j2, j3, e) = val;
1365 }
1366 }
1367 }
1368 }
1369 }
1370 }
1371 }
1372 });
1373}
1374
1375} // namespace internal
1376
1377namespace
1378{
1379using ApplyKernelType = MassIntegrator::ApplyKernelType;
1380using DiagonalKernelType = MassIntegrator::DiagonalKernelType;
1381}
1382
1383template<int DIM, int T_D1D, int T_Q1D>
1384ApplyKernelType MassIntegrator::ApplyPAKernels::Kernel()
1385{
1386 if (DIM == 1) { return internal::PAMassApply1D; }
1387 else if (DIM == 2) { return internal::SmemPAMassApply2D<T_D1D,T_Q1D>; }
1388 else if (DIM == 3) { return internal::SmemPAMassApply3D<T_D1D, T_Q1D>; }
1389 else { MFEM_ABORT(""); }
1390}
1391
1392inline ApplyKernelType MassIntegrator::ApplyPAKernels::Fallback(
1393 int DIM, int, int)
1394{
1395 if (DIM == 1) { return internal::PAMassApply1D; }
1396 else if (DIM == 2) { return internal::PAMassApply2D; }
1397 else if (DIM == 3) { return internal::PAMassApply3D; }
1398 else { MFEM_ABORT(""); }
1399}
1400
1401template<int DIM, int T_D1D, int T_Q1D>
1402DiagonalKernelType MassIntegrator::DiagonalPAKernels::Kernel()
1403{
1404 if (DIM == 1) { return internal::PAMassAssembleDiagonal1D; }
1405 else if (DIM == 2) { return internal::SmemPAMassAssembleDiagonal2D<T_D1D,T_Q1D>; }
1406 else if (DIM == 3) { return internal::SmemPAMassAssembleDiagonal3D<T_D1D, T_Q1D>; }
1407 else { MFEM_ABORT(""); }
1408}
1409
1410inline DiagonalKernelType MassIntegrator::DiagonalPAKernels::Fallback(
1411 int DIM, int, int)
1412{
1413 if (DIM == 1) { return internal::PAMassAssembleDiagonal1D; }
1414 else if (DIM == 2) { return internal::PAMassAssembleDiagonal2D; }
1415 else if (DIM == 3) { return internal::PAMassAssembleDiagonal3D; }
1416 else { MFEM_ABORT(""); }
1417}
1418/// \endcond DO_NOT_DOCUMENT
1419} // namespace mfem
1420
1421#endif
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
void(*)(const int, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
real_t b
Definition lissajous.cpp:42
constexpr int DIM
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
DeviceTensor< 2, real_t > DeviceMatrix
Definition dtensor.hpp:143
T * Write(Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for write access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
Definition device.hpp:358
void add(const Vector &v1, const Vector &v2, Vector &v)
Definition vector.cpp:391
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:131
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:768
DeviceTensor< 2, const real_t > ConstDeviceMatrix
Definition dtensor.hpp:144
DeviceTensor< 3, const real_t > ConstDeviceCube
Definition dtensor.hpp:147
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:762
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:774
float real_t
Definition config.hpp:43
DeviceTensor< 3, real_t > DeviceCube
Definition dtensor.hpp:146
void forall(int N, lambda &&body)
Definition forall.hpp:753
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119