MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_mass_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_MASS_KERNELS_HPP
13#define MFEM_BILININTEG_MASS_KERNELS_HPP
14
20#include "../bilininteg.hpp"
21
22namespace mfem
23{
24
25/// \cond DO_NOT_DOCUMENT
26
27namespace internal
28{
29
30// PA Mass Diagonal 1D kernel
31static void PAMassAssembleDiagonal1D(const int NE,
32 const Array<real_t> &b,
33 const Vector &d,
34 Vector &y,
35 const int D1D,
36 const int Q1D)
37{
38 auto B = Reshape(b.Read(), Q1D, D1D);
39 auto D = Reshape(d.Read(), Q1D, NE);
40 auto Y = Reshape(y.ReadWrite(), D1D, NE);
41 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
42 {
43 for (int dx = 0; dx < D1D; ++dx)
44 {
45 for (int qx = 0; qx < Q1D; ++qx)
46 {
47 Y(dx, e) += B(qx, dx) * B(qx, dx) * D(qx, e);
48 }
49 }
50 });
51}
52
53template <bool ACCUMULATE = true>
54MFEM_HOST_DEVICE inline
55void PAMassApply1D_Element(const int e,
56 const int NE,
57 const real_t *b_,
58 const real_t *bt_,
59 const real_t *d_,
60 const real_t *x_,
61 real_t *y_,
62 const int d1d = 0,
63 const int q1d = 0)
64{
65 const int D1D = d1d;
66 const int Q1D = q1d;
67 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
68 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
69 auto D = ConstDeviceMatrix(d_, Q1D, NE);
70 auto X = ConstDeviceMatrix(x_, D1D, NE);
71 auto Y = DeviceMatrix(y_, D1D, NE);
72
73 if (!ACCUMULATE)
74 {
75 for (int dx = 0; dx < D1D; ++dx)
76 {
77 Y(dx, e) = 0.0;
78 }
79 }
80
81 real_t XQ[DofQuadLimits::MAX_Q1D];
82 for (int qx = 0; qx < Q1D; ++qx)
83 {
84 XQ[qx] = 0.0;
85 }
86 for (int dx = 0; dx < D1D; ++dx)
87 {
88 const real_t s = X(dx,e);
89 for (int qx = 0; qx < Q1D; ++qx)
90 {
91 XQ[qx] += B(qx,dx)*s;
92 }
93 }
94 for (int qx = 0; qx < Q1D; ++qx)
95 {
96 const double q = XQ[qx]*D(qx,e);
97 for (int dx = 0; dx < D1D; ++dx)
98 {
99 Y(dx,e) += Bt(dx,qx) * q;
100 }
101 }
102}
103
104// PA Mass Apply 1D kernel
105static void PAMassApply1D(const int NE,
106 const Array<real_t> &b_,
107 const Array<real_t> &bt_,
108 const Vector &d_,
109 const Vector &x_,
110 Vector &y_,
111 const int d1d = 0,
112 const int q1d = 0)
113{
114 MFEM_VERIFY(d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
115 MFEM_VERIFY(q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
116
117 const auto B = b_.Read();
118 const auto Bt = bt_.Read();
119 const auto D = d_.Read();
120 const auto X = x_.Read();
121 auto Y = y_.ReadWrite();
122
123 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
124 {
125 internal::PAMassApply1D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
126 });
127}
128
129// PA Mass Diagonal 2D kernel
130template<int T_D1D = 0, int T_Q1D = 0>
131inline void PAMassAssembleDiagonal2D(const int NE,
132 const Array<real_t> &b,
133 const Vector &d,
134 Vector &y,
135 const int d1d = 0,
136 const int q1d = 0)
137{
138 const int D1D = T_D1D ? T_D1D : d1d;
139 const int Q1D = T_Q1D ? T_Q1D : q1d;
140 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
141 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
142 auto B = Reshape(b.Read(), Q1D, D1D);
143 auto D = Reshape(d.Read(), Q1D, Q1D, NE);
144 auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
145 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
146 {
147 const int D1D = T_D1D ? T_D1D : d1d;
148 const int Q1D = T_Q1D ? T_Q1D : q1d;
149 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
150 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
151 real_t QD[MQ1][MD1];
152 for (int qx = 0; qx < Q1D; ++qx)
153 {
154 for (int dy = 0; dy < D1D; ++dy)
155 {
156 QD[qx][dy] = 0.0;
157 for (int qy = 0; qy < Q1D; ++qy)
158 {
159 QD[qx][dy] += B(qy, dy) * B(qy, dy) * D(qx, qy, e);
160 }
161 }
162 }
163 for (int dy = 0; dy < D1D; ++dy)
164 {
165 for (int dx = 0; dx < D1D; ++dx)
166 {
167 for (int qx = 0; qx < Q1D; ++qx)
168 {
169 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD[qx][dy];
170 }
171 }
172 }
173 });
174}
175
176namespace mass
177{
178constexpr int ipow(int x, int p) { return p == 0 ? 1 : x*ipow(x, p-1); }
179constexpr int D(int D1D) { return (11 - D1D) / 2; }
180constexpr int NBZ(int D1D)
181{
182 return ipow(2, D(D1D) >= 0 ? D(D1D) : 0);
183}
184}
185
186// Shared memory PA Mass Diagonal 2D kernel
187template<int T_D1D = 0, int T_Q1D = 0>
188inline void SmemPAMassAssembleDiagonal2D(const int NE,
189 const Array<real_t> &b_,
190 const Vector &d_,
191 Vector &y_,
192 const int d1d = 0,
193 const int q1d = 0)
194{
195 static constexpr int T_NBZ = mass::NBZ(T_D1D);
196 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
199 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
200 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
201 MFEM_VERIFY(D1D <= max_d1d, "");
202 MFEM_VERIFY(Q1D <= max_q1d, "");
203 auto b = Reshape(b_.Read(), Q1D, D1D);
204 auto D = Reshape(d_.Read(), Q1D, Q1D, NE);
205 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
206 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
207 {
208 const int tidz = MFEM_THREAD_ID(z);
209 const int D1D = T_D1D ? T_D1D : d1d;
210 const int Q1D = T_Q1D ? T_Q1D : q1d;
211 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
212 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
213 MFEM_SHARED real_t B[MQ1][MD1];
214 MFEM_SHARED real_t QDZ[NBZ][MQ1][MD1];
215 real_t (*QD)[MD1] = (real_t (*)[MD1])(QDZ + tidz);
216 if (tidz == 0)
217 {
218 MFEM_FOREACH_THREAD(d,y,D1D)
219 {
220 MFEM_FOREACH_THREAD(q,x,Q1D)
221 {
222 B[q][d] = b(q,d);
223 }
224 }
225 }
226 MFEM_SYNC_THREAD;
227 MFEM_FOREACH_THREAD(qx,x,Q1D)
228 {
229 MFEM_FOREACH_THREAD(dy,y,D1D)
230 {
231 QD[qx][dy] = 0.0;
232 for (int qy = 0; qy < Q1D; ++qy)
233 {
234 QD[qx][dy] += B[qy][dy] * B[qy][dy] * D(qx, qy, e);
235 }
236 }
237 }
238 MFEM_SYNC_THREAD;
239 MFEM_FOREACH_THREAD(dy,y,D1D)
240 {
241 MFEM_FOREACH_THREAD(dx,x,D1D)
242 {
243 for (int qx = 0; qx < Q1D; ++qx)
244 {
245 // might need absolute values on next line
246 Y(dx,dy,e) += B[qx][dx] * B[qx][dx] * QD[qx][dy];
247 }
248 }
249 }
250 });
251}
252
253// PA Mass Diagonal 3D kernel
254template<int T_D1D = 0, int T_Q1D = 0>
255inline void PAMassAssembleDiagonal3D(const int NE,
256 const Array<real_t> &b,
257 const Vector &d,
258 Vector &y,
259 const int d1d = 0,
260 const int q1d = 0)
261{
262 const int D1D = T_D1D ? T_D1D : d1d;
263 const int Q1D = T_Q1D ? T_Q1D : q1d;
264 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
265 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
266 auto B = Reshape(b.Read(), Q1D, D1D);
267 auto D = Reshape(d.Read(), Q1D, Q1D, Q1D, NE);
268 auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
269 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
270 {
271 const int D1D = T_D1D ? T_D1D : d1d;
272 const int Q1D = T_Q1D ? T_Q1D : q1d;
273 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
274 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
275 real_t QQD[MQ1][MQ1][MD1];
276 real_t QDD[MQ1][MD1][MD1];
277 for (int qx = 0; qx < Q1D; ++qx)
278 {
279 for (int qy = 0; qy < Q1D; ++qy)
280 {
281 for (int dz = 0; dz < D1D; ++dz)
282 {
283 QQD[qx][qy][dz] = 0.0;
284 for (int qz = 0; qz < Q1D; ++qz)
285 {
286 QQD[qx][qy][dz] += B(qz, dz) * B(qz, dz) * D(qx, qy, qz, e);
287 }
288 }
289 }
290 }
291 for (int qx = 0; qx < Q1D; ++qx)
292 {
293 for (int dz = 0; dz < D1D; ++dz)
294 {
295 for (int dy = 0; dy < D1D; ++dy)
296 {
297 QDD[qx][dy][dz] = 0.0;
298 for (int qy = 0; qy < Q1D; ++qy)
299 {
300 QDD[qx][dy][dz] += B(qy, dy) * B(qy, dy) * QQD[qx][qy][dz];
301 }
302 }
303 }
304 }
305 for (int dz = 0; dz < D1D; ++dz)
306 {
307 for (int dy = 0; dy < D1D; ++dy)
308 {
309 for (int dx = 0; dx < D1D; ++dx)
310 {
311 real_t t = 0.0;
312 for (int qx = 0; qx < Q1D; ++qx)
313 {
314 t += B(qx, dx) * B(qx, dx) * QDD[qx][dy][dz];
315 }
316 Y(dx, dy, dz, e) += t;
317 }
318 }
319 }
320 });
321}
322
323// Shared memory PA Mass Diagonal 3D kernel
324template<int T_D1D = 0, int T_Q1D = 0>
325inline void SmemPAMassAssembleDiagonal3D(const int NE,
326 const Array<real_t> &b_,
327 const Vector &d_,
328 Vector &y_,
329 const int d1d = 0,
330 const int q1d = 0)
331{
332 const int D1D = T_D1D ? T_D1D : d1d;
333 const int Q1D = T_Q1D ? T_Q1D : q1d;
334 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
335 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
336 MFEM_VERIFY(D1D <= max_d1d, "");
337 MFEM_VERIFY(Q1D <= max_q1d, "");
338 auto b = Reshape(b_.Read(), Q1D, D1D);
339 auto D = Reshape(d_.Read(), Q1D, Q1D, Q1D, NE);
340 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
341 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
342 {
343 const int tidz = MFEM_THREAD_ID(z);
344 const int D1D = T_D1D ? T_D1D : d1d;
345 const int Q1D = T_Q1D ? T_Q1D : q1d;
346 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
347 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
348 MFEM_SHARED real_t B[MQ1][MD1];
349 MFEM_SHARED real_t QQD[MQ1][MQ1][MD1];
350 MFEM_SHARED real_t QDD[MQ1][MD1][MD1];
351 if (tidz == 0)
352 {
353 MFEM_FOREACH_THREAD(d,y,D1D)
354 {
355 MFEM_FOREACH_THREAD(q,x,Q1D)
356 {
357 B[q][d] = b(q,d);
358 }
359 }
360 }
361 MFEM_SYNC_THREAD;
362 MFEM_FOREACH_THREAD(qx,x,Q1D)
363 {
364 MFEM_FOREACH_THREAD(qy,y,Q1D)
365 {
366 MFEM_FOREACH_THREAD(dz,z,D1D)
367 {
368 QQD[qx][qy][dz] = 0.0;
369 for (int qz = 0; qz < Q1D; ++qz)
370 {
371 QQD[qx][qy][dz] += B[qz][dz] * B[qz][dz] * D(qx, qy, qz, e);
372 }
373 }
374 }
375 }
376 MFEM_SYNC_THREAD;
377 MFEM_FOREACH_THREAD(qx,x,Q1D)
378 {
379 MFEM_FOREACH_THREAD(dz,z,D1D)
380 {
381 MFEM_FOREACH_THREAD(dy,y,D1D)
382 {
383 QDD[qx][dy][dz] = 0.0;
384 for (int qy = 0; qy < Q1D; ++qy)
385 {
386 QDD[qx][dy][dz] += B[qy][dy] * B[qy][dy] * QQD[qx][qy][dz];
387 }
388 }
389 }
390 }
391 MFEM_SYNC_THREAD;
392 MFEM_FOREACH_THREAD(dz,z,D1D)
393 {
394 MFEM_FOREACH_THREAD(dy,y,D1D)
395 {
396 MFEM_FOREACH_THREAD(dx,x,D1D)
397 {
398 real_t t = 0.0;
399 for (int qx = 0; qx < Q1D; ++qx)
400 {
401 t += B[qx][dx] * B[qx][dx] * QDD[qx][dy][dz];
402 }
403 Y(dx, dy, dz, e) += t;
404 }
405 }
406 }
407 });
408}
409
410#ifdef MFEM_USE_OCCA
411// OCCA PA Mass Apply 2D kernel
412void OccaPAMassApply2D(const int D1D,
413 const int Q1D,
414 const int NE,
415 const Array<real_t> &B,
416 const Array<real_t> &Bt,
417 const Vector &D,
418 const Vector &X,
419 Vector &Y);
420
421// OCCA PA Mass Apply 3D kernel
422void OccaPAMassApply3D(const int D1D,
423 const int Q1D,
424 const int NE,
425 const Array<real_t> &B,
426 const Array<real_t> &Bt,
427 const Vector &D,
428 const Vector &X,
429 Vector &Y);
430#endif // MFEM_USE_OCCA
431
432template <bool ACCUMULATE = true>
433MFEM_HOST_DEVICE inline
434void PAMassApply2D_Element(const int e,
435 const int NE,
436 const real_t *b_,
437 const real_t *bt_,
438 const real_t *d_,
439 const real_t *x_,
440 real_t *y_,
441 const int d1d = 0,
442 const int q1d = 0)
443{
444 const int D1D = d1d;
445 const int Q1D = q1d;
446 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
447 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
448 auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
449 auto X = ConstDeviceCube(x_, D1D, D1D, NE);
450 auto Y = DeviceCube(y_, D1D, D1D, NE);
451
452 if (!ACCUMULATE)
453 {
454 for (int dy = 0; dy < D1D; ++dy)
455 {
456 for (int dx = 0; dx < D1D; ++dx)
457 {
458 Y(dx, dy, e) = 0.0;
459 }
460 }
461 }
462
463 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
464 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
465 real_t sol_xy[max_Q1D][max_Q1D];
466 for (int qy = 0; qy < Q1D; ++qy)
467 {
468 for (int qx = 0; qx < Q1D; ++qx)
469 {
470 sol_xy[qy][qx] = 0.0;
471 }
472 }
473 for (int dy = 0; dy < D1D; ++dy)
474 {
475 real_t sol_x[max_Q1D];
476 for (int qy = 0; qy < Q1D; ++qy)
477 {
478 sol_x[qy] = 0.0;
479 }
480 for (int dx = 0; dx < D1D; ++dx)
481 {
482 const real_t s = X(dx,dy,e);
483 for (int qx = 0; qx < Q1D; ++qx)
484 {
485 sol_x[qx] += B(qx,dx)* s;
486 }
487 }
488 for (int qy = 0; qy < Q1D; ++qy)
489 {
490 const real_t d2q = B(qy,dy);
491 for (int qx = 0; qx < Q1D; ++qx)
492 {
493 sol_xy[qy][qx] += d2q * sol_x[qx];
494 }
495 }
496 }
497 for (int qy = 0; qy < Q1D; ++qy)
498 {
499 for (int qx = 0; qx < Q1D; ++qx)
500 {
501 sol_xy[qy][qx] *= D(qx,qy,e);
502 }
503 }
504 for (int qy = 0; qy < Q1D; ++qy)
505 {
506 real_t sol_x[max_D1D];
507 for (int dx = 0; dx < D1D; ++dx)
508 {
509 sol_x[dx] = 0.0;
510 }
511 for (int qx = 0; qx < Q1D; ++qx)
512 {
513 const real_t s = sol_xy[qy][qx];
514 for (int dx = 0; dx < D1D; ++dx)
515 {
516 sol_x[dx] += Bt(dx,qx) * s;
517 }
518 }
519 for (int dy = 0; dy < D1D; ++dy)
520 {
521 const real_t q2d = Bt(dy,qy);
522 for (int dx = 0; dx < D1D; ++dx)
523 {
524 Y(dx,dy,e) += q2d * sol_x[dx];
525 }
526 }
527 }
528}
529
530template<int T_D1D, int T_Q1D, int T_NBZ, bool ACCUMULATE = true>
531MFEM_HOST_DEVICE inline
532void SmemPAMassApply2D_Element(const int e,
533 const int NE,
534 const real_t *b_,
535 const real_t *d_,
536 const real_t *x_,
537 real_t *y_,
538 int d1d = 0,
539 int q1d = 0)
540{
541 const int D1D = T_D1D ? T_D1D : d1d;
542 const int Q1D = T_Q1D ? T_Q1D : q1d;
543 constexpr int NBZ = T_NBZ ? T_NBZ : 1;
544
545 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
546 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
547 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
548
549 auto b = ConstDeviceMatrix(b_, Q1D, D1D);
550 auto D = ConstDeviceCube(d_, Q1D, Q1D, NE);
551 auto x = ConstDeviceCube(x_, D1D, D1D, NE);
552 auto Y = DeviceCube(y_, D1D, D1D, NE);
553
554 const int tidz = MFEM_THREAD_ID(z);
555
556 MFEM_SHARED real_t BBt[MQ1*MD1];
557 real_t (*B)[MD1] = (real_t (*)[MD1]) BBt;
558 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) BBt;
559 MFEM_SHARED real_t sm0[NBZ][MDQ*MDQ];
560 MFEM_SHARED real_t sm1[NBZ][MDQ*MDQ];
561 real_t (*X)[MD1] = (real_t (*)[MD1]) (sm0 + tidz);
562 real_t (*DQ)[MQ1] = (real_t (*)[MQ1]) (sm1 + tidz);
563 real_t (*QQ)[MQ1] = (real_t (*)[MQ1]) (sm0 + tidz);
564 real_t (*QD)[MD1] = (real_t (*)[MD1]) (sm1 + tidz);
565
566
567 MFEM_FOREACH_THREAD(dy,y,D1D)
568 {
569 MFEM_FOREACH_THREAD(dx,x,D1D)
570 {
571 X[dy][dx] = x(dx,dy,e);
572 }
573 }
574 if (tidz == 0)
575 {
576 MFEM_FOREACH_THREAD(dy,y,D1D)
577 {
578 MFEM_FOREACH_THREAD(q,x,Q1D)
579 {
580 B[q][dy] = b(q,dy);
581 }
582 }
583 }
584 MFEM_SYNC_THREAD;
585 MFEM_FOREACH_THREAD(dy,y,D1D)
586 {
587 MFEM_FOREACH_THREAD(qx,x,Q1D)
588 {
589 real_t dq = 0.0;
590 for (int dx = 0; dx < D1D; ++dx)
591 {
592 dq += X[dy][dx] * B[qx][dx];
593 }
594 DQ[dy][qx] = dq;
595 }
596 }
597 MFEM_SYNC_THREAD;
598 MFEM_FOREACH_THREAD(qy,y,Q1D)
599 {
600 MFEM_FOREACH_THREAD(qx,x,Q1D)
601 {
602 real_t qq = 0.0;
603 for (int dy = 0; dy < D1D; ++dy)
604 {
605 qq += DQ[dy][qx] * B[qy][dy];
606 }
607 QQ[qy][qx] = qq * D(qx, qy, e);
608 }
609 }
610 MFEM_SYNC_THREAD;
611 if (tidz == 0)
612 {
613 MFEM_FOREACH_THREAD(dy,y,D1D)
614 {
615 MFEM_FOREACH_THREAD(q,x,Q1D)
616 {
617 Bt[dy][q] = b(q,dy);
618 }
619 }
620 }
621 MFEM_SYNC_THREAD;
622 MFEM_FOREACH_THREAD(qy,y,Q1D)
623 {
624 MFEM_FOREACH_THREAD(dx,x,D1D)
625 {
626 real_t dq = 0.0;
627 for (int qx = 0; qx < Q1D; ++qx)
628 {
629 dq += QQ[qy][qx] * Bt[dx][qx];
630 }
631 QD[qy][dx] = dq;
632 }
633 }
634 MFEM_SYNC_THREAD;
635 MFEM_FOREACH_THREAD(dy,y,D1D)
636 {
637 MFEM_FOREACH_THREAD(dx,x,D1D)
638 {
639 real_t dd = 0.0;
640 for (int qy = 0; qy < Q1D; ++qy)
641 {
642 dd += (QD[qy][dx] * Bt[dy][qy]);
643 }
644 if (ACCUMULATE)
645 {
646 Y(dx, dy, e) += dd;
647 }
648 else
649 {
650 Y(dx, dy, e) = dd;
651 }
652 }
653 }
654}
655
656template <bool ACCUMULATE = true>
657MFEM_HOST_DEVICE inline
658void PAMassApply3D_Element(const int e,
659 const int NE,
660 const real_t *b_,
661 const real_t *bt_,
662 const real_t *d_,
663 const real_t *x_,
664 real_t *y_,
665 const int d1d,
666 const int q1d)
667{
668 const int D1D = d1d;
669 const int Q1D = q1d;
670 auto B = ConstDeviceMatrix(b_, Q1D, D1D);
671 auto Bt = ConstDeviceMatrix(bt_, D1D, Q1D);
672 auto D = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
673 auto X = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
674 auto Y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
675
676 if (!ACCUMULATE)
677 {
678 for (int dz = 0; dz < D1D; ++dz)
679 {
680 for (int dy = 0; dy < D1D; ++dy)
681 {
682 for (int dx = 0; dx < D1D; ++dx)
683 {
684 Y(dx, dy, dz, e) = 0.0;
685 }
686 }
687 }
688 }
689
690 constexpr int max_D1D = DofQuadLimits::MAX_D1D;
691 constexpr int max_Q1D = DofQuadLimits::MAX_Q1D;
692 real_t sol_xyz[max_Q1D][max_Q1D][max_Q1D];
693 for (int qz = 0; qz < Q1D; ++qz)
694 {
695 for (int qy = 0; qy < Q1D; ++qy)
696 {
697 for (int qx = 0; qx < Q1D; ++qx)
698 {
699 sol_xyz[qz][qy][qx] = 0.0;
700 }
701 }
702 }
703 for (int dz = 0; dz < D1D; ++dz)
704 {
705 real_t sol_xy[max_Q1D][max_Q1D];
706 for (int qy = 0; qy < Q1D; ++qy)
707 {
708 for (int qx = 0; qx < Q1D; ++qx)
709 {
710 sol_xy[qy][qx] = 0.0;
711 }
712 }
713 for (int dy = 0; dy < D1D; ++dy)
714 {
715 real_t sol_x[max_Q1D];
716 for (int qx = 0; qx < Q1D; ++qx)
717 {
718 sol_x[qx] = 0;
719 }
720 for (int dx = 0; dx < D1D; ++dx)
721 {
722 const real_t s = X(dx,dy,dz,e);
723 for (int qx = 0; qx < Q1D; ++qx)
724 {
725 sol_x[qx] += B(qx,dx) * s;
726 }
727 }
728 for (int qy = 0; qy < Q1D; ++qy)
729 {
730 const real_t wy = B(qy,dy);
731 for (int qx = 0; qx < Q1D; ++qx)
732 {
733 sol_xy[qy][qx] += wy * sol_x[qx];
734 }
735 }
736 }
737 for (int qz = 0; qz < Q1D; ++qz)
738 {
739 const real_t wz = B(qz,dz);
740 for (int qy = 0; qy < Q1D; ++qy)
741 {
742 for (int qx = 0; qx < Q1D; ++qx)
743 {
744 sol_xyz[qz][qy][qx] += wz * sol_xy[qy][qx];
745 }
746 }
747 }
748 }
749 for (int qz = 0; qz < Q1D; ++qz)
750 {
751 for (int qy = 0; qy < Q1D; ++qy)
752 {
753 for (int qx = 0; qx < Q1D; ++qx)
754 {
755 sol_xyz[qz][qy][qx] *= D(qx,qy,qz,e);
756 }
757 }
758 }
759 for (int qz = 0; qz < Q1D; ++qz)
760 {
761 real_t sol_xy[max_D1D][max_D1D];
762 for (int dy = 0; dy < D1D; ++dy)
763 {
764 for (int dx = 0; dx < D1D; ++dx)
765 {
766 sol_xy[dy][dx] = 0;
767 }
768 }
769 for (int qy = 0; qy < Q1D; ++qy)
770 {
771 real_t sol_x[max_D1D];
772 for (int dx = 0; dx < D1D; ++dx)
773 {
774 sol_x[dx] = 0;
775 }
776 for (int qx = 0; qx < Q1D; ++qx)
777 {
778 const real_t s = sol_xyz[qz][qy][qx];
779 for (int dx = 0; dx < D1D; ++dx)
780 {
781 sol_x[dx] += Bt(dx,qx) * s;
782 }
783 }
784 for (int dy = 0; dy < D1D; ++dy)
785 {
786 const real_t wy = Bt(dy,qy);
787 for (int dx = 0; dx < D1D; ++dx)
788 {
789 sol_xy[dy][dx] += wy * sol_x[dx];
790 }
791 }
792 }
793 for (int dz = 0; dz < D1D; ++dz)
794 {
795 const real_t wz = Bt(dz,qz);
796 for (int dy = 0; dy < D1D; ++dy)
797 {
798 for (int dx = 0; dx < D1D; ++dx)
799 {
800 Y(dx,dy,dz,e) += wz * sol_xy[dy][dx];
801 }
802 }
803 }
804 }
805}
806
807template<int T_D1D, int T_Q1D, bool ACCUMULATE = true>
808MFEM_HOST_DEVICE inline
809void SmemPAMassApply3D_Element(const int e,
810 const int NE,
811 const real_t *b_,
812 const real_t *d_,
813 const real_t *x_,
814 real_t *y_,
815 const int d1d = 0,
816 const int q1d = 0)
817{
818 constexpr int D1D = T_D1D ? T_D1D : d1d;
819 constexpr int Q1D = T_Q1D ? T_Q1D : q1d;
820 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
821 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
822 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
823
824 auto b = ConstDeviceMatrix(b_, Q1D, D1D);
825 auto d = DeviceTensor<4,const real_t>(d_, Q1D, Q1D, Q1D, NE);
826 auto x = DeviceTensor<4,const real_t>(x_, D1D, D1D, D1D, NE);
827 auto y = DeviceTensor<4,real_t>(y_, D1D, D1D, D1D, NE);
828
829 MFEM_SHARED real_t sDQ[MQ1*MD1];
830 real_t (*B)[MD1] = (real_t (*)[MD1]) sDQ;
831 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) sDQ;
832 MFEM_SHARED real_t sm0[MDQ*MDQ*MDQ];
833 MFEM_SHARED real_t sm1[MDQ*MDQ*MDQ];
834 real_t (*X)[MD1][MD1] = (real_t (*)[MD1][MD1]) sm0;
835 real_t (*DDQ)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) sm1;
836 real_t (*DQQ)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) sm0;
837 real_t (*QQQ)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) sm1;
838 real_t (*QQD)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) sm0;
839 real_t (*QDD)[MD1][MD1] = (real_t (*)[MD1][MD1]) sm1;
840 MFEM_FOREACH_THREAD(dy,y,D1D)
841 {
842 MFEM_FOREACH_THREAD(dx,x,D1D)
843 {
844 MFEM_UNROLL(MD1)
845 for (int dz = 0; dz < D1D; ++dz)
846 {
847 X[dz][dy][dx] = x(dx,dy,dz,e);
848 }
849 }
850 MFEM_FOREACH_THREAD(dx,x,Q1D)
851 {
852 B[dx][dy] = b(dx,dy);
853 }
854 }
855 MFEM_SYNC_THREAD;
856 MFEM_FOREACH_THREAD(dy,y,D1D)
857 {
858 MFEM_FOREACH_THREAD(qx,x,Q1D)
859 {
860 real_t u[D1D];
861 MFEM_UNROLL(MD1)
862 for (int dz = 0; dz < D1D; dz++)
863 {
864 u[dz] = 0;
865 }
866 MFEM_UNROLL(MD1)
867 for (int dx = 0; dx < D1D; ++dx)
868 {
869 MFEM_UNROLL(MD1)
870 for (int dz = 0; dz < D1D; ++dz)
871 {
872 u[dz] += X[dz][dy][dx] * B[qx][dx];
873 }
874 }
875 MFEM_UNROLL(MD1)
876 for (int dz = 0; dz < D1D; ++dz)
877 {
878 DDQ[dz][dy][qx] = u[dz];
879 }
880 }
881 }
882 MFEM_SYNC_THREAD;
883 MFEM_FOREACH_THREAD(qy,y,Q1D)
884 {
885 MFEM_FOREACH_THREAD(qx,x,Q1D)
886 {
887 real_t u[D1D];
888 MFEM_UNROLL(MD1)
889 for (int dz = 0; dz < D1D; dz++)
890 {
891 u[dz] = 0;
892 }
893 MFEM_UNROLL(MD1)
894 for (int dy = 0; dy < D1D; ++dy)
895 {
896 MFEM_UNROLL(MD1)
897 for (int dz = 0; dz < D1D; dz++)
898 {
899 u[dz] += DDQ[dz][dy][qx] * B[qy][dy];
900 }
901 }
902 MFEM_UNROLL(MD1)
903 for (int dz = 0; dz < D1D; dz++)
904 {
905 DQQ[dz][qy][qx] = u[dz];
906 }
907 }
908 }
909 MFEM_SYNC_THREAD;
910 MFEM_FOREACH_THREAD(qy,y,Q1D)
911 {
912 MFEM_FOREACH_THREAD(qx,x,Q1D)
913 {
914 real_t u[Q1D];
915 MFEM_UNROLL(MQ1)
916 for (int qz = 0; qz < Q1D; qz++)
917 {
918 u[qz] = 0;
919 }
920 MFEM_UNROLL(MD1)
921 for (int dz = 0; dz < D1D; ++dz)
922 {
923 MFEM_UNROLL(MQ1)
924 for (int qz = 0; qz < Q1D; qz++)
925 {
926 u[qz] += DQQ[dz][qy][qx] * B[qz][dz];
927 }
928 }
929 MFEM_UNROLL(MQ1)
930 for (int qz = 0; qz < Q1D; qz++)
931 {
932 QQQ[qz][qy][qx] = u[qz] * d(qx,qy,qz,e);
933 }
934 }
935 }
936 MFEM_SYNC_THREAD;
937 MFEM_FOREACH_THREAD(di,y,D1D)
938 {
939 MFEM_FOREACH_THREAD(q,x,Q1D)
940 {
941 Bt[di][q] = b(q,di);
942 }
943 }
944 MFEM_SYNC_THREAD;
945 MFEM_FOREACH_THREAD(qy,y,Q1D)
946 {
947 MFEM_FOREACH_THREAD(dx,x,D1D)
948 {
949 real_t u[Q1D];
950 MFEM_UNROLL(MQ1)
951 for (int qz = 0; qz < Q1D; ++qz)
952 {
953 u[qz] = 0;
954 }
955 MFEM_UNROLL(MQ1)
956 for (int qx = 0; qx < Q1D; ++qx)
957 {
958 MFEM_UNROLL(MQ1)
959 for (int qz = 0; qz < Q1D; ++qz)
960 {
961 u[qz] += QQQ[qz][qy][qx] * Bt[dx][qx];
962 }
963 }
964 MFEM_UNROLL(MQ1)
965 for (int qz = 0; qz < Q1D; ++qz)
966 {
967 QQD[qz][qy][dx] = u[qz];
968 }
969 }
970 }
971 MFEM_SYNC_THREAD;
972 MFEM_FOREACH_THREAD(dy,y,D1D)
973 {
974 MFEM_FOREACH_THREAD(dx,x,D1D)
975 {
976 real_t u[Q1D];
977 MFEM_UNROLL(MQ1)
978 for (int qz = 0; qz < Q1D; ++qz)
979 {
980 u[qz] = 0;
981 }
982 MFEM_UNROLL(MQ1)
983 for (int qy = 0; qy < Q1D; ++qy)
984 {
985 MFEM_UNROLL(MQ1)
986 for (int qz = 0; qz < Q1D; ++qz)
987 {
988 u[qz] += QQD[qz][qy][dx] * Bt[dy][qy];
989 }
990 }
991 MFEM_UNROLL(MQ1)
992 for (int qz = 0; qz < Q1D; ++qz)
993 {
994 QDD[qz][dy][dx] = u[qz];
995 }
996 }
997 }
998 MFEM_SYNC_THREAD;
999 MFEM_FOREACH_THREAD(dy,y,D1D)
1000 {
1001 MFEM_FOREACH_THREAD(dx,x,D1D)
1002 {
1003 real_t u[D1D];
1004 MFEM_UNROLL(MD1)
1005 for (int dz = 0; dz < D1D; ++dz)
1006 {
1007 u[dz] = 0;
1008 }
1009 MFEM_UNROLL(MQ1)
1010 for (int qz = 0; qz < Q1D; ++qz)
1011 {
1012 MFEM_UNROLL(MD1)
1013 for (int dz = 0; dz < D1D; ++dz)
1014 {
1015 u[dz] += QDD[qz][dy][dx] * Bt[dz][qz];
1016 }
1017 }
1018 MFEM_UNROLL(MD1)
1019 for (int dz = 0; dz < D1D; ++dz)
1020 {
1021 if (ACCUMULATE)
1022 {
1023 y(dx,dy,dz,e) += u[dz];
1024 }
1025 else
1026 {
1027 y(dx,dy,dz,e) = u[dz];
1028 }
1029 }
1030 }
1031 }
1032 MFEM_SYNC_THREAD;
1033}
1034
1035// PA Mass Apply 2D kernel
1036template<int T_D1D = 0, int T_Q1D = 0>
1037inline void PAMassApply2D(const int NE,
1038 const Array<real_t> &b_,
1039 const Array<real_t> &bt_,
1040 const Vector &d_,
1041 const Vector &x_,
1042 Vector &y_,
1043 const int d1d = 0,
1044 const int q1d = 0)
1045{
1046 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1047 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1048
1049 const auto B = b_.Read();
1050 const auto Bt = bt_.Read();
1051 const auto D = d_.Read();
1052 const auto X = x_.Read();
1053 auto Y = y_.ReadWrite();
1054
1055 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1056 {
1057 internal::PAMassApply2D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1058 });
1059}
1060
1061// Shared memory PA Mass Apply 2D kernel
1062template<int T_D1D = 0, int T_Q1D = 0>
1063inline void SmemPAMassApply2D(const int NE,
1064 const Array<real_t> &b_,
1065 const Array<real_t> &bt_,
1066 const Vector &d_,
1067 const Vector &x_,
1068 Vector &y_,
1069 const int d1d = 0,
1070 const int q1d = 0)
1071{
1072 MFEM_CONTRACT_VAR(bt_);
1073 static constexpr int T_NBZ = mass::NBZ(T_D1D);
1074 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1075 const int D1D = T_D1D ? T_D1D : d1d;
1076 const int Q1D = T_Q1D ? T_Q1D : q1d;
1077 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1078 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1079 MFEM_VERIFY(D1D <= max_d1d, "");
1080 MFEM_VERIFY(Q1D <= max_q1d, "");
1081 const auto b = b_.Read();
1082 const auto D = d_.Read();
1083 const auto x = x_.Read();
1084 auto Y = y_.ReadWrite();
1085 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
1086 {
1087 internal::SmemPAMassApply2D_Element<T_D1D,T_Q1D,T_NBZ>(
1088 e, NE, b, D, x, Y, d1d, q1d);
1089 });
1090}
1091
1092// PA Mass Apply 3D kernel
1093template<int T_D1D = 0, int T_Q1D = 0>
1094inline void PAMassApply3D(const int NE,
1095 const Array<real_t> &b_,
1096 const Array<real_t> &bt_,
1097 const Vector &d_,
1098 const Vector &x_,
1099 Vector &y_,
1100 const int d1d = 0,
1101 const int q1d = 0)
1102{
1103 MFEM_VERIFY(T_D1D ? T_D1D : d1d <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1104 MFEM_VERIFY(T_Q1D ? T_Q1D : q1d <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1105
1106 const auto B = b_.Read();
1107 const auto Bt = bt_.Read();
1108 const auto D = d_.Read();
1109 const auto X = x_.Read();
1110 auto Y = y_.ReadWrite();
1111
1112 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1113 {
1114 internal::PAMassApply3D_Element(e, NE, B, Bt, D, X, Y, d1d, q1d);
1115 });
1116}
1117
1118// Shared memory PA Mass Apply 2D kernel
1119template<int T_D1D = 0, int T_Q1D = 0>
1120inline void SmemPAMassApply3D(const int NE,
1121 const Array<real_t> &b_,
1122 const Array<real_t> &bt_,
1123 const Vector &d_,
1124 const Vector &x_,
1125 Vector &y_,
1126 const int d1d = 0,
1127 const int q1d = 0)
1128{
1129 MFEM_CONTRACT_VAR(bt_);
1130 const int D1D = T_D1D ? T_D1D : d1d;
1131 const int Q1D = T_Q1D ? T_Q1D : q1d;
1132 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1133 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1134 MFEM_VERIFY(D1D <= max_d1d, "");
1135 MFEM_VERIFY(Q1D <= max_q1d, "");
1136 auto b = b_.Read();
1137 auto d = d_.Read();
1138 auto x = x_.Read();
1139 auto y = y_.ReadWrite();
1140 mfem::forall_2D(NE, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
1141 {
1142 internal::SmemPAMassApply3D_Element<T_D1D,T_Q1D>(e, NE, b, d, x, y, d1d, q1d);
1143 });
1144}
1145
1146template<int T_D1D = 0, int T_Q1D = 0>
1147inline void EAMassAssemble1D(const int NE,
1148 const Array<real_t> &basis,
1149 const Vector &padata,
1150 Vector &eadata,
1151 const bool add,
1152 const int d1d = 0,
1153 const int q1d = 0)
1154{
1155 const int D1D = T_D1D ? T_D1D : d1d;
1156 const int Q1D = T_Q1D ? T_Q1D : q1d;
1157 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1158 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1159 auto B = Reshape(basis.Read(), Q1D, D1D);
1160 auto D = Reshape(padata.Read(), Q1D, NE);
1161 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, NE);
1162 mfem::forall_2D(NE, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1163 {
1164 const int D1D = T_D1D ? T_D1D : d1d;
1165 const int Q1D = T_Q1D ? T_Q1D : q1d;
1166 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1167 MFEM_FOREACH_THREAD(i1,x,D1D)
1168 {
1169 real_t r_Bi[MQ1];
1170 for (int q = 0; q < Q1D; q++) { r_Bi[q] = B(q,i1); }
1171 MFEM_FOREACH_THREAD(j1,y,D1D)
1172 {
1173 real_t r_Bj[MQ1];
1174 for (int q = 0; q < Q1D; q++) { r_Bj[q] = B(q,j1); }
1175
1176 real_t val = 0.0;
1177 for (int k1 = 0; k1 < Q1D; ++k1)
1178 {
1179 val += r_Bi[k1] * r_Bj[k1] * D(k1, e);
1180 }
1181 if (add)
1182 {
1183 M(i1, j1, e) += val;
1184 }
1185 else
1186 {
1187 M(i1, j1, e) = val;
1188 }
1189 }
1190 }
1191 });
1192}
1193
1194template<int T_D1D = 0, int T_Q1D = 0>
1195inline void EAMassAssemble2D(const int NE,
1196 const Array<real_t> &basis,
1197 const Vector &padata,
1198 Vector &eadata,
1199 const bool add,
1200 const int d1d = 0,
1201 const int q1d = 0)
1202{
1203 const int D1D = T_D1D ? T_D1D : d1d;
1204 const int Q1D = T_Q1D ? T_Q1D : q1d;
1205 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1206 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1207 auto B = Reshape(basis.Read(), Q1D, D1D);
1208 auto D = Reshape(padata.Read(), Q1D, Q1D, NE);
1209 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, D1D, D1D,
1210 NE);
1211 mfem::forall_2D(NE, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1212 {
1213 const int D1D = T_D1D ? T_D1D : d1d;
1214 const int Q1D = T_Q1D ? T_Q1D : q1d;
1215 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1216 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1217 real_t r_B[MQ1][MD1];
1218 for (int d = 0; d < D1D; d++)
1219 {
1220 for (int q = 0; q < Q1D; q++)
1221 {
1222 r_B[q][d] = B(q,d);
1223 }
1224 }
1225 MFEM_SHARED real_t s_D[MQ1][MQ1];
1226 MFEM_FOREACH_THREAD(k1,x,Q1D)
1227 {
1228 MFEM_FOREACH_THREAD(k2,y,Q1D)
1229 {
1230 s_D[k1][k2] = D(k1,k2,e);
1231 }
1232 }
1233 MFEM_SYNC_THREAD;
1234 MFEM_FOREACH_THREAD(i1,x,D1D)
1235 {
1236 MFEM_FOREACH_THREAD(i2,y,D1D)
1237 {
1238 for (int j1 = 0; j1 < D1D; ++j1)
1239 {
1240 for (int j2 = 0; j2 < D1D; ++j2)
1241 {
1242 real_t val = 0.0;
1243 for (int k1 = 0; k1 < Q1D; ++k1)
1244 {
1245 for (int k2 = 0; k2 < Q1D; ++k2)
1246 {
1247 val += r_B[k1][i1] * r_B[k1][j1]
1248 * r_B[k2][i2] * r_B[k2][j2]
1249 * s_D[k1][k2];
1250 }
1251 }
1252 if (add)
1253 {
1254 M(i1, i2, j1, j2, e) += val;
1255 }
1256 else
1257 {
1258 M(i1, i2, j1, j2, e) = val;
1259 }
1260 }
1261 }
1262 }
1263 }
1264 });
1265}
1266
1267template<int T_D1D = 0, int T_Q1D = 0>
1268inline void EAMassAssemble3D(const int NE,
1269 const Array<real_t> &basis,
1270 const Vector &padata,
1271 Vector &eadata,
1272 const bool add,
1273 const int d1d = 0,
1274 const int q1d = 0)
1275{
1276 const int D1D = T_D1D ? T_D1D : d1d;
1277 const int Q1D = T_Q1D ? T_Q1D : q1d;
1278 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
1279 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
1280 auto B = Reshape(basis.Read(), Q1D, D1D);
1281 auto D = Reshape(padata.Read(), Q1D, Q1D, Q1D, NE);
1282 auto M = Reshape(add ? eadata.ReadWrite() : eadata.Write(), D1D, D1D, D1D, D1D,
1283 D1D, D1D, NE);
1284 mfem::forall_3D(NE, D1D, D1D, D1D, [=] MFEM_HOST_DEVICE (int e)
1285 {
1286 const int D1D = T_D1D ? T_D1D : d1d;
1287 const int Q1D = T_Q1D ? T_Q1D : q1d;
1288 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1289 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1290 constexpr int DQ = T_D1D * T_Q1D;
1291
1292 // For quadratic and lower it's better to use registers but for higher-order you start to
1293 // spill and it's better to use shared memory
1294 constexpr bool USE_REG = DQ != 0 && DQ <= 12;
1295 constexpr int MD1r = USE_REG ? MD1 : 1;
1296 constexpr int MQ1r = USE_REG ? MQ1 : 1;
1297 constexpr int MD1s = USE_REG ? 1 : MD1;
1298 constexpr int MQ1s = USE_REG ? 1 : MQ1;
1299
1300 MFEM_SHARED real_t s_B[MQ1s][MD1s];
1301 real_t r_B[MQ1r][MD1r];
1302 real_t (*l_B)[MD1] = nullptr;
1303 if (USE_REG)
1304 {
1305 for (int d = 0; d < D1D; d++)
1306 {
1307 for (int q = 0; q < Q1D; q++)
1308 {
1309 r_B[q][d] = B(q,d);
1310 }
1311 }
1312 l_B = (real_t (*)[MD1])r_B;
1313 }
1314 else
1315 {
1316 if (MFEM_THREAD_ID(z) == 0)
1317 {
1318 MFEM_FOREACH_THREAD(d,x,D1D)
1319 {
1320 MFEM_FOREACH_THREAD(q,y,Q1D)
1321 {
1322 s_B[q][d] = B(q,d);
1323 }
1324 }
1325 }
1326 l_B = (real_t (*)[MD1])s_B;
1327 }
1328
1329 MFEM_SHARED real_t s_D[MQ1][MQ1][MQ1];
1330 MFEM_FOREACH_THREAD(k1,x,Q1D)
1331 {
1332 MFEM_FOREACH_THREAD(k2,y,Q1D)
1333 {
1334 MFEM_FOREACH_THREAD(k3,z,Q1D)
1335 {
1336 s_D[k1][k2][k3] = D(k1,k2,k3,e);
1337 }
1338 }
1339 }
1340 MFEM_SYNC_THREAD;
1341 MFEM_FOREACH_THREAD(i1,x,D1D)
1342 {
1343 MFEM_FOREACH_THREAD(i2,y,D1D)
1344 {
1345 MFEM_FOREACH_THREAD(i3,z,D1D)
1346 {
1347 for (int j1 = 0; j1 < D1D; ++j1)
1348 {
1349 for (int j2 = 0; j2 < D1D; ++j2)
1350 {
1351 for (int j3 = 0; j3 < D1D; ++j3)
1352 {
1353 real_t val = 0.0;
1354 for (int k1 = 0; k1 < Q1D; ++k1)
1355 {
1356 for (int k2 = 0; k2 < Q1D; ++k2)
1357 {
1358 for (int k3 = 0; k3 < Q1D; ++k3)
1359 {
1360 val += l_B[k1][i1] * l_B[k1][j1]
1361 * l_B[k2][i2] * l_B[k2][j2]
1362 * l_B[k3][i3] * l_B[k3][j3]
1363 * s_D[k1][k2][k3];
1364 }
1365 }
1366 }
1367 if (add)
1368 {
1369 M(i1, i2, i3, j1, j2, j3, e) += val;
1370 }
1371 else
1372 {
1373 M(i1, i2, i3, j1, j2, j3, e) = val;
1374 }
1375 }
1376 }
1377 }
1378 }
1379 }
1380 }
1381 });
1382}
1383
1384} // namespace internal
1385
1386namespace
1387{
1388using ApplyKernelType = MassIntegrator::ApplyKernelType;
1389using DiagonalKernelType = MassIntegrator::DiagonalKernelType;
1390}
1391
1392template<int DIM, int T_D1D, int T_Q1D>
1393ApplyKernelType MassIntegrator::ApplyPAKernels::Kernel()
1394{
1395 if constexpr (DIM == 1) { return internal::PAMassApply1D; }
1396 else if constexpr (DIM == 2) { return internal::SmemPAMassApply2D<T_D1D,T_Q1D>; }
1397 else if constexpr (DIM == 3) { return internal::SmemPAMassApply3D<T_D1D, T_Q1D>; }
1398 MFEM_ABORT("");
1399}
1400
1401inline ApplyKernelType MassIntegrator::ApplyPAKernels::Fallback(
1402 int DIM, int, int)
1403{
1404 if (DIM == 1) { return internal::PAMassApply1D; }
1405 else if (DIM == 2) { return internal::PAMassApply2D; }
1406 else if (DIM == 3) { return internal::PAMassApply3D; }
1407 else { MFEM_ABORT(""); }
1408}
1409
1410template<int DIM, int T_D1D, int T_Q1D>
1411DiagonalKernelType MassIntegrator::DiagonalPAKernels::Kernel()
1412{
1413 if constexpr (DIM == 1) { return internal::PAMassAssembleDiagonal1D; }
1414 else if constexpr (DIM == 2) { return internal::SmemPAMassAssembleDiagonal2D<T_D1D,T_Q1D>; }
1415 else if constexpr (DIM == 3) { return internal::SmemPAMassAssembleDiagonal3D<T_D1D, T_Q1D>; }
1416 MFEM_ABORT("");
1417}
1418
1419inline DiagonalKernelType MassIntegrator::DiagonalPAKernels::Fallback(
1420 int DIM, int, int)
1421{
1422 if (DIM == 1) { return internal::PAMassAssembleDiagonal1D; }
1423 else if (DIM == 2) { return internal::PAMassAssembleDiagonal2D; }
1424 else if (DIM == 3) { return internal::PAMassAssembleDiagonal3D; }
1425 else { MFEM_ABORT(""); }
1426}
1427/// \endcond DO_NOT_DOCUMENT
1428} // namespace mfem
1429
1430#endif
void(*)(const int, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
void(*)(const int, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
real_t b
Definition lissajous.cpp:42
constexpr int DIM
mfem::real_t real_t
DeviceTensor< 3, real_t > DeviceCube
Definition dtensor.hpp:153
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
DeviceTensor< 3, const real_t > ConstDeviceCube
Definition dtensor.hpp:154
T * Write(Memory< T > &mem, int size, bool on_dev=true)
Get a pointer for write access to mem with the mfem::Device's DeviceMemoryClass, if on_dev = true,...
Definition device.hpp:365
void add(const Vector &v1, const Vector &v2, Vector &v)
Definition vector.cpp:414
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:138
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:931
void forall_2D(int N, int X, int Y, lambda &&body)
Definition forall.hpp:925
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:937
float real_t
Definition config.hpp:46
DeviceTensor< 2, const real_t > ConstDeviceMatrix
Definition dtensor.hpp:151
void forall(int N, lambda &&body)
Definition forall.hpp:839
DeviceTensor< 2, real_t > DeviceMatrix
Definition dtensor.hpp:150
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119