MFEM v4.8.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_diffusion_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
13#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
14
21#include "../bilininteg.hpp"
22
23namespace mfem
24{
25
26/// \cond DO_NOT_DOCUMENT
27namespace internal
28{
29
30void PADiffusionSetup(const int dim,
31 const int sdim,
32 const int D1D,
33 const int Q1D,
34 const int coeffDim,
35 const int NE,
36 const Array<real_t> &W,
37 const Vector &J,
38 const Vector &C,
39 Vector &D);
40
41// PA Diffusion Assemble 2D f
42template<int T_SDIM>
43void PADiffusionSetup2D(const int Q1D,
44 const int coeffDim,
45 const int NE,
46 const Array<real_t> &w,
47 const Vector &j,
48 const Vector &c,
49 Vector &d);
50
51// PA Diffusion Assemble 3D kernel
52void PADiffusionSetup3D(const int Q1D,
53 const int coeffDim,
54 const int NE,
55 const Array<real_t> &w,
56 const Vector &j,
57 const Vector &c,
58 Vector &d);
59
60#ifdef MFEM_USE_OCCA
61// OCCA 2D Assemble kernel
62void OccaPADiffusionSetup2D(const int D1D,
63 const int Q1D,
64 const int NE,
65 const Array<real_t> &W,
66 const Vector &J,
67 const Vector &C,
68 Vector &op);
69
70// OCCA 3D Assemble kernel
71void OccaPADiffusionSetup3D(const int D1D,
72 const int Q1D,
73 const int NE,
74 const Array<real_t> &W,
75 const Vector &J,
76 const Vector &C,
77 Vector &op);
78#endif // MFEM_USE_OCCA
79
80void PADiffusionAssembleDiagonal(const int dim,
81 const int D1D,
82 const int Q1D,
83 const int NE,
84 const bool symm,
85 const Array<real_t> &B,
86 const Array<real_t> &G,
87 const Vector &D,
88 Vector &Y);
89
90// PA Diffusion Diagonal 2D kernel
91template<int T_D1D = 0, int T_Q1D = 0>
92inline void PADiffusionDiagonal2D(const int NE,
93 const bool symmetric,
94 const Array<real_t> &b,
95 const Array<real_t> &g,
96 const Vector &d,
97 Vector &y,
98 const int d1d = 0,
99 const int q1d = 0)
100{
101 const int D1D = T_D1D ? T_D1D : d1d;
102 const int Q1D = T_Q1D ? T_Q1D : q1d;
103 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
104 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
105 auto B = Reshape(b.Read(), Q1D, D1D);
106 auto G = Reshape(g.Read(), Q1D, D1D);
107 // note the different shape for D, if this is a symmetric matrix we only
108 // store necessary entries
109 auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
110 auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
111 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
112 {
113 const int D1D = T_D1D ? T_D1D : d1d;
114 const int Q1D = T_Q1D ? T_Q1D : q1d;
115 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
116 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
117 // gradphi \cdot Q \gradphi has four terms
118 real_t QD0[MQ1][MD1];
119 real_t QD1[MQ1][MD1];
120 real_t QD2[MQ1][MD1];
121 for (int qx = 0; qx < Q1D; ++qx)
122 {
123 for (int dy = 0; dy < D1D; ++dy)
124 {
125 QD0[qx][dy] = 0.0;
126 QD1[qx][dy] = 0.0;
127 QD2[qx][dy] = 0.0;
128 for (int qy = 0; qy < Q1D; ++qy)
129 {
130 const int q = qx + qy * Q1D;
131 const real_t D00 = D(q,0,e);
132 const real_t D10 = D(q,1,e);
133 const real_t D01 = symmetric ? D10 : D(q,2,e);
134 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
135 QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
136 QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
137 QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
138 }
139 }
140 }
141 for (int dy = 0; dy < D1D; ++dy)
142 {
143 for (int dx = 0; dx < D1D; ++dx)
144 {
145 for (int qx = 0; qx < Q1D; ++qx)
146 {
147 Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
148 Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
149 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
150 }
151 }
152 }
153 });
154}
155
156namespace diffusion
157{
158constexpr int ipow(int x, int p) { return p == 0 ? 1 : x*ipow(x, p-1); }
159constexpr int D11(int x) { return (11 - x)/2; }
160constexpr int D10(int x) { return (10 - x)/2; }
161constexpr int NBZApply(int D1D)
162{
163 return ipow(2, D11(D1D) >= 0 ? D11(D1D) : 0);
164}
165constexpr int NBZDiagonal(int D1D)
166{
167 return ipow(2, D10(D1D) >= 0 ? D10(D1D) : 0);
168}
169}
170
171// Shared memory PA Diffusion Diagonal 2D kernel
172template<int T_D1D = 0, int T_Q1D = 0>
173inline void SmemPADiffusionDiagonal2D(const int NE,
174 const bool symmetric,
175 const Array<real_t> &b_,
176 const Array<real_t> &g_,
177 const Vector &d_,
178 Vector &y_,
179 const int d1d = 0,
180 const int q1d = 0)
181{
182 static constexpr int T_NBZ = diffusion::NBZDiagonal(T_D1D);
183 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
184 const int D1D = T_D1D ? T_D1D : d1d;
185 const int Q1D = T_Q1D ? T_Q1D : q1d;
186 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
187 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
188 MFEM_VERIFY(D1D <= max_d1d, "");
189 MFEM_VERIFY(Q1D <= max_q1d, "");
190 auto b = Reshape(b_.Read(), Q1D, D1D);
191 auto g = Reshape(g_.Read(), Q1D, D1D);
192 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
193 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
194 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
195 {
196 const int tidz = MFEM_THREAD_ID(z);
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
199 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
200 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
201 MFEM_SHARED real_t BG[2][MQ1*MD1];
202 real_t (*B)[MD1] = (real_t (*)[MD1]) (BG+0);
203 real_t (*G)[MD1] = (real_t (*)[MD1]) (BG+1);
204 MFEM_SHARED real_t QD[3][NBZ][MD1][MQ1];
205 real_t (*QD0)[MD1] = (real_t (*)[MD1])(QD[0] + tidz);
206 real_t (*QD1)[MD1] = (real_t (*)[MD1])(QD[1] + tidz);
207 real_t (*QD2)[MD1] = (real_t (*)[MD1])(QD[2] + tidz);
208 if (tidz == 0)
209 {
210 MFEM_FOREACH_THREAD(d,y,D1D)
211 {
212 MFEM_FOREACH_THREAD(q,x,Q1D)
213 {
214 B[q][d] = b(q,d);
215 G[q][d] = g(q,d);
216 }
217 }
218 }
219 MFEM_SYNC_THREAD;
220 MFEM_FOREACH_THREAD(qx,x,Q1D)
221 {
222 MFEM_FOREACH_THREAD(dy,y,D1D)
223 {
224 QD0[qx][dy] = 0.0;
225 QD1[qx][dy] = 0.0;
226 QD2[qx][dy] = 0.0;
227 for (int qy = 0; qy < Q1D; ++qy)
228 {
229 const int q = qx + qy * Q1D;
230 const real_t D00 = D(q,0,e);
231 const real_t D10 = D(q,1,e);
232 const real_t D01 = symmetric ? D10 : D(q,2,e);
233 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
234 const real_t By = B[qy][dy];
235 const real_t Gy = G[qy][dy];
236 const real_t BBy = By * By;
237 const real_t BGy = By * Gy;
238 const real_t GGy = Gy * Gy;
239 QD0[qx][dy] += BBy * D00;
240 QD1[qx][dy] += BGy * (D01 + D10);
241 QD2[qx][dy] += GGy * D11;
242 }
243 }
244 }
245 MFEM_SYNC_THREAD;
246 MFEM_FOREACH_THREAD(dy,y,D1D)
247 {
248 MFEM_FOREACH_THREAD(dx,x,D1D)
249 {
250 for (int qx = 0; qx < Q1D; ++qx)
251 {
252 const real_t Bx = B[qx][dx];
253 const real_t Gx = G[qx][dx];
254 const real_t BBx = Bx * Bx;
255 const real_t BGx = Bx * Gx;
256 const real_t GGx = Gx * Gx;
257 Y(dx,dy,e) += GGx * QD0[qx][dy];
258 Y(dx,dy,e) += BGx * QD1[qx][dy];
259 Y(dx,dy,e) += BBx * QD2[qx][dy];
260 }
261 }
262 }
263 });
264}
265
266// PA Diffusion Diagonal 3D kernel
267template<int T_D1D = 0, int T_Q1D = 0>
268inline void PADiffusionDiagonal3D(const int NE,
269 const bool symmetric,
270 const Array<real_t> &b,
271 const Array<real_t> &g,
272 const Vector &d,
273 Vector &y,
274 const int d1d = 0,
275 const int q1d = 0)
276{
277 constexpr int DIM = 3;
278 const int D1D = T_D1D ? T_D1D : d1d;
279 const int Q1D = T_Q1D ? T_Q1D : q1d;
280 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
281 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
282 MFEM_VERIFY(D1D <= max_d1d, "");
283 MFEM_VERIFY(Q1D <= max_q1d, "");
284 auto B = Reshape(b.Read(), Q1D, D1D);
285 auto G = Reshape(g.Read(), Q1D, D1D);
286 auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
287 auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
288 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
289 {
290 const int D1D = T_D1D ? T_D1D : d1d;
291 const int Q1D = T_Q1D ? T_Q1D : q1d;
292 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
293 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
294 real_t QQD[MQ1][MQ1][MD1];
295 real_t QDD[MQ1][MD1][MD1];
296 for (int i = 0; i < DIM; ++i)
297 {
298 for (int j = 0; j < DIM; ++j)
299 {
300 // first tensor contraction, along z direction
301 for (int qx = 0; qx < Q1D; ++qx)
302 {
303 for (int qy = 0; qy < Q1D; ++qy)
304 {
305 for (int dz = 0; dz < D1D; ++dz)
306 {
307 QQD[qx][qy][dz] = 0.0;
308 for (int qz = 0; qz < Q1D; ++qz)
309 {
310 const int q = qx + (qy + qz * Q1D) * Q1D;
311 const int ksym = j >= i ?
312 3 - (3-i)*(2-i)/2 + j:
313 3 - (3-j)*(2-j)/2 + i;
314 const int k = symmetric ? ksym : (i*DIM) + j;
315 const real_t O = Q(q,k,e);
316 const real_t Bz = B(qz,dz);
317 const real_t Gz = G(qz,dz);
318 const real_t L = i==2 ? Gz : Bz;
319 const real_t R = j==2 ? Gz : Bz;
320 QQD[qx][qy][dz] += L * O * R;
321 }
322 }
323 }
324 }
325 // second tensor contraction, along y direction
326 for (int qx = 0; qx < Q1D; ++qx)
327 {
328 for (int dz = 0; dz < D1D; ++dz)
329 {
330 for (int dy = 0; dy < D1D; ++dy)
331 {
332 QDD[qx][dy][dz] = 0.0;
333 for (int qy = 0; qy < Q1D; ++qy)
334 {
335 const real_t By = B(qy,dy);
336 const real_t Gy = G(qy,dy);
337 const real_t L = i==1 ? Gy : By;
338 const real_t R = j==1 ? Gy : By;
339 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
340 }
341 }
342 }
343 }
344 // third tensor contraction, along x direction
345 for (int dz = 0; dz < D1D; ++dz)
346 {
347 for (int dy = 0; dy < D1D; ++dy)
348 {
349 for (int dx = 0; dx < D1D; ++dx)
350 {
351 for (int qx = 0; qx < Q1D; ++qx)
352 {
353 const real_t Bx = B(qx,dx);
354 const real_t Gx = G(qx,dx);
355 const real_t L = i==0 ? Gx : Bx;
356 const real_t R = j==0 ? Gx : Bx;
357 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
358 }
359 }
360 }
361 }
362 }
363 }
364 });
365}
366
367// Shared memory PA Diffusion Diagonal 3D kernel
368template<int T_D1D = 0, int T_Q1D = 0>
369inline void SmemPADiffusionDiagonal3D(const int NE,
370 const bool symmetric,
371 const Array<real_t> &b_,
372 const Array<real_t> &g_,
373 const Vector &d_,
374 Vector &y_,
375 const int d1d = 0,
376 const int q1d = 0)
377{
378 constexpr int DIM = 3;
379 const int D1D = T_D1D ? T_D1D : d1d;
380 const int Q1D = T_Q1D ? T_Q1D : q1d;
381 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
382 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
383 MFEM_VERIFY(D1D <= max_d1d, "");
384 MFEM_VERIFY(Q1D <= max_q1d, "");
385 auto b = Reshape(b_.Read(), Q1D, D1D);
386 auto g = Reshape(g_.Read(), Q1D, D1D);
387 auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
388 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
389 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
390 {
391 const int tidz = MFEM_THREAD_ID(z);
392 const int D1D = T_D1D ? T_D1D : d1d;
393 const int Q1D = T_Q1D ? T_Q1D : q1d;
394 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
395 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
396 MFEM_SHARED real_t BG[2][MQ1*MD1];
397 real_t (*B)[MD1] = (real_t (*)[MD1]) (BG+0);
398 real_t (*G)[MD1] = (real_t (*)[MD1]) (BG+1);
399 MFEM_SHARED real_t QQD[MQ1][MQ1][MD1];
400 MFEM_SHARED real_t QDD[MQ1][MD1][MD1];
401 if (tidz == 0)
402 {
403 MFEM_FOREACH_THREAD(d,y,D1D)
404 {
405 MFEM_FOREACH_THREAD(q,x,Q1D)
406 {
407 B[q][d] = b(q,d);
408 G[q][d] = g(q,d);
409 }
410 }
411 }
412 MFEM_SYNC_THREAD;
413 for (int i = 0; i < DIM; ++i)
414 {
415 for (int j = 0; j < DIM; ++j)
416 {
417 // first tensor contraction, along z direction
418 MFEM_FOREACH_THREAD(qx,x,Q1D)
419 {
420 MFEM_FOREACH_THREAD(qy,y,Q1D)
421 {
422 MFEM_FOREACH_THREAD(dz,z,D1D)
423 {
424 QQD[qx][qy][dz] = 0.0;
425 for (int qz = 0; qz < Q1D; ++qz)
426 {
427 const int q = qx + (qy + qz * Q1D) * Q1D;
428 const int ksym = j >= i ?
429 3 - (3-i)*(2-i)/2 + j:
430 3 - (3-j)*(2-j)/2 + i;
431 const int k = symmetric ? ksym : (i*DIM) + j;
432 const real_t O = D(q,k,e);
433 const real_t Bz = B[qz][dz];
434 const real_t Gz = G[qz][dz];
435 const real_t L = i==2 ? Gz : Bz;
436 const real_t R = j==2 ? Gz : Bz;
437 QQD[qx][qy][dz] += L * O * R;
438 }
439 }
440 }
441 }
442 MFEM_SYNC_THREAD;
443 // second tensor contraction, along y direction
444 MFEM_FOREACH_THREAD(qx,x,Q1D)
445 {
446 MFEM_FOREACH_THREAD(dz,z,D1D)
447 {
448 MFEM_FOREACH_THREAD(dy,y,D1D)
449 {
450 QDD[qx][dy][dz] = 0.0;
451 for (int qy = 0; qy < Q1D; ++qy)
452 {
453 const real_t By = B[qy][dy];
454 const real_t Gy = G[qy][dy];
455 const real_t L = i==1 ? Gy : By;
456 const real_t R = j==1 ? Gy : By;
457 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
458 }
459 }
460 }
461 }
462 MFEM_SYNC_THREAD;
463 // third tensor contraction, along x direction
464 MFEM_FOREACH_THREAD(dz,z,D1D)
465 {
466 MFEM_FOREACH_THREAD(dy,y,D1D)
467 {
468 MFEM_FOREACH_THREAD(dx,x,D1D)
469 {
470 for (int qx = 0; qx < Q1D; ++qx)
471 {
472 const real_t Bx = B[qx][dx];
473 const real_t Gx = G[qx][dx];
474 const real_t L = i==0 ? Gx : Bx;
475 const real_t R = j==0 ? Gx : Bx;
476 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
477 }
478 }
479 }
480 }
481 }
482 }
483 });
484}
485
486void PADiffusionApply(const int dim,
487 const int D1D,
488 const int Q1D,
489 const int NE,
490 const bool symm,
491 const Array<real_t> &B,
492 const Array<real_t> &G,
493 const Array<real_t> &Bt,
494 const Array<real_t> &Gt,
495 const Vector &D,
496 const Vector &X,
497 Vector &Y);
498
499#ifdef MFEM_USE_OCCA
500// OCCA PA Diffusion Apply 2D kernel
501void OccaPADiffusionApply2D(const int D1D,
502 const int Q1D,
503 const int NE,
504 const Array<real_t> &B,
505 const Array<real_t> &G,
506 const Array<real_t> &Bt,
507 const Array<real_t> &Gt,
508 const Vector &D,
509 const Vector &X,
510 Vector &Y);
511
512// OCCA PA Diffusion Apply 3D kernel
513void OccaPADiffusionApply3D(const int D1D,
514 const int Q1D,
515 const int NE,
516 const Array<real_t> &B,
517 const Array<real_t> &G,
518 const Array<real_t> &Bt,
519 const Array<real_t> &Gt,
520 const Vector &D,
521 const Vector &X,
522 Vector &Y);
523#endif // MFEM_USE_OCCA
524
525// PA Diffusion Apply 2D kernel
526template<int T_D1D = 0, int T_Q1D = 0>
527inline void PADiffusionApply2D(const int NE,
528 const bool symmetric,
529 const Array<real_t> &b_,
530 const Array<real_t> &g_,
531 const Array<real_t> &bt_,
532 const Array<real_t> &gt_,
533 const Vector &d_,
534 const Vector &x_,
535 Vector &y_,
536 const int d1d = 0,
537 const int q1d = 0)
538{
539 const int D1D = T_D1D ? T_D1D : d1d;
540 const int Q1D = T_Q1D ? T_Q1D : q1d;
541 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
542 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
543 auto B = Reshape(b_.Read(), Q1D, D1D);
544 auto G = Reshape(g_.Read(), Q1D, D1D);
545 auto Bt = Reshape(bt_.Read(), D1D, Q1D);
546 auto Gt = Reshape(gt_.Read(), D1D, Q1D);
547 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
548 auto X = Reshape(x_.Read(), D1D, D1D, NE);
549 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
550 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
551 {
552 const int D1D = T_D1D ? T_D1D : d1d;
553 const int Q1D = T_Q1D ? T_Q1D : q1d;
554 // the following variables are evaluated at compile time
555 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
556 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
557
558 real_t grad[max_Q1D][max_Q1D][2];
559 for (int qy = 0; qy < Q1D; ++qy)
560 {
561 for (int qx = 0; qx < Q1D; ++qx)
562 {
563 grad[qy][qx][0] = 0.0;
564 grad[qy][qx][1] = 0.0;
565 }
566 }
567 for (int dy = 0; dy < D1D; ++dy)
568 {
569 real_t gradX[max_Q1D][2];
570 for (int qx = 0; qx < Q1D; ++qx)
571 {
572 gradX[qx][0] = 0.0;
573 gradX[qx][1] = 0.0;
574 }
575 for (int dx = 0; dx < D1D; ++dx)
576 {
577 const real_t s = X(dx,dy,e);
578 for (int qx = 0; qx < Q1D; ++qx)
579 {
580 gradX[qx][0] += s * B(qx,dx);
581 gradX[qx][1] += s * G(qx,dx);
582 }
583 }
584 for (int qy = 0; qy < Q1D; ++qy)
585 {
586 const real_t wy = B(qy,dy);
587 const real_t wDy = G(qy,dy);
588 for (int qx = 0; qx < Q1D; ++qx)
589 {
590 grad[qy][qx][0] += gradX[qx][1] * wy;
591 grad[qy][qx][1] += gradX[qx][0] * wDy;
592 }
593 }
594 }
595 // Calculate Dxy, xDy in plane
596 for (int qy = 0; qy < Q1D; ++qy)
597 {
598 for (int qx = 0; qx < Q1D; ++qx)
599 {
600 const int q = qx + qy * Q1D;
601
602 const real_t O11 = D(q,0,e);
603 const real_t O21 = D(q,1,e);
604 const real_t O12 = symmetric ? O21 : D(q,2,e);
605 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
606
607 const real_t gradX = grad[qy][qx][0];
608 const real_t gradY = grad[qy][qx][1];
609
610 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
611 grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
612 }
613 }
614 for (int qy = 0; qy < Q1D; ++qy)
615 {
616 real_t gradX[max_D1D][2];
617 for (int dx = 0; dx < D1D; ++dx)
618 {
619 gradX[dx][0] = 0;
620 gradX[dx][1] = 0;
621 }
622 for (int qx = 0; qx < Q1D; ++qx)
623 {
624 const real_t gX = grad[qy][qx][0];
625 const real_t gY = grad[qy][qx][1];
626 for (int dx = 0; dx < D1D; ++dx)
627 {
628 const real_t wx = Bt(dx,qx);
629 const real_t wDx = Gt(dx,qx);
630 gradX[dx][0] += gX * wDx;
631 gradX[dx][1] += gY * wx;
632 }
633 }
634 for (int dy = 0; dy < D1D; ++dy)
635 {
636 const real_t wy = Bt(dy,qy);
637 const real_t wDy = Gt(dy,qy);
638 for (int dx = 0; dx < D1D; ++dx)
639 {
640 Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
641 }
642 }
643 }
644 });
645}
646
647// Shared memory PA Diffusion Apply 2D kernel
648template<int T_D1D = 0, int T_Q1D = 0>
649inline void SmemPADiffusionApply2D(const int NE,
650 const bool symmetric,
651 const Array<real_t> &b_,
652 const Array<real_t> &g_,
653 const Array<real_t> &bt_,
654 const Array<real_t> &gt_,
655 const Vector &d_,
656 const Vector &x_,
657 Vector &y_,
658 const int d1d = 0,
659 const int q1d = 0)
660{
661 static constexpr int T_NBZ = diffusion::NBZApply(T_D1D);
662 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
663 const int D1D = T_D1D ? T_D1D : d1d;
664 const int Q1D = T_Q1D ? T_Q1D : q1d;
665 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
666 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
667 MFEM_VERIFY(D1D <= max_d1d, "");
668 MFEM_VERIFY(Q1D <= max_q1d, "");
669 auto b = Reshape(b_.Read(), Q1D, D1D);
670 auto g = Reshape(g_.Read(), Q1D, D1D);
671 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
672 auto x = Reshape(x_.Read(), D1D, D1D, NE);
673 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
674 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE(int e)
675 {
676 const int tidz = MFEM_THREAD_ID(z);
677 const int D1D = T_D1D ? T_D1D : d1d;
678 const int Q1D = T_Q1D ? T_Q1D : q1d;
679 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
680 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
681 MFEM_SHARED real_t sBG[2][MQ1*MD1];
682 real_t (*B)[MD1] = (real_t (*)[MD1]) (sBG+0);
683 real_t (*G)[MD1] = (real_t (*)[MD1]) (sBG+1);
684 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) (sBG+0);
685 real_t (*Gt)[MQ1] = (real_t (*)[MQ1]) (sBG+1);
686 MFEM_SHARED real_t Xz[NBZ][MD1][MD1];
687 MFEM_SHARED real_t GD[2][NBZ][MD1][MQ1];
688 MFEM_SHARED real_t GQ[2][NBZ][MD1][MQ1];
689 real_t (*X)[MD1] = (real_t (*)[MD1])(Xz + tidz);
690 real_t (*DQ0)[MD1] = (real_t (*)[MD1])(GD[0] + tidz);
691 real_t (*DQ1)[MD1] = (real_t (*)[MD1])(GD[1] + tidz);
692 real_t (*QQ0)[MD1] = (real_t (*)[MD1])(GQ[0] + tidz);
693 real_t (*QQ1)[MD1] = (real_t (*)[MD1])(GQ[1] + tidz);
694 MFEM_FOREACH_THREAD(dy,y,D1D)
695 {
696 MFEM_FOREACH_THREAD(dx,x,D1D)
697 {
698 X[dy][dx] = x(dx,dy,e);
699 }
700 }
701 if (tidz == 0)
702 {
703 MFEM_FOREACH_THREAD(dy,y,D1D)
704 {
705 MFEM_FOREACH_THREAD(q,x,Q1D)
706 {
707 B[q][dy] = b(q,dy);
708 G[q][dy] = g(q,dy);
709 }
710 }
711 }
712 MFEM_SYNC_THREAD;
713 MFEM_FOREACH_THREAD(dy,y,D1D)
714 {
715 MFEM_FOREACH_THREAD(qx,x,Q1D)
716 {
717 real_t u = 0.0;
718 real_t v = 0.0;
719 for (int dx = 0; dx < D1D; ++dx)
720 {
721 const real_t coords = X[dy][dx];
722 u += B[qx][dx] * coords;
723 v += G[qx][dx] * coords;
724 }
725 DQ0[dy][qx] = u;
726 DQ1[dy][qx] = v;
727 }
728 }
729 MFEM_SYNC_THREAD;
730 MFEM_FOREACH_THREAD(qy,y,Q1D)
731 {
732 MFEM_FOREACH_THREAD(qx,x,Q1D)
733 {
734 real_t u = 0.0;
735 real_t v = 0.0;
736 for (int dy = 0; dy < D1D; ++dy)
737 {
738 u += DQ1[dy][qx] * B[qy][dy];
739 v += DQ0[dy][qx] * G[qy][dy];
740 }
741 QQ0[qy][qx] = u;
742 QQ1[qy][qx] = v;
743 }
744 }
745 MFEM_SYNC_THREAD;
746 MFEM_FOREACH_THREAD(qy,y,Q1D)
747 {
748 MFEM_FOREACH_THREAD(qx,x,Q1D)
749 {
750 const int q = (qx + ((qy) * Q1D));
751 const real_t O11 = D(q,0,e);
752 const real_t O21 = D(q,1,e);
753 const real_t O12 = symmetric ? O21 : D(q,2,e);
754 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
755 const real_t gX = QQ0[qy][qx];
756 const real_t gY = QQ1[qy][qx];
757 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
758 QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
759 }
760 }
761 MFEM_SYNC_THREAD;
762 if (tidz == 0)
763 {
764 MFEM_FOREACH_THREAD(dy,y,D1D)
765 {
766 MFEM_FOREACH_THREAD(q,x,Q1D)
767 {
768 Bt[dy][q] = b(q,dy);
769 Gt[dy][q] = g(q,dy);
770 }
771 }
772 }
773 MFEM_SYNC_THREAD;
774 MFEM_FOREACH_THREAD(qy,y,Q1D)
775 {
776 MFEM_FOREACH_THREAD(dx,x,D1D)
777 {
778 real_t u = 0.0;
779 real_t v = 0.0;
780 for (int qx = 0; qx < Q1D; ++qx)
781 {
782 u += Gt[dx][qx] * QQ0[qy][qx];
783 v += Bt[dx][qx] * QQ1[qy][qx];
784 }
785 DQ0[qy][dx] = u;
786 DQ1[qy][dx] = v;
787 }
788 }
789 MFEM_SYNC_THREAD;
790 MFEM_FOREACH_THREAD(dy,y,D1D)
791 {
792 MFEM_FOREACH_THREAD(dx,x,D1D)
793 {
794 real_t u = 0.0;
795 real_t v = 0.0;
796 for (int qy = 0; qy < Q1D; ++qy)
797 {
798 u += DQ0[qy][dx] * Bt[dy][qy];
799 v += DQ1[qy][dx] * Gt[dy][qy];
800 }
801 Y(dx,dy,e) += (u + v);
802 }
803 }
804 });
805}
806
807// PA Diffusion Apply 3D kernel
808template<int T_D1D = 0, int T_Q1D = 0>
809inline void PADiffusionApply3D(const int NE,
810 const bool symmetric,
811 const Array<real_t> &b,
812 const Array<real_t> &g,
813 const Array<real_t> &bt,
814 const Array<real_t> &gt,
815 const Vector &d_,
816 const Vector &x_,
817 Vector &y_,
818 int d1d = 0, int q1d = 0)
819{
820 const int D1D = T_D1D ? T_D1D : d1d;
821 const int Q1D = T_Q1D ? T_Q1D : q1d;
822 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
823 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
824 auto B = Reshape(b.Read(), Q1D, D1D);
825 auto G = Reshape(g.Read(), Q1D, D1D);
826 auto Bt = Reshape(bt.Read(), D1D, Q1D);
827 auto Gt = Reshape(gt.Read(), D1D, Q1D);
828 auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
829 auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
830 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
831 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
832 {
833 const int D1D = T_D1D ? T_D1D : d1d;
834 const int Q1D = T_Q1D ? T_Q1D : q1d;
835 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
836 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
837 real_t grad[max_Q1D][max_Q1D][max_Q1D][3];
838 for (int qz = 0; qz < Q1D; ++qz)
839 {
840 for (int qy = 0; qy < Q1D; ++qy)
841 {
842 for (int qx = 0; qx < Q1D; ++qx)
843 {
844 grad[qz][qy][qx][0] = 0.0;
845 grad[qz][qy][qx][1] = 0.0;
846 grad[qz][qy][qx][2] = 0.0;
847 }
848 }
849 }
850 for (int dz = 0; dz < D1D; ++dz)
851 {
852 real_t gradXY[max_Q1D][max_Q1D][3];
853 for (int qy = 0; qy < Q1D; ++qy)
854 {
855 for (int qx = 0; qx < Q1D; ++qx)
856 {
857 gradXY[qy][qx][0] = 0.0;
858 gradXY[qy][qx][1] = 0.0;
859 gradXY[qy][qx][2] = 0.0;
860 }
861 }
862 for (int dy = 0; dy < D1D; ++dy)
863 {
864 real_t gradX[max_Q1D][2];
865 for (int qx = 0; qx < Q1D; ++qx)
866 {
867 gradX[qx][0] = 0.0;
868 gradX[qx][1] = 0.0;
869 }
870 for (int dx = 0; dx < D1D; ++dx)
871 {
872 const real_t s = X(dx,dy,dz,e);
873 for (int qx = 0; qx < Q1D; ++qx)
874 {
875 gradX[qx][0] += s * B(qx,dx);
876 gradX[qx][1] += s * G(qx,dx);
877 }
878 }
879 for (int qy = 0; qy < Q1D; ++qy)
880 {
881 const real_t wy = B(qy,dy);
882 const real_t wDy = G(qy,dy);
883 for (int qx = 0; qx < Q1D; ++qx)
884 {
885 const real_t wx = gradX[qx][0];
886 const real_t wDx = gradX[qx][1];
887 gradXY[qy][qx][0] += wDx * wy;
888 gradXY[qy][qx][1] += wx * wDy;
889 gradXY[qy][qx][2] += wx * wy;
890 }
891 }
892 }
893 for (int qz = 0; qz < Q1D; ++qz)
894 {
895 const real_t wz = B(qz,dz);
896 const real_t wDz = G(qz,dz);
897 for (int qy = 0; qy < Q1D; ++qy)
898 {
899 for (int qx = 0; qx < Q1D; ++qx)
900 {
901 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
902 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
903 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
904 }
905 }
906 }
907 }
908 // Calculate Dxyz, xDyz, xyDz in plane
909 for (int qz = 0; qz < Q1D; ++qz)
910 {
911 for (int qy = 0; qy < Q1D; ++qy)
912 {
913 for (int qx = 0; qx < Q1D; ++qx)
914 {
915 const int q = qx + (qy + qz * Q1D) * Q1D;
916 const real_t O11 = D(q,0,e);
917 const real_t O12 = D(q,1,e);
918 const real_t O13 = D(q,2,e);
919 const real_t O21 = symmetric ? O12 : D(q,3,e);
920 const real_t O22 = symmetric ? D(q,3,e) : D(q,4,e);
921 const real_t O23 = symmetric ? D(q,4,e) : D(q,5,e);
922 const real_t O31 = symmetric ? O13 : D(q,6,e);
923 const real_t O32 = symmetric ? O23 : D(q,7,e);
924 const real_t O33 = symmetric ? D(q,5,e) : D(q,8,e);
925 const real_t gradX = grad[qz][qy][qx][0];
926 const real_t gradY = grad[qz][qy][qx][1];
927 const real_t gradZ = grad[qz][qy][qx][2];
928 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
929 grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
930 grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
931 }
932 }
933 }
934 for (int qz = 0; qz < Q1D; ++qz)
935 {
936 real_t gradXY[max_D1D][max_D1D][3];
937 for (int dy = 0; dy < D1D; ++dy)
938 {
939 for (int dx = 0; dx < D1D; ++dx)
940 {
941 gradXY[dy][dx][0] = 0;
942 gradXY[dy][dx][1] = 0;
943 gradXY[dy][dx][2] = 0;
944 }
945 }
946 for (int qy = 0; qy < Q1D; ++qy)
947 {
948 real_t gradX[max_D1D][3];
949 for (int dx = 0; dx < D1D; ++dx)
950 {
951 gradX[dx][0] = 0;
952 gradX[dx][1] = 0;
953 gradX[dx][2] = 0;
954 }
955 for (int qx = 0; qx < Q1D; ++qx)
956 {
957 const real_t gX = grad[qz][qy][qx][0];
958 const real_t gY = grad[qz][qy][qx][1];
959 const real_t gZ = grad[qz][qy][qx][2];
960 for (int dx = 0; dx < D1D; ++dx)
961 {
962 const real_t wx = Bt(dx,qx);
963 const real_t wDx = Gt(dx,qx);
964 gradX[dx][0] += gX * wDx;
965 gradX[dx][1] += gY * wx;
966 gradX[dx][2] += gZ * wx;
967 }
968 }
969 for (int dy = 0; dy < D1D; ++dy)
970 {
971 const real_t wy = Bt(dy,qy);
972 const real_t wDy = Gt(dy,qy);
973 for (int dx = 0; dx < D1D; ++dx)
974 {
975 gradXY[dy][dx][0] += gradX[dx][0] * wy;
976 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
977 gradXY[dy][dx][2] += gradX[dx][2] * wy;
978 }
979 }
980 }
981 for (int dz = 0; dz < D1D; ++dz)
982 {
983 const real_t wz = Bt(dz,qz);
984 const real_t wDz = Gt(dz,qz);
985 for (int dy = 0; dy < D1D; ++dy)
986 {
987 for (int dx = 0; dx < D1D; ++dx)
988 {
989 Y(dx,dy,dz,e) +=
990 ((gradXY[dy][dx][0] * wz) +
991 (gradXY[dy][dx][1] * wz) +
992 (gradXY[dy][dx][2] * wDz));
993 }
994 }
995 }
996 }
997 });
998}
999
1000// Shared memory PA Diffusion Apply 3D kernel
1001template<int T_D1D = 0, int T_Q1D = 0>
1002inline void SmemPADiffusionApply3D(const int NE,
1003 const bool symmetric,
1004 const Array<real_t> &b_,
1005 const Array<real_t> &g_,
1006 const Array<real_t> &,
1007 const Array<real_t> &,
1008 const Vector &d_,
1009 const Vector &x_,
1010 Vector &y_,
1011 const int d1d = 0,
1012 const int q1d = 0)
1013{
1014 const int D1D = T_D1D ? T_D1D : d1d;
1015 const int Q1D = T_Q1D ? T_Q1D : q1d;
1016 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1017 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1018 MFEM_VERIFY(D1D <= max_d1d, "");
1019 MFEM_VERIFY(Q1D <= max_q1d, "");
1020 auto b = Reshape(b_.Read(), Q1D, D1D);
1021 auto g = Reshape(g_.Read(), Q1D, D1D);
1022 auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1023 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1024 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1025 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
1026 {
1027 const int D1D = T_D1D ? T_D1D : d1d;
1028 const int Q1D = T_Q1D ? T_Q1D : q1d;
1029 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1030 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1031 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1032 MFEM_SHARED real_t sBG[2][MQ1*MD1];
1033 real_t (*B)[MD1] = (real_t (*)[MD1]) (sBG+0);
1034 real_t (*G)[MD1] = (real_t (*)[MD1]) (sBG+1);
1035 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) (sBG+0);
1036 real_t (*Gt)[MQ1] = (real_t (*)[MQ1]) (sBG+1);
1037 MFEM_SHARED real_t sm0[3][MDQ*MDQ*MDQ];
1038 MFEM_SHARED real_t sm1[3][MDQ*MDQ*MDQ];
1039 real_t (*X)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+2);
1040 real_t (*DDQ0)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) (sm0+0);
1041 real_t (*DDQ1)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) (sm0+1);
1042 real_t (*DQQ0)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+0);
1043 real_t (*DQQ1)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+1);
1044 real_t (*DQQ2)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+2);
1045 real_t (*QQQ0)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+0);
1046 real_t (*QQQ1)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+1);
1047 real_t (*QQQ2)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+2);
1048 real_t (*QQD0)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+0);
1049 real_t (*QQD1)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+1);
1050 real_t (*QQD2)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+2);
1051 real_t (*QDD0)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+0);
1052 real_t (*QDD1)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+1);
1053 real_t (*QDD2)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+2);
1054 MFEM_FOREACH_THREAD(dz,z,D1D)
1055 {
1056 MFEM_FOREACH_THREAD(dy,y,D1D)
1057 {
1058 MFEM_FOREACH_THREAD(dx,x,D1D)
1059 {
1060 X[dz][dy][dx] = x(dx,dy,dz,e);
1061 }
1062 }
1063 }
1064 if (MFEM_THREAD_ID(z) == 0)
1065 {
1066 MFEM_FOREACH_THREAD(dy,y,D1D)
1067 {
1068 MFEM_FOREACH_THREAD(qx,x,Q1D)
1069 {
1070 B[qx][dy] = b(qx,dy);
1071 G[qx][dy] = g(qx,dy);
1072 }
1073 }
1074 }
1075 MFEM_SYNC_THREAD;
1076 MFEM_FOREACH_THREAD(dz,z,D1D)
1077 {
1078 MFEM_FOREACH_THREAD(dy,y,D1D)
1079 {
1080 MFEM_FOREACH_THREAD(qx,x,Q1D)
1081 {
1082 real_t u = 0.0, v = 0.0;
1083 MFEM_UNROLL(MD1)
1084 for (int dx = 0; dx < D1D; ++dx)
1085 {
1086 const real_t coords = X[dz][dy][dx];
1087 u += coords * B[qx][dx];
1088 v += coords * G[qx][dx];
1089 }
1090 DDQ0[dz][dy][qx] = u;
1091 DDQ1[dz][dy][qx] = v;
1092 }
1093 }
1094 }
1095 MFEM_SYNC_THREAD;
1096 MFEM_FOREACH_THREAD(dz,z,D1D)
1097 {
1098 MFEM_FOREACH_THREAD(qy,y,Q1D)
1099 {
1100 MFEM_FOREACH_THREAD(qx,x,Q1D)
1101 {
1102 real_t u = 0.0, v = 0.0, w = 0.0;
1103 MFEM_UNROLL(MD1)
1104 for (int dy = 0; dy < D1D; ++dy)
1105 {
1106 u += DDQ1[dz][dy][qx] * B[qy][dy];
1107 v += DDQ0[dz][dy][qx] * G[qy][dy];
1108 w += DDQ0[dz][dy][qx] * B[qy][dy];
1109 }
1110 DQQ0[dz][qy][qx] = u;
1111 DQQ1[dz][qy][qx] = v;
1112 DQQ2[dz][qy][qx] = w;
1113 }
1114 }
1115 }
1116 MFEM_SYNC_THREAD;
1117 MFEM_FOREACH_THREAD(qz,z,Q1D)
1118 {
1119 MFEM_FOREACH_THREAD(qy,y,Q1D)
1120 {
1121 MFEM_FOREACH_THREAD(qx,x,Q1D)
1122 {
1123 real_t u = 0.0, v = 0.0, w = 0.0;
1124 MFEM_UNROLL(MD1)
1125 for (int dz = 0; dz < D1D; ++dz)
1126 {
1127 u += DQQ0[dz][qy][qx] * B[qz][dz];
1128 v += DQQ1[dz][qy][qx] * B[qz][dz];
1129 w += DQQ2[dz][qy][qx] * G[qz][dz];
1130 }
1131 const real_t O11 = d(qx,qy,qz,0,e);
1132 const real_t O12 = d(qx,qy,qz,1,e);
1133 const real_t O13 = d(qx,qy,qz,2,e);
1134 const real_t O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1135 const real_t O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1136 const real_t O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1137 const real_t O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1138 const real_t O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1139 const real_t O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1140 const real_t gX = u;
1141 const real_t gY = v;
1142 const real_t gZ = w;
1143 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1144 QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1145 QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1146 }
1147 }
1148 }
1149 MFEM_SYNC_THREAD;
1150 if (MFEM_THREAD_ID(z) == 0)
1151 {
1152 MFEM_FOREACH_THREAD(dy,y,D1D)
1153 {
1154 MFEM_FOREACH_THREAD(qx,x,Q1D)
1155 {
1156 Bt[dy][qx] = b(qx,dy);
1157 Gt[dy][qx] = g(qx,dy);
1158 }
1159 }
1160 }
1161 MFEM_SYNC_THREAD;
1162 MFEM_FOREACH_THREAD(qz,z,Q1D)
1163 {
1164 MFEM_FOREACH_THREAD(qy,y,Q1D)
1165 {
1166 MFEM_FOREACH_THREAD(dx,x,D1D)
1167 {
1168 real_t u = 0.0, v = 0.0, w = 0.0;
1169 MFEM_UNROLL(MQ1)
1170 for (int qx = 0; qx < Q1D; ++qx)
1171 {
1172 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1173 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1174 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1175 }
1176 QQD0[qz][qy][dx] = u;
1177 QQD1[qz][qy][dx] = v;
1178 QQD2[qz][qy][dx] = w;
1179 }
1180 }
1181 }
1182 MFEM_SYNC_THREAD;
1183 MFEM_FOREACH_THREAD(qz,z,Q1D)
1184 {
1185 MFEM_FOREACH_THREAD(dy,y,D1D)
1186 {
1187 MFEM_FOREACH_THREAD(dx,x,D1D)
1188 {
1189 real_t u = 0.0, v = 0.0, w = 0.0;
1190 MFEM_UNROLL(Q1D)
1191 for (int qy = 0; qy < Q1D; ++qy)
1192 {
1193 u += QQD0[qz][qy][dx] * Bt[dy][qy];
1194 v += QQD1[qz][qy][dx] * Gt[dy][qy];
1195 w += QQD2[qz][qy][dx] * Bt[dy][qy];
1196 }
1197 QDD0[qz][dy][dx] = u;
1198 QDD1[qz][dy][dx] = v;
1199 QDD2[qz][dy][dx] = w;
1200 }
1201 }
1202 }
1203 MFEM_SYNC_THREAD;
1204 MFEM_FOREACH_THREAD(dz,z,D1D)
1205 {
1206 MFEM_FOREACH_THREAD(dy,y,D1D)
1207 {
1208 MFEM_FOREACH_THREAD(dx,x,D1D)
1209 {
1210 real_t u = 0.0, v = 0.0, w = 0.0;
1211 MFEM_UNROLL(MQ1)
1212 for (int qz = 0; qz < Q1D; ++qz)
1213 {
1214 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1215 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1216 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1217 }
1218 y(dx,dy,dz,e) += (u + v + w);
1219 }
1220 }
1221 }
1222 });
1223}
1224
1225} // namespace internal
1226
1227namespace
1228{
1229using ApplyKernelType = DiffusionIntegrator::ApplyKernelType;
1230using DiagonalKernelType = DiffusionIntegrator::DiagonalKernelType;
1231}
1232
1233template<int DIM, int T_D1D, int T_Q1D>
1234ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Kernel()
1235{
1236 if (DIM == 2) { return internal::SmemPADiffusionApply2D<T_D1D,T_Q1D>; }
1237 else if (DIM == 3) { return internal::SmemPADiffusionApply3D<T_D1D, T_Q1D>; }
1238 else { MFEM_ABORT(""); }
1239}
1240
1241inline
1242ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Fallback(int DIM, int, int)
1243{
1244 if (DIM == 2) { return internal::PADiffusionApply2D; }
1245 else if (DIM == 3) { return internal::PADiffusionApply3D; }
1246 else { MFEM_ABORT(""); }
1247}
1248
1249template<int DIM, int D1D, int Q1D>
1250DiagonalKernelType DiffusionIntegrator::DiagonalPAKernels::Kernel()
1251{
1252 if (DIM == 2) { return internal::SmemPADiffusionDiagonal2D<D1D,Q1D>; }
1253 else if (DIM == 3) { return internal::SmemPADiffusionDiagonal3D<D1D, Q1D>; }
1254 else { MFEM_ABORT(""); }
1255}
1256
1257inline DiagonalKernelType
1258DiffusionIntegrator::DiagonalPAKernels::Fallback(int DIM, int, int)
1259{
1260 if (DIM == 2) { return internal::PADiffusionDiagonal2D; }
1261 else if (DIM == 3) { return internal::PADiffusionDiagonal3D; }
1262 else { MFEM_ABORT(""); }
1263}
1264/// \endcond DO_NOT_DOCUMENT
1265
1266} // namespace mfem
1267
1268
1269#endif
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
int dim
Definition ex24.cpp:53
real_t b
Definition lissajous.cpp:42
constexpr int DIM
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:131
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:768
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:774
float real_t
Definition config.hpp:43
void forall(int N, lambda &&body)
Definition forall.hpp:753
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119