MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_diffusion_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
13#define MFEM_BILININTEG_DIFFUSION_KERNELS_HPP
14
21#include "../bilininteg.hpp"
22
23namespace mfem
24{
25
26/// \cond DO_NOT_DOCUMENT
27namespace internal
28{
29
30void PADiffusionSetup(const int dim,
31 const int sdim,
32 const int D1D,
33 const int Q1D,
34 const int coeffDim,
35 const int NE,
36 const Array<real_t> &W,
37 const Vector &J,
38 const Vector &C,
39 Vector &D);
40
41// PA Diffusion Assemble 2D f
42template<int T_SDIM>
43void PADiffusionSetup2D(const int Q1D,
44 const int coeffDim,
45 const int NE,
46 const Array<real_t> &w,
47 const Vector &j,
48 const Vector &c,
49 Vector &d);
50
51// PA Diffusion Assemble 3D kernel
52void PADiffusionSetup3D(const int Q1D,
53 const int coeffDim,
54 const int NE,
55 const Array<real_t> &w,
56 const Vector &j,
57 const Vector &c,
58 Vector &d);
59
60#ifdef MFEM_USE_OCCA
61// OCCA 2D Assemble kernel
62void OccaPADiffusionSetup2D(const int D1D,
63 const int Q1D,
64 const int NE,
65 const Array<real_t> &W,
66 const Vector &J,
67 const Vector &C,
68 Vector &op);
69
70// OCCA 3D Assemble kernel
71void OccaPADiffusionSetup3D(const int D1D,
72 const int Q1D,
73 const int NE,
74 const Array<real_t> &W,
75 const Vector &J,
76 const Vector &C,
77 Vector &op);
78#endif // MFEM_USE_OCCA
79
80void PADiffusionAssembleDiagonal(const int dim,
81 const int D1D,
82 const int Q1D,
83 const int NE,
84 const bool symm,
85 const Array<real_t> &B,
86 const Array<real_t> &G,
87 const Vector &D,
88 Vector &Y);
89
90// PA Diffusion Diagonal 2D kernel
91template<int T_D1D = 0, int T_Q1D = 0>
92inline void PADiffusionDiagonal2D(const int NE,
93 const bool symmetric,
94 const Array<real_t> &b,
95 const Array<real_t> &g,
96 const Vector &d,
97 Vector &y,
98 const int d1d = 0,
99 const int q1d = 0)
100{
101 const int D1D = T_D1D ? T_D1D : d1d;
102 const int Q1D = T_Q1D ? T_Q1D : q1d;
103 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
104 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
105 auto B = Reshape(b.Read(), Q1D, D1D);
106 auto G = Reshape(g.Read(), Q1D, D1D);
107 // note the different shape for D, if this is a symmetric matrix we only
108 // store necessary entries
109 auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
110 auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
111 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
112 {
113 const int D1D = T_D1D ? T_D1D : d1d;
114 const int Q1D = T_Q1D ? T_Q1D : q1d;
115 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
116 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
117 // gradphi \cdot Q \gradphi has four terms
118 real_t QD0[MQ1][MD1];
119 real_t QD1[MQ1][MD1];
120 real_t QD2[MQ1][MD1];
121 for (int qx = 0; qx < Q1D; ++qx)
122 {
123 for (int dy = 0; dy < D1D; ++dy)
124 {
125 QD0[qx][dy] = 0.0;
126 QD1[qx][dy] = 0.0;
127 QD2[qx][dy] = 0.0;
128 for (int qy = 0; qy < Q1D; ++qy)
129 {
130 const int q = qx + qy * Q1D;
131 const real_t D00 = D(q,0,e);
132 const real_t D10 = D(q,1,e);
133 const real_t D01 = symmetric ? D10 : D(q,2,e);
134 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
135 QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
136 QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
137 QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
138 }
139 }
140 }
141 for (int dy = 0; dy < D1D; ++dy)
142 {
143 for (int dx = 0; dx < D1D; ++dx)
144 {
145 for (int qx = 0; qx < Q1D; ++qx)
146 {
147 Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
148 Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
149 Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
150 }
151 }
152 }
153 });
154}
155
156namespace diffusion
157{
158constexpr int ipow(int x, int p) { return p == 0 ? 1 : x*ipow(x, p-1); }
159constexpr int D11(int x) { return (11 - x)/2; }
160constexpr int D10(int x) { return (10 - x)/2; }
161constexpr int NBZApply(int D1D)
162{
163 return ipow(2, D11(D1D) >= 0 ? D11(D1D) : 0);
164}
165constexpr int NBZDiagonal(int D1D)
166{
167 return ipow(2, D10(D1D) >= 0 ? D10(D1D) : 0);
168}
169}
170
171// Shared memory PA Diffusion Diagonal 2D kernel
172template<int T_D1D = 0, int T_Q1D = 0>
173inline void SmemPADiffusionDiagonal2D(const int NE,
174 const bool symmetric,
175 const Array<real_t> &b_,
176 const Array<real_t> &g_,
177 const Vector &d_,
178 Vector &y_,
179 const int d1d = 0,
180 const int q1d = 0)
181{
182 static constexpr int T_NBZ = diffusion::NBZDiagonal(T_D1D);
183 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
184 const int D1D = T_D1D ? T_D1D : d1d;
185 const int Q1D = T_Q1D ? T_Q1D : q1d;
186 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
187 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
188 MFEM_VERIFY(D1D <= max_d1d, "");
189 MFEM_VERIFY(Q1D <= max_q1d, "");
190 auto b = Reshape(b_.Read(), Q1D, D1D);
191 auto g = Reshape(g_.Read(), Q1D, D1D);
192 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
193 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
194 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE (int e)
195 {
196 const int tidz = MFEM_THREAD_ID(z);
197 const int D1D = T_D1D ? T_D1D : d1d;
198 const int Q1D = T_Q1D ? T_Q1D : q1d;
199 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
200 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
201 MFEM_SHARED real_t BG[2][MQ1*MD1];
202 real_t (*B)[MD1] = (real_t (*)[MD1]) (BG+0);
203 real_t (*G)[MD1] = (real_t (*)[MD1]) (BG+1);
204 MFEM_SHARED real_t QD[3][NBZ][MQ1][MD1];
205 real_t (*QD0)[MD1] = (real_t (*)[MD1])(QD[0] + tidz);
206 real_t (*QD1)[MD1] = (real_t (*)[MD1])(QD[1] + tidz);
207 real_t (*QD2)[MD1] = (real_t (*)[MD1])(QD[2] + tidz);
208 if (tidz == 0)
209 {
210 MFEM_FOREACH_THREAD(d,y,D1D)
211 {
212 MFEM_FOREACH_THREAD(q,x,Q1D)
213 {
214 B[q][d] = b(q,d);
215 G[q][d] = g(q,d);
216 }
217 }
218 }
219 MFEM_SYNC_THREAD;
220 MFEM_FOREACH_THREAD(qx,x,Q1D)
221 {
222 MFEM_FOREACH_THREAD(dy,y,D1D)
223 {
224 QD0[qx][dy] = 0.0;
225 QD1[qx][dy] = 0.0;
226 QD2[qx][dy] = 0.0;
227 for (int qy = 0; qy < Q1D; ++qy)
228 {
229 const int q = qx + qy * Q1D;
230 const real_t D00 = D(q,0,e);
231 const real_t D10 = D(q,1,e);
232 const real_t D01 = symmetric ? D10 : D(q,2,e);
233 const real_t D11 = symmetric ? D(q,2,e) : D(q,3,e);
234 const real_t By = B[qy][dy];
235 const real_t Gy = G[qy][dy];
236 const real_t BBy = By * By;
237 const real_t BGy = By * Gy;
238 const real_t GGy = Gy * Gy;
239 QD0[qx][dy] += BBy * D00;
240 QD1[qx][dy] += BGy * (D01 + D10);
241 QD2[qx][dy] += GGy * D11;
242 }
243 }
244 }
245 MFEM_SYNC_THREAD;
246 MFEM_FOREACH_THREAD(dy,y,D1D)
247 {
248 MFEM_FOREACH_THREAD(dx,x,D1D)
249 {
250 for (int qx = 0; qx < Q1D; ++qx)
251 {
252 const real_t Bx = B[qx][dx];
253 const real_t Gx = G[qx][dx];
254 const real_t BBx = Bx * Bx;
255 const real_t BGx = Bx * Gx;
256 const real_t GGx = Gx * Gx;
257 Y(dx,dy,e) += GGx * QD0[qx][dy];
258 Y(dx,dy,e) += BGx * QD1[qx][dy];
259 Y(dx,dy,e) += BBx * QD2[qx][dy];
260 }
261 }
262 }
263 });
264}
265
266// PA Diffusion Diagonal 3D kernel
267template<int T_D1D = 0, int T_Q1D = 0>
268inline void PADiffusionDiagonal3D(const int NE,
269 const bool symmetric,
270 const Array<real_t> &b,
271 const Array<real_t> &g,
272 const Vector &d,
273 Vector &y,
274 const int d1d = 0,
275 const int q1d = 0)
276{
277 constexpr int DIM = 3;
278 const int D1D = T_D1D ? T_D1D : d1d;
279 const int Q1D = T_Q1D ? T_Q1D : q1d;
280 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
281 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
282 MFEM_VERIFY(D1D <= max_d1d, "");
283 MFEM_VERIFY(Q1D <= max_q1d, "");
284 auto B = Reshape(b.Read(), Q1D, D1D);
285 auto G = Reshape(g.Read(), Q1D, D1D);
286 auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
287 auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
288 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
289 {
290 const int D1D = T_D1D ? T_D1D : d1d;
291 const int Q1D = T_Q1D ? T_Q1D : q1d;
292 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
293 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
294 real_t QQD[MQ1][MQ1][MD1];
295 real_t QDD[MQ1][MD1][MD1];
296 for (int i = 0; i < DIM; ++i)
297 {
298 for (int j = 0; j < DIM; ++j)
299 {
300 // first tensor contraction, along z direction
301 for (int qx = 0; qx < Q1D; ++qx)
302 {
303 for (int qy = 0; qy < Q1D; ++qy)
304 {
305 for (int dz = 0; dz < D1D; ++dz)
306 {
307 QQD[qx][qy][dz] = 0.0;
308 for (int qz = 0; qz < Q1D; ++qz)
309 {
310 const int q = qx + (qy + qz * Q1D) * Q1D;
311 const int ksym = j >= i ?
312 3 - (3-i)*(2-i)/2 + j:
313 3 - (3-j)*(2-j)/2 + i;
314 const int k = symmetric ? ksym : (i*DIM) + j;
315 const real_t O = Q(q,k,e);
316 const real_t Bz = B(qz,dz);
317 const real_t Gz = G(qz,dz);
318 const real_t L = i==2 ? Gz : Bz;
319 const real_t R = j==2 ? Gz : Bz;
320 QQD[qx][qy][dz] += L * O * R;
321 }
322 }
323 }
324 }
325 // second tensor contraction, along y direction
326 for (int qx = 0; qx < Q1D; ++qx)
327 {
328 for (int dz = 0; dz < D1D; ++dz)
329 {
330 for (int dy = 0; dy < D1D; ++dy)
331 {
332 QDD[qx][dy][dz] = 0.0;
333 for (int qy = 0; qy < Q1D; ++qy)
334 {
335 const real_t By = B(qy,dy);
336 const real_t Gy = G(qy,dy);
337 const real_t L = i==1 ? Gy : By;
338 const real_t R = j==1 ? Gy : By;
339 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
340 }
341 }
342 }
343 }
344 // third tensor contraction, along x direction
345 for (int dz = 0; dz < D1D; ++dz)
346 {
347 for (int dy = 0; dy < D1D; ++dy)
348 {
349 for (int dx = 0; dx < D1D; ++dx)
350 {
351 for (int qx = 0; qx < Q1D; ++qx)
352 {
353 const real_t Bx = B(qx,dx);
354 const real_t Gx = G(qx,dx);
355 const real_t L = i==0 ? Gx : Bx;
356 const real_t R = j==0 ? Gx : Bx;
357 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
358 }
359 }
360 }
361 }
362 }
363 }
364 });
365}
366
367// Shared memory PA Diffusion Diagonal 3D kernel
368template<int T_D1D = 0, int T_Q1D = 0>
369inline void SmemPADiffusionDiagonal3D(const int NE,
370 const bool symmetric,
371 const Array<real_t> &b_,
372 const Array<real_t> &g_,
373 const Vector &d_,
374 Vector &y_,
375 const int d1d = 0,
376 const int q1d = 0)
377{
378 constexpr int DIM = 3;
379 const int D1D = T_D1D ? T_D1D : d1d;
380 const int Q1D = T_Q1D ? T_Q1D : q1d;
381 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
382 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
383 MFEM_VERIFY(D1D <= max_d1d, "");
384 MFEM_VERIFY(Q1D <= max_q1d, "");
385 auto b = Reshape(b_.Read(), Q1D, D1D);
386 auto g = Reshape(g_.Read(), Q1D, D1D);
387 auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
388 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
389 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
390 {
391 const int tidz = MFEM_THREAD_ID(z);
392 const int D1D = T_D1D ? T_D1D : d1d;
393 const int Q1D = T_Q1D ? T_Q1D : q1d;
394 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
395 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
396 MFEM_SHARED real_t BG[2][MQ1*MD1];
397 real_t (*B)[MD1] = (real_t (*)[MD1]) (BG+0);
398 real_t (*G)[MD1] = (real_t (*)[MD1]) (BG+1);
399 MFEM_SHARED real_t QQD[MQ1][MQ1][MD1];
400 MFEM_SHARED real_t QDD[MQ1][MD1][MD1];
401 if (tidz == 0)
402 {
403 MFEM_FOREACH_THREAD(d,y,D1D)
404 {
405 MFEM_FOREACH_THREAD(q,x,Q1D)
406 {
407 B[q][d] = b(q,d);
408 G[q][d] = g(q,d);
409 }
410 }
411 }
412 MFEM_SYNC_THREAD;
413 for (int i = 0; i < DIM; ++i)
414 {
415 for (int j = 0; j < DIM; ++j)
416 {
417 // first tensor contraction, along z direction
418 MFEM_FOREACH_THREAD(qx,x,Q1D)
419 {
420 MFEM_FOREACH_THREAD(qy,y,Q1D)
421 {
422 MFEM_FOREACH_THREAD(dz,z,D1D)
423 {
424 QQD[qx][qy][dz] = 0.0;
425 for (int qz = 0; qz < Q1D; ++qz)
426 {
427 const int q = qx + (qy + qz * Q1D) * Q1D;
428 const int ksym = j >= i ?
429 3 - (3-i)*(2-i)/2 + j:
430 3 - (3-j)*(2-j)/2 + i;
431 const int k = symmetric ? ksym : (i*DIM) + j;
432 const real_t O = D(q,k,e);
433 const real_t Bz = B[qz][dz];
434 const real_t Gz = G[qz][dz];
435 const real_t L = i==2 ? Gz : Bz;
436 const real_t R = j==2 ? Gz : Bz;
437 QQD[qx][qy][dz] += L * O * R;
438 }
439 }
440 }
441 }
442 MFEM_SYNC_THREAD;
443 // second tensor contraction, along y direction
444 MFEM_FOREACH_THREAD(qx,x,Q1D)
445 {
446 MFEM_FOREACH_THREAD(dz,z,D1D)
447 {
448 MFEM_FOREACH_THREAD(dy,y,D1D)
449 {
450 QDD[qx][dy][dz] = 0.0;
451 for (int qy = 0; qy < Q1D; ++qy)
452 {
453 const real_t By = B[qy][dy];
454 const real_t Gy = G[qy][dy];
455 const real_t L = i==1 ? Gy : By;
456 const real_t R = j==1 ? Gy : By;
457 QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
458 }
459 }
460 }
461 }
462 MFEM_SYNC_THREAD;
463 // third tensor contraction, along x direction
464 MFEM_FOREACH_THREAD(dz,z,D1D)
465 {
466 MFEM_FOREACH_THREAD(dy,y,D1D)
467 {
468 MFEM_FOREACH_THREAD(dx,x,D1D)
469 {
470 for (int qx = 0; qx < Q1D; ++qx)
471 {
472 const real_t Bx = B[qx][dx];
473 const real_t Gx = G[qx][dx];
474 const real_t L = i==0 ? Gx : Bx;
475 const real_t R = j==0 ? Gx : Bx;
476 Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
477 }
478 }
479 }
480 }
481 }
482 }
483 });
484}
485
486#ifdef MFEM_USE_OCCA
487// OCCA PA Diffusion Apply 2D kernel
488void OccaPADiffusionApply2D(const int D1D,
489 const int Q1D,
490 const int NE,
491 const Array<real_t> &B,
492 const Array<real_t> &G,
493 const Array<real_t> &Bt,
494 const Array<real_t> &Gt,
495 const Vector &D,
496 const Vector &X,
497 Vector &Y);
498
499// OCCA PA Diffusion Apply 3D kernel
500void OccaPADiffusionApply3D(const int D1D,
501 const int Q1D,
502 const int NE,
503 const Array<real_t> &B,
504 const Array<real_t> &G,
505 const Array<real_t> &Bt,
506 const Array<real_t> &Gt,
507 const Vector &D,
508 const Vector &X,
509 Vector &Y);
510#endif // MFEM_USE_OCCA
511
512// PA Diffusion Apply 2D kernel
513template<int T_D1D = 0, int T_Q1D = 0>
514inline void PADiffusionApply2D(const int NE,
515 const bool symmetric,
516 const Array<real_t> &b_,
517 const Array<real_t> &g_,
518 const Array<real_t> &bt_,
519 const Array<real_t> &gt_,
520 const Vector &d_,
521 const Vector &x_,
522 Vector &y_,
523 const int d1d = 0,
524 const int q1d = 0)
525{
526 const int D1D = T_D1D ? T_D1D : d1d;
527 const int Q1D = T_Q1D ? T_Q1D : q1d;
528 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
529 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
530 auto B = Reshape(b_.Read(), Q1D, D1D);
531 auto G = Reshape(g_.Read(), Q1D, D1D);
532 auto Bt = Reshape(bt_.Read(), D1D, Q1D);
533 auto Gt = Reshape(gt_.Read(), D1D, Q1D);
534 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
535 auto X = Reshape(x_.Read(), D1D, D1D, NE);
536 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
537 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
538 {
539 const int D1D = T_D1D ? T_D1D : d1d;
540 const int Q1D = T_Q1D ? T_Q1D : q1d;
541 // the following variables are evaluated at compile time
542 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
543 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
544
545 real_t grad[max_Q1D][max_Q1D][2];
546 for (int qy = 0; qy < Q1D; ++qy)
547 {
548 for (int qx = 0; qx < Q1D; ++qx)
549 {
550 grad[qy][qx][0] = 0.0;
551 grad[qy][qx][1] = 0.0;
552 }
553 }
554 for (int dy = 0; dy < D1D; ++dy)
555 {
556 real_t gradX[max_Q1D][2];
557 for (int qx = 0; qx < Q1D; ++qx)
558 {
559 gradX[qx][0] = 0.0;
560 gradX[qx][1] = 0.0;
561 }
562 for (int dx = 0; dx < D1D; ++dx)
563 {
564 const real_t s = X(dx,dy,e);
565 for (int qx = 0; qx < Q1D; ++qx)
566 {
567 gradX[qx][0] += s * B(qx,dx);
568 gradX[qx][1] += s * G(qx,dx);
569 }
570 }
571 for (int qy = 0; qy < Q1D; ++qy)
572 {
573 const real_t wy = B(qy,dy);
574 const real_t wDy = G(qy,dy);
575 for (int qx = 0; qx < Q1D; ++qx)
576 {
577 grad[qy][qx][0] += gradX[qx][1] * wy;
578 grad[qy][qx][1] += gradX[qx][0] * wDy;
579 }
580 }
581 }
582 // Calculate Dxy, xDy in plane
583 for (int qy = 0; qy < Q1D; ++qy)
584 {
585 for (int qx = 0; qx < Q1D; ++qx)
586 {
587 const int q = qx + qy * Q1D;
588
589 const real_t O11 = D(q,0,e);
590 const real_t O21 = D(q,1,e);
591 const real_t O12 = symmetric ? O21 : D(q,2,e);
592 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
593
594 const real_t gradX = grad[qy][qx][0];
595 const real_t gradY = grad[qy][qx][1];
596
597 grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
598 grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
599 }
600 }
601 for (int qy = 0; qy < Q1D; ++qy)
602 {
603 real_t gradX[max_D1D][2];
604 for (int dx = 0; dx < D1D; ++dx)
605 {
606 gradX[dx][0] = 0;
607 gradX[dx][1] = 0;
608 }
609 for (int qx = 0; qx < Q1D; ++qx)
610 {
611 const real_t gX = grad[qy][qx][0];
612 const real_t gY = grad[qy][qx][1];
613 for (int dx = 0; dx < D1D; ++dx)
614 {
615 const real_t wx = Bt(dx,qx);
616 const real_t wDx = Gt(dx,qx);
617 gradX[dx][0] += gX * wDx;
618 gradX[dx][1] += gY * wx;
619 }
620 }
621 for (int dy = 0; dy < D1D; ++dy)
622 {
623 const real_t wy = Bt(dy,qy);
624 const real_t wDy = Gt(dy,qy);
625 for (int dx = 0; dx < D1D; ++dx)
626 {
627 Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
628 }
629 }
630 }
631 });
632}
633
634// Shared memory PA Diffusion Apply 2D kernel
635template<int T_D1D = 0, int T_Q1D = 0>
636inline void SmemPADiffusionApply2D(const int NE,
637 const bool symmetric,
638 const Array<real_t> &b_,
639 const Array<real_t> &g_,
640 const Array<real_t> &bt_,
641 const Array<real_t> &gt_,
642 const Vector &d_,
643 const Vector &x_,
644 Vector &y_,
645 const int d1d = 0,
646 const int q1d = 0)
647{
648 static constexpr int T_NBZ = diffusion::NBZApply(T_D1D);
649 static constexpr int NBZ = T_NBZ ? T_NBZ : 1;
650 const int D1D = T_D1D ? T_D1D : d1d;
651 const int Q1D = T_Q1D ? T_Q1D : q1d;
652 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
653 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
654 MFEM_VERIFY(D1D <= max_d1d, "");
655 MFEM_VERIFY(Q1D <= max_q1d, "");
656 auto b = Reshape(b_.Read(), Q1D, D1D);
657 auto g = Reshape(g_.Read(), Q1D, D1D);
658 auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
659 auto x = Reshape(x_.Read(), D1D, D1D, NE);
660 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
661 mfem::forall_2D_batch(NE, Q1D, Q1D, NBZ, [=] MFEM_HOST_DEVICE(int e)
662 {
663 const int tidz = MFEM_THREAD_ID(z);
664 const int D1D = T_D1D ? T_D1D : d1d;
665 const int Q1D = T_Q1D ? T_Q1D : q1d;
666 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
667 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
668 MFEM_SHARED real_t sBG[2][MQ1*MD1];
669 real_t (*B)[MD1] = (real_t (*)[MD1]) (sBG+0);
670 real_t (*G)[MD1] = (real_t (*)[MD1]) (sBG+1);
671 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) (sBG+0);
672 real_t (*Gt)[MQ1] = (real_t (*)[MQ1]) (sBG+1);
673 MFEM_SHARED real_t Xz[NBZ][MD1][MD1];
674 MFEM_SHARED real_t GD[2][NBZ][MD1][MQ1];
675 MFEM_SHARED real_t GQ[2][NBZ][MQ1][MQ1];
676 real_t (*X)[MD1] = (real_t (*)[MD1])(Xz + tidz);
677 real_t (*DQ0)[MQ1] = (real_t (*)[MQ1])(GD[0] + tidz);
678 real_t (*DQ1)[MQ1] = (real_t (*)[MQ1])(GD[1] + tidz);
679 real_t (*QQ0)[MQ1] = (real_t (*)[MQ1])(GQ[0] + tidz);
680 real_t (*QQ1)[MQ1] = (real_t (*)[MQ1])(GQ[1] + tidz);
681 MFEM_FOREACH_THREAD(dy,y,D1D)
682 {
683 MFEM_FOREACH_THREAD(dx,x,D1D)
684 {
685 X[dy][dx] = x(dx,dy,e);
686 }
687 }
688 if (tidz == 0)
689 {
690 MFEM_FOREACH_THREAD(dy,y,D1D)
691 {
692 MFEM_FOREACH_THREAD(q,x,Q1D)
693 {
694 B[q][dy] = b(q,dy);
695 G[q][dy] = g(q,dy);
696 }
697 }
698 }
699 MFEM_SYNC_THREAD;
700 MFEM_FOREACH_THREAD(dy,y,D1D)
701 {
702 MFEM_FOREACH_THREAD(qx,x,Q1D)
703 {
704 real_t u = 0.0;
705 real_t v = 0.0;
706 for (int dx = 0; dx < D1D; ++dx)
707 {
708 const real_t coords = X[dy][dx];
709 u += B[qx][dx] * coords;
710 v += G[qx][dx] * coords;
711 }
712 DQ0[dy][qx] = u;
713 DQ1[dy][qx] = v;
714 }
715 }
716 MFEM_SYNC_THREAD;
717 MFEM_FOREACH_THREAD(qy,y,Q1D)
718 {
719 MFEM_FOREACH_THREAD(qx,x,Q1D)
720 {
721 real_t u = 0.0;
722 real_t v = 0.0;
723 for (int dy = 0; dy < D1D; ++dy)
724 {
725 u += DQ1[dy][qx] * B[qy][dy];
726 v += DQ0[dy][qx] * G[qy][dy];
727 }
728 QQ0[qy][qx] = u;
729 QQ1[qy][qx] = v;
730 }
731 }
732 MFEM_SYNC_THREAD;
733 MFEM_FOREACH_THREAD(qy,y,Q1D)
734 {
735 MFEM_FOREACH_THREAD(qx,x,Q1D)
736 {
737 const int q = (qx + ((qy) * Q1D));
738 const real_t O11 = D(q,0,e);
739 const real_t O21 = D(q,1,e);
740 const real_t O12 = symmetric ? O21 : D(q,2,e);
741 const real_t O22 = symmetric ? D(q,2,e) : D(q,3,e);
742 const real_t gX = QQ0[qy][qx];
743 const real_t gY = QQ1[qy][qx];
744 QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
745 QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
746 }
747 }
748 MFEM_SYNC_THREAD;
749 if (tidz == 0)
750 {
751 MFEM_FOREACH_THREAD(dy,y,D1D)
752 {
753 MFEM_FOREACH_THREAD(q,x,Q1D)
754 {
755 Bt[dy][q] = b(q,dy);
756 Gt[dy][q] = g(q,dy);
757 }
758 }
759 }
760 MFEM_SYNC_THREAD;
761 MFEM_FOREACH_THREAD(qy,y,Q1D)
762 {
763 MFEM_FOREACH_THREAD(dx,x,D1D)
764 {
765 real_t u = 0.0;
766 real_t v = 0.0;
767 for (int qx = 0; qx < Q1D; ++qx)
768 {
769 u += Gt[dx][qx] * QQ0[qy][qx];
770 v += Bt[dx][qx] * QQ1[qy][qx];
771 }
772 DQ0[dx][qy] = u;
773 DQ1[dx][qy] = v;
774 }
775 }
776 MFEM_SYNC_THREAD;
777 MFEM_FOREACH_THREAD(dy,y,D1D)
778 {
779 MFEM_FOREACH_THREAD(dx,x,D1D)
780 {
781 real_t u = 0.0;
782 real_t v = 0.0;
783 for (int qy = 0; qy < Q1D; ++qy)
784 {
785 u += DQ0[dx][qy] * Bt[dy][qy];
786 v += DQ1[dx][qy] * Gt[dy][qy];
787 }
788 Y(dx,dy,e) += (u + v);
789 }
790 }
791 });
792}
793
794// PA Diffusion Apply 3D kernel
795template<int T_D1D = 0, int T_Q1D = 0>
796inline void PADiffusionApply3D(const int NE,
797 const bool symmetric,
798 const Array<real_t> &b,
799 const Array<real_t> &g,
800 const Array<real_t> &bt,
801 const Array<real_t> &gt,
802 const Vector &d_,
803 const Vector &x_,
804 Vector &y_,
805 int d1d = 0, int q1d = 0)
806{
807 const int D1D = T_D1D ? T_D1D : d1d;
808 const int Q1D = T_Q1D ? T_Q1D : q1d;
809 MFEM_VERIFY(D1D <= DeviceDofQuadLimits::Get().MAX_D1D, "");
810 MFEM_VERIFY(Q1D <= DeviceDofQuadLimits::Get().MAX_Q1D, "");
811 auto B = Reshape(b.Read(), Q1D, D1D);
812 auto G = Reshape(g.Read(), Q1D, D1D);
813 auto Bt = Reshape(bt.Read(), D1D, Q1D);
814 auto Gt = Reshape(gt.Read(), D1D, Q1D);
815 auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
816 auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
817 auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
818 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
819 {
820 const int D1D = T_D1D ? T_D1D : d1d;
821 const int Q1D = T_Q1D ? T_Q1D : q1d;
822 constexpr int max_D1D = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
823 constexpr int max_Q1D = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
824 real_t grad[max_Q1D][max_Q1D][max_Q1D][3];
825 for (int qz = 0; qz < Q1D; ++qz)
826 {
827 for (int qy = 0; qy < Q1D; ++qy)
828 {
829 for (int qx = 0; qx < Q1D; ++qx)
830 {
831 grad[qz][qy][qx][0] = 0.0;
832 grad[qz][qy][qx][1] = 0.0;
833 grad[qz][qy][qx][2] = 0.0;
834 }
835 }
836 }
837 for (int dz = 0; dz < D1D; ++dz)
838 {
839 real_t gradXY[max_Q1D][max_Q1D][3];
840 for (int qy = 0; qy < Q1D; ++qy)
841 {
842 for (int qx = 0; qx < Q1D; ++qx)
843 {
844 gradXY[qy][qx][0] = 0.0;
845 gradXY[qy][qx][1] = 0.0;
846 gradXY[qy][qx][2] = 0.0;
847 }
848 }
849 for (int dy = 0; dy < D1D; ++dy)
850 {
851 real_t gradX[max_Q1D][2];
852 for (int qx = 0; qx < Q1D; ++qx)
853 {
854 gradX[qx][0] = 0.0;
855 gradX[qx][1] = 0.0;
856 }
857 for (int dx = 0; dx < D1D; ++dx)
858 {
859 const real_t s = X(dx,dy,dz,e);
860 for (int qx = 0; qx < Q1D; ++qx)
861 {
862 gradX[qx][0] += s * B(qx,dx);
863 gradX[qx][1] += s * G(qx,dx);
864 }
865 }
866 for (int qy = 0; qy < Q1D; ++qy)
867 {
868 const real_t wy = B(qy,dy);
869 const real_t wDy = G(qy,dy);
870 for (int qx = 0; qx < Q1D; ++qx)
871 {
872 const real_t wx = gradX[qx][0];
873 const real_t wDx = gradX[qx][1];
874 gradXY[qy][qx][0] += wDx * wy;
875 gradXY[qy][qx][1] += wx * wDy;
876 gradXY[qy][qx][2] += wx * wy;
877 }
878 }
879 }
880 for (int qz = 0; qz < Q1D; ++qz)
881 {
882 const real_t wz = B(qz,dz);
883 const real_t wDz = G(qz,dz);
884 for (int qy = 0; qy < Q1D; ++qy)
885 {
886 for (int qx = 0; qx < Q1D; ++qx)
887 {
888 grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
889 grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
890 grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
891 }
892 }
893 }
894 }
895 // Calculate Dxyz, xDyz, xyDz in plane
896 for (int qz = 0; qz < Q1D; ++qz)
897 {
898 for (int qy = 0; qy < Q1D; ++qy)
899 {
900 for (int qx = 0; qx < Q1D; ++qx)
901 {
902 const int q = qx + (qy + qz * Q1D) * Q1D;
903 const real_t O11 = D(q,0,e);
904 const real_t O12 = D(q,1,e);
905 const real_t O13 = D(q,2,e);
906 const real_t O21 = symmetric ? O12 : D(q,3,e);
907 const real_t O22 = symmetric ? D(q,3,e) : D(q,4,e);
908 const real_t O23 = symmetric ? D(q,4,e) : D(q,5,e);
909 const real_t O31 = symmetric ? O13 : D(q,6,e);
910 const real_t O32 = symmetric ? O23 : D(q,7,e);
911 const real_t O33 = symmetric ? D(q,5,e) : D(q,8,e);
912 const real_t gradX = grad[qz][qy][qx][0];
913 const real_t gradY = grad[qz][qy][qx][1];
914 const real_t gradZ = grad[qz][qy][qx][2];
915 grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
916 grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
917 grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
918 }
919 }
920 }
921 for (int qz = 0; qz < Q1D; ++qz)
922 {
923 real_t gradXY[max_D1D][max_D1D][3];
924 for (int dy = 0; dy < D1D; ++dy)
925 {
926 for (int dx = 0; dx < D1D; ++dx)
927 {
928 gradXY[dy][dx][0] = 0;
929 gradXY[dy][dx][1] = 0;
930 gradXY[dy][dx][2] = 0;
931 }
932 }
933 for (int qy = 0; qy < Q1D; ++qy)
934 {
935 real_t gradX[max_D1D][3];
936 for (int dx = 0; dx < D1D; ++dx)
937 {
938 gradX[dx][0] = 0;
939 gradX[dx][1] = 0;
940 gradX[dx][2] = 0;
941 }
942 for (int qx = 0; qx < Q1D; ++qx)
943 {
944 const real_t gX = grad[qz][qy][qx][0];
945 const real_t gY = grad[qz][qy][qx][1];
946 const real_t gZ = grad[qz][qy][qx][2];
947 for (int dx = 0; dx < D1D; ++dx)
948 {
949 const real_t wx = Bt(dx,qx);
950 const real_t wDx = Gt(dx,qx);
951 gradX[dx][0] += gX * wDx;
952 gradX[dx][1] += gY * wx;
953 gradX[dx][2] += gZ * wx;
954 }
955 }
956 for (int dy = 0; dy < D1D; ++dy)
957 {
958 const real_t wy = Bt(dy,qy);
959 const real_t wDy = Gt(dy,qy);
960 for (int dx = 0; dx < D1D; ++dx)
961 {
962 gradXY[dy][dx][0] += gradX[dx][0] * wy;
963 gradXY[dy][dx][1] += gradX[dx][1] * wDy;
964 gradXY[dy][dx][2] += gradX[dx][2] * wy;
965 }
966 }
967 }
968 for (int dz = 0; dz < D1D; ++dz)
969 {
970 const real_t wz = Bt(dz,qz);
971 const real_t wDz = Gt(dz,qz);
972 for (int dy = 0; dy < D1D; ++dy)
973 {
974 for (int dx = 0; dx < D1D; ++dx)
975 {
976 Y(dx,dy,dz,e) +=
977 ((gradXY[dy][dx][0] * wz) +
978 (gradXY[dy][dx][1] * wz) +
979 (gradXY[dy][dx][2] * wDz));
980 }
981 }
982 }
983 }
984 });
985}
986
987// Shared memory PA Diffusion Apply 3D kernel
988template<int T_D1D = 0, int T_Q1D = 0>
989inline void SmemPADiffusionApply3D(const int NE,
990 const bool symmetric,
991 const Array<real_t> &b_,
992 const Array<real_t> &g_,
993 const Array<real_t> &,
994 const Array<real_t> &,
995 const Vector &d_,
996 const Vector &x_,
997 Vector &y_,
998 const int d1d = 0,
999 const int q1d = 0)
1000{
1001 const int D1D = T_D1D ? T_D1D : d1d;
1002 const int Q1D = T_Q1D ? T_Q1D : q1d;
1003 const int max_q1d = T_Q1D ? T_Q1D : DeviceDofQuadLimits::Get().MAX_Q1D;
1004 const int max_d1d = T_D1D ? T_D1D : DeviceDofQuadLimits::Get().MAX_D1D;
1005 MFEM_VERIFY(D1D <= max_d1d, "");
1006 MFEM_VERIFY(Q1D <= max_q1d, "");
1007 auto b = Reshape(b_.Read(), Q1D, D1D);
1008 auto g = Reshape(g_.Read(), Q1D, D1D);
1009 auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1010 auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1011 auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1012 MFEM_VERIFY(D1D <= Q1D, "THREAD_DIRECT requires D1D <= Q1D");
1013 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
1014 {
1015 const int D1D = T_D1D ? T_D1D : d1d;
1016 const int Q1D = T_Q1D ? T_Q1D : q1d;
1017 constexpr int MQ1 = T_Q1D ? T_Q1D : DofQuadLimits::MAX_Q1D;
1018 constexpr int MD1 = T_D1D ? T_D1D : DofQuadLimits::MAX_D1D;
1019 constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1020 MFEM_SHARED real_t sBG[2][MQ1*MD1];
1021 real_t (*B)[MD1] = (real_t (*)[MD1]) (sBG+0);
1022 real_t (*G)[MD1] = (real_t (*)[MD1]) (sBG+1);
1023 real_t (*Bt)[MQ1] = (real_t (*)[MQ1]) (sBG+0);
1024 real_t (*Gt)[MQ1] = (real_t (*)[MQ1]) (sBG+1);
1025 MFEM_SHARED real_t sm0[3][MDQ*MDQ*MDQ];
1026 MFEM_SHARED real_t sm1[3][MDQ*MDQ*MDQ];
1027 real_t (*X)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+2);
1028 real_t (*DDQ0)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) (sm0+0);
1029 real_t (*DDQ1)[MD1][MQ1] = (real_t (*)[MD1][MQ1]) (sm0+1);
1030 real_t (*DQQ0)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+0);
1031 real_t (*DQQ1)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+1);
1032 real_t (*DQQ2)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm1+2);
1033 real_t (*QQQ0)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+0);
1034 real_t (*QQQ1)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+1);
1035 real_t (*QQQ2)[MQ1][MQ1] = (real_t (*)[MQ1][MQ1]) (sm0+2);
1036 real_t (*QQD0)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+0);
1037 real_t (*QQD1)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+1);
1038 real_t (*QQD2)[MQ1][MD1] = (real_t (*)[MQ1][MD1]) (sm1+2);
1039 real_t (*QDD0)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+0);
1040 real_t (*QDD1)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+1);
1041 real_t (*QDD2)[MD1][MD1] = (real_t (*)[MD1][MD1]) (sm0+2);
1042 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1043 {
1044 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1045 {
1046 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1047 {
1048 X[dz][dy][dx] = x(dx,dy,dz,e);
1049 }
1050 }
1051 }
1052 if (MFEM_THREAD_ID(z) == 0)
1053 {
1054 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1055 {
1056 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1057 {
1058 B[qx][dy] = b(qx,dy);
1059 G[qx][dy] = g(qx,dy);
1060 }
1061 }
1062 }
1063 MFEM_SYNC_THREAD;
1064 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1065 {
1066 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1067 {
1068 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1069 {
1070 real_t u = 0.0, v = 0.0;
1071 MFEM_UNROLL(MD1)
1072 for (int dx = 0; dx < D1D; ++dx)
1073 {
1074 const real_t coords = X[dz][dy][dx];
1075 u += coords * B[qx][dx];
1076 v += coords * G[qx][dx];
1077 }
1078 DDQ0[dz][dy][qx] = u;
1079 DDQ1[dz][dy][qx] = v;
1080 }
1081 }
1082 }
1083 MFEM_SYNC_THREAD;
1084 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1085 {
1086 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1087 {
1088 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1089 {
1090 real_t u = 0.0, v = 0.0, w = 0.0;
1091 MFEM_UNROLL(MD1)
1092 for (int dy = 0; dy < D1D; ++dy)
1093 {
1094 u += DDQ1[dz][dy][qx] * B[qy][dy];
1095 v += DDQ0[dz][dy][qx] * G[qy][dy];
1096 w += DDQ0[dz][dy][qx] * B[qy][dy];
1097 }
1098 DQQ0[dz][qy][qx] = u;
1099 DQQ1[dz][qy][qx] = v;
1100 DQQ2[dz][qy][qx] = w;
1101 }
1102 }
1103 }
1104 MFEM_SYNC_THREAD;
1105 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1106 {
1107 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1108 {
1109 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1110 {
1111 real_t u = 0.0, v = 0.0, w = 0.0;
1112 MFEM_UNROLL(MD1)
1113 for (int dz = 0; dz < D1D; ++dz)
1114 {
1115 u += DQQ0[dz][qy][qx] * B[qz][dz];
1116 v += DQQ1[dz][qy][qx] * B[qz][dz];
1117 w += DQQ2[dz][qy][qx] * G[qz][dz];
1118 }
1119 const real_t O11 = d(qx,qy,qz,0,e);
1120 const real_t O12 = d(qx,qy,qz,1,e);
1121 const real_t O13 = d(qx,qy,qz,2,e);
1122 const real_t O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1123 const real_t O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1124 const real_t O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1125 const real_t O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1126 const real_t O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1127 const real_t O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1128 const real_t gX = u;
1129 const real_t gY = v;
1130 const real_t gZ = w;
1131 QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1132 QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1133 QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1134 }
1135 }
1136 }
1137 MFEM_SYNC_THREAD;
1138 if (MFEM_THREAD_ID(z) == 0)
1139 {
1140 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1141 {
1142 MFEM_FOREACH_THREAD_DIRECT(qx,x,Q1D)
1143 {
1144 Bt[dy][qx] = b(qx,dy);
1145 Gt[dy][qx] = g(qx,dy);
1146 }
1147 }
1148 }
1149 MFEM_SYNC_THREAD;
1150 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1151 {
1152 MFEM_FOREACH_THREAD_DIRECT(qy,y,Q1D)
1153 {
1154 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1155 {
1156 real_t u = 0.0, v = 0.0, w = 0.0;
1157 MFEM_UNROLL(MQ1)
1158 for (int qx = 0; qx < Q1D; ++qx)
1159 {
1160 u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1161 v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1162 w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1163 }
1164 QQD0[qz][qy][dx] = u;
1165 QQD1[qz][qy][dx] = v;
1166 QQD2[qz][qy][dx] = w;
1167 }
1168 }
1169 }
1170 MFEM_SYNC_THREAD;
1171 MFEM_FOREACH_THREAD_DIRECT(qz,z,Q1D)
1172 {
1173 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1174 {
1175 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1176 {
1177 real_t u = 0.0, v = 0.0, w = 0.0;
1178 MFEM_UNROLL(Q1D)
1179 for (int qy = 0; qy < Q1D; ++qy)
1180 {
1181 u += QQD0[qz][qy][dx] * Bt[dy][qy];
1182 v += QQD1[qz][qy][dx] * Gt[dy][qy];
1183 w += QQD2[qz][qy][dx] * Bt[dy][qy];
1184 }
1185 QDD0[qz][dy][dx] = u;
1186 QDD1[qz][dy][dx] = v;
1187 QDD2[qz][dy][dx] = w;
1188 }
1189 }
1190 }
1191 MFEM_SYNC_THREAD;
1192 MFEM_FOREACH_THREAD_DIRECT(dz,z,D1D)
1193 {
1194 MFEM_FOREACH_THREAD_DIRECT(dy,y,D1D)
1195 {
1196 MFEM_FOREACH_THREAD_DIRECT(dx,x,D1D)
1197 {
1198 real_t u = 0.0, v = 0.0, w = 0.0;
1199 MFEM_UNROLL(MQ1)
1200 for (int qz = 0; qz < Q1D; ++qz)
1201 {
1202 u += QDD0[qz][dy][dx] * Bt[dz][qz];
1203 v += QDD1[qz][dy][dx] * Bt[dz][qz];
1204 w += QDD2[qz][dy][dx] * Gt[dz][qz];
1205 }
1206 y(dx,dy,dz,e) += (u + v + w);
1207 }
1208 }
1209 }
1210 });
1211}
1212
1213} // namespace internal
1214
1215namespace
1216{
1217using ApplyKernelType = DiffusionIntegrator::ApplyKernelType;
1218using DiagonalKernelType = DiffusionIntegrator::DiagonalKernelType;
1219}
1220
1221template<int DIM, int T_D1D, int T_Q1D>
1222ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Kernel()
1223{
1224 if constexpr (DIM == 2) { return internal::SmemPADiffusionApply2D<T_D1D,T_Q1D>; }
1225 else if constexpr (DIM == 3) { return internal::SmemPADiffusionApply3D<T_D1D, T_Q1D>; }
1226 MFEM_ABORT("");
1227}
1228
1229inline
1230ApplyKernelType DiffusionIntegrator::ApplyPAKernels::Fallback(int DIM, int, int)
1231{
1232 if (DIM == 2) { return internal::PADiffusionApply2D; }
1233 else if (DIM == 3) { return internal::PADiffusionApply3D; }
1234 else { MFEM_ABORT(""); }
1235}
1236
1237template<int DIM, int D1D, int Q1D>
1238DiagonalKernelType DiffusionIntegrator::DiagonalPAKernels::Kernel()
1239{
1240 if constexpr (DIM == 2) { return internal::SmemPADiffusionDiagonal2D<D1D,Q1D>; }
1241 else if constexpr (DIM == 3) { return internal::SmemPADiffusionDiagonal3D<D1D, Q1D>; }
1242 MFEM_ABORT("");
1243}
1244
1245inline DiagonalKernelType
1246DiffusionIntegrator::DiagonalPAKernels::Fallback(int DIM, int, int)
1247{
1248 if (DIM == 2) { return internal::PADiffusionDiagonal2D; }
1249 else if (DIM == 3) { return internal::PADiffusionDiagonal3D; }
1250 else { MFEM_ABORT(""); }
1251}
1252/// \endcond DO_NOT_DOCUMENT
1253
1254} // namespace mfem
1255
1256
1257#endif
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &, const int, const int) DiagonalKernelType
void(*)(const int, const bool, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const int, const int) ApplyKernelType
int dim
Definition ex24.cpp:53
real_t b
Definition lissajous.cpp:42
constexpr int DIM
mfem::real_t real_t
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:138
void forall_2D_batch(int N, int X, int Y, int BZ, lambda &&body)
Definition forall.hpp:931
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:937
float real_t
Definition config.hpp:46
void forall(int N, lambda &&body)
Definition forall.hpp:839
real_t p(const Vector &x, real_t t)
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128
int MAX_D1D
Maximum number of 1D nodal points.
Definition forall.hpp:118
int MAX_Q1D
Maximum number of 1D quadrature points.
Definition forall.hpp:119