MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_hcurl_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP
13#define MFEM_BILININTEG_HCURL_KERNELS_HPP
14
20#include "../bilininteg.hpp"
21
22// Piola transformation in H(curl): w = dF^{-T} \hat{w}
23// curl w = (1 / det (dF)) dF \hat{curl} \hat{w}
24
25namespace mfem
26{
27/// \cond DO_NOT_DOCUMENT
28namespace internal
29{
30
31// PA H(curl) Mass Diagonal 2D kernel
32void PAHcurlMassAssembleDiagonal2D(const int D1D,
33 const int Q1D,
34 const int NE,
35 const bool symmetric,
36 const Array<real_t> &bo,
37 const Array<real_t> &bc,
38 const Vector &pa_data,
39 Vector &diag);
40
41// PA H(curl) Mass Diagonal 3D kernel
42void PAHcurlMassAssembleDiagonal3D(const int D1D,
43 const int Q1D,
44 const int NE,
45 const bool symmetric,
46 const Array<real_t> &bo,
47 const Array<real_t> &bc,
48 const Vector &pa_data,
49 Vector &diag);
50
51// Shared memory PA H(curl) Mass Diagonal 3D kernel
52template<int T_D1D = 0, int T_Q1D = 0>
53inline void SmemPAHcurlMassAssembleDiagonal3D(const int d1d,
54 const int q1d,
55 const int NE,
56 const bool symmetric,
57 const Array<real_t> &bo,
58 const Array<real_t> &bc,
59 const Vector &pa_data,
60 Vector &diag)
61{
62 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
63 "Error: d1d > HCURL_MAX_D1D");
64 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
65 "Error: q1d > HCURL_MAX_Q1D");
66 const int D1D = T_D1D ? T_D1D : d1d;
67 const int Q1D = T_Q1D ? T_Q1D : q1d;
68
69 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
70 auto Bc = Reshape(bc.Read(), Q1D, D1D);
71 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
72 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
73
74 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
75 {
76 constexpr int VDIM = 3;
77 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
78 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
79 const int D1D = T_D1D ? T_D1D : d1d;
80 const int Q1D = T_Q1D ? T_Q1D : q1d;
81
82 MFEM_SHARED real_t sBo[MQ1D][MD1D];
83 MFEM_SHARED real_t sBc[MQ1D][MD1D];
84
85 real_t op3[3];
86 MFEM_SHARED real_t sop[3][MQ1D][MQ1D];
87
88 MFEM_FOREACH_THREAD(qx,x,Q1D)
89 {
90 MFEM_FOREACH_THREAD(qy,y,Q1D)
91 {
92 MFEM_FOREACH_THREAD(qz,z,Q1D)
93 {
94 op3[0] = op(qx,qy,qz,0,e);
95 op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
96 op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
97 }
98 }
99 }
100
101 const int tidx = MFEM_THREAD_ID(x);
102 const int tidy = MFEM_THREAD_ID(y);
103 const int tidz = MFEM_THREAD_ID(z);
104
105 if (tidz == 0)
106 {
107 MFEM_FOREACH_THREAD(d,y,D1D)
108 {
109 MFEM_FOREACH_THREAD(q,x,Q1D)
110 {
111 sBc[q][d] = Bc(q,d);
112 if (d < D1D-1)
113 {
114 sBo[q][d] = Bo(q,d);
115 }
116 }
117 }
118 }
119 MFEM_SYNC_THREAD;
120
121 int osc = 0;
122 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
123 {
124 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
125 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
126 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
127
128 real_t dxyz = 0.0;
129
130 for (int qz=0; qz < Q1D; ++qz)
131 {
132 if (tidz == qz)
133 {
134 for (int i=0; i<3; ++i)
135 {
136 sop[i][tidx][tidy] = op3[i];
137 }
138 }
139
140 MFEM_SYNC_THREAD;
141
142 MFEM_FOREACH_THREAD(dz,z,D1Dz)
143 {
144 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
145
146 MFEM_FOREACH_THREAD(dy,y,D1Dy)
147 {
148 MFEM_FOREACH_THREAD(dx,x,D1Dx)
149 {
150 for (int qy = 0; qy < Q1D; ++qy)
151 {
152 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
153
154 for (int qx = 0; qx < Q1D; ++qx)
155 {
156 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
157 dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
158 }
159 }
160 }
161 }
162 }
163
164 MFEM_SYNC_THREAD;
165 } // qz loop
166
167 MFEM_FOREACH_THREAD(dz,z,D1Dz)
168 {
169 MFEM_FOREACH_THREAD(dy,y,D1Dy)
170 {
171 MFEM_FOREACH_THREAD(dx,x,D1Dx)
172 {
173 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
174 }
175 }
176 }
177
178 osc += D1Dx * D1Dy * D1Dz;
179 } // c loop
180 }); // end of element loop
181}
182
183// PA H(curl) Mass Apply 2D kernel
184void PAHcurlMassApply2D(const int D1D,
185 const int Q1D,
186 const int NE,
187 const bool symmetric,
188 const Array<real_t> &bo,
189 const Array<real_t> &bc,
190 const Array<real_t> &bot,
191 const Array<real_t> &bct,
192 const Vector &pa_data,
193 const Vector &x,
194 Vector &y);
195
196// PA H(curl) Mass Apply 3D kernel
197void PAHcurlMassApply3D(const int D1D,
198 const int Q1D,
199 const int NE,
200 const bool symmetric,
201 const Array<real_t> &bo,
202 const Array<real_t> &bc,
203 const Array<real_t> &bot,
204 const Array<real_t> &bct,
205 const Vector &pa_data,
206 const Vector &x,
207 Vector &y);
208
209// Shared memory PA H(curl) Mass Apply 3D kernel
210template<int T_D1D = 0, int T_Q1D = 0>
211inline void SmemPAHcurlMassApply3D(const int d1d,
212 const int q1d,
213 const int NE,
214 const bool symmetric,
215 const Array<real_t> &bo,
216 const Array<real_t> &bc,
217 const Array<real_t> &bot,
218 const Array<real_t> &bct,
219 const Vector &pa_data,
220 const Vector &x,
221 Vector &y)
222{
223 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
224 "Error: d1d > HCURL_MAX_D1D");
225 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
226 "Error: q1d > HCURL_MAX_Q1D");
227 const int D1D = T_D1D ? T_D1D : d1d;
228 const int Q1D = T_Q1D ? T_Q1D : q1d;
229
230 const int dataSize = symmetric ? 6 : 9;
231
232 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
233 auto Bc = Reshape(bc.Read(), Q1D, D1D);
234 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
235 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
236 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
237
238 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
239 {
240 constexpr int VDIM = 3;
241 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
242 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
243 const int D1D = T_D1D ? T_D1D : d1d;
244 const int Q1D = T_Q1D ? T_Q1D : q1d;
245
246 MFEM_SHARED real_t sBo[MQ1D][MD1D];
247 MFEM_SHARED real_t sBc[MQ1D][MD1D];
248
249 real_t op9[9];
250 MFEM_SHARED real_t sop[9*MQ1D*MQ1D];
251 MFEM_SHARED real_t mass[MQ1D][MQ1D][3];
252
253 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
254
255 MFEM_FOREACH_THREAD(qx,x,Q1D)
256 {
257 MFEM_FOREACH_THREAD(qy,y,Q1D)
258 {
259 MFEM_FOREACH_THREAD(qz,z,Q1D)
260 {
261 for (int i=0; i<dataSize; ++i)
262 {
263 op9[i] = op(qx,qy,qz,i,e);
264 }
265 }
266 }
267 }
268
269 const int tidx = MFEM_THREAD_ID(x);
270 const int tidy = MFEM_THREAD_ID(y);
271 const int tidz = MFEM_THREAD_ID(z);
272
273 if (tidz == 0)
274 {
275 MFEM_FOREACH_THREAD(d,y,D1D)
276 {
277 MFEM_FOREACH_THREAD(q,x,Q1D)
278 {
279 sBc[q][d] = Bc(q,d);
280 if (d < D1D-1)
281 {
282 sBo[q][d] = Bo(q,d);
283 }
284 }
285 }
286 }
287 MFEM_SYNC_THREAD;
288
289 for (int qz=0; qz < Q1D; ++qz)
290 {
291 int osc = 0;
292 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
293 {
294 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
295 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
296 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
297
298 MFEM_FOREACH_THREAD(dz,z,D1Dz)
299 {
300 MFEM_FOREACH_THREAD(dy,y,D1Dy)
301 {
302 MFEM_FOREACH_THREAD(dx,x,D1Dx)
303 {
304 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
305 }
306 }
307 }
308 MFEM_SYNC_THREAD;
309
310 if (tidz == qz)
311 {
312 for (int i=0; i<dataSize; ++i)
313 {
314 sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
315 }
316
317 MFEM_FOREACH_THREAD(qy,y,Q1D)
318 {
319 MFEM_FOREACH_THREAD(qx,x,Q1D)
320 {
321 real_t u = 0.0;
322
323 for (int dz = 0; dz < D1Dz; ++dz)
324 {
325 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
326 for (int dy = 0; dy < D1Dy; ++dy)
327 {
328 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
329 for (int dx = 0; dx < D1Dx; ++dx)
330 {
331 const real_t t = sX[dz][dy][dx];
332 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
333 u += t * wx * wy * wz;
334 }
335 }
336 }
337
338 mass[qy][qx][c] = u;
339 } // qx
340 } // qy
341 } // tidz == qz
342
343 osc += D1Dx * D1Dy * D1Dz;
344 MFEM_SYNC_THREAD;
345 } // c
346
347 MFEM_SYNC_THREAD; // Sync mass[qy][qx][d] and sop
348
349 osc = 0;
350 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
351 {
352 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
353 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
354 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
355
356 real_t dxyz = 0.0;
357
358 MFEM_FOREACH_THREAD(dz,z,D1Dz)
359 {
360 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
361
362 MFEM_FOREACH_THREAD(dy,y,D1Dy)
363 {
364 MFEM_FOREACH_THREAD(dx,x,D1Dx)
365 {
366 for (int qy = 0; qy < Q1D; ++qy)
367 {
368 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
369 for (int qx = 0; qx < Q1D; ++qx)
370 {
371 const int os = (dataSize*qx) + (dataSize*Q1D*qy);
372 const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
373 (symmetric ? 2 : 6))); // O11, O21, O31
374 const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
375 (symmetric ? 4 : 7))); // O12, O22, O32
376 const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
377 (symmetric ? 5 : 8))); // O13, O23, O33
378
379 const real_t m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
380 (sop[id3] * mass[qy][qx][2]);
381
382 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
383 dxyz += m_c * wx * wy * wz;
384 }
385 }
386 }
387 }
388 }
389
390 MFEM_SYNC_THREAD;
391
392 MFEM_FOREACH_THREAD(dz,z,D1Dz)
393 {
394 MFEM_FOREACH_THREAD(dy,y,D1Dy)
395 {
396 MFEM_FOREACH_THREAD(dx,x,D1Dx)
397 {
398 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
399 }
400 }
401 }
402
403 osc += D1Dx * D1Dy * D1Dz;
404 } // c loop
405 } // qz
406 }); // end of element loop
407}
408
409// PA H(curl) curl-curl Assemble 2D kernel
410void PACurlCurlSetup2D(const int Q1D,
411 const int NE,
412 const Array<real_t> &w,
413 const Vector &j,
414 Vector &coeff,
415 Vector &op);
416
417// PA H(curl) curl-curl Assemble 3D kernel
418void PACurlCurlSetup3D(const int Q1D,
419 const int coeffDim,
420 const int NE,
421 const Array<real_t> &w,
422 const Vector &j,
423 Vector &coeff,
424 Vector &op);
425
426// PA H(curl) curl-curl Diagonal 2D kernel
427void PACurlCurlAssembleDiagonal2D(const int D1D,
428 const int Q1D,
429 const bool symmetric, // unused
430 const int NE,
431 const Array<real_t> &bo,
432 const Array<real_t> &bc, // unused
433 const Array<real_t> &go, // unused
434 const Array<real_t> &gc,
435 const Vector &pa_data,
436 Vector &diag);
437
438// PA H(curl) curl-curl Diagonal 3D kernel
439template<int T_D1D = 0, int T_Q1D = 0>
440inline void PACurlCurlAssembleDiagonal3D(const int d1d,
441 const int q1d,
442 const bool symmetric,
443 const int NE,
444 const Array<real_t> &bo,
445 const Array<real_t> &bc,
446 const Array<real_t> &go,
447 const Array<real_t> &gc,
448 const Vector &pa_data,
449 Vector &diag)
450{
451 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
452 "Error: d1d > HCURL_MAX_D1D");
453 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
454 "Error: q1d > HCURL_MAX_Q1D");
455 const int D1D = T_D1D ? T_D1D : d1d;
456 const int Q1D = T_Q1D ? T_Q1D : q1d;
457
458 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
459 auto Bc = Reshape(bc.Read(), Q1D, D1D);
460 auto Go = Reshape(go.Read(), Q1D, D1D-1);
461 auto Gc = Reshape(gc.Read(), Q1D, D1D);
462 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
463 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
464
465 const int s = symmetric ? 6 : 9;
466 const int i11 = 0;
467 const int i12 = 1;
468 const int i13 = 2;
469 const int i21 = symmetric ? i12 : 3;
470 const int i22 = symmetric ? 3 : 4;
471 const int i23 = symmetric ? 4 : 5;
472 const int i31 = symmetric ? i13 : 6;
473 const int i32 = symmetric ? i23 : 7;
474 const int i33 = symmetric ? 5 : 8;
475
476 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
477 {
478 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
479 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
480 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
481 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
482 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
483
484 // For each c, we will keep 9 arrays for derivatives multiplied by the 9 entries of the 3x3 matrix (dF^T C dF),
485 // which may be non-symmetric depending on a possibly non-symmetric matrix coefficient.
486
487 constexpr int VDIM = 3;
488 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
489 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
490 const int D1D = T_D1D ? T_D1D : d1d;
491 const int Q1D = T_Q1D ? T_Q1D : q1d;
492
493 int osc = 0;
494
495 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
496 {
497 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
498 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
499 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
500
501 real_t zt[MQ1D][MQ1D][MD1D][9][3];
502
503 // z contraction
504 for (int qx = 0; qx < Q1D; ++qx)
505 {
506 for (int qy = 0; qy < Q1D; ++qy)
507 {
508 for (int dz = 0; dz < D1Dz; ++dz)
509 {
510 for (int i=0; i<s; ++i)
511 {
512 for (int d=0; d<3; ++d)
513 {
514 zt[qx][qy][dz][i][d] = 0.0;
515 }
516 }
517
518 for (int qz = 0; qz < Q1D; ++qz)
519 {
520 const real_t wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
521 const real_t wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
522
523 for (int i=0; i<s; ++i)
524 {
525 zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
526 zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
527 zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
528 }
529 }
530 }
531 }
532 } // end of z contraction
533
534 real_t yt[MQ1D][MD1D][MD1D][9][3][3];
535
536 // y contraction
537 for (int qx = 0; qx < Q1D; ++qx)
538 {
539 for (int dz = 0; dz < D1Dz; ++dz)
540 {
541 for (int dy = 0; dy < D1Dy; ++dy)
542 {
543 for (int i=0; i<s; ++i)
544 {
545 for (int d=0; d<3; ++d)
546 for (int j=0; j<3; ++j)
547 {
548 yt[qx][dy][dz][i][d][j] = 0.0;
549 }
550 }
551
552 for (int qy = 0; qy < Q1D; ++qy)
553 {
554 const real_t wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
555 const real_t wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
556
557 for (int i=0; i<s; ++i)
558 {
559 for (int d=0; d<3; ++d)
560 {
561 yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
562 yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
563 yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
564 }
565 }
566 }
567 }
568 }
569 } // end of y contraction
570
571 // x contraction
572 for (int dz = 0; dz < D1Dz; ++dz)
573 {
574 for (int dy = 0; dy < D1Dy; ++dy)
575 {
576 for (int dx = 0; dx < D1Dx; ++dx)
577 {
578 for (int qx = 0; qx < Q1D; ++qx)
579 {
580 const real_t wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
581 const real_t wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
582
583 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
584 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
585 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
586 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
587 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
588
589 /*
590 const double O11 = op(q,0,e);
591 const double O12 = op(q,1,e);
592 const double O13 = op(q,2,e);
593 const double O22 = op(q,3,e);
594 const double O23 = op(q,4,e);
595 const double O33 = op(q,5,e);
596 */
597
598 if (c == 0)
599 {
600 // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
601 const real_t sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
602 - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
603
604 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
605 }
606 else if (c == 1)
607 {
608 // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
609 const real_t d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
610 - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
611 + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
612
613 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
614 }
615 else
616 {
617 // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
618 const real_t d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
619 - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
620 + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
621
622 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
623 }
624 }
625 }
626 }
627 } // end of x contraction
628
629 osc += D1Dx * D1Dy * D1Dz;
630 } // loop c
631 }); // end of element loop
632}
633
634// Shared memory PA H(curl) curl-curl Diagonal 3D kernel
635template<int T_D1D = 0, int T_Q1D = 0>
636inline void SmemPACurlCurlAssembleDiagonal3D(const int d1d,
637 const int q1d,
638 const bool symmetric,
639 const int NE,
640 const Array<real_t> &bo,
641 const Array<real_t> &bc,
642 const Array<real_t> &go,
643 const Array<real_t> &gc,
644 const Vector &pa_data,
645 Vector &diag)
646{
647 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
648 "Error: d1d > HCURL_MAX_D1D");
649 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
650 "Error: q1d > HCURL_MAX_Q1D");
651 const int D1D = T_D1D ? T_D1D : d1d;
652 const int Q1D = T_Q1D ? T_Q1D : q1d;
653
654 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
655 auto Bc = Reshape(bc.Read(), Q1D, D1D);
656 auto Go = Reshape(go.Read(), Q1D, D1D-1);
657 auto Gc = Reshape(gc.Read(), Q1D, D1D);
658 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
659 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
660
661 const int s = symmetric ? 6 : 9;
662 const int i11 = 0;
663 const int i12 = 1;
664 const int i13 = 2;
665 const int i21 = symmetric ? i12 : 3;
666 const int i22 = symmetric ? 3 : 4;
667 const int i23 = symmetric ? 4 : 5;
668 const int i31 = symmetric ? i13 : 6;
669 const int i32 = symmetric ? i23 : 7;
670 const int i33 = symmetric ? 5 : 8;
671
672 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
673 {
674 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
675 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
676 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
677 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
678 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
679
680 constexpr int VDIM = 3;
681 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
682 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
683 const int D1D = T_D1D ? T_D1D : d1d;
684 const int Q1D = T_Q1D ? T_Q1D : q1d;
685
686 MFEM_SHARED real_t sBo[MQ1D][MD1D];
687 MFEM_SHARED real_t sBc[MQ1D][MD1D];
688 MFEM_SHARED real_t sGo[MQ1D][MD1D];
689 MFEM_SHARED real_t sGc[MQ1D][MD1D];
690
691 real_t ope[9];
692 MFEM_SHARED real_t sop[9][MQ1D][MQ1D];
693
694 MFEM_FOREACH_THREAD(qx,x,Q1D)
695 {
696 MFEM_FOREACH_THREAD(qy,y,Q1D)
697 {
698 MFEM_FOREACH_THREAD(qz,z,Q1D)
699 {
700 for (int i=0; i<s; ++i)
701 {
702 ope[i] = op(qx,qy,qz,i,e);
703 }
704 }
705 }
706 }
707
708 const int tidx = MFEM_THREAD_ID(x);
709 const int tidy = MFEM_THREAD_ID(y);
710 const int tidz = MFEM_THREAD_ID(z);
711
712 if (tidz == 0)
713 {
714 MFEM_FOREACH_THREAD(d,y,D1D)
715 {
716 MFEM_FOREACH_THREAD(q,x,Q1D)
717 {
718 sBc[q][d] = Bc(q,d);
719 sGc[q][d] = Gc(q,d);
720 if (d < D1D-1)
721 {
722 sBo[q][d] = Bo(q,d);
723 sGo[q][d] = Go(q,d);
724 }
725 }
726 }
727 }
728 MFEM_SYNC_THREAD;
729
730 int osc = 0;
731 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
732 {
733 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
734 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
735 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
736
737 real_t dxyz = 0.0;
738
739 for (int qz=0; qz < Q1D; ++qz)
740 {
741 if (tidz == qz)
742 {
743 for (int i=0; i<s; ++i)
744 {
745 sop[i][tidx][tidy] = ope[i];
746 }
747 }
748
749 MFEM_SYNC_THREAD;
750
751 MFEM_FOREACH_THREAD(dz,z,D1Dz)
752 {
753 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
754 const real_t wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
755
756 MFEM_FOREACH_THREAD(dy,y,D1Dy)
757 {
758 MFEM_FOREACH_THREAD(dx,x,D1Dx)
759 {
760 for (int qy = 0; qy < Q1D; ++qy)
761 {
762 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
763 const real_t wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
764
765 for (int qx = 0; qx < Q1D; ++qx)
766 {
767 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
768 const real_t wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
769
770 if (c == 0)
771 {
772 // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
773
774 // (u_0)_{x_2} O22 (u_0)_{x_2}
775 dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
776
777 // -(u_0)_{x_2} O23 (u_0)_{x_1} - (u_0)_{x_1} O32 (u_0)_{x_2}
778 dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
779
780 // (u_0)_{x_1} O33 (u_0)_{x_1}
781 dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
782 }
783 else if (c == 1)
784 {
785 // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
786
787 // (u_1)_{x_2} O11 (u_1)_{x_2}
788 dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
789
790 // -(u_1)_{x_2} O13 (u_1)_{x_0} - (u_1)_{x_0} O31 (u_1)_{x_2}
791 dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
792
793 // (u_1)_{x_0} O33 (u_1)_{x_0})
794 dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
795 }
796 else
797 {
798 // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
799
800 // (u_2)_{x_1} O11 (u_2)_{x_1}
801 dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
802
803 // -(u_2)_{x_1} O12 (u_2)_{x_0} - (u_2)_{x_0} O21 (u_2)_{x_1}
804 dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
805
806 // (u_2)_{x_0} O22 (u_2)_{x_0}
807 dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
808 }
809 }
810 }
811 }
812 }
813 }
814
815 MFEM_SYNC_THREAD;
816 } // qz loop
817
818 MFEM_FOREACH_THREAD(dz,z,D1Dz)
819 {
820 MFEM_FOREACH_THREAD(dy,y,D1Dy)
821 {
822 MFEM_FOREACH_THREAD(dx,x,D1Dx)
823 {
824 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
825 }
826 }
827 }
828
829 osc += D1Dx * D1Dy * D1Dz;
830 } // c loop
831 }); // end of element loop
832}
833
834// PA H(curl) curl-curl Apply/AbsApply 2D kernel
835void PACurlCurlApply2D(const int D1D,
836 const int Q1D,
837 const bool symmetric, // unused
838 const int NE,
839 const Array<real_t> &bo,
840 const Array<real_t> &bc, // unused
841 const Array<real_t> &bot,
842 const Array<real_t> &bct, // unused
843 const Array<real_t> &gc,
844 const Array<real_t> &gct,
845 const Vector &pa_data,
846 const Vector &x,
847 Vector &y,
848 const bool useAbs = false);
849
850// PA H(curl) curl-curl Apply/AbsApply 3D kernel
851template<int T_D1D = 0, int T_Q1D = 0>
852inline void PACurlCurlApply3D(const int d1d,
853 const int q1d,
854 const bool symmetric,
855 const int NE,
856 const Array<real_t> &bo,
857 const Array<real_t> &bc,
858 const Array<real_t> &bot,
859 const Array<real_t> &bct,
860 const Array<real_t> &gc,
861 const Array<real_t> &gct,
862 const Vector &pa_data,
863 const Vector &x,
864 Vector &y,
865 const bool useAbs = false)
866{
867 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
868 "Error: d1d > HCURL_MAX_D1D");
869 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
870 "Error: q1d > HCURL_MAX_Q1D");
871 const int D1D = T_D1D ? T_D1D : d1d;
872 const int Q1D = T_Q1D ? T_Q1D : q1d;
873
874 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
875 auto Bc = Reshape(bc.Read(), Q1D, D1D);
876 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
877 auto Bct = Reshape(bct.Read(), D1D, Q1D);
878 auto Gc = Reshape(gc.Read(), Q1D, D1D);
879 auto Gct = Reshape(gct.Read(), D1D, Q1D);
880 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
881 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
882 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
883
884 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
885 {
886 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk),
887 // we get:
888 // (\nabla\times u) \cdot (\nabla\times v)
889 // = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
890 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
891 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
892 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
893
894 constexpr int VDIM = 3;
895 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
896 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
897 const int D1D = T_D1D ? T_D1D : d1d;
898 const int Q1D = T_Q1D ? T_Q1D : q1d;
899
900 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
901 // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
902
903 for (int qz = 0; qz < Q1D; ++qz)
904 {
905 for (int qy = 0; qy < Q1D; ++qy)
906 {
907 for (int qx = 0; qx < Q1D; ++qx)
908 {
909 for (int c = 0; c < VDIM; ++c)
910 {
911 curl[qz][qy][qx][c] = 0.0;
912 }
913 }
914 }
915 }
916
917 // We treat x, y, z components separately for optimization specific to each.
918
919 int osc = 0;
920
921 {
922 // x component
923 const int D1Dz = D1D;
924 const int D1Dy = D1D;
925 const int D1Dx = D1D - 1;
926
927 for (int dz = 0; dz < D1Dz; ++dz)
928 {
929 real_t gradXY[MQ1D][MQ1D][2];
930 for (int qy = 0; qy < Q1D; ++qy)
931 {
932 for (int qx = 0; qx < Q1D; ++qx)
933 {
934 for (int d = 0; d < 2; ++d)
935 {
936 gradXY[qy][qx][d] = 0.0;
937 }
938 }
939 }
940
941 for (int dy = 0; dy < D1Dy; ++dy)
942 {
943 real_t massX[MQ1D];
944 for (int qx = 0; qx < Q1D; ++qx)
945 {
946 massX[qx] = 0.0;
947 }
948
949 for (int dx = 0; dx < D1Dx; ++dx)
950 {
951 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
952 for (int qx = 0; qx < Q1D; ++qx)
953 {
954 massX[qx] += t * Bo(qx,dx);
955 }
956 }
957
958 for (int qy = 0; qy < Q1D; ++qy)
959 {
960 const real_t wy = Bc(qy,dy);
961 const real_t wDy = Gc(qy,dy);
962 for (int qx = 0; qx < Q1D; ++qx)
963 {
964 const real_t wx = massX[qx];
965 gradXY[qy][qx][0] += wx * wDy;
966 gradXY[qy][qx][1] += wx * wy;
967 }
968 }
969 }
970
971 for (int qz = 0; qz < Q1D; ++qz)
972 {
973 const real_t wz = Bc(qz,dz);
974 const real_t wDz = Gc(qz,dz);
975 for (int qy = 0; qy < Q1D; ++qy)
976 {
977 for (int qx = 0; qx < Q1D; ++qx)
978 {
979 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
980 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
981 if (useAbs)
982 {
983 // +(u_0)_{x_1}
984 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz;
985 }
986 else
987 {
988 // -(u_0)_{x_1}
989 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz;
990 }
991 }
992 }
993 }
994 }
995
996 osc += D1Dx * D1Dy * D1Dz;
997 }
998
999 {
1000 // y component
1001 const int D1Dz = D1D;
1002 const int D1Dy = D1D - 1;
1003 const int D1Dx = D1D;
1004
1005 for (int dz = 0; dz < D1Dz; ++dz)
1006 {
1007 real_t gradXY[MQ1D][MQ1D][2];
1008 for (int qy = 0; qy < Q1D; ++qy)
1009 {
1010 for (int qx = 0; qx < Q1D; ++qx)
1011 {
1012 for (int d = 0; d < 2; ++d)
1013 {
1014 gradXY[qy][qx][d] = 0.0;
1015 }
1016 }
1017 }
1018
1019 for (int dx = 0; dx < D1Dx; ++dx)
1020 {
1021 real_t massY[MQ1D];
1022 for (int qy = 0; qy < Q1D; ++qy)
1023 {
1024 massY[qy] = 0.0;
1025 }
1026
1027 for (int dy = 0; dy < D1Dy; ++dy)
1028 {
1029 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1030 for (int qy = 0; qy < Q1D; ++qy)
1031 {
1032 massY[qy] += t * Bo(qy,dy);
1033 }
1034 }
1035
1036 for (int qx = 0; qx < Q1D; ++qx)
1037 {
1038 const real_t wx = Bc(qx,dx);
1039 const real_t wDx = Gc(qx,dx);
1040 for (int qy = 0; qy < Q1D; ++qy)
1041 {
1042 const real_t wy = massY[qy];
1043 gradXY[qy][qx][0] += wDx * wy;
1044 gradXY[qy][qx][1] += wx * wy;
1045 }
1046 }
1047 }
1048
1049 for (int qz = 0; qz < Q1D; ++qz)
1050 {
1051 const real_t wz = Bc(qz,dz);
1052 const real_t wDz = Gc(qz,dz);
1053 for (int qy = 0; qy < Q1D; ++qy)
1054 {
1055 for (int qx = 0; qx < Q1D; ++qx)
1056 {
1057 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1058 if (useAbs)
1059 {
1060 // +(u_1)_{x_2}
1061 curl[qz][qy][qx][0] += gradXY[qy][qx][1] * wDz;
1062 }
1063 else
1064 {
1065 // -(u_1)_{x_2}
1066 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz;
1067 }
1068 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz; // (u_1)_{x_0}
1069 }
1070 }
1071 }
1072 }
1073
1074 osc += D1Dx * D1Dy * D1Dz;
1075 }
1076
1077 {
1078 // z component
1079 const int D1Dz = D1D - 1;
1080 const int D1Dy = D1D;
1081 const int D1Dx = D1D;
1082
1083 for (int dx = 0; dx < D1Dx; ++dx)
1084 {
1085 real_t gradYZ[MQ1D][MQ1D][2];
1086 for (int qz = 0; qz < Q1D; ++qz)
1087 {
1088 for (int qy = 0; qy < Q1D; ++qy)
1089 {
1090 for (int d = 0; d < 2; ++d)
1091 {
1092 gradYZ[qz][qy][d] = 0.0;
1093 }
1094 }
1095 }
1096
1097 for (int dy = 0; dy < D1Dy; ++dy)
1098 {
1099 real_t massZ[MQ1D];
1100 for (int qz = 0; qz < Q1D; ++qz)
1101 {
1102 massZ[qz] = 0.0;
1103 }
1104
1105 for (int dz = 0; dz < D1Dz; ++dz)
1106 {
1107 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1108 for (int qz = 0; qz < Q1D; ++qz)
1109 {
1110 massZ[qz] += t * Bo(qz,dz);
1111 }
1112 }
1113
1114 for (int qy = 0; qy < Q1D; ++qy)
1115 {
1116 const real_t wy = Bc(qy,dy);
1117 const real_t wDy = Gc(qy,dy);
1118 for (int qz = 0; qz < Q1D; ++qz)
1119 {
1120 const real_t wz = massZ[qz];
1121 gradYZ[qz][qy][0] += wz * wy;
1122 gradYZ[qz][qy][1] += wz * wDy;
1123 }
1124 }
1125 }
1126
1127 for (int qx = 0; qx < Q1D; ++qx)
1128 {
1129 const real_t wx = Bc(qx,dx);
1130 const real_t wDx = Gc(qx,dx);
1131
1132 for (int qy = 0; qy < Q1D; ++qy)
1133 {
1134 for (int qz = 0; qz < Q1D; ++qz)
1135 {
1136 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1137 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx; // (u_2)_{x_1}
1138 if (useAbs)
1139 {
1140 // +(u_2)_{x_0}
1141 curl[qz][qy][qx][1] += gradYZ[qz][qy][0] * wDx;
1142 }
1143 else
1144 {
1145 // -(u_2)_{x_0}
1146 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx;
1147 }
1148 }
1149 }
1150 }
1151 }
1152 }
1153
1154 // Apply D operator.
1155 for (int qz = 0; qz < Q1D; ++qz)
1156 {
1157 for (int qy = 0; qy < Q1D; ++qy)
1158 {
1159 for (int qx = 0; qx < Q1D; ++qx)
1160 {
1161 const real_t O11 = op(qx,qy,qz,0,e);
1162 const real_t O12 = op(qx,qy,qz,1,e);
1163 const real_t O13 = op(qx,qy,qz,2,e);
1164 const real_t O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
1165 const real_t O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
1166 const real_t O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
1167 const real_t O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
1168 const real_t O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
1169 const real_t O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
1170
1171 const real_t c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
1172 (O13 * curl[qz][qy][qx][2]);
1173 const real_t c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
1174 (O23 * curl[qz][qy][qx][2]);
1175 const real_t c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
1176 (O33 * curl[qz][qy][qx][2]);
1177
1178 curl[qz][qy][qx][0] = c1;
1179 curl[qz][qy][qx][1] = c2;
1180 curl[qz][qy][qx][2] = c3;
1181 }
1182 }
1183 }
1184
1185 // x component
1186 osc = 0;
1187 {
1188 const int D1Dz = D1D;
1189 const int D1Dy = D1D;
1190 const int D1Dx = D1D - 1;
1191
1192 for (int qz = 0; qz < Q1D; ++qz)
1193 {
1194 real_t gradXY12[MD1D][MD1D];
1195 real_t gradXY21[MD1D][MD1D];
1196
1197 for (int dy = 0; dy < D1Dy; ++dy)
1198 {
1199 for (int dx = 0; dx < D1Dx; ++dx)
1200 {
1201 gradXY12[dy][dx] = 0.0;
1202 gradXY21[dy][dx] = 0.0;
1203 }
1204 }
1205 for (int qy = 0; qy < Q1D; ++qy)
1206 {
1207 real_t massX[MD1D][2];
1208 for (int dx = 0; dx < D1Dx; ++dx)
1209 {
1210 for (int n = 0; n < 2; ++n)
1211 {
1212 massX[dx][n] = 0.0;
1213 }
1214 }
1215 for (int qx = 0; qx < Q1D; ++qx)
1216 {
1217 for (int dx = 0; dx < D1Dx; ++dx)
1218 {
1219 const real_t wx = Bot(dx,qx);
1220
1221 massX[dx][0] += wx * curl[qz][qy][qx][1];
1222 massX[dx][1] += wx * curl[qz][qy][qx][2];
1223 }
1224 }
1225 for (int dy = 0; dy < D1Dy; ++dy)
1226 {
1227 const real_t wy = Bct(dy,qy);
1228 const real_t wDy = Gct(dy,qy);
1229
1230 for (int dx = 0; dx < D1Dx; ++dx)
1231 {
1232 gradXY21[dy][dx] += massX[dx][0] * wy;
1233 gradXY12[dy][dx] += massX[dx][1] * wDy;
1234 }
1235 }
1236 }
1237
1238 for (int dz = 0; dz < D1Dz; ++dz)
1239 {
1240 const real_t wz = Bct(dz,qz);
1241 const real_t wDz = Gct(dz,qz);
1242 for (int dy = 0; dy < D1Dy; ++dy)
1243 {
1244 for (int dx = 0; dx < D1Dx; ++dx)
1245 {
1246 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1247 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1248 if (useAbs)
1249 {
1250 // (u_0)_{x_2} * (op * curl)_1 +
1251 // (u_0)_{x_1} * (op * curl)_2
1252 Y(idx, e) += (gradXY21[dy][dx] * wDz) +
1253 (gradXY12[dy][dx] * wz);
1254 }
1255 else
1256 {
1257 // (u_0)_{x_2} * (op * curl)_1 -
1258 // (u_0)_{x_1} * (op * curl)_2
1259 Y(idx, e) += (gradXY21[dy][dx] * wDz) -
1260 (gradXY12[dy][dx] * wz);
1261 }
1262 }
1263 }
1264 }
1265 } // loop qz
1266
1267 osc += D1Dx * D1Dy * D1Dz;
1268 }
1269
1270 // y component
1271 {
1272 const int D1Dz = D1D;
1273 const int D1Dy = D1D - 1;
1274 const int D1Dx = D1D;
1275
1276 for (int qz = 0; qz < Q1D; ++qz)
1277 {
1278 real_t gradXY02[MD1D][MD1D];
1279 real_t gradXY20[MD1D][MD1D];
1280
1281 for (int dy = 0; dy < D1Dy; ++dy)
1282 {
1283 for (int dx = 0; dx < D1Dx; ++dx)
1284 {
1285 gradXY02[dy][dx] = 0.0;
1286 gradXY20[dy][dx] = 0.0;
1287 }
1288 }
1289 for (int qx = 0; qx < Q1D; ++qx)
1290 {
1291 real_t massY[MD1D][2];
1292 for (int dy = 0; dy < D1Dy; ++dy)
1293 {
1294 massY[dy][0] = 0.0;
1295 massY[dy][1] = 0.0;
1296 }
1297 for (int qy = 0; qy < Q1D; ++qy)
1298 {
1299 for (int dy = 0; dy < D1Dy; ++dy)
1300 {
1301 const real_t wy = Bot(dy,qy);
1302
1303 massY[dy][0] += wy * curl[qz][qy][qx][2];
1304 massY[dy][1] += wy * curl[qz][qy][qx][0];
1305 }
1306 }
1307 for (int dx = 0; dx < D1Dx; ++dx)
1308 {
1309 const real_t wx = Bct(dx,qx);
1310 const real_t wDx = Gct(dx,qx);
1311
1312 for (int dy = 0; dy < D1Dy; ++dy)
1313 {
1314 gradXY02[dy][dx] += massY[dy][0] * wDx;
1315 gradXY20[dy][dx] += massY[dy][1] * wx;
1316 }
1317 }
1318 }
1319
1320 for (int dz = 0; dz < D1Dz; ++dz)
1321 {
1322 const real_t wz = Bct(dz,qz);
1323 const real_t wDz = Gct(dz,qz);
1324 for (int dy = 0; dy < D1Dy; ++dy)
1325 {
1326 for (int dx = 0; dx < D1Dx; ++dx)
1327 {
1328 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1329 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1330 if (useAbs)
1331 {
1332 // +(u_1)_{x_2} * (op * curl)_0 +
1333 // (u_1)_{x_0} * (op * curl)_2
1334 Y(idx, e) += (gradXY20[dy][dx] * wDz) +
1335 (gradXY02[dy][dx] * wz);
1336 }
1337 else
1338 {
1339 // -(u_1)_{x_2} * (op * curl)_0 +
1340 // (u_1)_{x_0} * (op * curl)_2
1341 Y(idx, e) += (-gradXY20[dy][dx] * wDz) +
1342 (gradXY02[dy][dx] * wz);
1343 }
1344 }
1345 }
1346 }
1347 } // loop qz
1348
1349 osc += D1Dx * D1Dy * D1Dz;
1350 }
1351
1352 // z component
1353 {
1354 const int D1Dz = D1D - 1;
1355 const int D1Dy = D1D;
1356 const int D1Dx = D1D;
1357
1358 for (int qx = 0; qx < Q1D; ++qx)
1359 {
1360 real_t gradYZ01[MD1D][MD1D];
1361 real_t gradYZ10[MD1D][MD1D];
1362
1363 for (int dy = 0; dy < D1Dy; ++dy)
1364 {
1365 for (int dz = 0; dz < D1Dz; ++dz)
1366 {
1367 gradYZ01[dz][dy] = 0.0;
1368 gradYZ10[dz][dy] = 0.0;
1369 }
1370 }
1371 for (int qy = 0; qy < Q1D; ++qy)
1372 {
1373 real_t massZ[MD1D][2];
1374 for (int dz = 0; dz < D1Dz; ++dz)
1375 {
1376 for (int n = 0; n < 2; ++n)
1377 {
1378 massZ[dz][n] = 0.0;
1379 }
1380 }
1381 for (int qz = 0; qz < Q1D; ++qz)
1382 {
1383 for (int dz = 0; dz < D1Dz; ++dz)
1384 {
1385 const real_t wz = Bot(dz,qz);
1386
1387 massZ[dz][0] += wz * curl[qz][qy][qx][0];
1388 massZ[dz][1] += wz * curl[qz][qy][qx][1];
1389 }
1390 }
1391 for (int dy = 0; dy < D1Dy; ++dy)
1392 {
1393 const real_t wy = Bct(dy,qy);
1394 const real_t wDy = Gct(dy,qy);
1395
1396 for (int dz = 0; dz < D1Dz; ++dz)
1397 {
1398 gradYZ01[dz][dy] += wy * massZ[dz][1];
1399 gradYZ10[dz][dy] += wDy * massZ[dz][0];
1400 }
1401 }
1402 }
1403
1404 for (int dx = 0; dx < D1Dx; ++dx)
1405 {
1406 const real_t wx = Bct(dx,qx);
1407 const real_t wDx = Gct(dx,qx);
1408
1409 for (int dy = 0; dy < D1Dy; ++dy)
1410 {
1411 for (int dz = 0; dz < D1Dz; ++dz)
1412 {
1413 const int idx = dx + ((dy + (dz * D1Dy)) * D1Dx) + osc;
1414 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1415 if (useAbs)
1416 {
1417 // (u_2)_{x_1} * (op * curl)_0 +
1418 // (u_2)_{x_0} * (op * curl)_1
1419 Y(idx, e) += (gradYZ10[dz][dy] * wx) +
1420 (gradYZ01[dz][dy] * wDx);
1421 }
1422 else
1423 {
1424 // (u_2)_{x_1} * (op * curl)_0 -
1425 // (u_2)_{x_0} * (op * curl)_1
1426 Y(idx, e) += (gradYZ10[dz][dy] * wx) -
1427 (gradYZ01[dz][dy] * wDx);
1428 }
1429 }
1430 }
1431 }
1432 } // loop qx
1433 }
1434 }); // end of element loop
1435}
1436
1437// Shared memory PA H(curl) curl-curl Apply/AbsApply 3D kernel
1438template<int T_D1D = 0, int T_Q1D = 0>
1439inline void SmemPACurlCurlApply3D(const int d1d,
1440 const int q1d,
1441 const bool symmetric,
1442 const int NE,
1443 const Array<real_t> &bo,
1444 const Array<real_t> &bc,
1445 const Array<real_t> &bot,
1446 const Array<real_t> &bct,
1447 const Array<real_t> &gc,
1448 const Array<real_t> &gct,
1449 const Vector &pa_data,
1450 const Vector &x,
1451 Vector &y,
1452 const bool useAbs = false)
1453{
1454 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
1455 "Error: d1d > HCURL_MAX_D1D");
1456 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
1457 "Error: q1d > HCURL_MAX_Q1D");
1458 const int D1D = T_D1D ? T_D1D : d1d;
1459 const int Q1D = T_Q1D ? T_Q1D : q1d;
1460
1461 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
1462 // (\nabla\times u) \cdot (\nabla\times v) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
1463 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1464 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1465 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1466
1467 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
1468 auto Bc = Reshape(bc.Read(), Q1D, D1D);
1469 auto Gc = Reshape(gc.Read(), Q1D, D1D);
1470 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1471 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1472 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1473
1474 const int s = symmetric ? 6 : 9;
1475
1476 auto device_kernel = [=] MFEM_DEVICE (int e)
1477 {
1478 constexpr int VDIM = 3;
1479 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1480 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1481 const int D1D = T_D1D ? T_D1D : d1d;
1482 const int Q1D = T_Q1D ? T_Q1D : q1d;
1483
1484 MFEM_SHARED real_t sBo[MD1D][MQ1D];
1485 MFEM_SHARED real_t sBc[MD1D][MQ1D];
1486 MFEM_SHARED real_t sGc[MD1D][MQ1D];
1487
1488 real_t ope[9];
1489 MFEM_SHARED real_t sop[9][MQ1D][MQ1D];
1490 MFEM_SHARED real_t curl[MQ1D][MQ1D][3];
1491
1492 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
1493
1494 MFEM_FOREACH_THREAD(qx,x,Q1D)
1495 {
1496 MFEM_FOREACH_THREAD(qy,y,Q1D)
1497 {
1498 MFEM_FOREACH_THREAD(qz,z,Q1D)
1499 {
1500 for (int i=0; i<s; ++i)
1501 {
1502 ope[i] = op(qx,qy,qz,i,e);
1503 }
1504 }
1505 }
1506 }
1507
1508 const int tidx = MFEM_THREAD_ID(x);
1509 const int tidy = MFEM_THREAD_ID(y);
1510 const int tidz = MFEM_THREAD_ID(z);
1511
1512 if (tidz == 0)
1513 {
1514 MFEM_FOREACH_THREAD(d,y,D1D)
1515 {
1516 MFEM_FOREACH_THREAD(q,x,Q1D)
1517 {
1518 sBc[d][q] = Bc(q,d);
1519 sGc[d][q] = Gc(q,d);
1520 if (d < D1D-1)
1521 {
1522 sBo[d][q] = Bo(q,d);
1523 }
1524 }
1525 }
1526 }
1527 MFEM_SYNC_THREAD;
1528
1529 for (int qz=0; qz < Q1D; ++qz)
1530 {
1531 if (tidz == qz)
1532 {
1533 MFEM_FOREACH_THREAD(qy,y,Q1D)
1534 {
1535 MFEM_FOREACH_THREAD(qx,x,Q1D)
1536 {
1537 for (int i=0; i<3; ++i)
1538 {
1539 curl[qy][qx][i] = 0.0;
1540 }
1541 }
1542 }
1543 }
1544
1545 int osc = 0;
1546 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
1547 {
1548 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
1549 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
1550 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
1551
1552 MFEM_FOREACH_THREAD(dz,z,D1Dz)
1553 {
1554 MFEM_FOREACH_THREAD(dy,y,D1Dy)
1555 {
1556 MFEM_FOREACH_THREAD(dx,x,D1Dx)
1557 {
1558 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1559 }
1560 }
1561 }
1562 MFEM_SYNC_THREAD;
1563
1564 if (tidz == qz)
1565 {
1566 if (c == 0)
1567 {
1568 for (int i=0; i<s; ++i)
1569 {
1570 sop[i][tidx][tidy] = ope[i];
1571 }
1572 }
1573
1574 MFEM_FOREACH_THREAD(qy,y,Q1D)
1575 {
1576 MFEM_FOREACH_THREAD(qx,x,Q1D)
1577 {
1578 real_t u = 0.0;
1579 real_t v = 0.0;
1580
1581 // We treat x, y, z components separately for optimization specific to each.
1582 if (c == 0) // x component
1583 {
1584 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1585
1586 for (int dz = 0; dz < D1Dz; ++dz)
1587 {
1588 const real_t wz = sBc[dz][qz];
1589 const real_t wDz = sGc[dz][qz];
1590
1591 for (int dy = 0; dy < D1Dy; ++dy)
1592 {
1593 const real_t wy = sBc[dy][qy];
1594 const real_t wDy = sGc[dy][qy];
1595
1596 for (int dx = 0; dx < D1Dx; ++dx)
1597 {
1598 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
1599 u += wx * wDy * wz;
1600 v += wx * wy * wDz;
1601 }
1602 }
1603 }
1604
1605 curl[qy][qx][1] += v; // (u_0)_{x_2}
1606 if (useAbs) { curl[qy][qx][2] += u; } // +(u_0)_{x_1}
1607 else { curl[qy][qx][2] -= u; } // -(u_0)_{x_1}
1608 }
1609 else if (c == 1) // y component
1610 {
1611 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1612
1613 for (int dz = 0; dz < D1Dz; ++dz)
1614 {
1615 const real_t wz = sBc[dz][qz];
1616 const real_t wDz = sGc[dz][qz];
1617
1618 for (int dy = 0; dy < D1Dy; ++dy)
1619 {
1620 const real_t wy = sBo[dy][qy];
1621
1622 for (int dx = 0; dx < D1Dx; ++dx)
1623 {
1624 const real_t t = sX[dz][dy][dx];
1625 const real_t wx = t * sBc[dx][qx];
1626 const real_t wDx = t * sGc[dx][qx];
1627
1628 u += wDx * wy * wz;
1629 v += wx * wy * wDz;
1630 }
1631 }
1632 }
1633
1634 if (useAbs) { curl[qy][qx][0] += v; } // +(u_1)_{x_2}
1635 else { curl[qy][qx][0] -= v; } // -(u_1)_{x_2}
1636 curl[qy][qx][2] += u; // (u_1)_{x_0}
1637 }
1638 else // z component
1639 {
1640 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1641
1642 for (int dz = 0; dz < D1Dz; ++dz)
1643 {
1644 const real_t wz = sBo[dz][qz];
1645
1646 for (int dy = 0; dy < D1Dy; ++dy)
1647 {
1648 const real_t wy = sBc[dy][qy];
1649 const real_t wDy = sGc[dy][qy];
1650
1651 for (int dx = 0; dx < D1Dx; ++dx)
1652 {
1653 const real_t t = sX[dz][dy][dx];
1654 const real_t wx = t * sBc[dx][qx];
1655 const real_t wDx = t * sGc[dx][qx];
1656
1657 u += wDx * wy * wz;
1658 v += wx * wDy * wz;
1659 }
1660 }
1661 }
1662
1663 curl[qy][qx][0] += v; // (u_2)_{x_1}
1664 if (useAbs) { curl[qy][qx][1] += u; }// +(u_2)_{x_0}
1665 else { curl[qy][qx][1] -= u; } // -(u_2)_{x_0}
1666 }
1667 } // qx
1668 } // qy
1669 } // tidz == qz
1670
1671 osc += D1Dx * D1Dy * D1Dz;
1672 MFEM_SYNC_THREAD;
1673 } // c
1674
1675 real_t dxyz1 = 0.0;
1676 real_t dxyz2 = 0.0;
1677 real_t dxyz3 = 0.0;
1678
1679 MFEM_FOREACH_THREAD(dz,z,D1D)
1680 {
1681 const real_t wcz = sBc[dz][qz];
1682 const real_t wcDz = sGc[dz][qz];
1683 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
1684
1685 MFEM_FOREACH_THREAD(dy,y,D1D)
1686 {
1687 MFEM_FOREACH_THREAD(dx,x,D1D)
1688 {
1689 for (int qy = 0; qy < Q1D; ++qy)
1690 {
1691 const real_t wcy = sBc[dy][qy];
1692 const real_t wcDy = sGc[dy][qy];
1693 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
1694
1695 for (int qx = 0; qx < Q1D; ++qx)
1696 {
1697 const real_t O11 = sop[0][qx][qy];
1698 const real_t O12 = sop[1][qx][qy];
1699 const real_t O13 = sop[2][qx][qy];
1700 const real_t O21 = symmetric ? O12 : sop[3][qx][qy];
1701 const real_t O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
1702 const real_t O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
1703 const real_t O31 = symmetric ? O13 : sop[6][qx][qy];
1704 const real_t O32 = symmetric ? O23 : sop[7][qx][qy];
1705 const real_t O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
1706
1707 const real_t c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
1708 (O13 * curl[qy][qx][2]);
1709 const real_t c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
1710 (O23 * curl[qy][qx][2]);
1711 const real_t c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
1712 (O33 * curl[qy][qx][2]);
1713
1714 const real_t wcx = sBc[dx][qx];
1715 const real_t wDx = sGc[dx][qx];
1716
1717 if (dx < D1D-1)
1718 {
1719 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1720 const real_t wx = sBo[dx][qx];
1721 if (useAbs)
1722 {
1723 // (u_0)_{x_2} * (op * curl)_1 +
1724 // (u_0)_{x_1} * (op * curl)_2
1725 dxyz1 += (wx * c2 * wcy * wcDz) +
1726 (wx * c3 * wcDy * wcz);
1727 }
1728 else
1729 {
1730 // (u_0)_{x_2} * (op * curl)_1 -
1731 // (u_0)_{x_1} * (op * curl)_2
1732 dxyz1 += (wx * c2 * wcy * wcDz) -
1733 (wx * c3 * wcDy * wcz);
1734 }
1735 }
1736
1737 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1738 if (useAbs)
1739 {
1740 // +(u_1)_{x_2} * (op * curl)_0 +
1741 // (u_1)_{x_0} * (op * curl)_2
1742 dxyz2 += (wy * c1 * wcx * wcDz) +
1743 (wy * c3 * wDx * wcz);
1744 }
1745 else
1746 {
1747 // -(u_1)_{x_2} * (op * curl)_0 +
1748 // (u_1)_{x_0} * (op * curl)_2
1749 dxyz2 += (-wy * c1 * wcx * wcDz) +
1750 (wy * c3 * wDx * wcz);
1751 }
1752
1753 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1754 if (useAbs)
1755 {
1756 // (u_2)_{x_1} * (op * curl)_0 +
1757 // (u_2)_{x_0} * (op * curl)_1
1758 dxyz3 += (wcDy * wz * c1 * wcx) +
1759 (wcy * wz * c2 * wDx);
1760 }
1761 else
1762 {
1763 // (u_2)_{x_1} * (op * curl)_0 -
1764 // (u_2)_{x_0} * (op * curl)_1
1765 dxyz3 += (wcDy * wz * c1 * wcx) -
1766 (wcy * wz * c2 * wDx);
1767 }
1768 } // qx
1769 } // qy
1770 } // dx
1771 } // dy
1772 } // dz
1773
1774 MFEM_SYNC_THREAD;
1775
1776 MFEM_FOREACH_THREAD(dz,z,D1D)
1777 {
1778 MFEM_FOREACH_THREAD(dy,y,D1D)
1779 {
1780 MFEM_FOREACH_THREAD(dx,x,D1D)
1781 {
1782 if (dx < D1D-1)
1783 {
1784 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
1785 }
1786 if (dy < D1D-1)
1787 {
1788 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
1789 }
1790 if (dz < D1D-1)
1791 {
1792 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
1793 }
1794 }
1795 }
1796 }
1797 } // qz
1798 }; // end of element loop
1799
1800 auto host_kernel = [&] MFEM_LAMBDA (int)
1801 {
1802 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
1803 };
1804
1805 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
1806}
1807
1808// PA H(curl)-L2 Assemble 2D kernel
1809void PAHcurlL2Setup2D(const int Q1D,
1810 const int NE,
1811 const Array<real_t> &w,
1812 Vector &coeff,
1813 Vector &op);
1814
1815// PA H(curl)-L2 Assemble 3D kernel
1816void PAHcurlL2Setup3D(const int NQ,
1817 const int coeffDim,
1818 const int NE,
1819 const Array<real_t> &w,
1820 Vector &coeff,
1821 Vector &op);
1822
1823// PA H(curl)-L2 Apply 2D kernel
1824void PAHcurlL2Apply2D(const int D1D,
1825 const int D1Dtest,
1826 const int Q1D,
1827 const int NE,
1828 const Array<real_t> &bo,
1829 const Array<real_t> &bot,
1830 const Array<real_t> &bt,
1831 const Array<real_t> &gc,
1832 const Vector &pa_data,
1833 const Vector &x,
1834 Vector &y);
1835
1836// PA H(curl)-L2 Apply Transpose 2D kernel
1837void PAHcurlL2ApplyTranspose2D(const int D1D,
1838 const int D1Dtest,
1839 const int Q1D,
1840 const int NE,
1841 const Array<real_t> &bo,
1842 const Array<real_t> &bot,
1843 const Array<real_t> &b,
1844 const Array<real_t> &gct,
1845 const Vector &pa_data,
1846 const Vector &x,
1847 Vector &y);
1848
1849// PA H(curl)-L2 Apply 3D kernel
1850template<int T_D1D = 0, int T_Q1D = 0>
1851inline void PAHcurlL2Apply3D(const int d1d,
1852 const int q1d,
1853 const int coeffDim,
1854 const int NE,
1855 const Array<real_t> &bo,
1856 const Array<real_t> &bc,
1857 const Array<real_t> &bot,
1858 const Array<real_t> &bct,
1859 const Array<real_t> &gc,
1860 const Vector &pa_data,
1861 const Vector &x,
1862 Vector &y)
1863{
1864 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
1865 "Error: d1d > HCURL_MAX_D1D");
1866 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
1867 "Error: q1d > HCURL_MAX_Q1D");
1868 const int D1D = T_D1D ? T_D1D : d1d;
1869 const int Q1D = T_Q1D ? T_Q1D : q1d;
1870
1871 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
1872 auto Bc = Reshape(bc.Read(), Q1D, D1D);
1873 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
1874 auto Bct = Reshape(bct.Read(), D1D, Q1D);
1875 auto Gc = Reshape(gc.Read(), Q1D, D1D);
1876 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
1877 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1878 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1879
1880 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1881 {
1882 // Using u = dF^{-T} \hat{u} and (\nabla\times u) F =
1883 // 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get:
1884 // (\nabla\times u) \cdot v
1885 // = 1/det(dF) \hat{\nabla}\times\hat{u}^T dF^T dF^{-T} \hat{v}
1886 // = 1/det(dF) \hat{\nabla}\times\hat{u}^T \hat{v}
1887 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1888 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1889 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1890
1891 constexpr int VDIM = 3;
1892 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1893 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1894 const int D1D = T_D1D ? T_D1D : d1d;
1895 const int Q1D = T_Q1D ? T_Q1D : q1d;
1896
1897 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
1898 // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
1899
1900 for (int qz = 0; qz < Q1D; ++qz)
1901 {
1902 for (int qy = 0; qy < Q1D; ++qy)
1903 {
1904 for (int qx = 0; qx < Q1D; ++qx)
1905 {
1906 for (int c = 0; c < VDIM; ++c)
1907 {
1908 curl[qz][qy][qx][c] = 0.0;
1909 }
1910 }
1911 }
1912 }
1913
1914 // We treat x, y, z components separately for optimization specific to each.
1915
1916 int osc = 0;
1917
1918 {
1919 // x component
1920 const int D1Dz = D1D;
1921 const int D1Dy = D1D;
1922 const int D1Dx = D1D - 1;
1923
1924 for (int dz = 0; dz < D1Dz; ++dz)
1925 {
1926 real_t gradXY[MQ1D][MQ1D][2];
1927 for (int qy = 0; qy < Q1D; ++qy)
1928 {
1929 for (int qx = 0; qx < Q1D; ++qx)
1930 {
1931 for (int d = 0; d < 2; ++d)
1932 {
1933 gradXY[qy][qx][d] = 0.0;
1934 }
1935 }
1936 }
1937
1938 for (int dy = 0; dy < D1Dy; ++dy)
1939 {
1940 real_t massX[MQ1D];
1941 for (int qx = 0; qx < Q1D; ++qx)
1942 {
1943 massX[qx] = 0.0;
1944 }
1945
1946 for (int dx = 0; dx < D1Dx; ++dx)
1947 {
1948 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1949 for (int qx = 0; qx < Q1D; ++qx)
1950 {
1951 massX[qx] += t * Bo(qx,dx);
1952 }
1953 }
1954
1955 for (int qy = 0; qy < Q1D; ++qy)
1956 {
1957 const real_t wy = Bc(qy,dy);
1958 const real_t wDy = Gc(qy,dy);
1959 for (int qx = 0; qx < Q1D; ++qx)
1960 {
1961 const real_t wx = massX[qx];
1962 gradXY[qy][qx][0] += wx * wDy;
1963 gradXY[qy][qx][1] += wx * wy;
1964 }
1965 }
1966 }
1967
1968 for (int qz = 0; qz < Q1D; ++qz)
1969 {
1970 const real_t wz = Bc(qz,dz);
1971 const real_t wDz = Gc(qz,dz);
1972 for (int qy = 0; qy < Q1D; ++qy)
1973 {
1974 for (int qx = 0; qx < Q1D; ++qx)
1975 {
1976 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1977 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
1978 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz; // -(u_0)_{x_1}
1979 }
1980 }
1981 }
1982 }
1983
1984 osc += D1Dx * D1Dy * D1Dz;
1985 }
1986
1987 {
1988 // y component
1989 const int D1Dz = D1D;
1990 const int D1Dy = D1D - 1;
1991 const int D1Dx = D1D;
1992
1993 for (int dz = 0; dz < D1Dz; ++dz)
1994 {
1995 real_t gradXY[MQ1D][MQ1D][2];
1996 for (int qy = 0; qy < Q1D; ++qy)
1997 {
1998 for (int qx = 0; qx < Q1D; ++qx)
1999 {
2000 for (int d = 0; d < 2; ++d)
2001 {
2002 gradXY[qy][qx][d] = 0.0;
2003 }
2004 }
2005 }
2006
2007 for (int dx = 0; dx < D1Dx; ++dx)
2008 {
2009 real_t massY[MQ1D];
2010 for (int qy = 0; qy < Q1D; ++qy)
2011 {
2012 massY[qy] = 0.0;
2013 }
2014
2015 for (int dy = 0; dy < D1Dy; ++dy)
2016 {
2017 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2018 for (int qy = 0; qy < Q1D; ++qy)
2019 {
2020 massY[qy] += t * Bo(qy,dy);
2021 }
2022 }
2023
2024 for (int qx = 0; qx < Q1D; ++qx)
2025 {
2026 const real_t wx = Bc(qx,dx);
2027 const real_t wDx = Gc(qx,dx);
2028 for (int qy = 0; qy < Q1D; ++qy)
2029 {
2030 const real_t wy = massY[qy];
2031 gradXY[qy][qx][0] += wDx * wy;
2032 gradXY[qy][qx][1] += wx * wy;
2033 }
2034 }
2035 }
2036
2037 for (int qz = 0; qz < Q1D; ++qz)
2038 {
2039 const real_t wz = Bc(qz,dz);
2040 const real_t wDz = Gc(qz,dz);
2041 for (int qy = 0; qy < Q1D; ++qy)
2042 {
2043 for (int qx = 0; qx < Q1D; ++qx)
2044 {
2045 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
2046 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
2047 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz; // (u_1)_{x_0}
2048 }
2049 }
2050 }
2051 }
2052
2053 osc += D1Dx * D1Dy * D1Dz;
2054 }
2055
2056 {
2057 // z component
2058 const int D1Dz = D1D - 1;
2059 const int D1Dy = D1D;
2060 const int D1Dx = D1D;
2061
2062 for (int dx = 0; dx < D1Dx; ++dx)
2063 {
2064 real_t gradYZ[MQ1D][MQ1D][2];
2065 for (int qz = 0; qz < Q1D; ++qz)
2066 {
2067 for (int qy = 0; qy < Q1D; ++qy)
2068 {
2069 for (int d = 0; d < 2; ++d)
2070 {
2071 gradYZ[qz][qy][d] = 0.0;
2072 }
2073 }
2074 }
2075
2076 for (int dy = 0; dy < D1Dy; ++dy)
2077 {
2078 real_t massZ[MQ1D];
2079 for (int qz = 0; qz < Q1D; ++qz)
2080 {
2081 massZ[qz] = 0.0;
2082 }
2083
2084 for (int dz = 0; dz < D1Dz; ++dz)
2085 {
2086 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2087 for (int qz = 0; qz < Q1D; ++qz)
2088 {
2089 massZ[qz] += t * Bo(qz,dz);
2090 }
2091 }
2092
2093 for (int qy = 0; qy < Q1D; ++qy)
2094 {
2095 const real_t wy = Bc(qy,dy);
2096 const real_t wDy = Gc(qy,dy);
2097 for (int qz = 0; qz < Q1D; ++qz)
2098 {
2099 const real_t wz = massZ[qz];
2100 gradYZ[qz][qy][0] += wz * wy;
2101 gradYZ[qz][qy][1] += wz * wDy;
2102 }
2103 }
2104 }
2105
2106 for (int qx = 0; qx < Q1D; ++qx)
2107 {
2108 const real_t wx = Bc(qx,dx);
2109 const real_t wDx = Gc(qx,dx);
2110
2111 for (int qy = 0; qy < Q1D; ++qy)
2112 {
2113 for (int qz = 0; qz < Q1D; ++qz)
2114 {
2115 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2116 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx; // (u_2)_{x_1}
2117 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
2118 }
2119 }
2120 }
2121 }
2122 }
2123
2124 // Apply D operator.
2125 for (int qz = 0; qz < Q1D; ++qz)
2126 {
2127 for (int qy = 0; qy < Q1D; ++qy)
2128 {
2129 for (int qx = 0; qx < Q1D; ++qx)
2130 {
2131 const real_t O11 = op(0,qx,qy,qz,e);
2132 if (coeffDim == 1)
2133 {
2134 for (int c = 0; c < VDIM; ++c)
2135 {
2136 curl[qz][qy][qx][c] *= O11;
2137 }
2138 }
2139 else
2140 {
2141 const real_t O21 = op(1,qx,qy,qz,e);
2142 const real_t O31 = op(2,qx,qy,qz,e);
2143 const real_t O12 = op(3,qx,qy,qz,e);
2144 const real_t O22 = op(4,qx,qy,qz,e);
2145 const real_t O32 = op(5,qx,qy,qz,e);
2146 const real_t O13 = op(6,qx,qy,qz,e);
2147 const real_t O23 = op(7,qx,qy,qz,e);
2148 const real_t O33 = op(8,qx,qy,qz,e);
2149 const real_t curlX = curl[qz][qy][qx][0];
2150 const real_t curlY = curl[qz][qy][qx][1];
2151 const real_t curlZ = curl[qz][qy][qx][2];
2152 curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
2153 curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
2154 curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
2155 }
2156 }
2157 }
2158 }
2159
2160 for (int qz = 0; qz < Q1D; ++qz)
2161 {
2162 real_t massXY[MD1D][MD1D];
2163
2164 osc = 0;
2165
2166 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2167 {
2168 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2169 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2170 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2171
2172 for (int dy = 0; dy < D1Dy; ++dy)
2173 {
2174 for (int dx = 0; dx < D1Dx; ++dx)
2175 {
2176 massXY[dy][dx] = 0;
2177 }
2178 }
2179 for (int qy = 0; qy < Q1D; ++qy)
2180 {
2181 real_t massX[MD1D];
2182 for (int dx = 0; dx < D1Dx; ++dx)
2183 {
2184 massX[dx] = 0.0;
2185 }
2186 for (int qx = 0; qx < Q1D; ++qx)
2187 {
2188 for (int dx = 0; dx < D1Dx; ++dx)
2189 {
2190 massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
2191 }
2192 }
2193
2194 for (int dy = 0; dy < D1Dy; ++dy)
2195 {
2196 const real_t wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
2197 for (int dx = 0; dx < D1Dx; ++dx)
2198 {
2199 massXY[dy][dx] += massX[dx] * wy;
2200 }
2201 }
2202 }
2203
2204 for (int dz = 0; dz < D1Dz; ++dz)
2205 {
2206 const real_t wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
2207 for (int dy = 0; dy < D1Dy; ++dy)
2208 {
2209 for (int dx = 0; dx < D1Dx; ++dx)
2210 {
2211 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
2212 }
2213 }
2214 }
2215
2216 osc += D1Dx * D1Dy * D1Dz;
2217 } // loop c
2218 } // loop qz
2219 }); // end of element loop
2220}
2221
2222// Shared memory PA H(curl)-L2 Apply 3D kernel
2223template<int T_D1D = 0, int T_Q1D = 0>
2224inline void SmemPAHcurlL2Apply3D(const int d1d,
2225 const int q1d,
2226 const int coeffDim,
2227 const int NE,
2228 const Array<real_t> &bo,
2229 const Array<real_t> &bc,
2230 const Array<real_t> &gc,
2231 const Vector &pa_data,
2232 const Vector &x,
2233 Vector &y)
2234{
2235 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2236 "Error: d1d > HCURL_MAX_D1D");
2237 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2238 "Error: q1d > HCURL_MAX_Q1D");
2239 const int D1D = T_D1D ? T_D1D : d1d;
2240 const int Q1D = T_Q1D ? T_Q1D : q1d;
2241
2242 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2243 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2244 auto Gc = Reshape(gc.Read(), Q1D, D1D);
2245 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2246 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2247 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2248
2249 auto device_kernel = [=] MFEM_DEVICE (int e)
2250 {
2251 constexpr int VDIM = 3;
2252 constexpr int maxCoeffDim = 9;
2253 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2254 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2255 const int D1D = T_D1D ? T_D1D : d1d;
2256 const int Q1D = T_Q1D ? T_Q1D : q1d;
2257
2258 MFEM_SHARED real_t sBo[MD1D][MQ1D];
2259 MFEM_SHARED real_t sBc[MD1D][MQ1D];
2260 MFEM_SHARED real_t sGc[MD1D][MQ1D];
2261
2262 real_t opc[maxCoeffDim];
2263 MFEM_SHARED real_t sop[maxCoeffDim][MQ1D][MQ1D];
2264 MFEM_SHARED real_t curl[MQ1D][MQ1D][3];
2265
2266 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
2267
2268 MFEM_FOREACH_THREAD(qx,x,Q1D)
2269 {
2270 MFEM_FOREACH_THREAD(qy,y,Q1D)
2271 {
2272 MFEM_FOREACH_THREAD(qz,z,Q1D)
2273 {
2274 for (int i=0; i<coeffDim; ++i)
2275 {
2276 opc[i] = op(i,qx,qy,qz,e);
2277 }
2278 }
2279 }
2280 }
2281
2282 const int tidx = MFEM_THREAD_ID(x);
2283 const int tidy = MFEM_THREAD_ID(y);
2284 const int tidz = MFEM_THREAD_ID(z);
2285
2286 if (tidz == 0)
2287 {
2288 MFEM_FOREACH_THREAD(d,y,D1D)
2289 {
2290 MFEM_FOREACH_THREAD(q,x,Q1D)
2291 {
2292 sBc[d][q] = Bc(q,d);
2293 sGc[d][q] = Gc(q,d);
2294 if (d < D1D-1)
2295 {
2296 sBo[d][q] = Bo(q,d);
2297 }
2298 }
2299 }
2300 }
2301 MFEM_SYNC_THREAD;
2302
2303 for (int qz=0; qz < Q1D; ++qz)
2304 {
2305 if (tidz == qz)
2306 {
2307 MFEM_FOREACH_THREAD(qy,y,Q1D)
2308 {
2309 MFEM_FOREACH_THREAD(qx,x,Q1D)
2310 {
2311 for (int i=0; i<3; ++i)
2312 {
2313 curl[qy][qx][i] = 0.0;
2314 }
2315 }
2316 }
2317 }
2318
2319 int osc = 0;
2320 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2321 {
2322 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2323 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2324 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2325
2326 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2327 {
2328 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2329 {
2330 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2331 {
2332 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2333 }
2334 }
2335 }
2336 MFEM_SYNC_THREAD;
2337
2338 if (tidz == qz)
2339 {
2340 if (c == 0)
2341 {
2342 for (int i=0; i<coeffDim; ++i)
2343 {
2344 sop[i][tidx][tidy] = opc[i];
2345 }
2346 }
2347
2348 MFEM_FOREACH_THREAD(qy,y,Q1D)
2349 {
2350 MFEM_FOREACH_THREAD(qx,x,Q1D)
2351 {
2352 real_t u = 0.0;
2353 real_t v = 0.0;
2354
2355 // We treat x, y, z components separately for optimization specific to each.
2356 if (c == 0) // x component
2357 {
2358 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
2359
2360 for (int dz = 0; dz < D1Dz; ++dz)
2361 {
2362 const real_t wz = sBc[dz][qz];
2363 const real_t wDz = sGc[dz][qz];
2364
2365 for (int dy = 0; dy < D1Dy; ++dy)
2366 {
2367 const real_t wy = sBc[dy][qy];
2368 const real_t wDy = sGc[dy][qy];
2369
2370 for (int dx = 0; dx < D1Dx; ++dx)
2371 {
2372 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
2373 u += wx * wDy * wz;
2374 v += wx * wy * wDz;
2375 }
2376 }
2377 }
2378
2379 curl[qy][qx][1] += v; // (u_0)_{x_2}
2380 curl[qy][qx][2] -= u; // -(u_0)_{x_1}
2381 }
2382 else if (c == 1) // y component
2383 {
2384 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
2385
2386 for (int dz = 0; dz < D1Dz; ++dz)
2387 {
2388 const real_t wz = sBc[dz][qz];
2389 const real_t wDz = sGc[dz][qz];
2390
2391 for (int dy = 0; dy < D1Dy; ++dy)
2392 {
2393 const real_t wy = sBo[dy][qy];
2394
2395 for (int dx = 0; dx < D1Dx; ++dx)
2396 {
2397 const real_t t = sX[dz][dy][dx];
2398 const real_t wx = t * sBc[dx][qx];
2399 const real_t wDx = t * sGc[dx][qx];
2400
2401 u += wDx * wy * wz;
2402 v += wx * wy * wDz;
2403 }
2404 }
2405 }
2406
2407 curl[qy][qx][0] -= v; // -(u_1)_{x_2}
2408 curl[qy][qx][2] += u; // (u_1)_{x_0}
2409 }
2410 else // z component
2411 {
2412 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2413
2414 for (int dz = 0; dz < D1Dz; ++dz)
2415 {
2416 const real_t wz = sBo[dz][qz];
2417
2418 for (int dy = 0; dy < D1Dy; ++dy)
2419 {
2420 const real_t wy = sBc[dy][qy];
2421 const real_t wDy = sGc[dy][qy];
2422
2423 for (int dx = 0; dx < D1Dx; ++dx)
2424 {
2425 const real_t t = sX[dz][dy][dx];
2426 const real_t wx = t * sBc[dx][qx];
2427 const real_t wDx = t * sGc[dx][qx];
2428
2429 u += wDx * wy * wz;
2430 v += wx * wDy * wz;
2431 }
2432 }
2433 }
2434
2435 curl[qy][qx][0] += v; // (u_2)_{x_1}
2436 curl[qy][qx][1] -= u; // -(u_2)_{x_0}
2437 }
2438 } // qx
2439 } // qy
2440 } // tidz == qz
2441
2442 osc += D1Dx * D1Dy * D1Dz;
2443 MFEM_SYNC_THREAD;
2444 } // c
2445
2446 real_t dxyz1 = 0.0;
2447 real_t dxyz2 = 0.0;
2448 real_t dxyz3 = 0.0;
2449
2450 MFEM_FOREACH_THREAD(dz,z,D1D)
2451 {
2452 const real_t wcz = sBc[dz][qz];
2453 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2454
2455 MFEM_FOREACH_THREAD(dy,y,D1D)
2456 {
2457 MFEM_FOREACH_THREAD(dx,x,D1D)
2458 {
2459 for (int qy = 0; qy < Q1D; ++qy)
2460 {
2461 const real_t wcy = sBc[dy][qy];
2462 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2463
2464 for (int qx = 0; qx < Q1D; ++qx)
2465 {
2466 const real_t O11 = sop[0][qx][qy];
2467 real_t c1, c2, c3;
2468 if (coeffDim == 1)
2469 {
2470 c1 = O11 * curl[qy][qx][0];
2471 c2 = O11 * curl[qy][qx][1];
2472 c3 = O11 * curl[qy][qx][2];
2473 }
2474 else
2475 {
2476 const real_t O21 = sop[1][qx][qy];
2477 const real_t O31 = sop[2][qx][qy];
2478 const real_t O12 = sop[3][qx][qy];
2479 const real_t O22 = sop[4][qx][qy];
2480 const real_t O32 = sop[5][qx][qy];
2481 const real_t O13 = sop[6][qx][qy];
2482 const real_t O23 = sop[7][qx][qy];
2483 const real_t O33 = sop[8][qx][qy];
2484 c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
2485 c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
2486 c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
2487 }
2488
2489 const real_t wcx = sBc[dx][qx];
2490
2491 if (dx < D1D-1)
2492 {
2493 const real_t wx = sBo[dx][qx];
2494 dxyz1 += c1 * wx * wcy * wcz;
2495 }
2496
2497 dxyz2 += c2 * wcx * wy * wcz;
2498 dxyz3 += c3 * wcx * wcy * wz;
2499 } // qx
2500 } // qy
2501 } // dx
2502 } // dy
2503 } // dz
2504
2505 MFEM_SYNC_THREAD;
2506
2507 MFEM_FOREACH_THREAD(dz,z,D1D)
2508 {
2509 MFEM_FOREACH_THREAD(dy,y,D1D)
2510 {
2511 MFEM_FOREACH_THREAD(dx,x,D1D)
2512 {
2513 if (dx < D1D-1)
2514 {
2515 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
2516 }
2517 if (dy < D1D-1)
2518 {
2519 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
2520 }
2521 if (dz < D1D-1)
2522 {
2523 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
2524 }
2525 }
2526 }
2527 }
2528 } // qz
2529 }; // end of element loop
2530
2531 auto host_kernel = [&] MFEM_LAMBDA (int)
2532 {
2533 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
2534 };
2535
2536 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
2537}
2538
2539// PA H(curl)-L2 Apply Transpose 3D kernel
2540template<int T_D1D = 0, int T_Q1D = 0>
2541inline void PAHcurlL2ApplyTranspose3D(const int d1d,
2542 const int q1d,
2543 const int coeffDim,
2544 const int NE,
2545 const Array<real_t> &bo,
2546 const Array<real_t> &bc,
2547 const Array<real_t> &bot,
2548 const Array<real_t> &bct,
2549 const Array<real_t> &gct,
2550 const Vector &pa_data,
2551 const Vector &x,
2552 Vector &y)
2553{
2554 // See PAHcurlL2Apply3D for comments.
2555 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2556 "Error: d1d > HCURL_MAX_D1D");
2557 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2558 "Error: q1d > HCURL_MAX_Q1D");
2559 const int D1D = T_D1D ? T_D1D : d1d;
2560 const int Q1D = T_Q1D ? T_Q1D : q1d;
2561
2562 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2563 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2564 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
2565 auto Bct = Reshape(bct.Read(), D1D, Q1D);
2566 auto Gct = Reshape(gct.Read(), D1D, Q1D);
2567 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2568 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2569 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2570
2571 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
2572 {
2573 constexpr int VDIM = 3;
2574 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2575 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2576 const int D1D = T_D1D ? T_D1D : d1d;
2577 const int Q1D = T_Q1D ? T_Q1D : q1d;
2578
2579 real_t mass[MQ1D][MQ1D][MQ1D][VDIM];
2580
2581 for (int qz = 0; qz < Q1D; ++qz)
2582 {
2583 for (int qy = 0; qy < Q1D; ++qy)
2584 {
2585 for (int qx = 0; qx < Q1D; ++qx)
2586 {
2587 for (int c = 0; c < VDIM; ++c)
2588 {
2589 mass[qz][qy][qx][c] = 0.0;
2590 }
2591 }
2592 }
2593 }
2594
2595 int osc = 0;
2596
2597 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2598 {
2599 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2600 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2601 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2602
2603 for (int dz = 0; dz < D1Dz; ++dz)
2604 {
2605 real_t massXY[MQ1D][MQ1D];
2606 for (int qy = 0; qy < Q1D; ++qy)
2607 {
2608 for (int qx = 0; qx < Q1D; ++qx)
2609 {
2610 massXY[qy][qx] = 0.0;
2611 }
2612 }
2613
2614 for (int dy = 0; dy < D1Dy; ++dy)
2615 {
2616 real_t massX[MQ1D];
2617 for (int qx = 0; qx < Q1D; ++qx)
2618 {
2619 massX[qx] = 0.0;
2620 }
2621
2622 for (int dx = 0; dx < D1Dx; ++dx)
2623 {
2624 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2625 for (int qx = 0; qx < Q1D; ++qx)
2626 {
2627 massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
2628 }
2629 }
2630
2631 for (int qy = 0; qy < Q1D; ++qy)
2632 {
2633 const real_t wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
2634 for (int qx = 0; qx < Q1D; ++qx)
2635 {
2636 const real_t wx = massX[qx];
2637 massXY[qy][qx] += wx * wy;
2638 }
2639 }
2640 }
2641
2642 for (int qz = 0; qz < Q1D; ++qz)
2643 {
2644 const real_t wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
2645 for (int qy = 0; qy < Q1D; ++qy)
2646 {
2647 for (int qx = 0; qx < Q1D; ++qx)
2648 {
2649 mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
2650 }
2651 }
2652 }
2653 }
2654
2655 osc += D1Dx * D1Dy * D1Dz;
2656 } // loop (c) over components
2657
2658 // Apply D operator.
2659 for (int qz = 0; qz < Q1D; ++qz)
2660 {
2661 for (int qy = 0; qy < Q1D; ++qy)
2662 {
2663 for (int qx = 0; qx < Q1D; ++qx)
2664 {
2665 const real_t O11 = op(0,qx,qy,qz,e);
2666 if (coeffDim == 1)
2667 {
2668 for (int c = 0; c < VDIM; ++c)
2669 {
2670 mass[qz][qy][qx][c] *= O11;
2671 }
2672 }
2673 else
2674 {
2675 const real_t O12 = op(1,qx,qy,qz,e);
2676 const real_t O13 = op(2,qx,qy,qz,e);
2677 const real_t O21 = op(3,qx,qy,qz,e);
2678 const real_t O22 = op(4,qx,qy,qz,e);
2679 const real_t O23 = op(5,qx,qy,qz,e);
2680 const real_t O31 = op(6,qx,qy,qz,e);
2681 const real_t O32 = op(7,qx,qy,qz,e);
2682 const real_t O33 = op(8,qx,qy,qz,e);
2683 const real_t massX = mass[qz][qy][qx][0];
2684 const real_t massY = mass[qz][qy][qx][1];
2685 const real_t massZ = mass[qz][qy][qx][2];
2686 mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
2687 mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
2688 mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
2689 }
2690 }
2691 }
2692 }
2693
2694 // x component
2695 osc = 0;
2696 {
2697 const int D1Dz = D1D;
2698 const int D1Dy = D1D;
2699 const int D1Dx = D1D - 1;
2700
2701 for (int qz = 0; qz < Q1D; ++qz)
2702 {
2703 real_t gradXY12[MD1D][MD1D];
2704 real_t gradXY21[MD1D][MD1D];
2705
2706 for (int dy = 0; dy < D1Dy; ++dy)
2707 {
2708 for (int dx = 0; dx < D1Dx; ++dx)
2709 {
2710 gradXY12[dy][dx] = 0.0;
2711 gradXY21[dy][dx] = 0.0;
2712 }
2713 }
2714 for (int qy = 0; qy < Q1D; ++qy)
2715 {
2716 real_t massX[MD1D][2];
2717 for (int dx = 0; dx < D1Dx; ++dx)
2718 {
2719 for (int n = 0; n < 2; ++n)
2720 {
2721 massX[dx][n] = 0.0;
2722 }
2723 }
2724 for (int qx = 0; qx < Q1D; ++qx)
2725 {
2726 for (int dx = 0; dx < D1Dx; ++dx)
2727 {
2728 const real_t wx = Bot(dx,qx);
2729
2730 massX[dx][0] += wx * mass[qz][qy][qx][1];
2731 massX[dx][1] += wx * mass[qz][qy][qx][2];
2732 }
2733 }
2734 for (int dy = 0; dy < D1Dy; ++dy)
2735 {
2736 const real_t wy = Bct(dy,qy);
2737 const real_t wDy = Gct(dy,qy);
2738
2739 for (int dx = 0; dx < D1Dx; ++dx)
2740 {
2741 gradXY21[dy][dx] += massX[dx][0] * wy;
2742 gradXY12[dy][dx] += massX[dx][1] * wDy;
2743 }
2744 }
2745 }
2746
2747 for (int dz = 0; dz < D1Dz; ++dz)
2748 {
2749 const real_t wz = Bct(dz,qz);
2750 const real_t wDz = Gct(dz,qz);
2751 for (int dy = 0; dy < D1Dy; ++dy)
2752 {
2753 for (int dx = 0; dx < D1Dx; ++dx)
2754 {
2755 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
2756 // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
2757 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2758 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
2759 }
2760 }
2761 }
2762 } // loop qz
2763
2764 osc += D1Dx * D1Dy * D1Dz;
2765 }
2766
2767 // y component
2768 {
2769 const int D1Dz = D1D;
2770 const int D1Dy = D1D - 1;
2771 const int D1Dx = D1D;
2772
2773 for (int qz = 0; qz < Q1D; ++qz)
2774 {
2775 real_t gradXY02[MD1D][MD1D];
2776 real_t gradXY20[MD1D][MD1D];
2777
2778 for (int dy = 0; dy < D1Dy; ++dy)
2779 {
2780 for (int dx = 0; dx < D1Dx; ++dx)
2781 {
2782 gradXY02[dy][dx] = 0.0;
2783 gradXY20[dy][dx] = 0.0;
2784 }
2785 }
2786 for (int qx = 0; qx < Q1D; ++qx)
2787 {
2788 real_t massY[MD1D][2];
2789 for (int dy = 0; dy < D1Dy; ++dy)
2790 {
2791 massY[dy][0] = 0.0;
2792 massY[dy][1] = 0.0;
2793 }
2794 for (int qy = 0; qy < Q1D; ++qy)
2795 {
2796 for (int dy = 0; dy < D1Dy; ++dy)
2797 {
2798 const real_t wy = Bot(dy,qy);
2799
2800 massY[dy][0] += wy * mass[qz][qy][qx][2];
2801 massY[dy][1] += wy * mass[qz][qy][qx][0];
2802 }
2803 }
2804 for (int dx = 0; dx < D1Dx; ++dx)
2805 {
2806 const real_t wx = Bct(dx,qx);
2807 const real_t wDx = Gct(dx,qx);
2808
2809 for (int dy = 0; dy < D1Dy; ++dy)
2810 {
2811 gradXY02[dy][dx] += massY[dy][0] * wDx;
2812 gradXY20[dy][dx] += massY[dy][1] * wx;
2813 }
2814 }
2815 }
2816
2817 for (int dz = 0; dz < D1Dz; ++dz)
2818 {
2819 const real_t wz = Bct(dz,qz);
2820 const real_t wDz = Gct(dz,qz);
2821 for (int dy = 0; dy < D1Dy; ++dy)
2822 {
2823 for (int dx = 0; dx < D1Dx; ++dx)
2824 {
2825 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
2826 // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
2827 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2828 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
2829 }
2830 }
2831 }
2832 } // loop qz
2833
2834 osc += D1Dx * D1Dy * D1Dz;
2835 }
2836
2837 // z component
2838 {
2839 const int D1Dz = D1D - 1;
2840 const int D1Dy = D1D;
2841 const int D1Dx = D1D;
2842
2843 for (int qx = 0; qx < Q1D; ++qx)
2844 {
2845 real_t gradYZ01[MD1D][MD1D];
2846 real_t gradYZ10[MD1D][MD1D];
2847
2848 for (int dy = 0; dy < D1Dy; ++dy)
2849 {
2850 for (int dz = 0; dz < D1Dz; ++dz)
2851 {
2852 gradYZ01[dz][dy] = 0.0;
2853 gradYZ10[dz][dy] = 0.0;
2854 }
2855 }
2856 for (int qy = 0; qy < Q1D; ++qy)
2857 {
2858 real_t massZ[MD1D][2];
2859 for (int dz = 0; dz < D1Dz; ++dz)
2860 {
2861 for (int n = 0; n < 2; ++n)
2862 {
2863 massZ[dz][n] = 0.0;
2864 }
2865 }
2866 for (int qz = 0; qz < Q1D; ++qz)
2867 {
2868 for (int dz = 0; dz < D1Dz; ++dz)
2869 {
2870 const real_t wz = Bot(dz,qz);
2871
2872 massZ[dz][0] += wz * mass[qz][qy][qx][0];
2873 massZ[dz][1] += wz * mass[qz][qy][qx][1];
2874 }
2875 }
2876 for (int dy = 0; dy < D1Dy; ++dy)
2877 {
2878 const real_t wy = Bct(dy,qy);
2879 const real_t wDy = Gct(dy,qy);
2880
2881 for (int dz = 0; dz < D1Dz; ++dz)
2882 {
2883 gradYZ01[dz][dy] += wy * massZ[dz][1];
2884 gradYZ10[dz][dy] += wDy * massZ[dz][0];
2885 }
2886 }
2887 }
2888
2889 for (int dx = 0; dx < D1Dx; ++dx)
2890 {
2891 const real_t wx = Bct(dx,qx);
2892 const real_t wDx = Gct(dx,qx);
2893
2894 for (int dy = 0; dy < D1Dy; ++dy)
2895 {
2896 for (int dz = 0; dz < D1Dz; ++dz)
2897 {
2898 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2899 // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
2900 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2901 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
2902 }
2903 }
2904 }
2905 } // loop qx
2906 }
2907 });
2908}
2909
2910// PA H(curl)-L2 Apply Transpose 3D kernel
2911template<int T_D1D = 0, int T_Q1D = 0>
2912inline void SmemPAHcurlL2ApplyTranspose3D(const int d1d,
2913 const int q1d,
2914 const int coeffDim,
2915 const int NE,
2916 const Array<real_t> &bo,
2917 const Array<real_t> &bc,
2918 const Array<real_t> &gc,
2919 const Vector &pa_data,
2920 const Vector &x,
2921 Vector &y)
2922{
2923 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2924 "Error: d1d > HCURL_MAX_D1D");
2925 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2926 "Error: q1d > HCURL_MAX_Q1D");
2927 const int D1D = T_D1D ? T_D1D : d1d;
2928 const int Q1D = T_Q1D ? T_Q1D : q1d;
2929
2930 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2931 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2932 auto Gc = Reshape(gc.Read(), Q1D, D1D);
2933 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2934 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2935 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2936
2937 auto device_kernel = [=] MFEM_DEVICE (int e)
2938 {
2939 constexpr int VDIM = 3;
2940 constexpr int maxCoeffDim = 9;
2941 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2942 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2943 const int D1D = T_D1D ? T_D1D : d1d;
2944 const int Q1D = T_Q1D ? T_Q1D : q1d;
2945
2946 MFEM_SHARED real_t sBo[MD1D][MQ1D];
2947 MFEM_SHARED real_t sBc[MD1D][MQ1D];
2948 MFEM_SHARED real_t sGc[MD1D][MQ1D];
2949
2950 real_t opc[maxCoeffDim];
2951 MFEM_SHARED real_t sop[maxCoeffDim][MQ1D][MQ1D];
2952 MFEM_SHARED real_t mass[MQ1D][MQ1D][3];
2953
2954 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
2955
2956 MFEM_FOREACH_THREAD(qx,x,Q1D)
2957 {
2958 MFEM_FOREACH_THREAD(qy,y,Q1D)
2959 {
2960 MFEM_FOREACH_THREAD(qz,z,Q1D)
2961 {
2962 for (int i=0; i<coeffDim; ++i)
2963 {
2964 opc[i] = op(i,qx,qy,qz,e);
2965 }
2966 }
2967 }
2968 }
2969
2970 const int tidx = MFEM_THREAD_ID(x);
2971 const int tidy = MFEM_THREAD_ID(y);
2972 const int tidz = MFEM_THREAD_ID(z);
2973
2974 if (tidz == 0)
2975 {
2976 MFEM_FOREACH_THREAD(d,y,D1D)
2977 {
2978 MFEM_FOREACH_THREAD(q,x,Q1D)
2979 {
2980 sBc[d][q] = Bc(q,d);
2981 sGc[d][q] = Gc(q,d);
2982 if (d < D1D-1)
2983 {
2984 sBo[d][q] = Bo(q,d);
2985 }
2986 }
2987 }
2988 }
2989 MFEM_SYNC_THREAD;
2990
2991 for (int qz=0; qz < Q1D; ++qz)
2992 {
2993 if (tidz == qz)
2994 {
2995 MFEM_FOREACH_THREAD(qy,y,Q1D)
2996 {
2997 MFEM_FOREACH_THREAD(qx,x,Q1D)
2998 {
2999 for (int i=0; i<3; ++i)
3000 {
3001 mass[qy][qx][i] = 0.0;
3002 }
3003 }
3004 }
3005 }
3006
3007 int osc = 0;
3008 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
3009 {
3010 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
3011 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
3012 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
3013
3014 MFEM_FOREACH_THREAD(dz,z,D1Dz)
3015 {
3016 MFEM_FOREACH_THREAD(dy,y,D1Dy)
3017 {
3018 MFEM_FOREACH_THREAD(dx,x,D1Dx)
3019 {
3020 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
3021 }
3022 }
3023 }
3024 MFEM_SYNC_THREAD;
3025
3026 if (tidz == qz)
3027 {
3028 if (c == 0)
3029 {
3030 for (int i=0; i<coeffDim; ++i)
3031 {
3032 sop[i][tidx][tidy] = opc[i];
3033 }
3034 }
3035
3036 MFEM_FOREACH_THREAD(qy,y,Q1D)
3037 {
3038 MFEM_FOREACH_THREAD(qx,x,Q1D)
3039 {
3040 real_t u = 0.0;
3041
3042 for (int dz = 0; dz < D1Dz; ++dz)
3043 {
3044 const real_t wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
3045
3046 for (int dy = 0; dy < D1Dy; ++dy)
3047 {
3048 const real_t wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
3049
3050 for (int dx = 0; dx < D1Dx; ++dx)
3051 {
3052 const real_t wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
3053 u += wx * wy * wz;
3054 }
3055 }
3056 }
3057
3058 mass[qy][qx][c] += u;
3059 } // qx
3060 } // qy
3061 } // tidz == qz
3062
3063 osc += D1Dx * D1Dy * D1Dz;
3064 MFEM_SYNC_THREAD;
3065 } // c
3066
3067 real_t dxyz1 = 0.0;
3068 real_t dxyz2 = 0.0;
3069 real_t dxyz3 = 0.0;
3070
3071 MFEM_FOREACH_THREAD(dz,z,D1D)
3072 {
3073 const real_t wcz = sBc[dz][qz];
3074 const real_t wcDz = sGc[dz][qz];
3075 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
3076
3077 MFEM_FOREACH_THREAD(dy,y,D1D)
3078 {
3079 MFEM_FOREACH_THREAD(dx,x,D1D)
3080 {
3081 for (int qy = 0; qy < Q1D; ++qy)
3082 {
3083 const real_t wcy = sBc[dy][qy];
3084 const real_t wcDy = sGc[dy][qy];
3085 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
3086
3087 for (int qx = 0; qx < Q1D; ++qx)
3088 {
3089 const real_t O11 = sop[0][qx][qy];
3090 real_t c1, c2, c3;
3091 if (coeffDim == 1)
3092 {
3093 c1 = O11 * mass[qy][qx][0];
3094 c2 = O11 * mass[qy][qx][1];
3095 c3 = O11 * mass[qy][qx][2];
3096 }
3097 else
3098 {
3099 const real_t O12 = sop[1][qx][qy];
3100 const real_t O13 = sop[2][qx][qy];
3101 const real_t O21 = sop[3][qx][qy];
3102 const real_t O22 = sop[4][qx][qy];
3103 const real_t O23 = sop[5][qx][qy];
3104 const real_t O31 = sop[6][qx][qy];
3105 const real_t O32 = sop[7][qx][qy];
3106 const real_t O33 = sop[8][qx][qy];
3107
3108 c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
3109 c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
3110 c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
3111 }
3112
3113 const real_t wcx = sBc[dx][qx];
3114 const real_t wDx = sGc[dx][qx];
3115
3116 if (dx < D1D-1)
3117 {
3118 const real_t wx = sBo[dx][qx];
3119 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
3120 }
3121
3122 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
3123
3124 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
3125 } // qx
3126 } // qy
3127 } // dx
3128 } // dy
3129 } // dz
3130
3131 MFEM_SYNC_THREAD;
3132
3133 MFEM_FOREACH_THREAD(dz,z,D1D)
3134 {
3135 MFEM_FOREACH_THREAD(dy,y,D1D)
3136 {
3137 MFEM_FOREACH_THREAD(dx,x,D1D)
3138 {
3139 if (dx < D1D-1)
3140 {
3141 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
3142 }
3143 if (dy < D1D-1)
3144 {
3145 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
3146 }
3147 if (dz < D1D-1)
3148 {
3149 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
3150 }
3151 }
3152 }
3153 }
3154 } // qz
3155 }; // end of element loop
3156
3157 auto host_kernel = [&] MFEM_LAMBDA (int)
3158 {
3159 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
3160 };
3161
3162 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
3163}
3164
3165} // namespace internal
3166
3167template<int DIM, int T_D1D, int T_Q1D>
3168CurlCurlIntegrator::ApplyKernelType CurlCurlIntegrator::ApplyPAKernels::Kernel()
3169{
3170 if constexpr (DIM == 2)
3171 {
3172 return internal::PACurlCurlApply2D;
3173 }
3174 else if constexpr (DIM == 3)
3175 {
3177 {
3178 return internal::SmemPACurlCurlApply3D<T_D1D, T_Q1D>;
3179 }
3180 else
3181 {
3182 return internal::PACurlCurlApply3D;
3183 }
3184 }
3185 MFEM_ABORT("");
3186}
3187
3188template <int DIM, int T_D1D, int T_Q1D>
3190CurlCurlIntegrator::DiagonalPAKernels::Kernel()
3191{
3192 if constexpr (DIM == 2)
3193 {
3194 return internal::PACurlCurlAssembleDiagonal2D;
3195 }
3196 else if constexpr (DIM == 3)
3197 {
3199 {
3200 return internal::SmemPACurlCurlAssembleDiagonal3D<T_D1D, T_Q1D>;
3201 }
3202 else
3203 {
3204 return internal::PACurlCurlAssembleDiagonal3D;
3205 }
3206 }
3207 MFEM_ABORT("");
3208}
3209/// \endcond DO_NOT_DOCUMENT
3210} // namespace mfem
3211
3212#endif
void(*)(const int, const int, const bool, const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, Vector &) DiagonalKernelType
arguments: d1d, q1d, symmetric, ne, Bo, Bc, Go, Gc, pa_data, diag
void(*)( const int, const int, const bool, const int, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Array< real_t > &, const Vector &, const Vector &, Vector &, const bool) ApplyKernelType
static bool Allows(unsigned long b_mask)
Return true if any of the backends in the backend mask, b_mask, are allowed.
Definition device.hpp:262
real_t b
Definition lissajous.cpp:42
constexpr int DIM
mfem::real_t real_t
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:138
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:761
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:937
void forall(int N, lambda &&body)
Definition forall.hpp:839
@ DEVICE_MASK
Biwise-OR of all device backends.
Definition device.hpp:99
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128