MFEM v4.8.0
Finite element discretization library
Loading...
Searching...
No Matches
bilininteg_hcurl_kernels.hpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#ifndef MFEM_BILININTEG_HCURL_KERNELS_HPP
13#define MFEM_BILININTEG_HCURL_KERNELS_HPP
14
20#include "../bilininteg.hpp"
21
22// Piola transformation in H(curl): w = dF^{-T} \hat{w}
23// curl w = (1 / det (dF)) dF \hat{curl} \hat{w}
24
25namespace mfem
26{
27
28namespace internal
29{
30
31// PA H(curl) Mass Diagonal 2D kernel
32void PAHcurlMassAssembleDiagonal2D(const int D1D,
33 const int Q1D,
34 const int NE,
35 const bool symmetric,
36 const Array<real_t> &bo,
37 const Array<real_t> &bc,
38 const Vector &pa_data,
39 Vector &diag);
40
41// PA H(curl) Mass Diagonal 3D kernel
42void PAHcurlMassAssembleDiagonal3D(const int D1D,
43 const int Q1D,
44 const int NE,
45 const bool symmetric,
46 const Array<real_t> &bo,
47 const Array<real_t> &bc,
48 const Vector &pa_data,
49 Vector &diag);
50
51// Shared memory PA H(curl) Mass Diagonal 3D kernel
52template<int T_D1D = 0, int T_Q1D = 0>
53inline void SmemPAHcurlMassAssembleDiagonal3D(const int d1d,
54 const int q1d,
55 const int NE,
56 const bool symmetric,
57 const Array<real_t> &bo,
58 const Array<real_t> &bc,
59 const Vector &pa_data,
60 Vector &diag)
61{
62 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
63 "Error: d1d > HCURL_MAX_D1D");
64 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
65 "Error: q1d > HCURL_MAX_Q1D");
66 const int D1D = T_D1D ? T_D1D : d1d;
67 const int Q1D = T_Q1D ? T_Q1D : q1d;
68
69 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
70 auto Bc = Reshape(bc.Read(), Q1D, D1D);
71 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
72 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
73
74 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
75 {
76 constexpr int VDIM = 3;
77 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
78 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
79 const int D1D = T_D1D ? T_D1D : d1d;
80 const int Q1D = T_Q1D ? T_Q1D : q1d;
81
82 MFEM_SHARED real_t sBo[MQ1D][MD1D];
83 MFEM_SHARED real_t sBc[MQ1D][MD1D];
84
85 real_t op3[3];
86 MFEM_SHARED real_t sop[3][MQ1D][MQ1D];
87
88 MFEM_FOREACH_THREAD(qx,x,Q1D)
89 {
90 MFEM_FOREACH_THREAD(qy,y,Q1D)
91 {
92 MFEM_FOREACH_THREAD(qz,z,Q1D)
93 {
94 op3[0] = op(qx,qy,qz,0,e);
95 op3[1] = op(qx,qy,qz,symmetric ? 3 : 4,e);
96 op3[2] = op(qx,qy,qz,symmetric ? 5 : 8,e);
97 }
98 }
99 }
100
101 const int tidx = MFEM_THREAD_ID(x);
102 const int tidy = MFEM_THREAD_ID(y);
103 const int tidz = MFEM_THREAD_ID(z);
104
105 if (tidz == 0)
106 {
107 MFEM_FOREACH_THREAD(d,y,D1D)
108 {
109 MFEM_FOREACH_THREAD(q,x,Q1D)
110 {
111 sBc[q][d] = Bc(q,d);
112 if (d < D1D-1)
113 {
114 sBo[q][d] = Bo(q,d);
115 }
116 }
117 }
118 }
119 MFEM_SYNC_THREAD;
120
121 int osc = 0;
122 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
123 {
124 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
125 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
126 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
127
128 real_t dxyz = 0.0;
129
130 for (int qz=0; qz < Q1D; ++qz)
131 {
132 if (tidz == qz)
133 {
134 for (int i=0; i<3; ++i)
135 {
136 sop[i][tidx][tidy] = op3[i];
137 }
138 }
139
140 MFEM_SYNC_THREAD;
141
142 MFEM_FOREACH_THREAD(dz,z,D1Dz)
143 {
144 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
145
146 MFEM_FOREACH_THREAD(dy,y,D1Dy)
147 {
148 MFEM_FOREACH_THREAD(dx,x,D1Dx)
149 {
150 for (int qy = 0; qy < Q1D; ++qy)
151 {
152 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
153
154 for (int qx = 0; qx < Q1D; ++qx)
155 {
156 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
157 dxyz += sop[c][qx][qy] * wx * wx * wy * wy * wz * wz;
158 }
159 }
160 }
161 }
162 }
163
164 MFEM_SYNC_THREAD;
165 } // qz loop
166
167 MFEM_FOREACH_THREAD(dz,z,D1Dz)
168 {
169 MFEM_FOREACH_THREAD(dy,y,D1Dy)
170 {
171 MFEM_FOREACH_THREAD(dx,x,D1Dx)
172 {
173 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
174 }
175 }
176 }
177
178 osc += D1Dx * D1Dy * D1Dz;
179 } // c loop
180 }); // end of element loop
181}
182
183// PA H(curl) Mass Apply 2D kernel
184void PAHcurlMassApply2D(const int D1D,
185 const int Q1D,
186 const int NE,
187 const bool symmetric,
188 const Array<real_t> &bo,
189 const Array<real_t> &bc,
190 const Array<real_t> &bot,
191 const Array<real_t> &bct,
192 const Vector &pa_data,
193 const Vector &x,
194 Vector &y);
195
196// PA H(curl) Mass Apply 3D kernel
197void PAHcurlMassApply3D(const int D1D,
198 const int Q1D,
199 const int NE,
200 const bool symmetric,
201 const Array<real_t> &bo,
202 const Array<real_t> &bc,
203 const Array<real_t> &bot,
204 const Array<real_t> &bct,
205 const Vector &pa_data,
206 const Vector &x,
207 Vector &y);
208
209// Shared memory PA H(curl) Mass Apply 3D kernel
210template<int T_D1D = 0, int T_Q1D = 0>
211inline void SmemPAHcurlMassApply3D(const int d1d,
212 const int q1d,
213 const int NE,
214 const bool symmetric,
215 const Array<real_t> &bo,
216 const Array<real_t> &bc,
217 const Array<real_t> &bot,
218 const Array<real_t> &bct,
219 const Vector &pa_data,
220 const Vector &x,
221 Vector &y)
222{
223 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
224 "Error: d1d > HCURL_MAX_D1D");
225 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
226 "Error: q1d > HCURL_MAX_Q1D");
227 const int D1D = T_D1D ? T_D1D : d1d;
228 const int Q1D = T_Q1D ? T_Q1D : q1d;
229
230 const int dataSize = symmetric ? 6 : 9;
231
232 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
233 auto Bc = Reshape(bc.Read(), Q1D, D1D);
234 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, dataSize, NE);
235 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
236 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
237
238 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
239 {
240 constexpr int VDIM = 3;
241 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
242 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
243 const int D1D = T_D1D ? T_D1D : d1d;
244 const int Q1D = T_Q1D ? T_Q1D : q1d;
245
246 MFEM_SHARED real_t sBo[MQ1D][MD1D];
247 MFEM_SHARED real_t sBc[MQ1D][MD1D];
248
249 real_t op9[9];
250 MFEM_SHARED real_t sop[9*MQ1D*MQ1D];
251 MFEM_SHARED real_t mass[MQ1D][MQ1D][3];
252
253 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
254
255 MFEM_FOREACH_THREAD(qx,x,Q1D)
256 {
257 MFEM_FOREACH_THREAD(qy,y,Q1D)
258 {
259 MFEM_FOREACH_THREAD(qz,z,Q1D)
260 {
261 for (int i=0; i<dataSize; ++i)
262 {
263 op9[i] = op(qx,qy,qz,i,e);
264 }
265 }
266 }
267 }
268
269 const int tidx = MFEM_THREAD_ID(x);
270 const int tidy = MFEM_THREAD_ID(y);
271 const int tidz = MFEM_THREAD_ID(z);
272
273 if (tidz == 0)
274 {
275 MFEM_FOREACH_THREAD(d,y,D1D)
276 {
277 MFEM_FOREACH_THREAD(q,x,Q1D)
278 {
279 sBc[q][d] = Bc(q,d);
280 if (d < D1D-1)
281 {
282 sBo[q][d] = Bo(q,d);
283 }
284 }
285 }
286 }
287 MFEM_SYNC_THREAD;
288
289 for (int qz=0; qz < Q1D; ++qz)
290 {
291 int osc = 0;
292 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
293 {
294 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
295 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
296 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
297
298 MFEM_FOREACH_THREAD(dz,z,D1Dz)
299 {
300 MFEM_FOREACH_THREAD(dy,y,D1Dy)
301 {
302 MFEM_FOREACH_THREAD(dx,x,D1Dx)
303 {
304 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
305 }
306 }
307 }
308 MFEM_SYNC_THREAD;
309
310 if (tidz == qz)
311 {
312 for (int i=0; i<dataSize; ++i)
313 {
314 sop[i + (dataSize*tidx) + (dataSize*Q1D*tidy)] = op9[i];
315 }
316
317 MFEM_FOREACH_THREAD(qy,y,Q1D)
318 {
319 MFEM_FOREACH_THREAD(qx,x,Q1D)
320 {
321 real_t u = 0.0;
322
323 for (int dz = 0; dz < D1Dz; ++dz)
324 {
325 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
326 for (int dy = 0; dy < D1Dy; ++dy)
327 {
328 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
329 for (int dx = 0; dx < D1Dx; ++dx)
330 {
331 const real_t t = sX[dz][dy][dx];
332 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
333 u += t * wx * wy * wz;
334 }
335 }
336 }
337
338 mass[qy][qx][c] = u;
339 } // qx
340 } // qy
341 } // tidz == qz
342
343 osc += D1Dx * D1Dy * D1Dz;
344 MFEM_SYNC_THREAD;
345 } // c
346
347 MFEM_SYNC_THREAD; // Sync mass[qy][qx][d] and sop
348
349 osc = 0;
350 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
351 {
352 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
353 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
354 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
355
356 real_t dxyz = 0.0;
357
358 MFEM_FOREACH_THREAD(dz,z,D1Dz)
359 {
360 const real_t wz = (c == 2) ? sBo[qz][dz] : sBc[qz][dz];
361
362 MFEM_FOREACH_THREAD(dy,y,D1Dy)
363 {
364 MFEM_FOREACH_THREAD(dx,x,D1Dx)
365 {
366 for (int qy = 0; qy < Q1D; ++qy)
367 {
368 const real_t wy = (c == 1) ? sBo[qy][dy] : sBc[qy][dy];
369 for (int qx = 0; qx < Q1D; ++qx)
370 {
371 const int os = (dataSize*qx) + (dataSize*Q1D*qy);
372 const int id1 = os + ((c == 0) ? 0 : ((c == 1) ? (symmetric ? 1 : 3) :
373 (symmetric ? 2 : 6))); // O11, O21, O31
374 const int id2 = os + ((c == 0) ? 1 : ((c == 1) ? (symmetric ? 3 : 4) :
375 (symmetric ? 4 : 7))); // O12, O22, O32
376 const int id3 = os + ((c == 0) ? 2 : ((c == 1) ? (symmetric ? 4 : 5) :
377 (symmetric ? 5 : 8))); // O13, O23, O33
378
379 const real_t m_c = (sop[id1] * mass[qy][qx][0]) + (sop[id2] * mass[qy][qx][1]) +
380 (sop[id3] * mass[qy][qx][2]);
381
382 const real_t wx = (c == 0) ? sBo[qx][dx] : sBc[qx][dx];
383 dxyz += m_c * wx * wy * wz;
384 }
385 }
386 }
387 }
388 }
389
390 MFEM_SYNC_THREAD;
391
392 MFEM_FOREACH_THREAD(dz,z,D1Dz)
393 {
394 MFEM_FOREACH_THREAD(dy,y,D1Dy)
395 {
396 MFEM_FOREACH_THREAD(dx,x,D1Dx)
397 {
398 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
399 }
400 }
401 }
402
403 osc += D1Dx * D1Dy * D1Dz;
404 } // c loop
405 } // qz
406 }); // end of element loop
407}
408
409// PA H(curl) curl-curl Assemble 2D kernel
410void PACurlCurlSetup2D(const int Q1D,
411 const int NE,
412 const Array<real_t> &w,
413 const Vector &j,
414 Vector &coeff,
415 Vector &op);
416
417// PA H(curl) curl-curl Assemble 3D kernel
418void PACurlCurlSetup3D(const int Q1D,
419 const int coeffDim,
420 const int NE,
421 const Array<real_t> &w,
422 const Vector &j,
423 Vector &coeff,
424 Vector &op);
425
426// PA H(curl) curl-curl Diagonal 2D kernel
427void PACurlCurlAssembleDiagonal2D(const int D1D,
428 const int Q1D,
429 const int NE,
430 const Array<real_t> &bo,
431 const Array<real_t> &gc,
432 const Vector &pa_data,
433 Vector &diag);
434
435// PA H(curl) curl-curl Diagonal 3D kernel
436template<int T_D1D = 0, int T_Q1D = 0>
437inline void PACurlCurlAssembleDiagonal3D(const int d1d,
438 const int q1d,
439 const bool symmetric,
440 const int NE,
441 const Array<real_t> &bo,
442 const Array<real_t> &bc,
443 const Array<real_t> &go,
444 const Array<real_t> &gc,
445 const Vector &pa_data,
446 Vector &diag)
447{
448 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
449 "Error: d1d > HCURL_MAX_D1D");
450 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
451 "Error: q1d > HCURL_MAX_Q1D");
452 const int D1D = T_D1D ? T_D1D : d1d;
453 const int Q1D = T_Q1D ? T_Q1D : q1d;
454
455 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
456 auto Bc = Reshape(bc.Read(), Q1D, D1D);
457 auto Go = Reshape(go.Read(), Q1D, D1D-1);
458 auto Gc = Reshape(gc.Read(), Q1D, D1D);
459 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
460 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
461
462 const int s = symmetric ? 6 : 9;
463 const int i11 = 0;
464 const int i12 = 1;
465 const int i13 = 2;
466 const int i21 = symmetric ? i12 : 3;
467 const int i22 = symmetric ? 3 : 4;
468 const int i23 = symmetric ? 4 : 5;
469 const int i31 = symmetric ? i13 : 6;
470 const int i32 = symmetric ? i23 : 7;
471 const int i33 = symmetric ? 5 : 8;
472
473 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
474 {
475 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
476 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
477 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
478 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
479 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
480
481 // For each c, we will keep 9 arrays for derivatives multiplied by the 9 entries of the 3x3 matrix (dF^T C dF),
482 // which may be non-symmetric depending on a possibly non-symmetric matrix coefficient.
483
484 constexpr int VDIM = 3;
485 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
486 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
487 const int D1D = T_D1D ? T_D1D : d1d;
488 const int Q1D = T_Q1D ? T_Q1D : q1d;
489
490 int osc = 0;
491
492 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
493 {
494 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
495 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
496 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
497
498 real_t zt[MQ1D][MQ1D][MD1D][9][3];
499
500 // z contraction
501 for (int qx = 0; qx < Q1D; ++qx)
502 {
503 for (int qy = 0; qy < Q1D; ++qy)
504 {
505 for (int dz = 0; dz < D1Dz; ++dz)
506 {
507 for (int i=0; i<s; ++i)
508 {
509 for (int d=0; d<3; ++d)
510 {
511 zt[qx][qy][dz][i][d] = 0.0;
512 }
513 }
514
515 for (int qz = 0; qz < Q1D; ++qz)
516 {
517 const real_t wz = ((c == 2) ? Bo(qz,dz) : Bc(qz,dz));
518 const real_t wDz = ((c == 2) ? Go(qz,dz) : Gc(qz,dz));
519
520 for (int i=0; i<s; ++i)
521 {
522 zt[qx][qy][dz][i][0] += wz * wz * op(qx,qy,qz,i,e);
523 zt[qx][qy][dz][i][1] += wDz * wz * op(qx,qy,qz,i,e);
524 zt[qx][qy][dz][i][2] += wDz * wDz * op(qx,qy,qz,i,e);
525 }
526 }
527 }
528 }
529 } // end of z contraction
530
531 real_t yt[MQ1D][MD1D][MD1D][9][3][3];
532
533 // y contraction
534 for (int qx = 0; qx < Q1D; ++qx)
535 {
536 for (int dz = 0; dz < D1Dz; ++dz)
537 {
538 for (int dy = 0; dy < D1Dy; ++dy)
539 {
540 for (int i=0; i<s; ++i)
541 {
542 for (int d=0; d<3; ++d)
543 for (int j=0; j<3; ++j)
544 {
545 yt[qx][dy][dz][i][d][j] = 0.0;
546 }
547 }
548
549 for (int qy = 0; qy < Q1D; ++qy)
550 {
551 const real_t wy = ((c == 1) ? Bo(qy,dy) : Bc(qy,dy));
552 const real_t wDy = ((c == 1) ? Go(qy,dy) : Gc(qy,dy));
553
554 for (int i=0; i<s; ++i)
555 {
556 for (int d=0; d<3; ++d)
557 {
558 yt[qx][dy][dz][i][d][0] += wy * wy * zt[qx][qy][dz][i][d];
559 yt[qx][dy][dz][i][d][1] += wDy * wy * zt[qx][qy][dz][i][d];
560 yt[qx][dy][dz][i][d][2] += wDy * wDy * zt[qx][qy][dz][i][d];
561 }
562 }
563 }
564 }
565 }
566 } // end of y contraction
567
568 // x contraction
569 for (int dz = 0; dz < D1Dz; ++dz)
570 {
571 for (int dy = 0; dy < D1Dy; ++dy)
572 {
573 for (int dx = 0; dx < D1Dx; ++dx)
574 {
575 for (int qx = 0; qx < Q1D; ++qx)
576 {
577 const real_t wx = ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
578 const real_t wDx = ((c == 0) ? Go(qx,dx) : Gc(qx,dx));
579
580 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
581 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
582 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
583 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
584 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
585
586 /*
587 const double O11 = op(q,0,e);
588 const double O12 = op(q,1,e);
589 const double O13 = op(q,2,e);
590 const double O22 = op(q,3,e);
591 const double O23 = op(q,4,e);
592 const double O33 = op(q,5,e);
593 */
594
595 if (c == 0)
596 {
597 // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
598 const real_t sumy = yt[qx][dy][dz][i22][2][0] - yt[qx][dy][dz][i23][1][1]
599 - yt[qx][dy][dz][i32][1][1] + yt[qx][dy][dz][i33][0][2];
600
601 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += sumy * wx * wx;
602 }
603 else if (c == 1)
604 {
605 // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
606 const real_t d = (yt[qx][dy][dz][i11][2][0] * wx * wx)
607 - ((yt[qx][dy][dz][i13][1][0] + yt[qx][dy][dz][i31][1][0]) * wDx * wx)
608 + (yt[qx][dy][dz][i33][0][0] * wDx * wDx);
609
610 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
611 }
612 else
613 {
614 // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
615 const real_t d = (yt[qx][dy][dz][i11][0][2] * wx * wx)
616 - ((yt[qx][dy][dz][i12][0][1] + yt[qx][dy][dz][i21][0][1]) * wDx * wx)
617 + (yt[qx][dy][dz][i22][0][0] * wDx * wDx);
618
619 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += d;
620 }
621 }
622 }
623 }
624 } // end of x contraction
625
626 osc += D1Dx * D1Dy * D1Dz;
627 } // loop c
628 }); // end of element loop
629}
630
631// Shared memory PA H(curl) curl-curl Diagonal 3D kernel
632template<int T_D1D = 0, int T_Q1D = 0>
633inline void SmemPACurlCurlAssembleDiagonal3D(const int d1d,
634 const int q1d,
635 const bool symmetric,
636 const int NE,
637 const Array<real_t> &bo,
638 const Array<real_t> &bc,
639 const Array<real_t> &go,
640 const Array<real_t> &gc,
641 const Vector &pa_data,
642 Vector &diag)
643{
644 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
645 "Error: d1d > HCURL_MAX_D1D");
646 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
647 "Error: q1d > HCURL_MAX_Q1D");
648 const int D1D = T_D1D ? T_D1D : d1d;
649 const int Q1D = T_Q1D ? T_Q1D : q1d;
650
651 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
652 auto Bc = Reshape(bc.Read(), Q1D, D1D);
653 auto Go = Reshape(go.Read(), Q1D, D1D-1);
654 auto Gc = Reshape(gc.Read(), Q1D, D1D);
655 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
656 auto D = Reshape(diag.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
657
658 const int s = symmetric ? 6 : 9;
659 const int i11 = 0;
660 const int i12 = 1;
661 const int i13 = 2;
662 const int i21 = symmetric ? i12 : 3;
663 const int i22 = symmetric ? 3 : 4;
664 const int i23 = symmetric ? 4 : 5;
665 const int i31 = symmetric ? i13 : 6;
666 const int i32 = symmetric ? i23 : 7;
667 const int i33 = symmetric ? 5 : 8;
668
669 mfem::forall_3D(NE, Q1D, Q1D, Q1D, [=] MFEM_HOST_DEVICE (int e)
670 {
671 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
672 // (\nabla\times u) \cdot (\nabla\times u) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{u}
673 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
674 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
675 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
676
677 constexpr int VDIM = 3;
678 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
679 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
680 const int D1D = T_D1D ? T_D1D : d1d;
681 const int Q1D = T_Q1D ? T_Q1D : q1d;
682
683 MFEM_SHARED real_t sBo[MQ1D][MD1D];
684 MFEM_SHARED real_t sBc[MQ1D][MD1D];
685 MFEM_SHARED real_t sGo[MQ1D][MD1D];
686 MFEM_SHARED real_t sGc[MQ1D][MD1D];
687
688 real_t ope[9];
689 MFEM_SHARED real_t sop[9][MQ1D][MQ1D];
690
691 MFEM_FOREACH_THREAD(qx,x,Q1D)
692 {
693 MFEM_FOREACH_THREAD(qy,y,Q1D)
694 {
695 MFEM_FOREACH_THREAD(qz,z,Q1D)
696 {
697 for (int i=0; i<s; ++i)
698 {
699 ope[i] = op(qx,qy,qz,i,e);
700 }
701 }
702 }
703 }
704
705 const int tidx = MFEM_THREAD_ID(x);
706 const int tidy = MFEM_THREAD_ID(y);
707 const int tidz = MFEM_THREAD_ID(z);
708
709 if (tidz == 0)
710 {
711 MFEM_FOREACH_THREAD(d,y,D1D)
712 {
713 MFEM_FOREACH_THREAD(q,x,Q1D)
714 {
715 sBc[q][d] = Bc(q,d);
716 sGc[q][d] = Gc(q,d);
717 if (d < D1D-1)
718 {
719 sBo[q][d] = Bo(q,d);
720 sGo[q][d] = Go(q,d);
721 }
722 }
723 }
724 }
725 MFEM_SYNC_THREAD;
726
727 int osc = 0;
728 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
729 {
730 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
731 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
732 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
733
734 real_t dxyz = 0.0;
735
736 for (int qz=0; qz < Q1D; ++qz)
737 {
738 if (tidz == qz)
739 {
740 for (int i=0; i<s; ++i)
741 {
742 sop[i][tidx][tidy] = ope[i];
743 }
744 }
745
746 MFEM_SYNC_THREAD;
747
748 MFEM_FOREACH_THREAD(dz,z,D1Dz)
749 {
750 const real_t wz = ((c == 2) ? sBo[qz][dz] : sBc[qz][dz]);
751 const real_t wDz = ((c == 2) ? sGo[qz][dz] : sGc[qz][dz]);
752
753 MFEM_FOREACH_THREAD(dy,y,D1Dy)
754 {
755 MFEM_FOREACH_THREAD(dx,x,D1Dx)
756 {
757 for (int qy = 0; qy < Q1D; ++qy)
758 {
759 const real_t wy = ((c == 1) ? sBo[qy][dy] : sBc[qy][dy]);
760 const real_t wDy = ((c == 1) ? sGo[qy][dy] : sGc[qy][dy]);
761
762 for (int qx = 0; qx < Q1D; ++qx)
763 {
764 const real_t wx = ((c == 0) ? sBo[qx][dx] : sBc[qx][dx]);
765 const real_t wDx = ((c == 0) ? sGo[qx][dx] : sGc[qx][dx]);
766
767 if (c == 0)
768 {
769 // (u_0)_{x_2} (O22 (u_0)_{x_2} - O23 (u_0)_{x_1}) - (u_0)_{x_1} (O32 (u_0)_{x_2} - O33 (u_0)_{x_1})
770
771 // (u_0)_{x_2} O22 (u_0)_{x_2}
772 dxyz += sop[i22][qx][qy] * wx * wx * wy * wy * wDz * wDz;
773
774 // -(u_0)_{x_2} O23 (u_0)_{x_1} - (u_0)_{x_1} O32 (u_0)_{x_2}
775 dxyz += -(sop[i23][qx][qy] + sop[i32][qx][qy]) * wx * wx * wDy * wy * wDz * wz;
776
777 // (u_0)_{x_1} O33 (u_0)_{x_1}
778 dxyz += sop[i33][qx][qy] * wx * wx * wDy * wDy * wz * wz;
779 }
780 else if (c == 1)
781 {
782 // (u_1)_{x_2} (O11 (u_1)_{x_2} - O13 (u_1)_{x_0}) + (u_1)_{x_0} (-O31 (u_1)_{x_2} + O33 (u_1)_{x_0})
783
784 // (u_1)_{x_2} O11 (u_1)_{x_2}
785 dxyz += sop[i11][qx][qy] * wx * wx * wy * wy * wDz * wDz;
786
787 // -(u_1)_{x_2} O13 (u_1)_{x_0} - (u_1)_{x_0} O31 (u_1)_{x_2}
788 dxyz += -(sop[i13][qx][qy] + sop[i31][qx][qy]) * wDx * wx * wy * wy * wDz * wz;
789
790 // (u_1)_{x_0} O33 (u_1)_{x_0})
791 dxyz += sop[i33][qx][qy] * wDx * wDx * wy * wy * wz * wz;
792 }
793 else
794 {
795 // (u_2)_{x_1} (O11 (u_2)_{x_1} - O12 (u_2)_{x_0}) - (u_2)_{x_0} (O21 (u_2)_{x_1} - O22 (u_2)_{x_0})
796
797 // (u_2)_{x_1} O11 (u_2)_{x_1}
798 dxyz += sop[i11][qx][qy] * wx * wx * wDy * wDy * wz * wz;
799
800 // -(u_2)_{x_1} O12 (u_2)_{x_0} - (u_2)_{x_0} O21 (u_2)_{x_1}
801 dxyz += -(sop[i12][qx][qy] + sop[i21][qx][qy]) * wDx * wx * wDy * wy * wz * wz;
802
803 // (u_2)_{x_0} O22 (u_2)_{x_0}
804 dxyz += sop[i22][qx][qy] * wDx * wDx * wy * wy * wz * wz;
805 }
806 }
807 }
808 }
809 }
810 }
811
812 MFEM_SYNC_THREAD;
813 } // qz loop
814
815 MFEM_FOREACH_THREAD(dz,z,D1Dz)
816 {
817 MFEM_FOREACH_THREAD(dy,y,D1Dy)
818 {
819 MFEM_FOREACH_THREAD(dx,x,D1Dx)
820 {
821 D(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += dxyz;
822 }
823 }
824 }
825
826 osc += D1Dx * D1Dy * D1Dz;
827 } // c loop
828 }); // end of element loop
829}
830
831// PA H(curl) curl-curl Apply 2D kernel
832void PACurlCurlApply2D(const int D1D,
833 const int Q1D,
834 const int NE,
835 const Array<real_t> &bo,
836 const Array<real_t> &bot,
837 const Array<real_t> &gc,
838 const Array<real_t> &gct,
839 const Vector &pa_data,
840 const Vector &x,
841 Vector &y);
842
843// PA H(curl) curl-curl Apply 3D kernel
844template<int T_D1D = 0, int T_Q1D = 0>
845inline void PACurlCurlApply3D(const int d1d,
846 const int q1d,
847 const bool symmetric,
848 const int NE,
849 const Array<real_t> &bo,
850 const Array<real_t> &bc,
851 const Array<real_t> &bot,
852 const Array<real_t> &bct,
853 const Array<real_t> &gc,
854 const Array<real_t> &gct,
855 const Vector &pa_data,
856 const Vector &x,
857 Vector &y)
858{
859 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
860 "Error: d1d > HCURL_MAX_D1D");
861 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
862 "Error: q1d > HCURL_MAX_Q1D");
863 const int D1D = T_D1D ? T_D1D : d1d;
864 const int Q1D = T_Q1D ? T_Q1D : q1d;
865
866 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
867 auto Bc = Reshape(bc.Read(), Q1D, D1D);
868 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
869 auto Bct = Reshape(bct.Read(), D1D, Q1D);
870 auto Gc = Reshape(gc.Read(), Q1D, D1D);
871 auto Gct = Reshape(gct.Read(), D1D, Q1D);
872 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, (symmetric ? 6 : 9), NE);
873 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
874 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
875
876 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
877 {
878 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk),
879 // we get:
880 // (\nabla\times u) \cdot (\nabla\times v)
881 // = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
882 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
883 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
884 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
885
886 constexpr int VDIM = 3;
887 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
888 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
889 const int D1D = T_D1D ? T_D1D : d1d;
890 const int Q1D = T_Q1D ? T_Q1D : q1d;
891
892 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
893 // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
894
895 for (int qz = 0; qz < Q1D; ++qz)
896 {
897 for (int qy = 0; qy < Q1D; ++qy)
898 {
899 for (int qx = 0; qx < Q1D; ++qx)
900 {
901 for (int c = 0; c < VDIM; ++c)
902 {
903 curl[qz][qy][qx][c] = 0.0;
904 }
905 }
906 }
907 }
908
909 // We treat x, y, z components separately for optimization specific to each.
910
911 int osc = 0;
912
913 {
914 // x component
915 const int D1Dz = D1D;
916 const int D1Dy = D1D;
917 const int D1Dx = D1D - 1;
918
919 for (int dz = 0; dz < D1Dz; ++dz)
920 {
921 real_t gradXY[MQ1D][MQ1D][2];
922 for (int qy = 0; qy < Q1D; ++qy)
923 {
924 for (int qx = 0; qx < Q1D; ++qx)
925 {
926 for (int d = 0; d < 2; ++d)
927 {
928 gradXY[qy][qx][d] = 0.0;
929 }
930 }
931 }
932
933 for (int dy = 0; dy < D1Dy; ++dy)
934 {
935 real_t massX[MQ1D];
936 for (int qx = 0; qx < Q1D; ++qx)
937 {
938 massX[qx] = 0.0;
939 }
940
941 for (int dx = 0; dx < D1Dx; ++dx)
942 {
943 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
944 for (int qx = 0; qx < Q1D; ++qx)
945 {
946 massX[qx] += t * Bo(qx,dx);
947 }
948 }
949
950 for (int qy = 0; qy < Q1D; ++qy)
951 {
952 const real_t wy = Bc(qy,dy);
953 const real_t wDy = Gc(qy,dy);
954 for (int qx = 0; qx < Q1D; ++qx)
955 {
956 const real_t wx = massX[qx];
957 gradXY[qy][qx][0] += wx * wDy;
958 gradXY[qy][qx][1] += wx * wy;
959 }
960 }
961 }
962
963 for (int qz = 0; qz < Q1D; ++qz)
964 {
965 const real_t wz = Bc(qz,dz);
966 const real_t wDz = Gc(qz,dz);
967 for (int qy = 0; qy < Q1D; ++qy)
968 {
969 for (int qx = 0; qx < Q1D; ++qx)
970 {
971 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
972 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
973 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz; // -(u_0)_{x_1}
974 }
975 }
976 }
977 }
978
979 osc += D1Dx * D1Dy * D1Dz;
980 }
981
982 {
983 // y component
984 const int D1Dz = D1D;
985 const int D1Dy = D1D - 1;
986 const int D1Dx = D1D;
987
988 for (int dz = 0; dz < D1Dz; ++dz)
989 {
990 real_t gradXY[MQ1D][MQ1D][2];
991 for (int qy = 0; qy < Q1D; ++qy)
992 {
993 for (int qx = 0; qx < Q1D; ++qx)
994 {
995 for (int d = 0; d < 2; ++d)
996 {
997 gradXY[qy][qx][d] = 0.0;
998 }
999 }
1000 }
1001
1002 for (int dx = 0; dx < D1Dx; ++dx)
1003 {
1004 real_t massY[MQ1D];
1005 for (int qy = 0; qy < Q1D; ++qy)
1006 {
1007 massY[qy] = 0.0;
1008 }
1009
1010 for (int dy = 0; dy < D1Dy; ++dy)
1011 {
1012 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1013 for (int qy = 0; qy < Q1D; ++qy)
1014 {
1015 massY[qy] += t * Bo(qy,dy);
1016 }
1017 }
1018
1019 for (int qx = 0; qx < Q1D; ++qx)
1020 {
1021 const real_t wx = Bc(qx,dx);
1022 const real_t wDx = Gc(qx,dx);
1023 for (int qy = 0; qy < Q1D; ++qy)
1024 {
1025 const real_t wy = massY[qy];
1026 gradXY[qy][qx][0] += wDx * wy;
1027 gradXY[qy][qx][1] += wx * wy;
1028 }
1029 }
1030 }
1031
1032 for (int qz = 0; qz < Q1D; ++qz)
1033 {
1034 const real_t wz = Bc(qz,dz);
1035 const real_t wDz = Gc(qz,dz);
1036 for (int qy = 0; qy < Q1D; ++qy)
1037 {
1038 for (int qx = 0; qx < Q1D; ++qx)
1039 {
1040 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1041 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
1042 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz; // (u_1)_{x_0}
1043 }
1044 }
1045 }
1046 }
1047
1048 osc += D1Dx * D1Dy * D1Dz;
1049 }
1050
1051 {
1052 // z component
1053 const int D1Dz = D1D - 1;
1054 const int D1Dy = D1D;
1055 const int D1Dx = D1D;
1056
1057 for (int dx = 0; dx < D1Dx; ++dx)
1058 {
1059 real_t gradYZ[MQ1D][MQ1D][2];
1060 for (int qz = 0; qz < Q1D; ++qz)
1061 {
1062 for (int qy = 0; qy < Q1D; ++qy)
1063 {
1064 for (int d = 0; d < 2; ++d)
1065 {
1066 gradYZ[qz][qy][d] = 0.0;
1067 }
1068 }
1069 }
1070
1071 for (int dy = 0; dy < D1Dy; ++dy)
1072 {
1073 real_t massZ[MQ1D];
1074 for (int qz = 0; qz < Q1D; ++qz)
1075 {
1076 massZ[qz] = 0.0;
1077 }
1078
1079 for (int dz = 0; dz < D1Dz; ++dz)
1080 {
1081 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1082 for (int qz = 0; qz < Q1D; ++qz)
1083 {
1084 massZ[qz] += t * Bo(qz,dz);
1085 }
1086 }
1087
1088 for (int qy = 0; qy < Q1D; ++qy)
1089 {
1090 const real_t wy = Bc(qy,dy);
1091 const real_t wDy = Gc(qy,dy);
1092 for (int qz = 0; qz < Q1D; ++qz)
1093 {
1094 const real_t wz = massZ[qz];
1095 gradYZ[qz][qy][0] += wz * wy;
1096 gradYZ[qz][qy][1] += wz * wDy;
1097 }
1098 }
1099 }
1100
1101 for (int qx = 0; qx < Q1D; ++qx)
1102 {
1103 const real_t wx = Bc(qx,dx);
1104 const real_t wDx = Gc(qx,dx);
1105
1106 for (int qy = 0; qy < Q1D; ++qy)
1107 {
1108 for (int qz = 0; qz < Q1D; ++qz)
1109 {
1110 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1111 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx; // (u_2)_{x_1}
1112 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
1113 }
1114 }
1115 }
1116 }
1117 }
1118
1119 // Apply D operator.
1120 for (int qz = 0; qz < Q1D; ++qz)
1121 {
1122 for (int qy = 0; qy < Q1D; ++qy)
1123 {
1124 for (int qx = 0; qx < Q1D; ++qx)
1125 {
1126 const real_t O11 = op(qx,qy,qz,0,e);
1127 const real_t O12 = op(qx,qy,qz,1,e);
1128 const real_t O13 = op(qx,qy,qz,2,e);
1129 const real_t O21 = symmetric ? O12 : op(qx,qy,qz,3,e);
1130 const real_t O22 = symmetric ? op(qx,qy,qz,3,e) : op(qx,qy,qz,4,e);
1131 const real_t O23 = symmetric ? op(qx,qy,qz,4,e) : op(qx,qy,qz,5,e);
1132 const real_t O31 = symmetric ? O13 : op(qx,qy,qz,6,e);
1133 const real_t O32 = symmetric ? O23 : op(qx,qy,qz,7,e);
1134 const real_t O33 = symmetric ? op(qx,qy,qz,5,e) : op(qx,qy,qz,8,e);
1135
1136 const real_t c1 = (O11 * curl[qz][qy][qx][0]) + (O12 * curl[qz][qy][qx][1]) +
1137 (O13 * curl[qz][qy][qx][2]);
1138 const real_t c2 = (O21 * curl[qz][qy][qx][0]) + (O22 * curl[qz][qy][qx][1]) +
1139 (O23 * curl[qz][qy][qx][2]);
1140 const real_t c3 = (O31 * curl[qz][qy][qx][0]) + (O32 * curl[qz][qy][qx][1]) +
1141 (O33 * curl[qz][qy][qx][2]);
1142
1143 curl[qz][qy][qx][0] = c1;
1144 curl[qz][qy][qx][1] = c2;
1145 curl[qz][qy][qx][2] = c3;
1146 }
1147 }
1148 }
1149
1150 // x component
1151 osc = 0;
1152 {
1153 const int D1Dz = D1D;
1154 const int D1Dy = D1D;
1155 const int D1Dx = D1D - 1;
1156
1157 for (int qz = 0; qz < Q1D; ++qz)
1158 {
1159 real_t gradXY12[MD1D][MD1D];
1160 real_t gradXY21[MD1D][MD1D];
1161
1162 for (int dy = 0; dy < D1Dy; ++dy)
1163 {
1164 for (int dx = 0; dx < D1Dx; ++dx)
1165 {
1166 gradXY12[dy][dx] = 0.0;
1167 gradXY21[dy][dx] = 0.0;
1168 }
1169 }
1170 for (int qy = 0; qy < Q1D; ++qy)
1171 {
1172 real_t massX[MD1D][2];
1173 for (int dx = 0; dx < D1Dx; ++dx)
1174 {
1175 for (int n = 0; n < 2; ++n)
1176 {
1177 massX[dx][n] = 0.0;
1178 }
1179 }
1180 for (int qx = 0; qx < Q1D; ++qx)
1181 {
1182 for (int dx = 0; dx < D1Dx; ++dx)
1183 {
1184 const real_t wx = Bot(dx,qx);
1185
1186 massX[dx][0] += wx * curl[qz][qy][qx][1];
1187 massX[dx][1] += wx * curl[qz][qy][qx][2];
1188 }
1189 }
1190 for (int dy = 0; dy < D1Dy; ++dy)
1191 {
1192 const real_t wy = Bct(dy,qy);
1193 const real_t wDy = Gct(dy,qy);
1194
1195 for (int dx = 0; dx < D1Dx; ++dx)
1196 {
1197 gradXY21[dy][dx] += massX[dx][0] * wy;
1198 gradXY12[dy][dx] += massX[dx][1] * wDy;
1199 }
1200 }
1201 }
1202
1203 for (int dz = 0; dz < D1Dz; ++dz)
1204 {
1205 const real_t wz = Bct(dz,qz);
1206 const real_t wDz = Gct(dz,qz);
1207 for (int dy = 0; dy < D1Dy; ++dy)
1208 {
1209 for (int dx = 0; dx < D1Dx; ++dx)
1210 {
1211 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1212 // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
1213 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1214 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
1215 }
1216 }
1217 }
1218 } // loop qz
1219
1220 osc += D1Dx * D1Dy * D1Dz;
1221 }
1222
1223 // y component
1224 {
1225 const int D1Dz = D1D;
1226 const int D1Dy = D1D - 1;
1227 const int D1Dx = D1D;
1228
1229 for (int qz = 0; qz < Q1D; ++qz)
1230 {
1231 real_t gradXY02[MD1D][MD1D];
1232 real_t gradXY20[MD1D][MD1D];
1233
1234 for (int dy = 0; dy < D1Dy; ++dy)
1235 {
1236 for (int dx = 0; dx < D1Dx; ++dx)
1237 {
1238 gradXY02[dy][dx] = 0.0;
1239 gradXY20[dy][dx] = 0.0;
1240 }
1241 }
1242 for (int qx = 0; qx < Q1D; ++qx)
1243 {
1244 real_t massY[MD1D][2];
1245 for (int dy = 0; dy < D1Dy; ++dy)
1246 {
1247 massY[dy][0] = 0.0;
1248 massY[dy][1] = 0.0;
1249 }
1250 for (int qy = 0; qy < Q1D; ++qy)
1251 {
1252 for (int dy = 0; dy < D1Dy; ++dy)
1253 {
1254 const real_t wy = Bot(dy,qy);
1255
1256 massY[dy][0] += wy * curl[qz][qy][qx][2];
1257 massY[dy][1] += wy * curl[qz][qy][qx][0];
1258 }
1259 }
1260 for (int dx = 0; dx < D1Dx; ++dx)
1261 {
1262 const real_t wx = Bct(dx,qx);
1263 const real_t wDx = Gct(dx,qx);
1264
1265 for (int dy = 0; dy < D1Dy; ++dy)
1266 {
1267 gradXY02[dy][dx] += massY[dy][0] * wDx;
1268 gradXY20[dy][dx] += massY[dy][1] * wx;
1269 }
1270 }
1271 }
1272
1273 for (int dz = 0; dz < D1Dz; ++dz)
1274 {
1275 const real_t wz = Bct(dz,qz);
1276 const real_t wDz = Gct(dz,qz);
1277 for (int dy = 0; dy < D1Dy; ++dy)
1278 {
1279 for (int dx = 0; dx < D1Dx; ++dx)
1280 {
1281 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1282 // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
1283 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1284 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
1285 }
1286 }
1287 }
1288 } // loop qz
1289
1290 osc += D1Dx * D1Dy * D1Dz;
1291 }
1292
1293 // z component
1294 {
1295 const int D1Dz = D1D - 1;
1296 const int D1Dy = D1D;
1297 const int D1Dx = D1D;
1298
1299 for (int qx = 0; qx < Q1D; ++qx)
1300 {
1301 real_t gradYZ01[MD1D][MD1D];
1302 real_t gradYZ10[MD1D][MD1D];
1303
1304 for (int dy = 0; dy < D1Dy; ++dy)
1305 {
1306 for (int dz = 0; dz < D1Dz; ++dz)
1307 {
1308 gradYZ01[dz][dy] = 0.0;
1309 gradYZ10[dz][dy] = 0.0;
1310 }
1311 }
1312 for (int qy = 0; qy < Q1D; ++qy)
1313 {
1314 real_t massZ[MD1D][2];
1315 for (int dz = 0; dz < D1Dz; ++dz)
1316 {
1317 for (int n = 0; n < 2; ++n)
1318 {
1319 massZ[dz][n] = 0.0;
1320 }
1321 }
1322 for (int qz = 0; qz < Q1D; ++qz)
1323 {
1324 for (int dz = 0; dz < D1Dz; ++dz)
1325 {
1326 const real_t wz = Bot(dz,qz);
1327
1328 massZ[dz][0] += wz * curl[qz][qy][qx][0];
1329 massZ[dz][1] += wz * curl[qz][qy][qx][1];
1330 }
1331 }
1332 for (int dy = 0; dy < D1Dy; ++dy)
1333 {
1334 const real_t wy = Bct(dy,qy);
1335 const real_t wDy = Gct(dy,qy);
1336
1337 for (int dz = 0; dz < D1Dz; ++dz)
1338 {
1339 gradYZ01[dz][dy] += wy * massZ[dz][1];
1340 gradYZ10[dz][dy] += wDy * massZ[dz][0];
1341 }
1342 }
1343 }
1344
1345 for (int dx = 0; dx < D1Dx; ++dx)
1346 {
1347 const real_t wx = Bct(dx,qx);
1348 const real_t wDx = Gct(dx,qx);
1349
1350 for (int dy = 0; dy < D1Dy; ++dy)
1351 {
1352 for (int dz = 0; dz < D1Dz; ++dz)
1353 {
1354 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1355 // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
1356 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
1357 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
1358 }
1359 }
1360 }
1361 } // loop qx
1362 }
1363 }); // end of element loop
1364}
1365
1366// Shared memory PA H(curl) curl-curl Apply 3D kernel
1367template<int T_D1D = 0, int T_Q1D = 0>
1368inline void SmemPACurlCurlApply3D(const int d1d,
1369 const int q1d,
1370 const bool symmetric,
1371 const int NE,
1372 const Array<real_t> &bo,
1373 const Array<real_t> &bc,
1374 const Array<real_t> &bot,
1375 const Array<real_t> &bct,
1376 const Array<real_t> &gc,
1377 const Array<real_t> &gct,
1378 const Vector &pa_data,
1379 const Vector &x,
1380 Vector &y)
1381{
1382 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
1383 "Error: d1d > HCURL_MAX_D1D");
1384 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
1385 "Error: q1d > HCURL_MAX_Q1D");
1386 const int D1D = T_D1D ? T_D1D : d1d;
1387 const int Q1D = T_Q1D ? T_Q1D : q1d;
1388
1389 // Using (\nabla\times u) F = 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get
1390 // (\nabla\times u) \cdot (\nabla\times v) = 1/det(dF)^2 \hat{\nabla}\times\hat{u}^T dF^T dF \hat{\nabla}\times\hat{v}
1391 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1392 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1393 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1394
1395 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
1396 auto Bc = Reshape(bc.Read(), Q1D, D1D);
1397 auto Gc = Reshape(gc.Read(), Q1D, D1D);
1398 auto op = Reshape(pa_data.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1399 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1400 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1401
1402 const int s = symmetric ? 6 : 9;
1403
1404 auto device_kernel = [=] MFEM_DEVICE (int e)
1405 {
1406 constexpr int VDIM = 3;
1407 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1408 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1409 const int D1D = T_D1D ? T_D1D : d1d;
1410 const int Q1D = T_Q1D ? T_Q1D : q1d;
1411
1412 MFEM_SHARED real_t sBo[MD1D][MQ1D];
1413 MFEM_SHARED real_t sBc[MD1D][MQ1D];
1414 MFEM_SHARED real_t sGc[MD1D][MQ1D];
1415
1416 real_t ope[9];
1417 MFEM_SHARED real_t sop[9][MQ1D][MQ1D];
1418 MFEM_SHARED real_t curl[MQ1D][MQ1D][3];
1419
1420 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
1421
1422 MFEM_FOREACH_THREAD(qx,x,Q1D)
1423 {
1424 MFEM_FOREACH_THREAD(qy,y,Q1D)
1425 {
1426 MFEM_FOREACH_THREAD(qz,z,Q1D)
1427 {
1428 for (int i=0; i<s; ++i)
1429 {
1430 ope[i] = op(qx,qy,qz,i,e);
1431 }
1432 }
1433 }
1434 }
1435
1436 const int tidx = MFEM_THREAD_ID(x);
1437 const int tidy = MFEM_THREAD_ID(y);
1438 const int tidz = MFEM_THREAD_ID(z);
1439
1440 if (tidz == 0)
1441 {
1442 MFEM_FOREACH_THREAD(d,y,D1D)
1443 {
1444 MFEM_FOREACH_THREAD(q,x,Q1D)
1445 {
1446 sBc[d][q] = Bc(q,d);
1447 sGc[d][q] = Gc(q,d);
1448 if (d < D1D-1)
1449 {
1450 sBo[d][q] = Bo(q,d);
1451 }
1452 }
1453 }
1454 }
1455 MFEM_SYNC_THREAD;
1456
1457 for (int qz=0; qz < Q1D; ++qz)
1458 {
1459 if (tidz == qz)
1460 {
1461 MFEM_FOREACH_THREAD(qy,y,Q1D)
1462 {
1463 MFEM_FOREACH_THREAD(qx,x,Q1D)
1464 {
1465 for (int i=0; i<3; ++i)
1466 {
1467 curl[qy][qx][i] = 0.0;
1468 }
1469 }
1470 }
1471 }
1472
1473 int osc = 0;
1474 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
1475 {
1476 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
1477 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
1478 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
1479
1480 MFEM_FOREACH_THREAD(dz,z,D1Dz)
1481 {
1482 MFEM_FOREACH_THREAD(dy,y,D1Dy)
1483 {
1484 MFEM_FOREACH_THREAD(dx,x,D1Dx)
1485 {
1486 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1487 }
1488 }
1489 }
1490 MFEM_SYNC_THREAD;
1491
1492 if (tidz == qz)
1493 {
1494 if (c == 0)
1495 {
1496 for (int i=0; i<s; ++i)
1497 {
1498 sop[i][tidx][tidy] = ope[i];
1499 }
1500 }
1501
1502 MFEM_FOREACH_THREAD(qy,y,Q1D)
1503 {
1504 MFEM_FOREACH_THREAD(qx,x,Q1D)
1505 {
1506 real_t u = 0.0;
1507 real_t v = 0.0;
1508
1509 // We treat x, y, z components separately for optimization specific to each.
1510 if (c == 0) // x component
1511 {
1512 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1513
1514 for (int dz = 0; dz < D1Dz; ++dz)
1515 {
1516 const real_t wz = sBc[dz][qz];
1517 const real_t wDz = sGc[dz][qz];
1518
1519 for (int dy = 0; dy < D1Dy; ++dy)
1520 {
1521 const real_t wy = sBc[dy][qy];
1522 const real_t wDy = sGc[dy][qy];
1523
1524 for (int dx = 0; dx < D1Dx; ++dx)
1525 {
1526 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
1527 u += wx * wDy * wz;
1528 v += wx * wy * wDz;
1529 }
1530 }
1531 }
1532
1533 curl[qy][qx][1] += v; // (u_0)_{x_2}
1534 curl[qy][qx][2] -= u; // -(u_0)_{x_1}
1535 }
1536 else if (c == 1) // y component
1537 {
1538 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1539
1540 for (int dz = 0; dz < D1Dz; ++dz)
1541 {
1542 const real_t wz = sBc[dz][qz];
1543 const real_t wDz = sGc[dz][qz];
1544
1545 for (int dy = 0; dy < D1Dy; ++dy)
1546 {
1547 const real_t wy = sBo[dy][qy];
1548
1549 for (int dx = 0; dx < D1Dx; ++dx)
1550 {
1551 const real_t t = sX[dz][dy][dx];
1552 const real_t wx = t * sBc[dx][qx];
1553 const real_t wDx = t * sGc[dx][qx];
1554
1555 u += wDx * wy * wz;
1556 v += wx * wy * wDz;
1557 }
1558 }
1559 }
1560
1561 curl[qy][qx][0] -= v; // -(u_1)_{x_2}
1562 curl[qy][qx][2] += u; // (u_1)_{x_0}
1563 }
1564 else // z component
1565 {
1566 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1567
1568 for (int dz = 0; dz < D1Dz; ++dz)
1569 {
1570 const real_t wz = sBo[dz][qz];
1571
1572 for (int dy = 0; dy < D1Dy; ++dy)
1573 {
1574 const real_t wy = sBc[dy][qy];
1575 const real_t wDy = sGc[dy][qy];
1576
1577 for (int dx = 0; dx < D1Dx; ++dx)
1578 {
1579 const real_t t = sX[dz][dy][dx];
1580 const real_t wx = t * sBc[dx][qx];
1581 const real_t wDx = t * sGc[dx][qx];
1582
1583 u += wDx * wy * wz;
1584 v += wx * wDy * wz;
1585 }
1586 }
1587 }
1588
1589 curl[qy][qx][0] += v; // (u_2)_{x_1}
1590 curl[qy][qx][1] -= u; // -(u_2)_{x_0}
1591 }
1592 } // qx
1593 } // qy
1594 } // tidz == qz
1595
1596 osc += D1Dx * D1Dy * D1Dz;
1597 MFEM_SYNC_THREAD;
1598 } // c
1599
1600 real_t dxyz1 = 0.0;
1601 real_t dxyz2 = 0.0;
1602 real_t dxyz3 = 0.0;
1603
1604 MFEM_FOREACH_THREAD(dz,z,D1D)
1605 {
1606 const real_t wcz = sBc[dz][qz];
1607 const real_t wcDz = sGc[dz][qz];
1608 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
1609
1610 MFEM_FOREACH_THREAD(dy,y,D1D)
1611 {
1612 MFEM_FOREACH_THREAD(dx,x,D1D)
1613 {
1614 for (int qy = 0; qy < Q1D; ++qy)
1615 {
1616 const real_t wcy = sBc[dy][qy];
1617 const real_t wcDy = sGc[dy][qy];
1618 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
1619
1620 for (int qx = 0; qx < Q1D; ++qx)
1621 {
1622 const real_t O11 = sop[0][qx][qy];
1623 const real_t O12 = sop[1][qx][qy];
1624 const real_t O13 = sop[2][qx][qy];
1625 const real_t O21 = symmetric ? O12 : sop[3][qx][qy];
1626 const real_t O22 = symmetric ? sop[3][qx][qy] : sop[4][qx][qy];
1627 const real_t O23 = symmetric ? sop[4][qx][qy] : sop[5][qx][qy];
1628 const real_t O31 = symmetric ? O13 : sop[6][qx][qy];
1629 const real_t O32 = symmetric ? O23 : sop[7][qx][qy];
1630 const real_t O33 = symmetric ? sop[5][qx][qy] : sop[8][qx][qy];
1631
1632 const real_t c1 = (O11 * curl[qy][qx][0]) + (O12 * curl[qy][qx][1]) +
1633 (O13 * curl[qy][qx][2]);
1634 const real_t c2 = (O21 * curl[qy][qx][0]) + (O22 * curl[qy][qx][1]) +
1635 (O23 * curl[qy][qx][2]);
1636 const real_t c3 = (O31 * curl[qy][qx][0]) + (O32 * curl[qy][qx][1]) +
1637 (O33 * curl[qy][qx][2]);
1638
1639 const real_t wcx = sBc[dx][qx];
1640 const real_t wDx = sGc[dx][qx];
1641
1642 if (dx < D1D-1)
1643 {
1644 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1645 // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
1646 const real_t wx = sBo[dx][qx];
1647 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
1648 }
1649
1650 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1651 // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
1652 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
1653
1654 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1655 // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
1656 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
1657 } // qx
1658 } // qy
1659 } // dx
1660 } // dy
1661 } // dz
1662
1663 MFEM_SYNC_THREAD;
1664
1665 MFEM_FOREACH_THREAD(dz,z,D1D)
1666 {
1667 MFEM_FOREACH_THREAD(dy,y,D1D)
1668 {
1669 MFEM_FOREACH_THREAD(dx,x,D1D)
1670 {
1671 if (dx < D1D-1)
1672 {
1673 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
1674 }
1675 if (dy < D1D-1)
1676 {
1677 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
1678 }
1679 if (dz < D1D-1)
1680 {
1681 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
1682 }
1683 }
1684 }
1685 }
1686 } // qz
1687 }; // end of element loop
1688
1689 auto host_kernel = [&] MFEM_LAMBDA (int)
1690 {
1691 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
1692 };
1693
1694 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
1695}
1696
1697// PA H(curl)-L2 Assemble 2D kernel
1698void PAHcurlL2Setup2D(const int Q1D,
1699 const int NE,
1700 const Array<real_t> &w,
1701 Vector &coeff,
1702 Vector &op);
1703
1704// PA H(curl)-L2 Assemble 3D kernel
1705void PAHcurlL2Setup3D(const int NQ,
1706 const int coeffDim,
1707 const int NE,
1708 const Array<real_t> &w,
1709 Vector &coeff,
1710 Vector &op);
1711
1712// PA H(curl)-L2 Apply 2D kernel
1713void PAHcurlL2Apply2D(const int D1D,
1714 const int D1Dtest,
1715 const int Q1D,
1716 const int NE,
1717 const Array<real_t> &bo,
1718 const Array<real_t> &bot,
1719 const Array<real_t> &bt,
1720 const Array<real_t> &gc,
1721 const Vector &pa_data,
1722 const Vector &x,
1723 Vector &y);
1724
1725// PA H(curl)-L2 Apply Transpose 2D kernel
1726void PAHcurlL2ApplyTranspose2D(const int D1D,
1727 const int D1Dtest,
1728 const int Q1D,
1729 const int NE,
1730 const Array<real_t> &bo,
1731 const Array<real_t> &bot,
1732 const Array<real_t> &b,
1733 const Array<real_t> &gct,
1734 const Vector &pa_data,
1735 const Vector &x,
1736 Vector &y);
1737
1738// PA H(curl)-L2 Apply 3D kernel
1739template<int T_D1D = 0, int T_Q1D = 0>
1740inline void PAHcurlL2Apply3D(const int d1d,
1741 const int q1d,
1742 const int coeffDim,
1743 const int NE,
1744 const Array<real_t> &bo,
1745 const Array<real_t> &bc,
1746 const Array<real_t> &bot,
1747 const Array<real_t> &bct,
1748 const Array<real_t> &gc,
1749 const Vector &pa_data,
1750 const Vector &x,
1751 Vector &y)
1752{
1753 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
1754 "Error: d1d > HCURL_MAX_D1D");
1755 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
1756 "Error: q1d > HCURL_MAX_Q1D");
1757 const int D1D = T_D1D ? T_D1D : d1d;
1758 const int Q1D = T_Q1D ? T_Q1D : q1d;
1759
1760 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
1761 auto Bc = Reshape(bc.Read(), Q1D, D1D);
1762 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
1763 auto Bct = Reshape(bct.Read(), D1D, Q1D);
1764 auto Gc = Reshape(gc.Read(), Q1D, D1D);
1765 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
1766 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
1767 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
1768
1769 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
1770 {
1771 // Using u = dF^{-T} \hat{u} and (\nabla\times u) F =
1772 // 1/det(dF) dF \hat{\nabla}\times\hat{u} (p. 78 of Monk), we get:
1773 // (\nabla\times u) \cdot v
1774 // = 1/det(dF) \hat{\nabla}\times\hat{u}^T dF^T dF^{-T} \hat{v}
1775 // = 1/det(dF) \hat{\nabla}\times\hat{u}^T \hat{v}
1776 // If c = 0, \hat{\nabla}\times\hat{u} reduces to [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1777 // If c = 1, \hat{\nabla}\times\hat{u} reduces to [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1778 // If c = 2, \hat{\nabla}\times\hat{u} reduces to [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
1779
1780 constexpr int VDIM = 3;
1781 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
1782 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
1783 const int D1D = T_D1D ? T_D1D : d1d;
1784 const int Q1D = T_Q1D ? T_Q1D : q1d;
1785
1786 real_t curl[MQ1D][MQ1D][MQ1D][VDIM];
1787 // curl[qz][qy][qx] will be computed as the vector curl at each quadrature point.
1788
1789 for (int qz = 0; qz < Q1D; ++qz)
1790 {
1791 for (int qy = 0; qy < Q1D; ++qy)
1792 {
1793 for (int qx = 0; qx < Q1D; ++qx)
1794 {
1795 for (int c = 0; c < VDIM; ++c)
1796 {
1797 curl[qz][qy][qx][c] = 0.0;
1798 }
1799 }
1800 }
1801 }
1802
1803 // We treat x, y, z components separately for optimization specific to each.
1804
1805 int osc = 0;
1806
1807 {
1808 // x component
1809 const int D1Dz = D1D;
1810 const int D1Dy = D1D;
1811 const int D1Dx = D1D - 1;
1812
1813 for (int dz = 0; dz < D1Dz; ++dz)
1814 {
1815 real_t gradXY[MQ1D][MQ1D][2];
1816 for (int qy = 0; qy < Q1D; ++qy)
1817 {
1818 for (int qx = 0; qx < Q1D; ++qx)
1819 {
1820 for (int d = 0; d < 2; ++d)
1821 {
1822 gradXY[qy][qx][d] = 0.0;
1823 }
1824 }
1825 }
1826
1827 for (int dy = 0; dy < D1Dy; ++dy)
1828 {
1829 real_t massX[MQ1D];
1830 for (int qx = 0; qx < Q1D; ++qx)
1831 {
1832 massX[qx] = 0.0;
1833 }
1834
1835 for (int dx = 0; dx < D1Dx; ++dx)
1836 {
1837 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1838 for (int qx = 0; qx < Q1D; ++qx)
1839 {
1840 massX[qx] += t * Bo(qx,dx);
1841 }
1842 }
1843
1844 for (int qy = 0; qy < Q1D; ++qy)
1845 {
1846 const real_t wy = Bc(qy,dy);
1847 const real_t wDy = Gc(qy,dy);
1848 for (int qx = 0; qx < Q1D; ++qx)
1849 {
1850 const real_t wx = massX[qx];
1851 gradXY[qy][qx][0] += wx * wDy;
1852 gradXY[qy][qx][1] += wx * wy;
1853 }
1854 }
1855 }
1856
1857 for (int qz = 0; qz < Q1D; ++qz)
1858 {
1859 const real_t wz = Bc(qz,dz);
1860 const real_t wDz = Gc(qz,dz);
1861 for (int qy = 0; qy < Q1D; ++qy)
1862 {
1863 for (int qx = 0; qx < Q1D; ++qx)
1864 {
1865 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
1866 curl[qz][qy][qx][1] += gradXY[qy][qx][1] * wDz; // (u_0)_{x_2}
1867 curl[qz][qy][qx][2] -= gradXY[qy][qx][0] * wz; // -(u_0)_{x_1}
1868 }
1869 }
1870 }
1871 }
1872
1873 osc += D1Dx * D1Dy * D1Dz;
1874 }
1875
1876 {
1877 // y component
1878 const int D1Dz = D1D;
1879 const int D1Dy = D1D - 1;
1880 const int D1Dx = D1D;
1881
1882 for (int dz = 0; dz < D1Dz; ++dz)
1883 {
1884 real_t gradXY[MQ1D][MQ1D][2];
1885 for (int qy = 0; qy < Q1D; ++qy)
1886 {
1887 for (int qx = 0; qx < Q1D; ++qx)
1888 {
1889 for (int d = 0; d < 2; ++d)
1890 {
1891 gradXY[qy][qx][d] = 0.0;
1892 }
1893 }
1894 }
1895
1896 for (int dx = 0; dx < D1Dx; ++dx)
1897 {
1898 real_t massY[MQ1D];
1899 for (int qy = 0; qy < Q1D; ++qy)
1900 {
1901 massY[qy] = 0.0;
1902 }
1903
1904 for (int dy = 0; dy < D1Dy; ++dy)
1905 {
1906 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1907 for (int qy = 0; qy < Q1D; ++qy)
1908 {
1909 massY[qy] += t * Bo(qy,dy);
1910 }
1911 }
1912
1913 for (int qx = 0; qx < Q1D; ++qx)
1914 {
1915 const real_t wx = Bc(qx,dx);
1916 const real_t wDx = Gc(qx,dx);
1917 for (int qy = 0; qy < Q1D; ++qy)
1918 {
1919 const real_t wy = massY[qy];
1920 gradXY[qy][qx][0] += wDx * wy;
1921 gradXY[qy][qx][1] += wx * wy;
1922 }
1923 }
1924 }
1925
1926 for (int qz = 0; qz < Q1D; ++qz)
1927 {
1928 const real_t wz = Bc(qz,dz);
1929 const real_t wDz = Gc(qz,dz);
1930 for (int qy = 0; qy < Q1D; ++qy)
1931 {
1932 for (int qx = 0; qx < Q1D; ++qx)
1933 {
1934 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
1935 curl[qz][qy][qx][0] -= gradXY[qy][qx][1] * wDz; // -(u_1)_{x_2}
1936 curl[qz][qy][qx][2] += gradXY[qy][qx][0] * wz; // (u_1)_{x_0}
1937 }
1938 }
1939 }
1940 }
1941
1942 osc += D1Dx * D1Dy * D1Dz;
1943 }
1944
1945 {
1946 // z component
1947 const int D1Dz = D1D - 1;
1948 const int D1Dy = D1D;
1949 const int D1Dx = D1D;
1950
1951 for (int dx = 0; dx < D1Dx; ++dx)
1952 {
1953 real_t gradYZ[MQ1D][MQ1D][2];
1954 for (int qz = 0; qz < Q1D; ++qz)
1955 {
1956 for (int qy = 0; qy < Q1D; ++qy)
1957 {
1958 for (int d = 0; d < 2; ++d)
1959 {
1960 gradYZ[qz][qy][d] = 0.0;
1961 }
1962 }
1963 }
1964
1965 for (int dy = 0; dy < D1Dy; ++dy)
1966 {
1967 real_t massZ[MQ1D];
1968 for (int qz = 0; qz < Q1D; ++qz)
1969 {
1970 massZ[qz] = 0.0;
1971 }
1972
1973 for (int dz = 0; dz < D1Dz; ++dz)
1974 {
1975 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
1976 for (int qz = 0; qz < Q1D; ++qz)
1977 {
1978 massZ[qz] += t * Bo(qz,dz);
1979 }
1980 }
1981
1982 for (int qy = 0; qy < Q1D; ++qy)
1983 {
1984 const real_t wy = Bc(qy,dy);
1985 const real_t wDy = Gc(qy,dy);
1986 for (int qz = 0; qz < Q1D; ++qz)
1987 {
1988 const real_t wz = massZ[qz];
1989 gradYZ[qz][qy][0] += wz * wy;
1990 gradYZ[qz][qy][1] += wz * wDy;
1991 }
1992 }
1993 }
1994
1995 for (int qx = 0; qx < Q1D; ++qx)
1996 {
1997 const real_t wx = Bc(qx,dx);
1998 const real_t wDx = Gc(qx,dx);
1999
2000 for (int qy = 0; qy < Q1D; ++qy)
2001 {
2002 for (int qz = 0; qz < Q1D; ++qz)
2003 {
2004 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2005 curl[qz][qy][qx][0] += gradYZ[qz][qy][1] * wx; // (u_2)_{x_1}
2006 curl[qz][qy][qx][1] -= gradYZ[qz][qy][0] * wDx; // -(u_2)_{x_0}
2007 }
2008 }
2009 }
2010 }
2011 }
2012
2013 // Apply D operator.
2014 for (int qz = 0; qz < Q1D; ++qz)
2015 {
2016 for (int qy = 0; qy < Q1D; ++qy)
2017 {
2018 for (int qx = 0; qx < Q1D; ++qx)
2019 {
2020 const real_t O11 = op(0,qx,qy,qz,e);
2021 if (coeffDim == 1)
2022 {
2023 for (int c = 0; c < VDIM; ++c)
2024 {
2025 curl[qz][qy][qx][c] *= O11;
2026 }
2027 }
2028 else
2029 {
2030 const real_t O21 = op(1,qx,qy,qz,e);
2031 const real_t O31 = op(2,qx,qy,qz,e);
2032 const real_t O12 = op(3,qx,qy,qz,e);
2033 const real_t O22 = op(4,qx,qy,qz,e);
2034 const real_t O32 = op(5,qx,qy,qz,e);
2035 const real_t O13 = op(6,qx,qy,qz,e);
2036 const real_t O23 = op(7,qx,qy,qz,e);
2037 const real_t O33 = op(8,qx,qy,qz,e);
2038 const real_t curlX = curl[qz][qy][qx][0];
2039 const real_t curlY = curl[qz][qy][qx][1];
2040 const real_t curlZ = curl[qz][qy][qx][2];
2041 curl[qz][qy][qx][0] = (O11*curlX)+(O12*curlY)+(O13*curlZ);
2042 curl[qz][qy][qx][1] = (O21*curlX)+(O22*curlY)+(O23*curlZ);
2043 curl[qz][qy][qx][2] = (O31*curlX)+(O32*curlY)+(O33*curlZ);
2044 }
2045 }
2046 }
2047 }
2048
2049 for (int qz = 0; qz < Q1D; ++qz)
2050 {
2051 real_t massXY[MD1D][MD1D];
2052
2053 osc = 0;
2054
2055 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2056 {
2057 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2058 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2059 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2060
2061 for (int dy = 0; dy < D1Dy; ++dy)
2062 {
2063 for (int dx = 0; dx < D1Dx; ++dx)
2064 {
2065 massXY[dy][dx] = 0;
2066 }
2067 }
2068 for (int qy = 0; qy < Q1D; ++qy)
2069 {
2070 real_t massX[MD1D];
2071 for (int dx = 0; dx < D1Dx; ++dx)
2072 {
2073 massX[dx] = 0.0;
2074 }
2075 for (int qx = 0; qx < Q1D; ++qx)
2076 {
2077 for (int dx = 0; dx < D1Dx; ++dx)
2078 {
2079 massX[dx] += curl[qz][qy][qx][c] * ((c == 0) ? Bot(dx,qx) : Bct(dx,qx));
2080 }
2081 }
2082
2083 for (int dy = 0; dy < D1Dy; ++dy)
2084 {
2085 const real_t wy = (c == 1) ? Bot(dy,qy) : Bct(dy,qy);
2086 for (int dx = 0; dx < D1Dx; ++dx)
2087 {
2088 massXY[dy][dx] += massX[dx] * wy;
2089 }
2090 }
2091 }
2092
2093 for (int dz = 0; dz < D1Dz; ++dz)
2094 {
2095 const real_t wz = (c == 2) ? Bot(dz,qz) : Bct(dz,qz);
2096 for (int dy = 0; dy < D1Dy; ++dy)
2097 {
2098 for (int dx = 0; dx < D1Dx; ++dx)
2099 {
2100 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e) += massXY[dy][dx] * wz;
2101 }
2102 }
2103 }
2104
2105 osc += D1Dx * D1Dy * D1Dz;
2106 } // loop c
2107 } // loop qz
2108 }); // end of element loop
2109}
2110
2111// Shared memory PA H(curl)-L2 Apply 3D kernel
2112template<int T_D1D = 0, int T_Q1D = 0>
2113inline void SmemPAHcurlL2Apply3D(const int d1d,
2114 const int q1d,
2115 const int coeffDim,
2116 const int NE,
2117 const Array<real_t> &bo,
2118 const Array<real_t> &bc,
2119 const Array<real_t> &gc,
2120 const Vector &pa_data,
2121 const Vector &x,
2122 Vector &y)
2123{
2124 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2125 "Error: d1d > HCURL_MAX_D1D");
2126 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2127 "Error: q1d > HCURL_MAX_Q1D");
2128 const int D1D = T_D1D ? T_D1D : d1d;
2129 const int Q1D = T_Q1D ? T_Q1D : q1d;
2130
2131 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2132 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2133 auto Gc = Reshape(gc.Read(), Q1D, D1D);
2134 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2135 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2136 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2137
2138 auto device_kernel = [=] MFEM_DEVICE (int e)
2139 {
2140 constexpr int VDIM = 3;
2141 constexpr int maxCoeffDim = 9;
2142 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2143 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2144 const int D1D = T_D1D ? T_D1D : d1d;
2145 const int Q1D = T_Q1D ? T_Q1D : q1d;
2146
2147 MFEM_SHARED real_t sBo[MD1D][MQ1D];
2148 MFEM_SHARED real_t sBc[MD1D][MQ1D];
2149 MFEM_SHARED real_t sGc[MD1D][MQ1D];
2150
2151 real_t opc[maxCoeffDim];
2152 MFEM_SHARED real_t sop[maxCoeffDim][MQ1D][MQ1D];
2153 MFEM_SHARED real_t curl[MQ1D][MQ1D][3];
2154
2155 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
2156
2157 MFEM_FOREACH_THREAD(qx,x,Q1D)
2158 {
2159 MFEM_FOREACH_THREAD(qy,y,Q1D)
2160 {
2161 MFEM_FOREACH_THREAD(qz,z,Q1D)
2162 {
2163 for (int i=0; i<coeffDim; ++i)
2164 {
2165 opc[i] = op(i,qx,qy,qz,e);
2166 }
2167 }
2168 }
2169 }
2170
2171 const int tidx = MFEM_THREAD_ID(x);
2172 const int tidy = MFEM_THREAD_ID(y);
2173 const int tidz = MFEM_THREAD_ID(z);
2174
2175 if (tidz == 0)
2176 {
2177 MFEM_FOREACH_THREAD(d,y,D1D)
2178 {
2179 MFEM_FOREACH_THREAD(q,x,Q1D)
2180 {
2181 sBc[d][q] = Bc(q,d);
2182 sGc[d][q] = Gc(q,d);
2183 if (d < D1D-1)
2184 {
2185 sBo[d][q] = Bo(q,d);
2186 }
2187 }
2188 }
2189 }
2190 MFEM_SYNC_THREAD;
2191
2192 for (int qz=0; qz < Q1D; ++qz)
2193 {
2194 if (tidz == qz)
2195 {
2196 MFEM_FOREACH_THREAD(qy,y,Q1D)
2197 {
2198 MFEM_FOREACH_THREAD(qx,x,Q1D)
2199 {
2200 for (int i=0; i<3; ++i)
2201 {
2202 curl[qy][qx][i] = 0.0;
2203 }
2204 }
2205 }
2206 }
2207
2208 int osc = 0;
2209 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2210 {
2211 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2212 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2213 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2214
2215 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2216 {
2217 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2218 {
2219 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2220 {
2221 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2222 }
2223 }
2224 }
2225 MFEM_SYNC_THREAD;
2226
2227 if (tidz == qz)
2228 {
2229 if (c == 0)
2230 {
2231 for (int i=0; i<coeffDim; ++i)
2232 {
2233 sop[i][tidx][tidy] = opc[i];
2234 }
2235 }
2236
2237 MFEM_FOREACH_THREAD(qy,y,Q1D)
2238 {
2239 MFEM_FOREACH_THREAD(qx,x,Q1D)
2240 {
2241 real_t u = 0.0;
2242 real_t v = 0.0;
2243
2244 // We treat x, y, z components separately for optimization specific to each.
2245 if (c == 0) // x component
2246 {
2247 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
2248
2249 for (int dz = 0; dz < D1Dz; ++dz)
2250 {
2251 const real_t wz = sBc[dz][qz];
2252 const real_t wDz = sGc[dz][qz];
2253
2254 for (int dy = 0; dy < D1Dy; ++dy)
2255 {
2256 const real_t wy = sBc[dy][qy];
2257 const real_t wDy = sGc[dy][qy];
2258
2259 for (int dx = 0; dx < D1Dx; ++dx)
2260 {
2261 const real_t wx = sX[dz][dy][dx] * sBo[dx][qx];
2262 u += wx * wDy * wz;
2263 v += wx * wy * wDz;
2264 }
2265 }
2266 }
2267
2268 curl[qy][qx][1] += v; // (u_0)_{x_2}
2269 curl[qy][qx][2] -= u; // -(u_0)_{x_1}
2270 }
2271 else if (c == 1) // y component
2272 {
2273 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
2274
2275 for (int dz = 0; dz < D1Dz; ++dz)
2276 {
2277 const real_t wz = sBc[dz][qz];
2278 const real_t wDz = sGc[dz][qz];
2279
2280 for (int dy = 0; dy < D1Dy; ++dy)
2281 {
2282 const real_t wy = sBo[dy][qy];
2283
2284 for (int dx = 0; dx < D1Dx; ++dx)
2285 {
2286 const real_t t = sX[dz][dy][dx];
2287 const real_t wx = t * sBc[dx][qx];
2288 const real_t wDx = t * sGc[dx][qx];
2289
2290 u += wDx * wy * wz;
2291 v += wx * wy * wDz;
2292 }
2293 }
2294 }
2295
2296 curl[qy][qx][0] -= v; // -(u_1)_{x_2}
2297 curl[qy][qx][2] += u; // (u_1)_{x_0}
2298 }
2299 else // z component
2300 {
2301 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2302
2303 for (int dz = 0; dz < D1Dz; ++dz)
2304 {
2305 const real_t wz = sBo[dz][qz];
2306
2307 for (int dy = 0; dy < D1Dy; ++dy)
2308 {
2309 const real_t wy = sBc[dy][qy];
2310 const real_t wDy = sGc[dy][qy];
2311
2312 for (int dx = 0; dx < D1Dx; ++dx)
2313 {
2314 const real_t t = sX[dz][dy][dx];
2315 const real_t wx = t * sBc[dx][qx];
2316 const real_t wDx = t * sGc[dx][qx];
2317
2318 u += wDx * wy * wz;
2319 v += wx * wDy * wz;
2320 }
2321 }
2322 }
2323
2324 curl[qy][qx][0] += v; // (u_2)_{x_1}
2325 curl[qy][qx][1] -= u; // -(u_2)_{x_0}
2326 }
2327 } // qx
2328 } // qy
2329 } // tidz == qz
2330
2331 osc += D1Dx * D1Dy * D1Dz;
2332 MFEM_SYNC_THREAD;
2333 } // c
2334
2335 real_t dxyz1 = 0.0;
2336 real_t dxyz2 = 0.0;
2337 real_t dxyz3 = 0.0;
2338
2339 MFEM_FOREACH_THREAD(dz,z,D1D)
2340 {
2341 const real_t wcz = sBc[dz][qz];
2342 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2343
2344 MFEM_FOREACH_THREAD(dy,y,D1D)
2345 {
2346 MFEM_FOREACH_THREAD(dx,x,D1D)
2347 {
2348 for (int qy = 0; qy < Q1D; ++qy)
2349 {
2350 const real_t wcy = sBc[dy][qy];
2351 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2352
2353 for (int qx = 0; qx < Q1D; ++qx)
2354 {
2355 const real_t O11 = sop[0][qx][qy];
2356 real_t c1, c2, c3;
2357 if (coeffDim == 1)
2358 {
2359 c1 = O11 * curl[qy][qx][0];
2360 c2 = O11 * curl[qy][qx][1];
2361 c3 = O11 * curl[qy][qx][2];
2362 }
2363 else
2364 {
2365 const real_t O21 = sop[1][qx][qy];
2366 const real_t O31 = sop[2][qx][qy];
2367 const real_t O12 = sop[3][qx][qy];
2368 const real_t O22 = sop[4][qx][qy];
2369 const real_t O32 = sop[5][qx][qy];
2370 const real_t O13 = sop[6][qx][qy];
2371 const real_t O23 = sop[7][qx][qy];
2372 const real_t O33 = sop[8][qx][qy];
2373 c1 = (O11*curl[qy][qx][0])+(O12*curl[qy][qx][1])+(O13*curl[qy][qx][2]);
2374 c2 = (O21*curl[qy][qx][0])+(O22*curl[qy][qx][1])+(O23*curl[qy][qx][2]);
2375 c3 = (O31*curl[qy][qx][0])+(O32*curl[qy][qx][1])+(O33*curl[qy][qx][2]);
2376 }
2377
2378 const real_t wcx = sBc[dx][qx];
2379
2380 if (dx < D1D-1)
2381 {
2382 const real_t wx = sBo[dx][qx];
2383 dxyz1 += c1 * wx * wcy * wcz;
2384 }
2385
2386 dxyz2 += c2 * wcx * wy * wcz;
2387 dxyz3 += c3 * wcx * wcy * wz;
2388 } // qx
2389 } // qy
2390 } // dx
2391 } // dy
2392 } // dz
2393
2394 MFEM_SYNC_THREAD;
2395
2396 MFEM_FOREACH_THREAD(dz,z,D1D)
2397 {
2398 MFEM_FOREACH_THREAD(dy,y,D1D)
2399 {
2400 MFEM_FOREACH_THREAD(dx,x,D1D)
2401 {
2402 if (dx < D1D-1)
2403 {
2404 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
2405 }
2406 if (dy < D1D-1)
2407 {
2408 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
2409 }
2410 if (dz < D1D-1)
2411 {
2412 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
2413 }
2414 }
2415 }
2416 }
2417 } // qz
2418 }; // end of element loop
2419
2420 auto host_kernel = [&] MFEM_LAMBDA (int)
2421 {
2422 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
2423 };
2424
2425 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
2426}
2427
2428// PA H(curl)-L2 Apply Transpose 3D kernel
2429template<int T_D1D = 0, int T_Q1D = 0>
2430inline void PAHcurlL2ApplyTranspose3D(const int d1d,
2431 const int q1d,
2432 const int coeffDim,
2433 const int NE,
2434 const Array<real_t> &bo,
2435 const Array<real_t> &bc,
2436 const Array<real_t> &bot,
2437 const Array<real_t> &bct,
2438 const Array<real_t> &gct,
2439 const Vector &pa_data,
2440 const Vector &x,
2441 Vector &y)
2442{
2443 // See PAHcurlL2Apply3D for comments.
2444 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2445 "Error: d1d > HCURL_MAX_D1D");
2446 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2447 "Error: q1d > HCURL_MAX_Q1D");
2448 const int D1D = T_D1D ? T_D1D : d1d;
2449 const int Q1D = T_Q1D ? T_Q1D : q1d;
2450
2451 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2452 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2453 auto Bot = Reshape(bot.Read(), D1D-1, Q1D);
2454 auto Bct = Reshape(bct.Read(), D1D, Q1D);
2455 auto Gct = Reshape(gct.Read(), D1D, Q1D);
2456 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2457 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2458 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2459
2460 mfem::forall(NE, [=] MFEM_HOST_DEVICE (int e)
2461 {
2462 constexpr int VDIM = 3;
2463 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2464 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2465 const int D1D = T_D1D ? T_D1D : d1d;
2466 const int Q1D = T_Q1D ? T_Q1D : q1d;
2467
2468 real_t mass[MQ1D][MQ1D][MQ1D][VDIM];
2469
2470 for (int qz = 0; qz < Q1D; ++qz)
2471 {
2472 for (int qy = 0; qy < Q1D; ++qy)
2473 {
2474 for (int qx = 0; qx < Q1D; ++qx)
2475 {
2476 for (int c = 0; c < VDIM; ++c)
2477 {
2478 mass[qz][qy][qx][c] = 0.0;
2479 }
2480 }
2481 }
2482 }
2483
2484 int osc = 0;
2485
2486 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2487 {
2488 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2489 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2490 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2491
2492 for (int dz = 0; dz < D1Dz; ++dz)
2493 {
2494 real_t massXY[MQ1D][MQ1D];
2495 for (int qy = 0; qy < Q1D; ++qy)
2496 {
2497 for (int qx = 0; qx < Q1D; ++qx)
2498 {
2499 massXY[qy][qx] = 0.0;
2500 }
2501 }
2502
2503 for (int dy = 0; dy < D1Dy; ++dy)
2504 {
2505 real_t massX[MQ1D];
2506 for (int qx = 0; qx < Q1D; ++qx)
2507 {
2508 massX[qx] = 0.0;
2509 }
2510
2511 for (int dx = 0; dx < D1Dx; ++dx)
2512 {
2513 const real_t t = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2514 for (int qx = 0; qx < Q1D; ++qx)
2515 {
2516 massX[qx] += t * ((c == 0) ? Bo(qx,dx) : Bc(qx,dx));
2517 }
2518 }
2519
2520 for (int qy = 0; qy < Q1D; ++qy)
2521 {
2522 const real_t wy = (c == 1) ? Bo(qy,dy) : Bc(qy,dy);
2523 for (int qx = 0; qx < Q1D; ++qx)
2524 {
2525 const real_t wx = massX[qx];
2526 massXY[qy][qx] += wx * wy;
2527 }
2528 }
2529 }
2530
2531 for (int qz = 0; qz < Q1D; ++qz)
2532 {
2533 const real_t wz = (c == 2) ? Bo(qz,dz) : Bc(qz,dz);
2534 for (int qy = 0; qy < Q1D; ++qy)
2535 {
2536 for (int qx = 0; qx < Q1D; ++qx)
2537 {
2538 mass[qz][qy][qx][c] += massXY[qy][qx] * wz;
2539 }
2540 }
2541 }
2542 }
2543
2544 osc += D1Dx * D1Dy * D1Dz;
2545 } // loop (c) over components
2546
2547 // Apply D operator.
2548 for (int qz = 0; qz < Q1D; ++qz)
2549 {
2550 for (int qy = 0; qy < Q1D; ++qy)
2551 {
2552 for (int qx = 0; qx < Q1D; ++qx)
2553 {
2554 const real_t O11 = op(0,qx,qy,qz,e);
2555 if (coeffDim == 1)
2556 {
2557 for (int c = 0; c < VDIM; ++c)
2558 {
2559 mass[qz][qy][qx][c] *= O11;
2560 }
2561 }
2562 else
2563 {
2564 const real_t O12 = op(1,qx,qy,qz,e);
2565 const real_t O13 = op(2,qx,qy,qz,e);
2566 const real_t O21 = op(3,qx,qy,qz,e);
2567 const real_t O22 = op(4,qx,qy,qz,e);
2568 const real_t O23 = op(5,qx,qy,qz,e);
2569 const real_t O31 = op(6,qx,qy,qz,e);
2570 const real_t O32 = op(7,qx,qy,qz,e);
2571 const real_t O33 = op(8,qx,qy,qz,e);
2572 const real_t massX = mass[qz][qy][qx][0];
2573 const real_t massY = mass[qz][qy][qx][1];
2574 const real_t massZ = mass[qz][qy][qx][2];
2575 mass[qz][qy][qx][0] = (O11*massX)+(O12*massY)+(O13*massZ);
2576 mass[qz][qy][qx][1] = (O21*massX)+(O22*massY)+(O23*massZ);
2577 mass[qz][qy][qx][2] = (O31*massX)+(O32*massY)+(O33*massZ);
2578 }
2579 }
2580 }
2581 }
2582
2583 // x component
2584 osc = 0;
2585 {
2586 const int D1Dz = D1D;
2587 const int D1Dy = D1D;
2588 const int D1Dx = D1D - 1;
2589
2590 for (int qz = 0; qz < Q1D; ++qz)
2591 {
2592 real_t gradXY12[MD1D][MD1D];
2593 real_t gradXY21[MD1D][MD1D];
2594
2595 for (int dy = 0; dy < D1Dy; ++dy)
2596 {
2597 for (int dx = 0; dx < D1Dx; ++dx)
2598 {
2599 gradXY12[dy][dx] = 0.0;
2600 gradXY21[dy][dx] = 0.0;
2601 }
2602 }
2603 for (int qy = 0; qy < Q1D; ++qy)
2604 {
2605 real_t massX[MD1D][2];
2606 for (int dx = 0; dx < D1Dx; ++dx)
2607 {
2608 for (int n = 0; n < 2; ++n)
2609 {
2610 massX[dx][n] = 0.0;
2611 }
2612 }
2613 for (int qx = 0; qx < Q1D; ++qx)
2614 {
2615 for (int dx = 0; dx < D1Dx; ++dx)
2616 {
2617 const real_t wx = Bot(dx,qx);
2618
2619 massX[dx][0] += wx * mass[qz][qy][qx][1];
2620 massX[dx][1] += wx * mass[qz][qy][qx][2];
2621 }
2622 }
2623 for (int dy = 0; dy < D1Dy; ++dy)
2624 {
2625 const real_t wy = Bct(dy,qy);
2626 const real_t wDy = Gct(dy,qy);
2627
2628 for (int dx = 0; dx < D1Dx; ++dx)
2629 {
2630 gradXY21[dy][dx] += massX[dx][0] * wy;
2631 gradXY12[dy][dx] += massX[dx][1] * wDy;
2632 }
2633 }
2634 }
2635
2636 for (int dz = 0; dz < D1Dz; ++dz)
2637 {
2638 const real_t wz = Bct(dz,qz);
2639 const real_t wDz = Gct(dz,qz);
2640 for (int dy = 0; dy < D1Dy; ++dy)
2641 {
2642 for (int dx = 0; dx < D1Dx; ++dx)
2643 {
2644 // \hat{\nabla}\times\hat{u} is [0, (u_0)_{x_2}, -(u_0)_{x_1}]
2645 // (u_0)_{x_2} * (op * curl)_1 - (u_0)_{x_1} * (op * curl)_2
2646 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2647 e) += (gradXY21[dy][dx] * wDz) - (gradXY12[dy][dx] * wz);
2648 }
2649 }
2650 }
2651 } // loop qz
2652
2653 osc += D1Dx * D1Dy * D1Dz;
2654 }
2655
2656 // y component
2657 {
2658 const int D1Dz = D1D;
2659 const int D1Dy = D1D - 1;
2660 const int D1Dx = D1D;
2661
2662 for (int qz = 0; qz < Q1D; ++qz)
2663 {
2664 real_t gradXY02[MD1D][MD1D];
2665 real_t gradXY20[MD1D][MD1D];
2666
2667 for (int dy = 0; dy < D1Dy; ++dy)
2668 {
2669 for (int dx = 0; dx < D1Dx; ++dx)
2670 {
2671 gradXY02[dy][dx] = 0.0;
2672 gradXY20[dy][dx] = 0.0;
2673 }
2674 }
2675 for (int qx = 0; qx < Q1D; ++qx)
2676 {
2677 real_t massY[MD1D][2];
2678 for (int dy = 0; dy < D1Dy; ++dy)
2679 {
2680 massY[dy][0] = 0.0;
2681 massY[dy][1] = 0.0;
2682 }
2683 for (int qy = 0; qy < Q1D; ++qy)
2684 {
2685 for (int dy = 0; dy < D1Dy; ++dy)
2686 {
2687 const real_t wy = Bot(dy,qy);
2688
2689 massY[dy][0] += wy * mass[qz][qy][qx][2];
2690 massY[dy][1] += wy * mass[qz][qy][qx][0];
2691 }
2692 }
2693 for (int dx = 0; dx < D1Dx; ++dx)
2694 {
2695 const real_t wx = Bct(dx,qx);
2696 const real_t wDx = Gct(dx,qx);
2697
2698 for (int dy = 0; dy < D1Dy; ++dy)
2699 {
2700 gradXY02[dy][dx] += massY[dy][0] * wDx;
2701 gradXY20[dy][dx] += massY[dy][1] * wx;
2702 }
2703 }
2704 }
2705
2706 for (int dz = 0; dz < D1Dz; ++dz)
2707 {
2708 const real_t wz = Bct(dz,qz);
2709 const real_t wDz = Gct(dz,qz);
2710 for (int dy = 0; dy < D1Dy; ++dy)
2711 {
2712 for (int dx = 0; dx < D1Dx; ++dx)
2713 {
2714 // \hat{\nabla}\times\hat{u} is [-(u_1)_{x_2}, 0, (u_1)_{x_0}]
2715 // -(u_1)_{x_2} * (op * curl)_0 + (u_1)_{x_0} * (op * curl)_2
2716 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2717 e) += (-gradXY20[dy][dx] * wDz) + (gradXY02[dy][dx] * wz);
2718 }
2719 }
2720 }
2721 } // loop qz
2722
2723 osc += D1Dx * D1Dy * D1Dz;
2724 }
2725
2726 // z component
2727 {
2728 const int D1Dz = D1D - 1;
2729 const int D1Dy = D1D;
2730 const int D1Dx = D1D;
2731
2732 for (int qx = 0; qx < Q1D; ++qx)
2733 {
2734 real_t gradYZ01[MD1D][MD1D];
2735 real_t gradYZ10[MD1D][MD1D];
2736
2737 for (int dy = 0; dy < D1Dy; ++dy)
2738 {
2739 for (int dz = 0; dz < D1Dz; ++dz)
2740 {
2741 gradYZ01[dz][dy] = 0.0;
2742 gradYZ10[dz][dy] = 0.0;
2743 }
2744 }
2745 for (int qy = 0; qy < Q1D; ++qy)
2746 {
2747 real_t massZ[MD1D][2];
2748 for (int dz = 0; dz < D1Dz; ++dz)
2749 {
2750 for (int n = 0; n < 2; ++n)
2751 {
2752 massZ[dz][n] = 0.0;
2753 }
2754 }
2755 for (int qz = 0; qz < Q1D; ++qz)
2756 {
2757 for (int dz = 0; dz < D1Dz; ++dz)
2758 {
2759 const real_t wz = Bot(dz,qz);
2760
2761 massZ[dz][0] += wz * mass[qz][qy][qx][0];
2762 massZ[dz][1] += wz * mass[qz][qy][qx][1];
2763 }
2764 }
2765 for (int dy = 0; dy < D1Dy; ++dy)
2766 {
2767 const real_t wy = Bct(dy,qy);
2768 const real_t wDy = Gct(dy,qy);
2769
2770 for (int dz = 0; dz < D1Dz; ++dz)
2771 {
2772 gradYZ01[dz][dy] += wy * massZ[dz][1];
2773 gradYZ10[dz][dy] += wDy * massZ[dz][0];
2774 }
2775 }
2776 }
2777
2778 for (int dx = 0; dx < D1Dx; ++dx)
2779 {
2780 const real_t wx = Bct(dx,qx);
2781 const real_t wDx = Gct(dx,qx);
2782
2783 for (int dy = 0; dy < D1Dy; ++dy)
2784 {
2785 for (int dz = 0; dz < D1Dz; ++dz)
2786 {
2787 // \hat{\nabla}\times\hat{u} is [(u_2)_{x_1}, -(u_2)_{x_0}, 0]
2788 // (u_2)_{x_1} * (op * curl)_0 - (u_2)_{x_0} * (op * curl)_1
2789 Y(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc,
2790 e) += (gradYZ10[dz][dy] * wx) - (gradYZ01[dz][dy] * wDx);
2791 }
2792 }
2793 }
2794 } // loop qx
2795 }
2796 });
2797}
2798
2799// PA H(curl)-L2 Apply Transpose 3D kernel
2800template<int T_D1D = 0, int T_Q1D = 0>
2801inline void SmemPAHcurlL2ApplyTranspose3D(const int d1d,
2802 const int q1d,
2803 const int coeffDim,
2804 const int NE,
2805 const Array<real_t> &bo,
2806 const Array<real_t> &bc,
2807 const Array<real_t> &gc,
2808 const Vector &pa_data,
2809 const Vector &x,
2810 Vector &y)
2811{
2812 MFEM_VERIFY(T_D1D || d1d <= DeviceDofQuadLimits::Get().HCURL_MAX_D1D,
2813 "Error: d1d > HCURL_MAX_D1D");
2814 MFEM_VERIFY(T_Q1D || q1d <= DeviceDofQuadLimits::Get().HCURL_MAX_Q1D,
2815 "Error: q1d > HCURL_MAX_Q1D");
2816 const int D1D = T_D1D ? T_D1D : d1d;
2817 const int Q1D = T_Q1D ? T_Q1D : q1d;
2818
2819 auto Bo = Reshape(bo.Read(), Q1D, D1D-1);
2820 auto Bc = Reshape(bc.Read(), Q1D, D1D);
2821 auto Gc = Reshape(gc.Read(), Q1D, D1D);
2822 auto op = Reshape(pa_data.Read(), coeffDim, Q1D, Q1D, Q1D, NE);
2823 auto X = Reshape(x.Read(), 3*(D1D-1)*D1D*D1D, NE);
2824 auto Y = Reshape(y.ReadWrite(), 3*(D1D-1)*D1D*D1D, NE);
2825
2826 auto device_kernel = [=] MFEM_DEVICE (int e)
2827 {
2828 constexpr int VDIM = 3;
2829 constexpr int maxCoeffDim = 9;
2830 constexpr int MD1D = T_D1D ? T_D1D : DofQuadLimits::HCURL_MAX_D1D;
2831 constexpr int MQ1D = T_Q1D ? T_Q1D : DofQuadLimits::HCURL_MAX_Q1D;
2832 const int D1D = T_D1D ? T_D1D : d1d;
2833 const int Q1D = T_Q1D ? T_Q1D : q1d;
2834
2835 MFEM_SHARED real_t sBo[MD1D][MQ1D];
2836 MFEM_SHARED real_t sBc[MD1D][MQ1D];
2837 MFEM_SHARED real_t sGc[MD1D][MQ1D];
2838
2839 real_t opc[maxCoeffDim];
2840 MFEM_SHARED real_t sop[maxCoeffDim][MQ1D][MQ1D];
2841 MFEM_SHARED real_t mass[MQ1D][MQ1D][3];
2842
2843 MFEM_SHARED real_t sX[MD1D][MD1D][MD1D];
2844
2845 MFEM_FOREACH_THREAD(qx,x,Q1D)
2846 {
2847 MFEM_FOREACH_THREAD(qy,y,Q1D)
2848 {
2849 MFEM_FOREACH_THREAD(qz,z,Q1D)
2850 {
2851 for (int i=0; i<coeffDim; ++i)
2852 {
2853 opc[i] = op(i,qx,qy,qz,e);
2854 }
2855 }
2856 }
2857 }
2858
2859 const int tidx = MFEM_THREAD_ID(x);
2860 const int tidy = MFEM_THREAD_ID(y);
2861 const int tidz = MFEM_THREAD_ID(z);
2862
2863 if (tidz == 0)
2864 {
2865 MFEM_FOREACH_THREAD(d,y,D1D)
2866 {
2867 MFEM_FOREACH_THREAD(q,x,Q1D)
2868 {
2869 sBc[d][q] = Bc(q,d);
2870 sGc[d][q] = Gc(q,d);
2871 if (d < D1D-1)
2872 {
2873 sBo[d][q] = Bo(q,d);
2874 }
2875 }
2876 }
2877 }
2878 MFEM_SYNC_THREAD;
2879
2880 for (int qz=0; qz < Q1D; ++qz)
2881 {
2882 if (tidz == qz)
2883 {
2884 MFEM_FOREACH_THREAD(qy,y,Q1D)
2885 {
2886 MFEM_FOREACH_THREAD(qx,x,Q1D)
2887 {
2888 for (int i=0; i<3; ++i)
2889 {
2890 mass[qy][qx][i] = 0.0;
2891 }
2892 }
2893 }
2894 }
2895
2896 int osc = 0;
2897 for (int c = 0; c < VDIM; ++c) // loop over x, y, z components
2898 {
2899 const int D1Dz = (c == 2) ? D1D - 1 : D1D;
2900 const int D1Dy = (c == 1) ? D1D - 1 : D1D;
2901 const int D1Dx = (c == 0) ? D1D - 1 : D1D;
2902
2903 MFEM_FOREACH_THREAD(dz,z,D1Dz)
2904 {
2905 MFEM_FOREACH_THREAD(dy,y,D1Dy)
2906 {
2907 MFEM_FOREACH_THREAD(dx,x,D1Dx)
2908 {
2909 sX[dz][dy][dx] = X(dx + ((dy + (dz * D1Dy)) * D1Dx) + osc, e);
2910 }
2911 }
2912 }
2913 MFEM_SYNC_THREAD;
2914
2915 if (tidz == qz)
2916 {
2917 if (c == 0)
2918 {
2919 for (int i=0; i<coeffDim; ++i)
2920 {
2921 sop[i][tidx][tidy] = opc[i];
2922 }
2923 }
2924
2925 MFEM_FOREACH_THREAD(qy,y,Q1D)
2926 {
2927 MFEM_FOREACH_THREAD(qx,x,Q1D)
2928 {
2929 real_t u = 0.0;
2930
2931 for (int dz = 0; dz < D1Dz; ++dz)
2932 {
2933 const real_t wz = (c == 2) ? sBo[dz][qz] : sBc[dz][qz];
2934
2935 for (int dy = 0; dy < D1Dy; ++dy)
2936 {
2937 const real_t wy = (c == 1) ? sBo[dy][qy] : sBc[dy][qy];
2938
2939 for (int dx = 0; dx < D1Dx; ++dx)
2940 {
2941 const real_t wx = sX[dz][dy][dx] * ((c == 0) ? sBo[dx][qx] : sBc[dx][qx]);
2942 u += wx * wy * wz;
2943 }
2944 }
2945 }
2946
2947 mass[qy][qx][c] += u;
2948 } // qx
2949 } // qy
2950 } // tidz == qz
2951
2952 osc += D1Dx * D1Dy * D1Dz;
2953 MFEM_SYNC_THREAD;
2954 } // c
2955
2956 real_t dxyz1 = 0.0;
2957 real_t dxyz2 = 0.0;
2958 real_t dxyz3 = 0.0;
2959
2960 MFEM_FOREACH_THREAD(dz,z,D1D)
2961 {
2962 const real_t wcz = sBc[dz][qz];
2963 const real_t wcDz = sGc[dz][qz];
2964 const real_t wz = (dz < D1D-1) ? sBo[dz][qz] : 0.0;
2965
2966 MFEM_FOREACH_THREAD(dy,y,D1D)
2967 {
2968 MFEM_FOREACH_THREAD(dx,x,D1D)
2969 {
2970 for (int qy = 0; qy < Q1D; ++qy)
2971 {
2972 const real_t wcy = sBc[dy][qy];
2973 const real_t wcDy = sGc[dy][qy];
2974 const real_t wy = (dy < D1D-1) ? sBo[dy][qy] : 0.0;
2975
2976 for (int qx = 0; qx < Q1D; ++qx)
2977 {
2978 const real_t O11 = sop[0][qx][qy];
2979 real_t c1, c2, c3;
2980 if (coeffDim == 1)
2981 {
2982 c1 = O11 * mass[qy][qx][0];
2983 c2 = O11 * mass[qy][qx][1];
2984 c3 = O11 * mass[qy][qx][2];
2985 }
2986 else
2987 {
2988 const real_t O12 = sop[1][qx][qy];
2989 const real_t O13 = sop[2][qx][qy];
2990 const real_t O21 = sop[3][qx][qy];
2991 const real_t O22 = sop[4][qx][qy];
2992 const real_t O23 = sop[5][qx][qy];
2993 const real_t O31 = sop[6][qx][qy];
2994 const real_t O32 = sop[7][qx][qy];
2995 const real_t O33 = sop[8][qx][qy];
2996
2997 c1 = (O11*mass[qy][qx][0])+(O12*mass[qy][qx][1])+(O13*mass[qy][qx][2]);
2998 c2 = (O21*mass[qy][qx][0])+(O22*mass[qy][qx][1])+(O23*mass[qy][qx][2]);
2999 c3 = (O31*mass[qy][qx][0])+(O32*mass[qy][qx][1])+(O33*mass[qy][qx][2]);
3000 }
3001
3002 const real_t wcx = sBc[dx][qx];
3003 const real_t wDx = sGc[dx][qx];
3004
3005 if (dx < D1D-1)
3006 {
3007 const real_t wx = sBo[dx][qx];
3008 dxyz1 += (wx * c2 * wcy * wcDz) - (wx * c3 * wcDy * wcz);
3009 }
3010
3011 dxyz2 += (-wy * c1 * wcx * wcDz) + (wy * c3 * wDx * wcz);
3012
3013 dxyz3 += (wcDy * wz * c1 * wcx) - (wcy * wz * c2 * wDx);
3014 } // qx
3015 } // qy
3016 } // dx
3017 } // dy
3018 } // dz
3019
3020 MFEM_SYNC_THREAD;
3021
3022 MFEM_FOREACH_THREAD(dz,z,D1D)
3023 {
3024 MFEM_FOREACH_THREAD(dy,y,D1D)
3025 {
3026 MFEM_FOREACH_THREAD(dx,x,D1D)
3027 {
3028 if (dx < D1D-1)
3029 {
3030 Y(dx + ((dy + (dz * D1D)) * (D1D-1)), e) += dxyz1;
3031 }
3032 if (dy < D1D-1)
3033 {
3034 Y(dx + ((dy + (dz * (D1D-1))) * D1D) + ((D1D-1)*D1D*D1D), e) += dxyz2;
3035 }
3036 if (dz < D1D-1)
3037 {
3038 Y(dx + ((dy + (dz * D1D)) * D1D) + (2*(D1D-1)*D1D*D1D), e) += dxyz3;
3039 }
3040 }
3041 }
3042 }
3043 } // qz
3044 }; // end of element loop
3045
3046 auto host_kernel = [&] MFEM_LAMBDA (int)
3047 {
3048 MFEM_ABORT_KERNEL("This kernel should only be used on GPU.");
3049 };
3050
3051 ForallWrap<3>(true, NE, device_kernel, host_kernel, Q1D, Q1D, Q1D);
3052}
3053
3054} // namespace internal
3055
3056} // namespace mfem
3057
3058#endif
real_t b
Definition lissajous.cpp:42
real_t u(const Vector &xvec)
Definition lor_mms.hpp:22
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition dtensor.hpp:131
void ForallWrap(const bool use_dev, const int N, d_lambda &&d_body, h_lambda &&h_body, const int X=0, const int Y=0, const int Z=0, const int G=0)
The forall kernel body wrapper.
Definition forall.hpp:675
void forall_3D(int N, int X, int Y, int Z, lambda &&body)
Definition forall.hpp:774
float real_t
Definition config.hpp:43
void forall(int N, lambda &&body)
Definition forall.hpp:753
static const DeviceDofQuadLimits & Get()
Return a const reference to the DeviceDofQuadLimits singleton.
Definition forall.hpp:128