MFEM  v4.5.2
Finite element discretization library
bilininteg_diffusion_pa.cpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #include "../general/forall.hpp"
13 #include "bilininteg.hpp"
14 #include "gridfunc.hpp"
15 #include "qfunction.hpp"
17 
18 using namespace std;
19 
20 namespace mfem
21 {
22 
23 // PA Diffusion Integrator
24 
25 // OCCA 2D Assemble kernel
26 #ifdef MFEM_USE_OCCA
27 static void OccaPADiffusionSetup2D(const int D1D,
28  const int Q1D,
29  const int NE,
30  const Array<double> &W,
31  const Vector &J,
32  const Vector &C,
33  Vector &op)
34 {
35  occa::properties props;
36  props["defines/D1D"] = D1D;
37  props["defines/Q1D"] = Q1D;
38  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
39  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
40  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
41  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
42  const bool const_c = C.Size() == 1;
43  const occa_id_t id = std::make_pair(D1D,Q1D);
44  static occa_kernel_t OccaDiffSetup2D_ker;
45  if (OccaDiffSetup2D_ker.find(id) == OccaDiffSetup2D_ker.end())
46  {
47  const occa::kernel DiffusionSetup2D =
48  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
49  "DiffusionSetup2D", props);
50  OccaDiffSetup2D_ker.emplace(id, DiffusionSetup2D);
51  }
52  OccaDiffSetup2D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
53 }
54 
55 static void OccaPADiffusionSetup3D(const int D1D,
56  const int Q1D,
57  const int NE,
58  const Array<double> &W,
59  const Vector &J,
60  const Vector &C,
61  Vector &op)
62 {
63  occa::properties props;
64  props["defines/D1D"] = D1D;
65  props["defines/Q1D"] = Q1D;
66  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
67  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
68  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
69  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
70  const bool const_c = C.Size() == 1;
71  const occa_id_t id = std::make_pair(D1D,Q1D);
72  static occa_kernel_t OccaDiffSetup3D_ker;
73  if (OccaDiffSetup3D_ker.find(id) == OccaDiffSetup3D_ker.end())
74  {
75  const occa::kernel DiffusionSetup3D =
76  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
77  "DiffusionSetup3D", props);
78  OccaDiffSetup3D_ker.emplace(id, DiffusionSetup3D);
79  }
80  OccaDiffSetup3D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
81 }
82 #endif // MFEM_USE_OCCA
83 
84 template<>
85 void PADiffusionSetup2D<2>(const int Q1D,
86  const int coeffDim,
87  const int NE,
88  const Array<double> &w,
89  const Vector &j,
90  const Vector &c,
91  Vector &d)
92 {
93  const bool symmetric = (coeffDim != 4);
94  const bool const_c = c.Size() == 1;
95  MFEM_VERIFY(coeffDim < 3 ||
96  !const_c, "Constant matrix coefficient not supported");
97  const auto W = Reshape(w.Read(), Q1D,Q1D);
98  const auto J = Reshape(j.Read(), Q1D,Q1D,2,2,NE);
99  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1) :
100  Reshape(c.Read(), coeffDim,Q1D,Q1D,NE);
101  auto D = Reshape(d.Write(), Q1D,Q1D, symmetric ? 3 : 4, NE);
102  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
103  {
104  MFEM_FOREACH_THREAD(qx,x,Q1D)
105  {
106  MFEM_FOREACH_THREAD(qy,y,Q1D)
107  {
108  const double J11 = J(qx,qy,0,0,e);
109  const double J21 = J(qx,qy,1,0,e);
110  const double J12 = J(qx,qy,0,1,e);
111  const double J22 = J(qx,qy,1,1,e);
112  const double w_detJ = W(qx,qy) / ((J11*J22)-(J21*J12));
113  if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient
114  {
115  // First compute entries of R = MJ^{-T}, without det J factor.
116  const double M11 = C(0,qx,qy,e);
117  const double M12 = C(1,qx,qy,e);
118  const double M21 = symmetric ? M12 : C(2,qx,qy,e);
119  const double M22 = symmetric ? C(2,qx,qy,e) : C(3,qx,qy,e);
120  const double R11 = M11*J22 - M12*J12;
121  const double R21 = M21*J22 - M22*J12;
122  const double R12 = -M11*J21 + M12*J11;
123  const double R22 = -M21*J21 + M22*J11;
124 
125  // Now set y to J^{-1}R.
126  D(qx,qy,0,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
127  D(qx,qy,1,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1
128  D(qx,qy,2,e) = w_detJ * (symmetric ? (-J21*R12 + J11*R22) :
129  (J22*R12 - J12*R22)); // 2,2 or 1,2
130  if (!symmetric)
131  {
132  D(qx,qy,3,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
133  }
134  }
135  else // Vector or scalar coefficient
136  {
137  const double C1 = const_c ? C(0,0,0,0) : C(0,qx,qy,e);
138  const double C2 = const_c ? C(0,0,0,0) :
139  (coeffDim == 2 ? C(1,qx,qy,e) : C(0,qx,qy,e));
140 
141  D(qx,qy,0,e) = w_detJ * (C2*J12*J12 + C1*J22*J22); // 1,1
142  D(qx,qy,1,e) = -w_detJ * (C2*J12*J11 + C1*J22*J21); // 1,2
143  D(qx,qy,2,e) = w_detJ * (C2*J11*J11 + C1*J21*J21); // 2,2
144  }
145  }
146  }
147  });
148 }
149 
150 // PA Diffusion Assemble 2D kernel with 3D node coords
151 template<>
152 void PADiffusionSetup2D<3>(const int Q1D,
153  const int coeffDim,
154  const int NE,
155  const Array<double> &w,
156  const Vector &j,
157  const Vector &c,
158  Vector &d)
159 {
160  MFEM_VERIFY(coeffDim == 1, "Matrix and vector coefficients not supported");
161  constexpr int DIM = 2;
162  constexpr int SDIM = 3;
163  const bool const_c = c.Size() == 1;
164  const auto W = Reshape(w.Read(), Q1D,Q1D);
165  const auto J = Reshape(j.Read(), Q1D,Q1D,SDIM,DIM,NE);
166  const auto C = const_c ? Reshape(c.Read(), 1,1,1) :
167  Reshape(c.Read(), Q1D,Q1D,NE);
168  auto D = Reshape(d.Write(), Q1D,Q1D, 3, NE);
169  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
170  {
171  MFEM_FOREACH_THREAD(qx,x,Q1D)
172  {
173  MFEM_FOREACH_THREAD(qy,y,Q1D)
174  {
175  const double wq = W(qx,qy);
176  const double J11 = J(qx,qy,0,0,e);
177  const double J21 = J(qx,qy,1,0,e);
178  const double J31 = J(qx,qy,2,0,e);
179  const double J12 = J(qx,qy,0,1,e);
180  const double J22 = J(qx,qy,1,1,e);
181  const double J32 = J(qx,qy,2,1,e);
182  const double E = J11*J11 + J21*J21 + J31*J31;
183  const double G = J12*J12 + J22*J22 + J32*J32;
184  const double F = J11*J12 + J21*J22 + J31*J32;
185  const double iw = 1.0 / sqrt(E*G - F*F);
186  const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
187  const double alpha = wq * coeff * iw;
188  D(qx,qy,0,e) = alpha * G; // 1,1
189  D(qx,qy,1,e) = -alpha * F; // 1,2
190  D(qx,qy,2,e) = alpha * E; // 2,2
191  }
192  }
193  });
194 }
195 
196 // PA Diffusion Assemble 3D kernel
197 void PADiffusionSetup3D(const int Q1D,
198  const int coeffDim,
199  const int NE,
200  const Array<double> &w,
201  const Vector &j,
202  const Vector &c,
203  Vector &d)
204 {
205  const bool symmetric = (coeffDim != 9);
206  const bool const_c = c.Size() == 1;
207  MFEM_VERIFY(coeffDim < 6 ||
208  !const_c, "Constant matrix coefficient not supported");
209  const auto W = Reshape(w.Read(), Q1D,Q1D,Q1D);
210  const auto J = Reshape(j.Read(), Q1D,Q1D,Q1D,3,3,NE);
211  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1,1) :
212  Reshape(c.Read(), coeffDim,Q1D,Q1D,Q1D,NE);
213  auto D = Reshape(d.Write(), Q1D,Q1D,Q1D, symmetric ? 6 : 9, NE);
214  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
215  {
216  MFEM_FOREACH_THREAD(qx,x,Q1D)
217  {
218  MFEM_FOREACH_THREAD(qy,y,Q1D)
219  {
220  MFEM_FOREACH_THREAD(qz,z,Q1D)
221  {
222  const double J11 = J(qx,qy,qz,0,0,e);
223  const double J21 = J(qx,qy,qz,1,0,e);
224  const double J31 = J(qx,qy,qz,2,0,e);
225  const double J12 = J(qx,qy,qz,0,1,e);
226  const double J22 = J(qx,qy,qz,1,1,e);
227  const double J32 = J(qx,qy,qz,2,1,e);
228  const double J13 = J(qx,qy,qz,0,2,e);
229  const double J23 = J(qx,qy,qz,1,2,e);
230  const double J33 = J(qx,qy,qz,2,2,e);
231  const double detJ = J11 * (J22 * J33 - J32 * J23) -
232  /* */ J21 * (J12 * J33 - J32 * J13) +
233  /* */ J31 * (J12 * J23 - J22 * J13);
234  const double w_detJ = W(qx,qy,qz) / detJ;
235  // adj(J)
236  const double A11 = (J22 * J33) - (J23 * J32);
237  const double A12 = (J32 * J13) - (J12 * J33);
238  const double A13 = (J12 * J23) - (J22 * J13);
239  const double A21 = (J31 * J23) - (J21 * J33);
240  const double A22 = (J11 * J33) - (J13 * J31);
241  const double A23 = (J21 * J13) - (J11 * J23);
242  const double A31 = (J21 * J32) - (J31 * J22);
243  const double A32 = (J31 * J12) - (J11 * J32);
244  const double A33 = (J11 * J22) - (J12 * J21);
245 
246  if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
247  {
248  // Compute entries of R = MJ^{-T} = M adj(J)^T, without det J.
249  const double M11 = C(0, qx,qy,qz, e);
250  const double M12 = C(1, qx,qy,qz, e);
251  const double M13 = C(2, qx,qy,qz, e);
252  const double M21 = (!symmetric) ? C(3, qx,qy,qz, e) : M12;
253  const double M22 = (!symmetric) ? C(4, qx,qy,qz, e) : C(3, qx,qy,qz, e);
254  const double M23 = (!symmetric) ? C(5, qx,qy,qz, e) : C(4, qx,qy,qz, e);
255  const double M31 = (!symmetric) ? C(6, qx,qy,qz, e) : M13;
256  const double M32 = (!symmetric) ? C(7, qx,qy,qz, e) : M23;
257  const double M33 = (!symmetric) ? C(8, qx,qy,qz, e) : C(5, qx,qy,qz, e);
258 
259  const double R11 = M11*A11 + M12*A12 + M13*A13;
260  const double R12 = M11*A21 + M12*A22 + M13*A23;
261  const double R13 = M11*A31 + M12*A32 + M13*A33;
262  const double R21 = M21*A11 + M22*A12 + M23*A13;
263  const double R22 = M21*A21 + M22*A22 + M23*A23;
264  const double R23 = M21*A31 + M22*A32 + M23*A33;
265  const double R31 = M31*A11 + M32*A12 + M33*A13;
266  const double R32 = M31*A21 + M32*A22 + M33*A23;
267  const double R33 = M31*A31 + M32*A32 + M33*A33;
268 
269  // Now set D to J^{-1} R = adj(J) R
270  D(qx,qy,qz,0,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
271  const double D12 = w_detJ * (A11*R12 + A12*R22 + A13*R32);
272  D(qx,qy,qz,1,e) = D12; // 1,2
273  D(qx,qy,qz,2,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
274 
275  const double D22 = w_detJ * (A21*R12 + A22*R22 + A23*R32);
276  const double D23 = w_detJ * (A21*R13 + A22*R23 + A23*R33);
277 
278  const double D33 = w_detJ * (A31*R13 + A32*R23 + A33*R33);
279 
280  D(qx,qy,qz,4,e) = symmetric ? D23 : D22; // 2,3 or 2,2
281  D(qx,qy,qz,5,e) = symmetric ? D33 : D23; // 3,3 or 2,3
282 
283  if (symmetric)
284  {
285  D(qx,qy,qz,3,e) = D22; // 2,2
286  }
287  else
288  {
289  D(qx,qy,qz,3,e) = w_detJ * (A21*R11 + A22*R21 + A23*R31); // 2,1
290  D(qx,qy,qz,6,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
291  D(qx,qy,qz,7,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
292  D(qx,qy,qz,8,e) = D33; // 3,3
293  }
294  }
295  else // Vector or scalar coefficient version
296  {
297  const double C1 = const_c ? C(0,0,0,0,0) : C(0,qx,qy,qz,e);
298  const double C2 = const_c ? C(0,0,0,0,0) :
299  (coeffDim == 3 ? C(1,qx,qy,qz,e) : C(0,qx,qy,qz,e));
300  const double C3 = const_c ? C(0,0,0,0,0) :
301  (coeffDim == 3 ? C(2,qx,qy,qz,e) : C(0,qx,qy,qz,e));
302 
303  // detJ J^{-1} J^{-T} = (1/detJ) adj(J) adj(J)^T
304  D(qx,qy,qz,0,e) = w_detJ * (C1*A11*A11 + C2*A12*A12 + C3*A13*A13); // 1,1
305  D(qx,qy,qz,1,e) = w_detJ * (C1*A11*A21 + C2*A12*A22 + C3*A13*A23); // 2,1
306  D(qx,qy,qz,2,e) = w_detJ * (C1*A11*A31 + C2*A12*A32 + C3*A13*A33); // 3,1
307  D(qx,qy,qz,3,e) = w_detJ * (C1*A21*A21 + C2*A22*A22 + C3*A23*A23); // 2,2
308  D(qx,qy,qz,4,e) = w_detJ * (C1*A21*A31 + C2*A22*A32 + C3*A23*A33); // 3,2
309  D(qx,qy,qz,5,e) = w_detJ * (C1*A31*A31 + C2*A32*A32 + C3*A33*A33); // 3,3
310  }
311  }
312  }
313  }
314  });
315 }
316 
317 static void PADiffusionSetup(const int dim,
318  const int sdim,
319  const int D1D,
320  const int Q1D,
321  const int coeffDim,
322  const int NE,
323  const Array<double> &W,
324  const Vector &J,
325  const Vector &C,
326  Vector &D)
327 {
328  if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADiffusionSetup"); }
329  if (dim == 2)
330  {
331 #ifdef MFEM_USE_OCCA
332  if (DeviceCanUseOcca())
333  {
334  OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, C, D);
335  return;
336  }
337 #else
338  MFEM_CONTRACT_VAR(D1D);
339 #endif // MFEM_USE_OCCA
340  if (sdim == 2) { PADiffusionSetup2D<2>(Q1D, coeffDim, NE, W, J, C, D); }
341  if (sdim == 3) { PADiffusionSetup2D<3>(Q1D, coeffDim, NE, W, J, C, D); }
342  }
343  if (dim == 3)
344  {
345 #ifdef MFEM_USE_OCCA
346  if (DeviceCanUseOcca())
347  {
348  OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, C, D);
349  return;
350  }
351 #endif // MFEM_USE_OCCA
352  PADiffusionSetup3D(Q1D, coeffDim, NE, W, J, C, D);
353  }
354 }
355 
356 void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
357 {
358  const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
359  Device::GetDeviceMemoryType() : pa_mt;
360  // Assuming the same element type
361  fespace = &fes;
362  Mesh *mesh = fes.GetMesh();
363  if (mesh->GetNE() == 0) { return; }
364  const FiniteElement &el = *fes.GetFE(0);
365  const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
366  if (DeviceCanUseCeed())
367  {
368  delete ceedOp;
369  MFEM_VERIFY(!VQ && !MQ,
370  "Only scalar coefficient supported for DiffusionIntegrator"
371  " with libCEED");
372  const bool mixed = mesh->GetNumGeometries(mesh->Dimension()) > 1 ||
373  fes.IsVariableOrder();
374  if (mixed)
375  {
376  ceedOp = new ceed::MixedPADiffusionIntegrator(*this, fes, Q);
377  }
378  else
379  {
380  ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
381  }
382  return;
383  }
384  const int dims = el.GetDim();
385  const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
386  const int nq = ir->GetNPoints();
387  dim = mesh->Dimension();
388  ne = fes.GetNE();
389  geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
390  const int sdim = mesh->SpaceDimension();
391  maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
392  dofs1D = maps->ndof;
393  quad1D = maps->nqpt;
394 
395  QuadratureSpace qs(*mesh, *ir);
396  CoefficientVector coeff(qs, CoefficientStorage::COMPRESSED);
397 
398  if (MQ) { coeff.ProjectTranspose(*MQ); }
399  else if (VQ) { coeff.Project(*VQ); }
400  else if (Q) { coeff.Project(*Q); }
401  else { coeff.SetConstant(1.0); }
402 
403  const int coeff_dim = coeff.GetVDim();
404  symmetric = (coeff_dim != dims*dims);
405  const int pa_size = symmetric ? symmDims : dims*dims;
406 
407  pa_data.SetSize(pa_size * nq * ne, mt);
408  PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeff_dim, ne, ir->GetWeights(),
409  geom->J, coeff, pa_data);
410 }
411 
412 template<int T_D1D = 0, int T_Q1D = 0>
413 static void PADiffusionDiagonal2D(const int NE,
414  const bool symmetric,
415  const Array<double> &b,
416  const Array<double> &g,
417  const Vector &d,
418  Vector &y,
419  const int d1d = 0,
420  const int q1d = 0)
421 {
422  const int D1D = T_D1D ? T_D1D : d1d;
423  const int Q1D = T_Q1D ? T_Q1D : q1d;
424  MFEM_VERIFY(D1D <= MAX_D1D, "");
425  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
426  auto B = Reshape(b.Read(), Q1D, D1D);
427  auto G = Reshape(g.Read(), Q1D, D1D);
428  // note the different shape for D, if this is a symmetric matrix we only
429  // store necessary entries
430  auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
431  auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
432  MFEM_FORALL(e, NE,
433  {
434  const int D1D = T_D1D ? T_D1D : d1d;
435  const int Q1D = T_Q1D ? T_Q1D : q1d;
436  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
437  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
438  // gradphi \cdot Q \gradphi has four terms
439  double QD0[MQ1][MD1];
440  double QD1[MQ1][MD1];
441  double QD2[MQ1][MD1];
442  for (int qx = 0; qx < Q1D; ++qx)
443  {
444  for (int dy = 0; dy < D1D; ++dy)
445  {
446  QD0[qx][dy] = 0.0;
447  QD1[qx][dy] = 0.0;
448  QD2[qx][dy] = 0.0;
449  for (int qy = 0; qy < Q1D; ++qy)
450  {
451  const int q = qx + qy * Q1D;
452  const double D00 = D(q,0,e);
453  const double D10 = D(q,1,e);
454  const double D01 = symmetric ? D10 : D(q,2,e);
455  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
456  QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
457  QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
458  QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
459  }
460  }
461  }
462  for (int dy = 0; dy < D1D; ++dy)
463  {
464  for (int dx = 0; dx < D1D; ++dx)
465  {
466  for (int qx = 0; qx < Q1D; ++qx)
467  {
468  Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
469  Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
470  Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
471  }
472  }
473  }
474  });
475 }
476 
477 // Shared memory PA Diffusion Diagonal 2D kernel
478 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
479 static void SmemPADiffusionDiagonal2D(const int NE,
480  const bool symmetric,
481  const Array<double> &b_,
482  const Array<double> &g_,
483  const Vector &d_,
484  Vector &y_,
485  const int d1d = 0,
486  const int q1d = 0)
487 {
488  const int D1D = T_D1D ? T_D1D : d1d;
489  const int Q1D = T_Q1D ? T_Q1D : q1d;
490  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
491  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
492  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
493  MFEM_VERIFY(D1D <= MD1, "");
494  MFEM_VERIFY(Q1D <= MQ1, "");
495  auto b = Reshape(b_.Read(), Q1D, D1D);
496  auto g = Reshape(g_.Read(), Q1D, D1D);
497  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
498  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
499  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
500  {
501  const int tidz = MFEM_THREAD_ID(z);
502  const int D1D = T_D1D ? T_D1D : d1d;
503  const int Q1D = T_Q1D ? T_Q1D : q1d;
504  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
505  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
506  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
507  MFEM_SHARED double BG[2][MQ1*MD1];
508  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
509  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
510  MFEM_SHARED double QD[3][NBZ][MD1][MQ1];
511  double (*QD0)[MD1] = (double (*)[MD1])(QD[0] + tidz);
512  double (*QD1)[MD1] = (double (*)[MD1])(QD[1] + tidz);
513  double (*QD2)[MD1] = (double (*)[MD1])(QD[2] + tidz);
514  if (tidz == 0)
515  {
516  MFEM_FOREACH_THREAD(d,y,D1D)
517  {
518  MFEM_FOREACH_THREAD(q,x,Q1D)
519  {
520  B[q][d] = b(q,d);
521  G[q][d] = g(q,d);
522  }
523  }
524  }
525  MFEM_SYNC_THREAD;
526  MFEM_FOREACH_THREAD(qx,x,Q1D)
527  {
528  MFEM_FOREACH_THREAD(dy,y,D1D)
529  {
530  QD0[qx][dy] = 0.0;
531  QD1[qx][dy] = 0.0;
532  QD2[qx][dy] = 0.0;
533  for (int qy = 0; qy < Q1D; ++qy)
534  {
535  const int q = qx + qy * Q1D;
536  const double D00 = D(q,0,e);
537  const double D10 = D(q,1,e);
538  const double D01 = symmetric ? D10 : D(q,2,e);
539  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
540  const double By = B[qy][dy];
541  const double Gy = G[qy][dy];
542  const double BBy = By * By;
543  const double BGy = By * Gy;
544  const double GGy = Gy * Gy;
545  QD0[qx][dy] += BBy * D00;
546  QD1[qx][dy] += BGy * (D01 + D10);
547  QD2[qx][dy] += GGy * D11;
548  }
549  }
550  }
551  MFEM_SYNC_THREAD;
552  MFEM_FOREACH_THREAD(dy,y,D1D)
553  {
554  MFEM_FOREACH_THREAD(dx,x,D1D)
555  {
556  for (int qx = 0; qx < Q1D; ++qx)
557  {
558  const double Bx = B[qx][dx];
559  const double Gx = G[qx][dx];
560  const double BBx = Bx * Bx;
561  const double BGx = Bx * Gx;
562  const double GGx = Gx * Gx;
563  Y(dx,dy,e) += GGx * QD0[qx][dy];
564  Y(dx,dy,e) += BGx * QD1[qx][dy];
565  Y(dx,dy,e) += BBx * QD2[qx][dy];
566  }
567  }
568  }
569  });
570 }
571 
572 template<int T_D1D = 0, int T_Q1D = 0>
573 static void PADiffusionDiagonal3D(const int NE,
574  const bool symmetric,
575  const Array<double> &b,
576  const Array<double> &g,
577  const Vector &d,
578  Vector &y,
579  const int d1d = 0,
580  const int q1d = 0)
581 {
582  constexpr int DIM = 3;
583  const int D1D = T_D1D ? T_D1D : d1d;
584  const int Q1D = T_Q1D ? T_Q1D : q1d;
585  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
586  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
587  MFEM_VERIFY(D1D <= MD1, "");
588  MFEM_VERIFY(Q1D <= MQ1, "");
589  auto B = Reshape(b.Read(), Q1D, D1D);
590  auto G = Reshape(g.Read(), Q1D, D1D);
591  auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
592  auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
593  MFEM_FORALL(e, NE,
594  {
595  const int D1D = T_D1D ? T_D1D : d1d;
596  const int Q1D = T_Q1D ? T_Q1D : q1d;
597  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
598  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
599  double QQD[MQ1][MQ1][MD1];
600  double QDD[MQ1][MD1][MD1];
601  for (int i = 0; i < DIM; ++i)
602  {
603  for (int j = 0; j < DIM; ++j)
604  {
605  // first tensor contraction, along z direction
606  for (int qx = 0; qx < Q1D; ++qx)
607  {
608  for (int qy = 0; qy < Q1D; ++qy)
609  {
610  for (int dz = 0; dz < D1D; ++dz)
611  {
612  QQD[qx][qy][dz] = 0.0;
613  for (int qz = 0; qz < Q1D; ++qz)
614  {
615  const int q = qx + (qy + qz * Q1D) * Q1D;
616  const int ksym = j >= i ?
617  3 - (3-i)*(2-i)/2 + j:
618  3 - (3-j)*(2-j)/2 + i;
619  const int k = symmetric ? ksym : (i*DIM) + j;
620  const double O = Q(q,k,e);
621  const double Bz = B(qz,dz);
622  const double Gz = G(qz,dz);
623  const double L = i==2 ? Gz : Bz;
624  const double R = j==2 ? Gz : Bz;
625  QQD[qx][qy][dz] += L * O * R;
626  }
627  }
628  }
629  }
630  // second tensor contraction, along y direction
631  for (int qx = 0; qx < Q1D; ++qx)
632  {
633  for (int dz = 0; dz < D1D; ++dz)
634  {
635  for (int dy = 0; dy < D1D; ++dy)
636  {
637  QDD[qx][dy][dz] = 0.0;
638  for (int qy = 0; qy < Q1D; ++qy)
639  {
640  const double By = B(qy,dy);
641  const double Gy = G(qy,dy);
642  const double L = i==1 ? Gy : By;
643  const double R = j==1 ? Gy : By;
644  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
645  }
646  }
647  }
648  }
649  // third tensor contraction, along x direction
650  for (int dz = 0; dz < D1D; ++dz)
651  {
652  for (int dy = 0; dy < D1D; ++dy)
653  {
654  for (int dx = 0; dx < D1D; ++dx)
655  {
656  for (int qx = 0; qx < Q1D; ++qx)
657  {
658  const double Bx = B(qx,dx);
659  const double Gx = G(qx,dx);
660  const double L = i==0 ? Gx : Bx;
661  const double R = j==0 ? Gx : Bx;
662  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
663  }
664  }
665  }
666  }
667  }
668  }
669  });
670 }
671 
672 // Shared memory PA Diffusion Diagonal 3D kernel
673 template<int T_D1D = 0, int T_Q1D = 0>
674 static void SmemPADiffusionDiagonal3D(const int NE,
675  const bool symmetric,
676  const Array<double> &b_,
677  const Array<double> &g_,
678  const Vector &d_,
679  Vector &y_,
680  const int d1d = 0,
681  const int q1d = 0)
682 {
683  constexpr int DIM = 3;
684  const int D1D = T_D1D ? T_D1D : d1d;
685  const int Q1D = T_Q1D ? T_Q1D : q1d;
686  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
687  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
688  MFEM_VERIFY(D1D <= MD1, "");
689  MFEM_VERIFY(Q1D <= MQ1, "");
690  auto b = Reshape(b_.Read(), Q1D, D1D);
691  auto g = Reshape(g_.Read(), Q1D, D1D);
692  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
693  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
694  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
695  {
696  const int tidz = MFEM_THREAD_ID(z);
697  const int D1D = T_D1D ? T_D1D : d1d;
698  const int Q1D = T_Q1D ? T_Q1D : q1d;
699  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
700  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
701  MFEM_SHARED double BG[2][MQ1*MD1];
702  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
703  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
704  MFEM_SHARED double QQD[MQ1][MQ1][MD1];
705  MFEM_SHARED double QDD[MQ1][MD1][MD1];
706  if (tidz == 0)
707  {
708  MFEM_FOREACH_THREAD(d,y,D1D)
709  {
710  MFEM_FOREACH_THREAD(q,x,Q1D)
711  {
712  B[q][d] = b(q,d);
713  G[q][d] = g(q,d);
714  }
715  }
716  }
717  MFEM_SYNC_THREAD;
718  for (int i = 0; i < DIM; ++i)
719  {
720  for (int j = 0; j < DIM; ++j)
721  {
722  // first tensor contraction, along z direction
723  MFEM_FOREACH_THREAD(qx,x,Q1D)
724  {
725  MFEM_FOREACH_THREAD(qy,y,Q1D)
726  {
727  MFEM_FOREACH_THREAD(dz,z,D1D)
728  {
729  QQD[qx][qy][dz] = 0.0;
730  for (int qz = 0; qz < Q1D; ++qz)
731  {
732  const int q = qx + (qy + qz * Q1D) * Q1D;
733  const int ksym = j >= i ?
734  3 - (3-i)*(2-i)/2 + j:
735  3 - (3-j)*(2-j)/2 + i;
736  const int k = symmetric ? ksym : (i*DIM) + j;
737  const double O = D(q,k,e);
738  const double Bz = B[qz][dz];
739  const double Gz = G[qz][dz];
740  const double L = i==2 ? Gz : Bz;
741  const double R = j==2 ? Gz : Bz;
742  QQD[qx][qy][dz] += L * O * R;
743  }
744  }
745  }
746  }
747  MFEM_SYNC_THREAD;
748  // second tensor contraction, along y direction
749  MFEM_FOREACH_THREAD(qx,x,Q1D)
750  {
751  MFEM_FOREACH_THREAD(dz,z,D1D)
752  {
753  MFEM_FOREACH_THREAD(dy,y,D1D)
754  {
755  QDD[qx][dy][dz] = 0.0;
756  for (int qy = 0; qy < Q1D; ++qy)
757  {
758  const double By = B[qy][dy];
759  const double Gy = G[qy][dy];
760  const double L = i==1 ? Gy : By;
761  const double R = j==1 ? Gy : By;
762  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
763  }
764  }
765  }
766  }
767  MFEM_SYNC_THREAD;
768  // third tensor contraction, along x direction
769  MFEM_FOREACH_THREAD(dz,z,D1D)
770  {
771  MFEM_FOREACH_THREAD(dy,y,D1D)
772  {
773  MFEM_FOREACH_THREAD(dx,x,D1D)
774  {
775  for (int qx = 0; qx < Q1D; ++qx)
776  {
777  const double Bx = B[qx][dx];
778  const double Gx = G[qx][dx];
779  const double L = i==0 ? Gx : Bx;
780  const double R = j==0 ? Gx : Bx;
781  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
782  }
783  }
784  }
785  }
786  }
787  }
788  });
789 }
790 
791 static void PADiffusionAssembleDiagonal(const int dim,
792  const int D1D,
793  const int Q1D,
794  const int NE,
795  const bool symm,
796  const Array<double> &B,
797  const Array<double> &G,
798  const Vector &D,
799  Vector &Y)
800 {
801  if (dim == 2)
802  {
803  switch ((D1D << 4 ) | Q1D)
804  {
805  case 0x22: return SmemPADiffusionDiagonal2D<2,2,8>(NE,symm,B,G,D,Y);
806  case 0x33: return SmemPADiffusionDiagonal2D<3,3,8>(NE,symm,B,G,D,Y);
807  case 0x44: return SmemPADiffusionDiagonal2D<4,4,4>(NE,symm,B,G,D,Y);
808  case 0x55: return SmemPADiffusionDiagonal2D<5,5,4>(NE,symm,B,G,D,Y);
809  case 0x66: return SmemPADiffusionDiagonal2D<6,6,2>(NE,symm,B,G,D,Y);
810  case 0x77: return SmemPADiffusionDiagonal2D<7,7,2>(NE,symm,B,G,D,Y);
811  case 0x88: return SmemPADiffusionDiagonal2D<8,8,1>(NE,symm,B,G,D,Y);
812  case 0x99: return SmemPADiffusionDiagonal2D<9,9,1>(NE,symm,B,G,D,Y);
813  default: return PADiffusionDiagonal2D(NE,symm,B,G,D,Y,D1D,Q1D);
814  }
815  }
816  else if (dim == 3)
817  {
818  switch ((D1D << 4 ) | Q1D)
819  {
820  case 0x22: return SmemPADiffusionDiagonal3D<2,2>(NE,symm,B,G,D,Y);
821  case 0x23: return SmemPADiffusionDiagonal3D<2,3>(NE,symm,B,G,D,Y);
822  case 0x34: return SmemPADiffusionDiagonal3D<3,4>(NE,symm,B,G,D,Y);
823  case 0x45: return SmemPADiffusionDiagonal3D<4,5>(NE,symm,B,G,D,Y);
824  case 0x46: return SmemPADiffusionDiagonal3D<4,6>(NE,symm,B,G,D,Y);
825  case 0x56: return SmemPADiffusionDiagonal3D<5,6>(NE,symm,B,G,D,Y);
826  case 0x67: return SmemPADiffusionDiagonal3D<6,7>(NE,symm,B,G,D,Y);
827  case 0x78: return SmemPADiffusionDiagonal3D<7,8>(NE,symm,B,G,D,Y);
828  case 0x89: return SmemPADiffusionDiagonal3D<8,9>(NE,symm,B,G,D,Y);
829  case 0x9A: return SmemPADiffusionDiagonal3D<9,10>(NE,symm,B,G,D,Y);
830  default: return PADiffusionDiagonal3D(NE,symm,B,G,D,Y,D1D,Q1D);
831  }
832  }
833  MFEM_ABORT("Unknown kernel.");
834 }
835 
836 void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
837 {
838  if (DeviceCanUseCeed())
839  {
840  ceedOp->GetDiagonal(diag);
841  }
842  else
843  {
844  if (pa_data.Size()==0) { AssemblePA(*fespace); }
845  PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
846  maps->B, maps->G, pa_data, diag);
847  }
848 }
849 
850 
851 #ifdef MFEM_USE_OCCA
852 // OCCA PA Diffusion Apply 2D kernel
853 static void OccaPADiffusionApply2D(const int D1D,
854  const int Q1D,
855  const int NE,
856  const Array<double> &B,
857  const Array<double> &G,
858  const Array<double> &Bt,
859  const Array<double> &Gt,
860  const Vector &D,
861  const Vector &X,
862  Vector &Y)
863 {
864  occa::properties props;
865  props["defines/D1D"] = D1D;
866  props["defines/Q1D"] = Q1D;
867  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
868  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
869  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
870  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
871  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
872  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
873  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
874  const occa_id_t id = std::make_pair(D1D,Q1D);
875  if (!Device::Allows(Backend::OCCA_CUDA))
876  {
877  static occa_kernel_t OccaDiffApply2D_cpu;
878  if (OccaDiffApply2D_cpu.find(id) == OccaDiffApply2D_cpu.end())
879  {
880  const occa::kernel DiffusionApply2D_CPU =
881  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
882  "DiffusionApply2D_CPU", props);
883  OccaDiffApply2D_cpu.emplace(id, DiffusionApply2D_CPU);
884  }
885  OccaDiffApply2D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
886  }
887  else
888  {
889  static occa_kernel_t OccaDiffApply2D_gpu;
890  if (OccaDiffApply2D_gpu.find(id) == OccaDiffApply2D_gpu.end())
891  {
892  const occa::kernel DiffusionApply2D_GPU =
893  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
894  "DiffusionApply2D_GPU", props);
895  OccaDiffApply2D_gpu.emplace(id, DiffusionApply2D_GPU);
896  }
897  OccaDiffApply2D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
898  }
899 }
900 
901 // OCCA PA Diffusion Apply 3D kernel
902 static void OccaPADiffusionApply3D(const int D1D,
903  const int Q1D,
904  const int NE,
905  const Array<double> &B,
906  const Array<double> &G,
907  const Array<double> &Bt,
908  const Array<double> &Gt,
909  const Vector &D,
910  const Vector &X,
911  Vector &Y)
912 {
913  occa::properties props;
914  props["defines/D1D"] = D1D;
915  props["defines/Q1D"] = Q1D;
916  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
917  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
918  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
919  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
920  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
921  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
922  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
923  const occa_id_t id = std::make_pair(D1D,Q1D);
924  if (!Device::Allows(Backend::OCCA_CUDA))
925  {
926  static occa_kernel_t OccaDiffApply3D_cpu;
927  if (OccaDiffApply3D_cpu.find(id) == OccaDiffApply3D_cpu.end())
928  {
929  const occa::kernel DiffusionApply3D_CPU =
930  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
931  "DiffusionApply3D_CPU", props);
932  OccaDiffApply3D_cpu.emplace(id, DiffusionApply3D_CPU);
933  }
934  OccaDiffApply3D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
935  }
936  else
937  {
938  static occa_kernel_t OccaDiffApply3D_gpu;
939  if (OccaDiffApply3D_gpu.find(id) == OccaDiffApply3D_gpu.end())
940  {
941  const occa::kernel DiffusionApply3D_GPU =
942  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
943  "DiffusionApply3D_GPU", props);
944  OccaDiffApply3D_gpu.emplace(id, DiffusionApply3D_GPU);
945  }
946  OccaDiffApply3D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
947  }
948 }
949 #endif // MFEM_USE_OCCA
950 
951 // PA Diffusion Apply 2D kernel
952 template<int T_D1D = 0, int T_Q1D = 0>
953 static void PADiffusionApply2D(const int NE,
954  const bool symmetric,
955  const Array<double> &b_,
956  const Array<double> &g_,
957  const Array<double> &bt_,
958  const Array<double> &gt_,
959  const Vector &d_,
960  const Vector &x_,
961  Vector &y_,
962  const int d1d = 0,
963  const int q1d = 0)
964 {
965  const int D1D = T_D1D ? T_D1D : d1d;
966  const int Q1D = T_Q1D ? T_Q1D : q1d;
967  MFEM_VERIFY(D1D <= MAX_D1D, "");
968  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
969  auto B = Reshape(b_.Read(), Q1D, D1D);
970  auto G = Reshape(g_.Read(), Q1D, D1D);
971  auto Bt = Reshape(bt_.Read(), D1D, Q1D);
972  auto Gt = Reshape(gt_.Read(), D1D, Q1D);
973  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
974  auto X = Reshape(x_.Read(), D1D, D1D, NE);
975  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
976  MFEM_FORALL(e, NE,
977  {
978  const int D1D = T_D1D ? T_D1D : d1d;
979  const int Q1D = T_Q1D ? T_Q1D : q1d;
980  // the following variables are evaluated at compile time
981  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
982  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
983 
984  double grad[max_Q1D][max_Q1D][2];
985  for (int qy = 0; qy < Q1D; ++qy)
986  {
987  for (int qx = 0; qx < Q1D; ++qx)
988  {
989  grad[qy][qx][0] = 0.0;
990  grad[qy][qx][1] = 0.0;
991  }
992  }
993  for (int dy = 0; dy < D1D; ++dy)
994  {
995  double gradX[max_Q1D][2];
996  for (int qx = 0; qx < Q1D; ++qx)
997  {
998  gradX[qx][0] = 0.0;
999  gradX[qx][1] = 0.0;
1000  }
1001  for (int dx = 0; dx < D1D; ++dx)
1002  {
1003  const double s = X(dx,dy,e);
1004  for (int qx = 0; qx < Q1D; ++qx)
1005  {
1006  gradX[qx][0] += s * B(qx,dx);
1007  gradX[qx][1] += s * G(qx,dx);
1008  }
1009  }
1010  for (int qy = 0; qy < Q1D; ++qy)
1011  {
1012  const double wy = B(qy,dy);
1013  const double wDy = G(qy,dy);
1014  for (int qx = 0; qx < Q1D; ++qx)
1015  {
1016  grad[qy][qx][0] += gradX[qx][1] * wy;
1017  grad[qy][qx][1] += gradX[qx][0] * wDy;
1018  }
1019  }
1020  }
1021  // Calculate Dxy, xDy in plane
1022  for (int qy = 0; qy < Q1D; ++qy)
1023  {
1024  for (int qx = 0; qx < Q1D; ++qx)
1025  {
1026  const int q = qx + qy * Q1D;
1027 
1028  const double O11 = D(q,0,e);
1029  const double O21 = D(q,1,e);
1030  const double O12 = symmetric ? O21 : D(q,2,e);
1031  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1032 
1033  const double gradX = grad[qy][qx][0];
1034  const double gradY = grad[qy][qx][1];
1035 
1036  grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
1037  grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
1038  }
1039  }
1040  for (int qy = 0; qy < Q1D; ++qy)
1041  {
1042  double gradX[max_D1D][2];
1043  for (int dx = 0; dx < D1D; ++dx)
1044  {
1045  gradX[dx][0] = 0;
1046  gradX[dx][1] = 0;
1047  }
1048  for (int qx = 0; qx < Q1D; ++qx)
1049  {
1050  const double gX = grad[qy][qx][0];
1051  const double gY = grad[qy][qx][1];
1052  for (int dx = 0; dx < D1D; ++dx)
1053  {
1054  const double wx = Bt(dx,qx);
1055  const double wDx = Gt(dx,qx);
1056  gradX[dx][0] += gX * wDx;
1057  gradX[dx][1] += gY * wx;
1058  }
1059  }
1060  for (int dy = 0; dy < D1D; ++dy)
1061  {
1062  const double wy = Bt(dy,qy);
1063  const double wDy = Gt(dy,qy);
1064  for (int dx = 0; dx < D1D; ++dx)
1065  {
1066  Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
1067  }
1068  }
1069  }
1070  });
1071 }
1072 
1073 // Shared memory PA Diffusion Apply 2D kernel
1074 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
1075 static void SmemPADiffusionApply2D(const int NE,
1076  const bool symmetric,
1077  const Array<double> &b_,
1078  const Array<double> &g_,
1079  const Vector &d_,
1080  const Vector &x_,
1081  Vector &y_,
1082  const int d1d = 0,
1083  const int q1d = 0)
1084 {
1085  const int D1D = T_D1D ? T_D1D : d1d;
1086  const int Q1D = T_Q1D ? T_Q1D : q1d;
1087  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1088  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1089  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1090  MFEM_VERIFY(D1D <= MD1, "");
1091  MFEM_VERIFY(Q1D <= MQ1, "");
1092  auto b = Reshape(b_.Read(), Q1D, D1D);
1093  auto g = Reshape(g_.Read(), Q1D, D1D);
1094  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1095  auto x = Reshape(x_.Read(), D1D, D1D, NE);
1096  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1097  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
1098  {
1099  const int tidz = MFEM_THREAD_ID(z);
1100  const int D1D = T_D1D ? T_D1D : d1d;
1101  const int Q1D = T_Q1D ? T_Q1D : q1d;
1102  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1103  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1104  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1105  MFEM_SHARED double sBG[2][MQ1*MD1];
1106  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1107  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1108  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1109  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1110  MFEM_SHARED double Xz[NBZ][MD1][MD1];
1111  MFEM_SHARED double GD[2][NBZ][MD1][MQ1];
1112  MFEM_SHARED double GQ[2][NBZ][MD1][MQ1];
1113  double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
1114  double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
1115  double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
1116  double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
1117  double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
1118  MFEM_FOREACH_THREAD(dy,y,D1D)
1119  {
1120  MFEM_FOREACH_THREAD(dx,x,D1D)
1121  {
1122  X[dy][dx] = x(dx,dy,e);
1123  }
1124  }
1125  if (tidz == 0)
1126  {
1127  MFEM_FOREACH_THREAD(dy,y,D1D)
1128  {
1129  MFEM_FOREACH_THREAD(q,x,Q1D)
1130  {
1131  B[q][dy] = b(q,dy);
1132  G[q][dy] = g(q,dy);
1133  }
1134  }
1135  }
1136  MFEM_SYNC_THREAD;
1137  MFEM_FOREACH_THREAD(dy,y,D1D)
1138  {
1139  MFEM_FOREACH_THREAD(qx,x,Q1D)
1140  {
1141  double u = 0.0;
1142  double v = 0.0;
1143  for (int dx = 0; dx < D1D; ++dx)
1144  {
1145  const double coords = X[dy][dx];
1146  u += B[qx][dx] * coords;
1147  v += G[qx][dx] * coords;
1148  }
1149  DQ0[dy][qx] = u;
1150  DQ1[dy][qx] = v;
1151  }
1152  }
1153  MFEM_SYNC_THREAD;
1154  MFEM_FOREACH_THREAD(qy,y,Q1D)
1155  {
1156  MFEM_FOREACH_THREAD(qx,x,Q1D)
1157  {
1158  double u = 0.0;
1159  double v = 0.0;
1160  for (int dy = 0; dy < D1D; ++dy)
1161  {
1162  u += DQ1[dy][qx] * B[qy][dy];
1163  v += DQ0[dy][qx] * G[qy][dy];
1164  }
1165  QQ0[qy][qx] = u;
1166  QQ1[qy][qx] = v;
1167  }
1168  }
1169  MFEM_SYNC_THREAD;
1170  MFEM_FOREACH_THREAD(qy,y,Q1D)
1171  {
1172  MFEM_FOREACH_THREAD(qx,x,Q1D)
1173  {
1174  const int q = (qx + ((qy) * Q1D));
1175  const double O11 = D(q,0,e);
1176  const double O21 = D(q,1,e);
1177  const double O12 = symmetric ? O21 : D(q,2,e);
1178  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1179  const double gX = QQ0[qy][qx];
1180  const double gY = QQ1[qy][qx];
1181  QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
1182  QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
1183  }
1184  }
1185  MFEM_SYNC_THREAD;
1186  if (tidz == 0)
1187  {
1188  MFEM_FOREACH_THREAD(dy,y,D1D)
1189  {
1190  MFEM_FOREACH_THREAD(q,x,Q1D)
1191  {
1192  Bt[dy][q] = b(q,dy);
1193  Gt[dy][q] = g(q,dy);
1194  }
1195  }
1196  }
1197  MFEM_SYNC_THREAD;
1198  MFEM_FOREACH_THREAD(qy,y,Q1D)
1199  {
1200  MFEM_FOREACH_THREAD(dx,x,D1D)
1201  {
1202  double u = 0.0;
1203  double v = 0.0;
1204  for (int qx = 0; qx < Q1D; ++qx)
1205  {
1206  u += Gt[dx][qx] * QQ0[qy][qx];
1207  v += Bt[dx][qx] * QQ1[qy][qx];
1208  }
1209  DQ0[qy][dx] = u;
1210  DQ1[qy][dx] = v;
1211  }
1212  }
1213  MFEM_SYNC_THREAD;
1214  MFEM_FOREACH_THREAD(dy,y,D1D)
1215  {
1216  MFEM_FOREACH_THREAD(dx,x,D1D)
1217  {
1218  double u = 0.0;
1219  double v = 0.0;
1220  for (int qy = 0; qy < Q1D; ++qy)
1221  {
1222  u += DQ0[qy][dx] * Bt[dy][qy];
1223  v += DQ1[qy][dx] * Gt[dy][qy];
1224  }
1225  Y(dx,dy,e) += (u + v);
1226  }
1227  }
1228  });
1229 }
1230 
1231 // PA Diffusion Apply 3D kernel
1232 template<int T_D1D = 0, int T_Q1D = 0>
1233 static void PADiffusionApply3D(const int NE,
1234  const bool symmetric,
1235  const Array<double> &b,
1236  const Array<double> &g,
1237  const Array<double> &bt,
1238  const Array<double> &gt,
1239  const Vector &d_,
1240  const Vector &x_,
1241  Vector &y_,
1242  int d1d = 0, int q1d = 0)
1243 {
1244  const int D1D = T_D1D ? T_D1D : d1d;
1245  const int Q1D = T_Q1D ? T_Q1D : q1d;
1246  MFEM_VERIFY(D1D <= MAX_D1D, "");
1247  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1248  auto B = Reshape(b.Read(), Q1D, D1D);
1249  auto G = Reshape(g.Read(), Q1D, D1D);
1250  auto Bt = Reshape(bt.Read(), D1D, Q1D);
1251  auto Gt = Reshape(gt.Read(), D1D, Q1D);
1252  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
1253  auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1254  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1255  MFEM_FORALL(e, NE,
1256  {
1257  const int D1D = T_D1D ? T_D1D : d1d;
1258  const int Q1D = T_Q1D ? T_Q1D : q1d;
1259  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1260  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1261  double grad[max_Q1D][max_Q1D][max_Q1D][3];
1262  for (int qz = 0; qz < Q1D; ++qz)
1263  {
1264  for (int qy = 0; qy < Q1D; ++qy)
1265  {
1266  for (int qx = 0; qx < Q1D; ++qx)
1267  {
1268  grad[qz][qy][qx][0] = 0.0;
1269  grad[qz][qy][qx][1] = 0.0;
1270  grad[qz][qy][qx][2] = 0.0;
1271  }
1272  }
1273  }
1274  for (int dz = 0; dz < D1D; ++dz)
1275  {
1276  double gradXY[max_Q1D][max_Q1D][3];
1277  for (int qy = 0; qy < Q1D; ++qy)
1278  {
1279  for (int qx = 0; qx < Q1D; ++qx)
1280  {
1281  gradXY[qy][qx][0] = 0.0;
1282  gradXY[qy][qx][1] = 0.0;
1283  gradXY[qy][qx][2] = 0.0;
1284  }
1285  }
1286  for (int dy = 0; dy < D1D; ++dy)
1287  {
1288  double gradX[max_Q1D][2];
1289  for (int qx = 0; qx < Q1D; ++qx)
1290  {
1291  gradX[qx][0] = 0.0;
1292  gradX[qx][1] = 0.0;
1293  }
1294  for (int dx = 0; dx < D1D; ++dx)
1295  {
1296  const double s = X(dx,dy,dz,e);
1297  for (int qx = 0; qx < Q1D; ++qx)
1298  {
1299  gradX[qx][0] += s * B(qx,dx);
1300  gradX[qx][1] += s * G(qx,dx);
1301  }
1302  }
1303  for (int qy = 0; qy < Q1D; ++qy)
1304  {
1305  const double wy = B(qy,dy);
1306  const double wDy = G(qy,dy);
1307  for (int qx = 0; qx < Q1D; ++qx)
1308  {
1309  const double wx = gradX[qx][0];
1310  const double wDx = gradX[qx][1];
1311  gradXY[qy][qx][0] += wDx * wy;
1312  gradXY[qy][qx][1] += wx * wDy;
1313  gradXY[qy][qx][2] += wx * wy;
1314  }
1315  }
1316  }
1317  for (int qz = 0; qz < Q1D; ++qz)
1318  {
1319  const double wz = B(qz,dz);
1320  const double wDz = G(qz,dz);
1321  for (int qy = 0; qy < Q1D; ++qy)
1322  {
1323  for (int qx = 0; qx < Q1D; ++qx)
1324  {
1325  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
1326  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
1327  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
1328  }
1329  }
1330  }
1331  }
1332  // Calculate Dxyz, xDyz, xyDz in plane
1333  for (int qz = 0; qz < Q1D; ++qz)
1334  {
1335  for (int qy = 0; qy < Q1D; ++qy)
1336  {
1337  for (int qx = 0; qx < Q1D; ++qx)
1338  {
1339  const int q = qx + (qy + qz * Q1D) * Q1D;
1340  const double O11 = D(q,0,e);
1341  const double O12 = D(q,1,e);
1342  const double O13 = D(q,2,e);
1343  const double O21 = symmetric ? O12 : D(q,3,e);
1344  const double O22 = symmetric ? D(q,3,e) : D(q,4,e);
1345  const double O23 = symmetric ? D(q,4,e) : D(q,5,e);
1346  const double O31 = symmetric ? O13 : D(q,6,e);
1347  const double O32 = symmetric ? O23 : D(q,7,e);
1348  const double O33 = symmetric ? D(q,5,e) : D(q,8,e);
1349  const double gradX = grad[qz][qy][qx][0];
1350  const double gradY = grad[qz][qy][qx][1];
1351  const double gradZ = grad[qz][qy][qx][2];
1352  grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
1353  grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
1354  grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
1355  }
1356  }
1357  }
1358  for (int qz = 0; qz < Q1D; ++qz)
1359  {
1360  double gradXY[max_D1D][max_D1D][3];
1361  for (int dy = 0; dy < D1D; ++dy)
1362  {
1363  for (int dx = 0; dx < D1D; ++dx)
1364  {
1365  gradXY[dy][dx][0] = 0;
1366  gradXY[dy][dx][1] = 0;
1367  gradXY[dy][dx][2] = 0;
1368  }
1369  }
1370  for (int qy = 0; qy < Q1D; ++qy)
1371  {
1372  double gradX[max_D1D][3];
1373  for (int dx = 0; dx < D1D; ++dx)
1374  {
1375  gradX[dx][0] = 0;
1376  gradX[dx][1] = 0;
1377  gradX[dx][2] = 0;
1378  }
1379  for (int qx = 0; qx < Q1D; ++qx)
1380  {
1381  const double gX = grad[qz][qy][qx][0];
1382  const double gY = grad[qz][qy][qx][1];
1383  const double gZ = grad[qz][qy][qx][2];
1384  for (int dx = 0; dx < D1D; ++dx)
1385  {
1386  const double wx = Bt(dx,qx);
1387  const double wDx = Gt(dx,qx);
1388  gradX[dx][0] += gX * wDx;
1389  gradX[dx][1] += gY * wx;
1390  gradX[dx][2] += gZ * wx;
1391  }
1392  }
1393  for (int dy = 0; dy < D1D; ++dy)
1394  {
1395  const double wy = Bt(dy,qy);
1396  const double wDy = Gt(dy,qy);
1397  for (int dx = 0; dx < D1D; ++dx)
1398  {
1399  gradXY[dy][dx][0] += gradX[dx][0] * wy;
1400  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
1401  gradXY[dy][dx][2] += gradX[dx][2] * wy;
1402  }
1403  }
1404  }
1405  for (int dz = 0; dz < D1D; ++dz)
1406  {
1407  const double wz = Bt(dz,qz);
1408  const double wDz = Gt(dz,qz);
1409  for (int dy = 0; dy < D1D; ++dy)
1410  {
1411  for (int dx = 0; dx < D1D; ++dx)
1412  {
1413  Y(dx,dy,dz,e) +=
1414  ((gradXY[dy][dx][0] * wz) +
1415  (gradXY[dy][dx][1] * wz) +
1416  (gradXY[dy][dx][2] * wDz));
1417  }
1418  }
1419  }
1420  }
1421  });
1422 }
1423 
1424 template<int T_D1D = 0, int T_Q1D = 0>
1425 static void SmemPADiffusionApply3D(const int NE,
1426  const bool symmetric,
1427  const Array<double> &b_,
1428  const Array<double> &g_,
1429  const Vector &d_,
1430  const Vector &x_,
1431  Vector &y_,
1432  const int d1d = 0,
1433  const int q1d = 0)
1434 {
1435  const int D1D = T_D1D ? T_D1D : d1d;
1436  const int Q1D = T_Q1D ? T_Q1D : q1d;
1437  constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
1438  constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
1439  MFEM_VERIFY(D1D <= M1D, "");
1440  MFEM_VERIFY(Q1D <= M1Q, "");
1441  auto b = Reshape(b_.Read(), Q1D, D1D);
1442  auto g = Reshape(g_.Read(), Q1D, D1D);
1443  auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1444  auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1445  auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1446  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
1447  {
1448  const int D1D = T_D1D ? T_D1D : d1d;
1449  const int Q1D = T_Q1D ? T_Q1D : q1d;
1450  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1451  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1452  constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1453  MFEM_SHARED double sBG[2][MQ1*MD1];
1454  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1455  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1456  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1457  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1458  MFEM_SHARED double sm0[3][MDQ*MDQ*MDQ];
1459  MFEM_SHARED double sm1[3][MDQ*MDQ*MDQ];
1460  double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1461  double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
1462  double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
1463  double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
1464  double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
1465  double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
1466  double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
1467  double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
1468  double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
1469  double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
1470  double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
1471  double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
1472  double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
1473  double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
1474  double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1475  MFEM_FOREACH_THREAD(dz,z,D1D)
1476  {
1477  MFEM_FOREACH_THREAD(dy,y,D1D)
1478  {
1479  MFEM_FOREACH_THREAD(dx,x,D1D)
1480  {
1481  X[dz][dy][dx] = x(dx,dy,dz,e);
1482  }
1483  }
1484  }
1485  if (MFEM_THREAD_ID(z) == 0)
1486  {
1487  MFEM_FOREACH_THREAD(dy,y,D1D)
1488  {
1489  MFEM_FOREACH_THREAD(qx,x,Q1D)
1490  {
1491  B[qx][dy] = b(qx,dy);
1492  G[qx][dy] = g(qx,dy);
1493  }
1494  }
1495  }
1496  MFEM_SYNC_THREAD;
1497  MFEM_FOREACH_THREAD(dz,z,D1D)
1498  {
1499  MFEM_FOREACH_THREAD(dy,y,D1D)
1500  {
1501  MFEM_FOREACH_THREAD(qx,x,Q1D)
1502  {
1503  double u = 0.0, v = 0.0;
1504  MFEM_UNROLL(MD1)
1505  for (int dx = 0; dx < D1D; ++dx)
1506  {
1507  const double coords = X[dz][dy][dx];
1508  u += coords * B[qx][dx];
1509  v += coords * G[qx][dx];
1510  }
1511  DDQ0[dz][dy][qx] = u;
1512  DDQ1[dz][dy][qx] = v;
1513  }
1514  }
1515  }
1516  MFEM_SYNC_THREAD;
1517  MFEM_FOREACH_THREAD(dz,z,D1D)
1518  {
1519  MFEM_FOREACH_THREAD(qy,y,Q1D)
1520  {
1521  MFEM_FOREACH_THREAD(qx,x,Q1D)
1522  {
1523  double u = 0.0, v = 0.0, w = 0.0;
1524  MFEM_UNROLL(MD1)
1525  for (int dy = 0; dy < D1D; ++dy)
1526  {
1527  u += DDQ1[dz][dy][qx] * B[qy][dy];
1528  v += DDQ0[dz][dy][qx] * G[qy][dy];
1529  w += DDQ0[dz][dy][qx] * B[qy][dy];
1530  }
1531  DQQ0[dz][qy][qx] = u;
1532  DQQ1[dz][qy][qx] = v;
1533  DQQ2[dz][qy][qx] = w;
1534  }
1535  }
1536  }
1537  MFEM_SYNC_THREAD;
1538  MFEM_FOREACH_THREAD(qz,z,Q1D)
1539  {
1540  MFEM_FOREACH_THREAD(qy,y,Q1D)
1541  {
1542  MFEM_FOREACH_THREAD(qx,x,Q1D)
1543  {
1544  double u = 0.0, v = 0.0, w = 0.0;
1545  MFEM_UNROLL(MD1)
1546  for (int dz = 0; dz < D1D; ++dz)
1547  {
1548  u += DQQ0[dz][qy][qx] * B[qz][dz];
1549  v += DQQ1[dz][qy][qx] * B[qz][dz];
1550  w += DQQ2[dz][qy][qx] * G[qz][dz];
1551  }
1552  const double O11 = d(qx,qy,qz,0,e);
1553  const double O12 = d(qx,qy,qz,1,e);
1554  const double O13 = d(qx,qy,qz,2,e);
1555  const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1556  const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1557  const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1558  const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1559  const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1560  const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1561  const double gX = u;
1562  const double gY = v;
1563  const double gZ = w;
1564  QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1565  QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1566  QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1567  }
1568  }
1569  }
1570  MFEM_SYNC_THREAD;
1571  if (MFEM_THREAD_ID(z) == 0)
1572  {
1573  MFEM_FOREACH_THREAD(dy,y,D1D)
1574  {
1575  MFEM_FOREACH_THREAD(qx,x,Q1D)
1576  {
1577  Bt[dy][qx] = b(qx,dy);
1578  Gt[dy][qx] = g(qx,dy);
1579  }
1580  }
1581  }
1582  MFEM_SYNC_THREAD;
1583  MFEM_FOREACH_THREAD(qz,z,Q1D)
1584  {
1585  MFEM_FOREACH_THREAD(qy,y,Q1D)
1586  {
1587  MFEM_FOREACH_THREAD(dx,x,D1D)
1588  {
1589  double u = 0.0, v = 0.0, w = 0.0;
1590  MFEM_UNROLL(MQ1)
1591  for (int qx = 0; qx < Q1D; ++qx)
1592  {
1593  u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1594  v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1595  w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1596  }
1597  QQD0[qz][qy][dx] = u;
1598  QQD1[qz][qy][dx] = v;
1599  QQD2[qz][qy][dx] = w;
1600  }
1601  }
1602  }
1603  MFEM_SYNC_THREAD;
1604  MFEM_FOREACH_THREAD(qz,z,Q1D)
1605  {
1606  MFEM_FOREACH_THREAD(dy,y,D1D)
1607  {
1608  MFEM_FOREACH_THREAD(dx,x,D1D)
1609  {
1610  double u = 0.0, v = 0.0, w = 0.0;
1611  MFEM_UNROLL(Q1D)
1612  for (int qy = 0; qy < Q1D; ++qy)
1613  {
1614  u += QQD0[qz][qy][dx] * Bt[dy][qy];
1615  v += QQD1[qz][qy][dx] * Gt[dy][qy];
1616  w += QQD2[qz][qy][dx] * Bt[dy][qy];
1617  }
1618  QDD0[qz][dy][dx] = u;
1619  QDD1[qz][dy][dx] = v;
1620  QDD2[qz][dy][dx] = w;
1621  }
1622  }
1623  }
1624  MFEM_SYNC_THREAD;
1625  MFEM_FOREACH_THREAD(dz,z,D1D)
1626  {
1627  MFEM_FOREACH_THREAD(dy,y,D1D)
1628  {
1629  MFEM_FOREACH_THREAD(dx,x,D1D)
1630  {
1631  double u = 0.0, v = 0.0, w = 0.0;
1632  MFEM_UNROLL(MQ1)
1633  for (int qz = 0; qz < Q1D; ++qz)
1634  {
1635  u += QDD0[qz][dy][dx] * Bt[dz][qz];
1636  v += QDD1[qz][dy][dx] * Bt[dz][qz];
1637  w += QDD2[qz][dy][dx] * Gt[dz][qz];
1638  }
1639  y(dx,dy,dz,e) += (u + v + w);
1640  }
1641  }
1642  }
1643  });
1644 }
1645 
1646 static void PADiffusionApply(const int dim,
1647  const int D1D,
1648  const int Q1D,
1649  const int NE,
1650  const bool symm,
1651  const Array<double> &B,
1652  const Array<double> &G,
1653  const Array<double> &Bt,
1654  const Array<double> &Gt,
1655  const Vector &D,
1656  const Vector &X,
1657  Vector &Y)
1658 {
1659 #ifdef MFEM_USE_OCCA
1660  if (DeviceCanUseOcca())
1661  {
1662  if (dim == 2)
1663  {
1664  OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1665  return;
1666  }
1667  if (dim == 3)
1668  {
1669  OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1670  return;
1671  }
1672  MFEM_ABORT("OCCA PADiffusionApply unknown kernel!");
1673  }
1674 #endif // MFEM_USE_OCCA
1675  const int id = (D1D << 4) | Q1D;
1676 
1677  if (dim == 2)
1678  {
1679  switch (id)
1680  {
1681  case 0x22: return SmemPADiffusionApply2D<2,2,16>(NE,symm,B,G,D,X,Y);
1682  case 0x33: return SmemPADiffusionApply2D<3,3,16>(NE,symm,B,G,D,X,Y);
1683  case 0x44: return SmemPADiffusionApply2D<4,4,8>(NE,symm,B,G,D,X,Y);
1684  case 0x55: return SmemPADiffusionApply2D<5,5,8>(NE,symm,B,G,D,X,Y);
1685  case 0x66: return SmemPADiffusionApply2D<6,6,4>(NE,symm,B,G,D,X,Y);
1686  case 0x77: return SmemPADiffusionApply2D<7,7,4>(NE,symm,B,G,D,X,Y);
1687  case 0x88: return SmemPADiffusionApply2D<8,8,2>(NE,symm,B,G,D,X,Y);
1688  case 0x99: return SmemPADiffusionApply2D<9,9,2>(NE,symm,B,G,D,X,Y);
1689  default: return PADiffusionApply2D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1690  }
1691  }
1692 
1693  if (dim == 3)
1694  {
1695  switch (id)
1696  {
1697  case 0x22: return SmemPADiffusionApply3D<2,2>(NE,symm,B,G,D,X,Y);
1698  case 0x23: return SmemPADiffusionApply3D<2,3>(NE,symm,B,G,D,X,Y);
1699  case 0x34: return SmemPADiffusionApply3D<3,4>(NE,symm,B,G,D,X,Y);
1700  case 0x45: return SmemPADiffusionApply3D<4,5>(NE,symm,B,G,D,X,Y);
1701  case 0x46: return SmemPADiffusionApply3D<4,6>(NE,symm,B,G,D,X,Y);
1702  case 0x56: return SmemPADiffusionApply3D<5,6>(NE,symm,B,G,D,X,Y);
1703  case 0x58: return SmemPADiffusionApply3D<5,8>(NE,symm,B,G,D,X,Y);
1704  case 0x67: return SmemPADiffusionApply3D<6,7>(NE,symm,B,G,D,X,Y);
1705  case 0x78: return SmemPADiffusionApply3D<7,8>(NE,symm,B,G,D,X,Y);
1706  case 0x89: return SmemPADiffusionApply3D<8,9>(NE,symm,B,G,D,X,Y);
1707  default: return PADiffusionApply3D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1708  }
1709  }
1710  MFEM_ABORT("Unknown kernel: 0x"<<std::hex << id << std::dec);
1711 }
1712 
1713 // PA Diffusion Apply kernel
1714 void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
1715 {
1716  if (DeviceCanUseCeed())
1717  {
1718  ceedOp->AddMult(x, y);
1719  }
1720  else
1721  {
1722  PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
1723  maps->B, maps->G, maps->Bt, maps->Gt,
1724  pa_data, x, y);
1725  }
1726 }
1727 
1728 void DiffusionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
1729 {
1730  if (symmetric)
1731  {
1732  AddMultPA(x, y);
1733  }
1734  else
1735  {
1736  MFEM_ABORT("DiffusionIntegrator::AddMultTransposePA only implemented in "
1737  "the symmetric case.")
1738  }
1739 }
1740 
1741 } // namespace mfem
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Definition: array.hpp:307
Abstract class for all finite elements.
Definition: fe_base.hpp:232
int GetNPoints() const
Returns the number of the points in the integration rule.
Definition: intrules.hpp:247
Class for an integration rule - an Array of IntegrationPoint.
Definition: intrules.hpp:90
bool IsVariableOrder() const
Returns true if the space contains elements of varying polynomial orders.
Definition: fespace.hpp:463
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition: array.hpp:120
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
Definition: mesh.cpp:840
int Dimension() const
Definition: mesh.hpp:1047
occa::device & OccaDev()
Return the default occa::device used by MFEM.
Definition: occa.cpp:27
int Size() const
Returns the size of the vector.
Definition: vector.hpp:199
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
Definition: fe_base.hpp:168
virtual const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:448
constexpr int DIM
STL namespace.
void SetConstant(double constant)
Set this vector to the given constant.
std::map< occa_id_t, occa::kernel > occa_kernel_t
Definition: occa.hpp:79
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
Definition: vector.hpp:230
Class to represent a coefficient evaluated at quadrature points.
virtual const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement in the FiniteElementCollection associated with i&#39;th element in t...
Definition: fespace.cpp:2783
int GetNumGeometries(int dim) const
Return the number of geometries of the given dimension present in the mesh.
Definition: mesh.cpp:5921
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:59
void Project(Coefficient &coeff)
Evaluate the given Coefficient at the quadrature points defined by qs.
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
Definition: fe_base.cpp:364
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
Definition: intrules.cpp:83
const int MAX_Q1D
Definition: forall.hpp:29
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
Definition: occa.hpp:69
virtual double * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:456
double b
Definition: lissajous.cpp:42
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:37
int GetNE() const
Returns number of elements in the mesh.
Definition: fespace.hpp:614
void PADiffusionSetup2D< 3 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
void PADiffusionSetup2D< 2 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
int GetVDim() const
Return the number of values per quadrature point.
Mesh * GetMesh() const
Returns the mesh.
Definition: fespace.hpp:441
Represent a DiffusionIntegrator with AssemblyLevel::Partial using libCEED.
Definition: diffusion.hpp:26
int GetDim() const
Returns the reference space dimension for the finite element.
Definition: fe_base.hpp:310
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Definition: fespace.hpp:96
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:48
MemoryType
Memory types supported by MFEM.
Definition: mem_manager.hpp:31
bool DeviceCanUseCeed()
Function that determines if a CEED kernel should be used, based on the current mfem::Device configura...
Definition: util.cpp:33
int SpaceDimension() const
Definition: mesh.hpp:1048
int GetNE() const
Returns number of elements.
Definition: mesh.hpp:936
const IntegrationRule & GetRule(const Integrator &integ, const FiniteElement &trial_fe, const FiniteElement &test_fe, ElementTransformation &Trans)
void PADiffusionSetup3D(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
virtual double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:464
void ProjectTranspose(MatrixCoefficient &coeff)
Project the transpose of coeff.
int dim
Definition: ex24.cpp:53
const int MAX_D1D
Definition: forall.hpp:28
constexpr int SDIM
int Size() const
Return the logical size of the array.
Definition: array.hpp:141
const double alpha
Definition: ex15.cpp:369
Vector data type.
Definition: vector.hpp:60
std::pair< int, int > occa_id_t
Definition: occa.hpp:78
RefCoord s[3]
Class representing the storage layout of a QuadratureFunction.
Definition: qspace.hpp:92
double u(const Vector &xvec)
Definition: lor_mms.hpp:24
MFEM_HOST_DEVICE DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims... dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition: dtensor.hpp:131