MFEM  v4.2.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
bilininteg_diffusion_pa.cpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2020, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #include "../general/forall.hpp"
13 #include "bilininteg.hpp"
14 #include "gridfunc.hpp"
15 #include "libceed/diffusion.hpp"
16 
17 using namespace std;
18 
19 namespace mfem
20 {
21 
22 // PA Diffusion Integrator
23 
24 // OCCA 2D Assemble kernel
25 #ifdef MFEM_USE_OCCA
26 static void OccaPADiffusionSetup2D(const int D1D,
27  const int Q1D,
28  const int NE,
29  const Array<double> &W,
30  const Vector &J,
31  const Vector &C,
32  Vector &op)
33 {
34  occa::properties props;
35  props["defines/D1D"] = D1D;
36  props["defines/Q1D"] = Q1D;
37  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
38  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
39  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
40  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
41  const bool const_c = C.Size() == 1;
42  const occa_id_t id = std::make_pair(D1D,Q1D);
43  static occa_kernel_t OccaDiffSetup2D_ker;
44  if (OccaDiffSetup2D_ker.find(id) == OccaDiffSetup2D_ker.end())
45  {
46  const occa::kernel DiffusionSetup2D =
47  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
48  "DiffusionSetup2D", props);
49  OccaDiffSetup2D_ker.emplace(id, DiffusionSetup2D);
50  }
51  OccaDiffSetup2D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
52 }
53 
54 static void OccaPADiffusionSetup3D(const int D1D,
55  const int Q1D,
56  const int NE,
57  const Array<double> &W,
58  const Vector &J,
59  const Vector &C,
60  Vector &op)
61 {
62  occa::properties props;
63  props["defines/D1D"] = D1D;
64  props["defines/Q1D"] = Q1D;
65  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
66  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
67  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
68  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
69  const bool const_c = C.Size() == 1;
70  const occa_id_t id = std::make_pair(D1D,Q1D);
71  static occa_kernel_t OccaDiffSetup3D_ker;
72  if (OccaDiffSetup3D_ker.find(id) == OccaDiffSetup3D_ker.end())
73  {
74  const occa::kernel DiffusionSetup3D =
75  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
76  "DiffusionSetup3D", props);
77  OccaDiffSetup3D_ker.emplace(id, DiffusionSetup3D);
78  }
79  OccaDiffSetup3D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
80 }
81 #endif // MFEM_USE_OCCA
82 
83 template<>
84 void PADiffusionSetup2D<2>(const int Q1D,
85  const int coeffDim,
86  const int NE,
87  const Array<double> &w,
88  const Vector &j,
89  const Vector &c,
90  Vector &d)
91 {
92  const bool symmetric = (coeffDim != 4);
93  const bool const_c = c.Size() == 1;
94  MFEM_VERIFY(coeffDim < 3 ||
95  !const_c, "Constant matrix coefficient not supported");
96  const auto W = Reshape(w.Read(), Q1D,Q1D);
97  const auto J = Reshape(j.Read(), Q1D,Q1D,2,2,NE);
98  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1) :
99  Reshape(c.Read(), coeffDim,Q1D,Q1D,NE);
100  auto D = Reshape(d.Write(), Q1D,Q1D, symmetric ? 3 : 4, NE);
101  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
102  {
103  MFEM_FOREACH_THREAD(qx,x,Q1D)
104  {
105  MFEM_FOREACH_THREAD(qy,y,Q1D)
106  {
107  const double J11 = J(qx,qy,0,0,e);
108  const double J21 = J(qx,qy,1,0,e);
109  const double J12 = J(qx,qy,0,1,e);
110  const double J22 = J(qx,qy,1,1,e);
111  const double w_detJ = W(qx,qy) / ((J11*J22)-(J21*J12));
112  if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient
113  {
114  // First compute entries of R = MJ^{-T}, without det J factor.
115  const double M11 = C(0,qx,qy,e);
116  const double M12 = C(1,qx,qy,e);
117  const double M21 = symmetric ? M12 : C(2,qx,qy,e);
118  const double M22 = symmetric ? C(2,qx,qy,e) : C(3,qx,qy,e);
119  const double R11 = M11*J22 - M12*J12;
120  const double R21 = M21*J22 - M22*J12;
121  const double R12 = -M11*J21 + M12*J11;
122  const double R22 = -M21*J21 + M22*J11;
123 
124  // Now set y to J^{-1}R.
125  D(qx,qy,0,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
126  D(qx,qy,1,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1
127  D(qx,qy,2,e) = w_detJ * (symmetric ? (-J21*R12 + J11*R22) :
128  (J22*R12 - J12*R22)); // 2,2 or 1,2
129  if (!symmetric)
130  {
131  D(qx,qy,3,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
132  }
133  }
134  else // Vector or scalar coefficient
135  {
136  const double C1 = const_c ? C(0,0,0,0) : C(0,qx,qy,e);
137  const double C2 = const_c ? C(0,0,0,0) :
138  (coeffDim == 2 ? C(1,qx,qy,e) : C(0,qx,qy,e));
139 
140  D(qx,qy,0,e) = w_detJ * (C2*J12*J12 + C1*J22*J22); // 1,1
141  D(qx,qy,1,e) = -w_detJ * (C2*J12*J11 + C1*J22*J21); // 1,2
142  D(qx,qy,2,e) = w_detJ * (C2*J11*J11 + C1*J21*J21); // 2,2
143  }
144  }
145  }
146  });
147 }
148 
149 // PA Diffusion Assemble 2D kernel with 3D node coords
150 template<>
151 void PADiffusionSetup2D<3>(const int Q1D,
152  const int coeffDim,
153  const int NE,
154  const Array<double> &w,
155  const Vector &j,
156  const Vector &c,
157  Vector &d)
158 {
159  MFEM_VERIFY(coeffDim == 1, "Matrix and vector coefficients not supported");
160  constexpr int DIM = 2;
161  constexpr int SDIM = 3;
162  const bool const_c = c.Size() == 1;
163  const auto W = Reshape(w.Read(), Q1D,Q1D);
164  const auto J = Reshape(j.Read(), Q1D,Q1D,SDIM,DIM,NE);
165  const auto C = const_c ? Reshape(c.Read(), 1,1,1) :
166  Reshape(c.Read(), Q1D,Q1D,NE);
167  auto D = Reshape(d.Write(), Q1D,Q1D, 3, NE);
168  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
169  {
170  MFEM_FOREACH_THREAD(qx,x,Q1D)
171  {
172  MFEM_FOREACH_THREAD(qy,y,Q1D)
173  {
174  const double wq = W(qx,qy);
175  const double J11 = J(qx,qy,0,0,e);
176  const double J21 = J(qx,qy,1,0,e);
177  const double J31 = J(qx,qy,2,0,e);
178  const double J12 = J(qx,qy,0,1,e);
179  const double J22 = J(qx,qy,1,1,e);
180  const double J32 = J(qx,qy,2,1,e);
181  const double E = J11*J11 + J21*J21 + J31*J31;
182  const double G = J12*J12 + J22*J22 + J32*J32;
183  const double F = J11*J12 + J21*J22 + J31*J32;
184  const double iw = 1.0 / sqrt(E*G - F*F);
185  const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
186  const double alpha = wq * coeff * iw;
187  D(qx,qy,0,e) = alpha * G; // 1,1
188  D(qx,qy,1,e) = -alpha * F; // 1,2
189  D(qx,qy,2,e) = alpha * E; // 2,2
190  }
191  }
192  });
193 }
194 
195 // PA Diffusion Assemble 3D kernel
196 void PADiffusionSetup3D(const int Q1D,
197  const int coeffDim,
198  const int NE,
199  const Array<double> &w,
200  const Vector &j,
201  const Vector &c,
202  Vector &d)
203 {
204  const bool symmetric = (coeffDim != 9);
205  const bool const_c = c.Size() == 1;
206  MFEM_VERIFY(coeffDim < 6 ||
207  !const_c, "Constant matrix coefficient not supported");
208  const auto W = Reshape(w.Read(), Q1D,Q1D,Q1D);
209  const auto J = Reshape(j.Read(), Q1D,Q1D,Q1D,3,3,NE);
210  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1,1) :
211  Reshape(c.Read(), coeffDim,Q1D,Q1D,Q1D,NE);
212  auto D = Reshape(d.Write(), Q1D,Q1D,Q1D, symmetric ? 6 : 9, NE);
213  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
214  {
215  MFEM_FOREACH_THREAD(qx,x,Q1D)
216  {
217  MFEM_FOREACH_THREAD(qy,y,Q1D)
218  {
219  MFEM_FOREACH_THREAD(qz,z,Q1D)
220  {
221  const double J11 = J(qx,qy,qz,0,0,e);
222  const double J21 = J(qx,qy,qz,1,0,e);
223  const double J31 = J(qx,qy,qz,2,0,e);
224  const double J12 = J(qx,qy,qz,0,1,e);
225  const double J22 = J(qx,qy,qz,1,1,e);
226  const double J32 = J(qx,qy,qz,2,1,e);
227  const double J13 = J(qx,qy,qz,0,2,e);
228  const double J23 = J(qx,qy,qz,1,2,e);
229  const double J33 = J(qx,qy,qz,2,2,e);
230  const double detJ = J11 * (J22 * J33 - J32 * J23) -
231  /* */ J21 * (J12 * J33 - J32 * J13) +
232  /* */ J31 * (J12 * J23 - J22 * J13);
233  const double w_detJ = W(qx,qy,qz) / detJ;
234  // adj(J)
235  const double A11 = (J22 * J33) - (J23 * J32);
236  const double A12 = (J32 * J13) - (J12 * J33);
237  const double A13 = (J12 * J23) - (J22 * J13);
238  const double A21 = (J31 * J23) - (J21 * J33);
239  const double A22 = (J11 * J33) - (J13 * J31);
240  const double A23 = (J21 * J13) - (J11 * J23);
241  const double A31 = (J21 * J32) - (J31 * J22);
242  const double A32 = (J31 * J12) - (J11 * J32);
243  const double A33 = (J11 * J22) - (J12 * J21);
244 
245  if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
246  {
247  // Compute entries of R = MJ^{-T} = M adj(J)^T, without det J.
248  const double M11 = C(0, qx,qy,qz, e);
249  const double M12 = C(1, qx,qy,qz, e);
250  const double M13 = C(2, qx,qy,qz, e);
251  const double M21 = (!symmetric) ? C(3, qx,qy,qz, e) : M12;
252  const double M22 = (!symmetric) ? C(4, qx,qy,qz, e) : C(3, qx,qy,qz, e);
253  const double M23 = (!symmetric) ? C(5, qx,qy,qz, e) : C(4, qx,qy,qz, e);
254  const double M31 = (!symmetric) ? C(6, qx,qy,qz, e) : M13;
255  const double M32 = (!symmetric) ? C(7, qx,qy,qz, e) : M23;
256  const double M33 = (!symmetric) ? C(8, qx,qy,qz, e) : C(5, qx,qy,qz, e);
257 
258  const double R11 = M11*A11 + M12*A12 + M13*A13;
259  const double R12 = M11*A21 + M12*A22 + M13*A23;
260  const double R13 = M11*A31 + M12*A32 + M13*A33;
261  const double R21 = M21*A11 + M22*A12 + M23*A13;
262  const double R22 = M21*A21 + M22*A22 + M23*A23;
263  const double R23 = M21*A31 + M22*A32 + M23*A33;
264  const double R31 = M31*A11 + M32*A12 + M33*A13;
265  const double R32 = M31*A21 + M32*A22 + M33*A23;
266  const double R33 = M31*A31 + M32*A32 + M33*A33;
267 
268  // Now set D to J^{-1} R = adj(J) R
269  D(qx,qy,qz,0,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
270  const double D12 = w_detJ * (A11*R12 + A12*R22 + A13*R32);
271  D(qx,qy,qz,1,e) = D12; // 1,2
272  D(qx,qy,qz,2,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
273 
274  const double D21 = w_detJ * (A21*R11 + A22*R21 + A23*R31);
275  const double D22 = w_detJ * (A21*R12 + A22*R22 + A23*R32);
276  const double D23 = w_detJ * (A21*R13 + A22*R23 + A23*R33);
277 
278  const double D33 = w_detJ * (A31*R13 + A32*R23 + A33*R33);
279 
280  D(qx,qy,qz,3,e) = symmetric ? D22 : D21; // 2,2 or 2,1
281  D(qx,qy,qz,4,e) = symmetric ? D23 : D22; // 2,3 or 2,2
282  D(qx,qy,qz,5,e) = symmetric ? D33 : D23; // 3,3 or 2,3
283 
284  if (!symmetric)
285  {
286  D(qx,qy,qz,6,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
287  D(qx,qy,qz,7,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
288  D(qx,qy,qz,8,e) = D33; // 3,3
289  }
290  }
291  else // Vector or scalar coefficient version
292  {
293  const double C1 = const_c ? C(0,0,0,0,0) : C(0,qx,qy,qz,e);
294  const double C2 = const_c ? C(0,0,0,0,0) :
295  (coeffDim == 3 ? C(1,qx,qy,qz,e) : C(0,qx,qy,qz,e));
296  const double C3 = const_c ? C(0,0,0,0,0) :
297  (coeffDim == 3 ? C(2,qx,qy,qz,e) : C(0,qx,qy,qz,e));
298 
299  // detJ J^{-1} J^{-T} = (1/detJ) adj(J) adj(J)^T
300  D(qx,qy,qz,0,e) = w_detJ * (C1*A11*A11 + C2*A12*A12 + C3*A13*A13); // 1,1
301  D(qx,qy,qz,1,e) = w_detJ * (C1*A11*A21 + C2*A12*A22 + C3*A13*A23); // 2,1
302  D(qx,qy,qz,2,e) = w_detJ * (C1*A11*A31 + C2*A12*A32 + C3*A13*A33); // 3,1
303  D(qx,qy,qz,3,e) = w_detJ * (C1*A21*A21 + C2*A22*A22 + C3*A23*A23); // 2,2
304  D(qx,qy,qz,4,e) = w_detJ * (C1*A21*A31 + C2*A22*A32 + C3*A23*A33); // 3,2
305  D(qx,qy,qz,5,e) = w_detJ * (C1*A31*A31 + C2*A32*A32 + C3*A33*A33); // 3,3
306  }
307  }
308  }
309  }
310  });
311 }
312 
313 static void PADiffusionSetup(const int dim,
314  const int sdim,
315  const int D1D,
316  const int Q1D,
317  const int coeffDim,
318  const int NE,
319  const Array<double> &W,
320  const Vector &J,
321  const Vector &C,
322  Vector &D)
323 {
324  if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADiffusionSetup"); }
325  if (dim == 2)
326  {
327 #ifdef MFEM_USE_OCCA
328  if (DeviceCanUseOcca())
329  {
330  OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, C, D);
331  return;
332  }
333 #else
334  MFEM_CONTRACT_VAR(D1D);
335 #endif // MFEM_USE_OCCA
336  if (sdim == 2) { PADiffusionSetup2D<2>(Q1D, coeffDim, NE, W, J, C, D); }
337  if (sdim == 3) { PADiffusionSetup2D<3>(Q1D, coeffDim, NE, W, J, C, D); }
338  }
339  if (dim == 3)
340  {
341 #ifdef MFEM_USE_OCCA
342  if (DeviceCanUseOcca())
343  {
344  OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, C, D);
345  return;
346  }
347 #endif // MFEM_USE_OCCA
348  PADiffusionSetup3D(Q1D, coeffDim, NE, W, J, C, D);
349  }
350 }
351 
352 void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
353 {
354  // Assuming the same element type
355  fespace = &fes;
356  Mesh *mesh = fes.GetMesh();
357  if (mesh->GetNE() == 0) { return; }
358  const FiniteElement &el = *fes.GetFE(0);
359  const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
360  if (DeviceCanUseCeed())
361  {
362  delete ceedDataPtr;
363  ceedDataPtr = new CeedData;
364  InitCeedCoeff(Q, *mesh, *ir, ceedDataPtr);
365  return CeedPADiffusionAssemble(fes, *ir, *ceedDataPtr);
366  }
367  const int dims = el.GetDim();
368  const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
369  const int nq = ir->GetNPoints();
370  dim = mesh->Dimension();
371  ne = fes.GetNE();
372  geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS);
373  const int sdim = mesh->SpaceDimension();
374  maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
375  dofs1D = maps->ndof;
376  quad1D = maps->nqpt;
377  int coeffDim = 1;
378  Vector coeff;
379  const int MQfullDim = MQ ? MQ->GetHeight() * MQ->GetWidth() : 0;
380  if (MQ)
381  {
382  MFEM_VERIFY(MQ->GetHeight() == dim && MQ->GetWidth() == dim, "");
383  const int MQsymmDim = MQ->GetWidth() * (MQ->GetWidth() + 1) / 2;
384 
385  const int MQdim = MQ->IsSymmetric() ? MQsymmDim : MQfullDim;
386  coeffDim = MQdim;
387 
388  coeff.SetSize(MQdim * nq * ne);
389  symmetric = MQ ? MQ->IsSymmetric() : true;
390 
391  DenseMatrix M;
392  Vector Msymm;
393  if (symmetric)
394  {
395  Msymm.SetSize(MQsymmDim);
396  }
397  else
398  {
399  M.SetSize(dim);
400  }
401 
402  auto C = Reshape(coeff.HostWrite(), MQdim, nq, ne);
403  for (int e=0; e<ne; ++e)
404  {
406  for (int p=0; p<nq; ++p)
407  {
408  if (MQ->IsSymmetric())
409  {
410  MQ->EvalSymmetric(Msymm, *tr, ir->IntPoint(p));
411 
412  for (int i=0; i<MQsymmDim; ++i)
413  {
414  C(i, p, e) = Msymm[i];
415  }
416  }
417  else
418  {
419  MQ->Eval(M, *tr, ir->IntPoint(p));
420 
421  for (int i=0; i<dim; ++i)
422  for (int j=0; j<dim; ++j)
423  {
424  C(j+(i*dim), p, e) = M(i,j);
425  }
426  }
427  }
428  }
429  }
430  else if (VQ)
431  {
432  MFEM_VERIFY(VQ->GetVDim() == dim, "");
433  coeffDim = VQ->GetVDim();
434  coeff.SetSize(coeffDim * nq * ne);
435  auto C = Reshape(coeff.HostWrite(), coeffDim, nq, ne);
436  Vector D(coeffDim);
437  for (int e=0; e<ne; ++e)
438  {
440  for (int p=0; p<nq; ++p)
441  {
442  VQ->Eval(D, *tr, ir->IntPoint(p));
443  for (int i=0; i<coeffDim; ++i)
444  {
445  C(i, p, e) = D[i];
446  }
447  }
448  }
449  }
450  else if (Q == nullptr)
451  {
452  coeff.SetSize(1);
453  coeff(0) = 1.0;
454  }
455  else if (ConstantCoefficient* cQ = dynamic_cast<ConstantCoefficient*>(Q))
456  {
457  coeff.SetSize(1);
458  coeff(0) = cQ->constant;
459  }
460  else if (QuadratureFunctionCoefficient* cQ =
461  dynamic_cast<QuadratureFunctionCoefficient*>(Q))
462  {
463  const QuadratureFunction &qFun = cQ->GetQuadFunction();
464  MFEM_VERIFY(qFun.Size() == ne*nq,
465  "Incompatible QuadratureFunction dimension \n");
466 
467  MFEM_VERIFY(ir == &qFun.GetSpace()->GetElementIntRule(0),
468  "IntegrationRule used within integrator and in"
469  " QuadratureFunction appear to be different");
470  qFun.Read();
471  coeff.MakeRef(const_cast<QuadratureFunction &>(qFun),0);
472  }
473  else
474  {
475  coeff.SetSize(nq * ne);
476  auto C = Reshape(coeff.HostWrite(), nq, ne);
477  for (int e = 0; e < ne; ++e)
478  {
480  for (int q = 0; q < nq; ++q)
481  {
482  C(q,e) = Q->Eval(T, ir->IntPoint(q));
483  }
484  }
485  }
486  pa_data.SetSize((symmetric ? symmDims : MQfullDim) * nq * ne,
487  Device::GetDeviceMemoryType());
488  PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeffDim, ne, ir->GetWeights(),
489  geom->J, coeff, pa_data);
490 }
491 
492 template<int T_D1D = 0, int T_Q1D = 0>
493 static void PADiffusionDiagonal2D(const int NE,
494  const bool symmetric,
495  const Array<double> &b,
496  const Array<double> &g,
497  const Vector &d,
498  Vector &y,
499  const int d1d = 0,
500  const int q1d = 0)
501 {
502  const int D1D = T_D1D ? T_D1D : d1d;
503  const int Q1D = T_Q1D ? T_Q1D : q1d;
504  MFEM_VERIFY(D1D <= MAX_D1D, "");
505  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
506  auto B = Reshape(b.Read(), Q1D, D1D);
507  auto G = Reshape(g.Read(), Q1D, D1D);
508  // note the different shape for D, if this is a symmetric matrix we only
509  // store necessary entries
510  auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
511  auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
512  MFEM_FORALL(e, NE,
513  {
514  const int D1D = T_D1D ? T_D1D : d1d;
515  const int Q1D = T_Q1D ? T_Q1D : q1d;
516  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
517  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
518  // gradphi \cdot Q \gradphi has four terms
519  double QD0[MQ1][MD1];
520  double QD1[MQ1][MD1];
521  double QD2[MQ1][MD1];
522  for (int qx = 0; qx < Q1D; ++qx)
523  {
524  for (int dy = 0; dy < D1D; ++dy)
525  {
526  QD0[qx][dy] = 0.0;
527  QD1[qx][dy] = 0.0;
528  QD2[qx][dy] = 0.0;
529  for (int qy = 0; qy < Q1D; ++qy)
530  {
531  const int q = qx + qy * Q1D;
532  const double D00 = D(q,0,e);
533  const double D10 = D(q,1,e);
534  const double D01 = symmetric ? D10 : D(q,2,e);
535  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
536  QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
537  QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
538  QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
539  }
540  }
541  }
542  for (int dy = 0; dy < D1D; ++dy)
543  {
544  for (int dx = 0; dx < D1D; ++dx)
545  {
546  for (int qx = 0; qx < Q1D; ++qx)
547  {
548  Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
549  Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
550  Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
551  }
552  }
553  }
554  });
555 }
556 
557 // Shared memory PA Diffusion Diagonal 2D kernel
558 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
559 static void SmemPADiffusionDiagonal2D(const int NE,
560  const bool symmetric,
561  const Array<double> &b_,
562  const Array<double> &g_,
563  const Vector &d_,
564  Vector &y_,
565  const int d1d = 0,
566  const int q1d = 0)
567 {
568  const int D1D = T_D1D ? T_D1D : d1d;
569  const int Q1D = T_Q1D ? T_Q1D : q1d;
570  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
571  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
572  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
573  MFEM_VERIFY(D1D <= MD1, "");
574  MFEM_VERIFY(Q1D <= MQ1, "");
575  auto b = Reshape(b_.Read(), Q1D, D1D);
576  auto g = Reshape(g_.Read(), Q1D, D1D);
577  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
578  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
579  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
580  {
581  const int tidz = MFEM_THREAD_ID(z);
582  const int D1D = T_D1D ? T_D1D : d1d;
583  const int Q1D = T_Q1D ? T_Q1D : q1d;
584  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
585  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
586  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
587  MFEM_SHARED double BG[2][MQ1*MD1];
588  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
589  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
590  MFEM_SHARED double QD[3][NBZ][MD1][MQ1];
591  double (*QD0)[MD1] = (double (*)[MD1])(QD[0] + tidz);
592  double (*QD1)[MD1] = (double (*)[MD1])(QD[1] + tidz);
593  double (*QD2)[MD1] = (double (*)[MD1])(QD[2] + tidz);
594  if (tidz == 0)
595  {
596  MFEM_FOREACH_THREAD(d,y,D1D)
597  {
598  MFEM_FOREACH_THREAD(q,x,Q1D)
599  {
600  B[q][d] = b(q,d);
601  G[q][d] = g(q,d);
602  }
603  }
604  }
605  MFEM_SYNC_THREAD;
606  MFEM_FOREACH_THREAD(qx,x,Q1D)
607  {
608  MFEM_FOREACH_THREAD(dy,y,D1D)
609  {
610  QD0[qx][dy] = 0.0;
611  QD1[qx][dy] = 0.0;
612  QD2[qx][dy] = 0.0;
613  for (int qy = 0; qy < Q1D; ++qy)
614  {
615  const int q = qx + qy * Q1D;
616  const double D00 = D(q,0,e);
617  const double D10 = D(q,1,e);
618  const double D01 = symmetric ? D10 : D(q,2,e);
619  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
620  const double By = B[qy][dy];
621  const double Gy = G[qy][dy];
622  const double BB = By * By;
623  const double BG = By * Gy;
624  const double GG = Gy * Gy;
625  QD0[qx][dy] += BB * D00;
626  QD1[qx][dy] += BG * (D01 + D10);
627  QD2[qx][dy] += GG * D11;
628  }
629  }
630  }
631  MFEM_SYNC_THREAD;
632  MFEM_FOREACH_THREAD(dy,y,D1D)
633  {
634  MFEM_FOREACH_THREAD(dx,x,D1D)
635  {
636  for (int qx = 0; qx < Q1D; ++qx)
637  {
638  const double Bx = B[qx][dx];
639  const double Gx = G[qx][dx];
640  const double BB = Bx * Bx;
641  const double BG = Bx * Gx;
642  const double GG = Gx * Gx;
643  Y(dx,dy,e) += GG * QD0[qx][dy];
644  Y(dx,dy,e) += BG * QD1[qx][dy];
645  Y(dx,dy,e) += BB * QD2[qx][dy];
646  }
647  }
648  }
649  });
650 }
651 
652 template<int T_D1D = 0, int T_Q1D = 0>
653 static void PADiffusionDiagonal3D(const int NE,
654  const bool symmetric,
655  const Array<double> &b,
656  const Array<double> &g,
657  const Vector &d,
658  Vector &y,
659  const int d1d = 0,
660  const int q1d = 0)
661 {
662  constexpr int DIM = 3;
663  const int D1D = T_D1D ? T_D1D : d1d;
664  const int Q1D = T_Q1D ? T_Q1D : q1d;
665  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
666  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
667  MFEM_VERIFY(D1D <= MD1, "");
668  MFEM_VERIFY(Q1D <= MQ1, "");
669  auto B = Reshape(b.Read(), Q1D, D1D);
670  auto G = Reshape(g.Read(), Q1D, D1D);
671  auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
672  auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
673  MFEM_FORALL(e, NE,
674  {
675  const int D1D = T_D1D ? T_D1D : d1d;
676  const int Q1D = T_Q1D ? T_Q1D : q1d;
677  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
678  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
679  double QQD[MQ1][MQ1][MD1];
680  double QDD[MQ1][MD1][MD1];
681  for (int i = 0; i < DIM; ++i)
682  {
683  for (int j = 0; j < DIM; ++j)
684  {
685  // first tensor contraction, along z direction
686  for (int qx = 0; qx < Q1D; ++qx)
687  {
688  for (int qy = 0; qy < Q1D; ++qy)
689  {
690  for (int dz = 0; dz < D1D; ++dz)
691  {
692  QQD[qx][qy][dz] = 0.0;
693  for (int qz = 0; qz < Q1D; ++qz)
694  {
695  const int q = qx + (qy + qz * Q1D) * Q1D;
696  const int ksym = j >= i ?
697  3 - (3-i)*(2-i)/2 + j:
698  3 - (3-j)*(2-j)/2 + i;
699  const int k = symmetric ? ksym : (i*DIM) + j;
700  const double O = Q(q,k,e);
701  const double Bz = B(qz,dz);
702  const double Gz = G(qz,dz);
703  const double L = i==2 ? Gz : Bz;
704  const double R = j==2 ? Gz : Bz;
705  QQD[qx][qy][dz] += L * O * R;
706  }
707  }
708  }
709  }
710  // second tensor contraction, along y direction
711  for (int qx = 0; qx < Q1D; ++qx)
712  {
713  for (int dz = 0; dz < D1D; ++dz)
714  {
715  for (int dy = 0; dy < D1D; ++dy)
716  {
717  QDD[qx][dy][dz] = 0.0;
718  for (int qy = 0; qy < Q1D; ++qy)
719  {
720  const double By = B(qy,dy);
721  const double Gy = G(qy,dy);
722  const double L = i==1 ? Gy : By;
723  const double R = j==1 ? Gy : By;
724  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
725  }
726  }
727  }
728  }
729  // third tensor contraction, along x direction
730  for (int dz = 0; dz < D1D; ++dz)
731  {
732  for (int dy = 0; dy < D1D; ++dy)
733  {
734  for (int dx = 0; dx < D1D; ++dx)
735  {
736  for (int qx = 0; qx < Q1D; ++qx)
737  {
738  const double Bx = B(qx,dx);
739  const double Gx = G(qx,dx);
740  const double L = i==0 ? Gx : Bx;
741  const double R = j==0 ? Gx : Bx;
742  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
743  }
744  }
745  }
746  }
747  }
748  }
749  });
750 }
751 
752 // Shared memory PA Diffusion Diagonal 3D kernel
753 template<int T_D1D = 0, int T_Q1D = 0>
754 static void SmemPADiffusionDiagonal3D(const int NE,
755  const bool symmetric,
756  const Array<double> &b_,
757  const Array<double> &g_,
758  const Vector &d_,
759  Vector &y_,
760  const int d1d = 0,
761  const int q1d = 0)
762 {
763  constexpr int DIM = 3;
764  const int D1D = T_D1D ? T_D1D : d1d;
765  const int Q1D = T_Q1D ? T_Q1D : q1d;
766  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
767  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
768  MFEM_VERIFY(D1D <= MD1, "");
769  MFEM_VERIFY(Q1D <= MQ1, "");
770  auto b = Reshape(b_.Read(), Q1D, D1D);
771  auto g = Reshape(g_.Read(), Q1D, D1D);
772  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
773  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
774  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
775  {
776  const int tidz = MFEM_THREAD_ID(z);
777  const int D1D = T_D1D ? T_D1D : d1d;
778  const int Q1D = T_Q1D ? T_Q1D : q1d;
779  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
780  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
781  MFEM_SHARED double BG[2][MQ1*MD1];
782  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
783  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
784  MFEM_SHARED double QQD[MQ1][MQ1][MD1];
785  MFEM_SHARED double QDD[MQ1][MD1][MD1];
786  if (tidz == 0)
787  {
788  MFEM_FOREACH_THREAD(d,y,D1D)
789  {
790  MFEM_FOREACH_THREAD(q,x,Q1D)
791  {
792  B[q][d] = b(q,d);
793  G[q][d] = g(q,d);
794  }
795  }
796  }
797  MFEM_SYNC_THREAD;
798  for (int i = 0; i < DIM; ++i)
799  {
800  for (int j = 0; j < DIM; ++j)
801  {
802  // first tensor contraction, along z direction
803  MFEM_FOREACH_THREAD(qx,x,Q1D)
804  {
805  MFEM_FOREACH_THREAD(qy,y,Q1D)
806  {
807  MFEM_FOREACH_THREAD(dz,z,D1D)
808  {
809  QQD[qx][qy][dz] = 0.0;
810  for (int qz = 0; qz < Q1D; ++qz)
811  {
812  const int q = qx + (qy + qz * Q1D) * Q1D;
813  const int ksym = j >= i ?
814  3 - (3-i)*(2-i)/2 + j:
815  3 - (3-j)*(2-j)/2 + i;
816  const int k = symmetric ? ksym : (i*DIM) + j;
817  const double O = D(q,k,e);
818  const double Bz = B[qz][dz];
819  const double Gz = G[qz][dz];
820  const double L = i==2 ? Gz : Bz;
821  const double R = j==2 ? Gz : Bz;
822  QQD[qx][qy][dz] += L * O * R;
823  }
824  }
825  }
826  }
827  MFEM_SYNC_THREAD;
828  // second tensor contraction, along y direction
829  MFEM_FOREACH_THREAD(qx,x,Q1D)
830  {
831  MFEM_FOREACH_THREAD(dz,z,D1D)
832  {
833  MFEM_FOREACH_THREAD(dy,y,D1D)
834  {
835  QDD[qx][dy][dz] = 0.0;
836  for (int qy = 0; qy < Q1D; ++qy)
837  {
838  const double By = B[qy][dy];
839  const double Gy = G[qy][dy];
840  const double L = i==1 ? Gy : By;
841  const double R = j==1 ? Gy : By;
842  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
843  }
844  }
845  }
846  }
847  MFEM_SYNC_THREAD;
848  // third tensor contraction, along x direction
849  MFEM_FOREACH_THREAD(dz,z,D1D)
850  {
851  MFEM_FOREACH_THREAD(dy,y,D1D)
852  {
853  MFEM_FOREACH_THREAD(dx,x,D1D)
854  {
855  for (int qx = 0; qx < Q1D; ++qx)
856  {
857  const double Bx = B[qx][dx];
858  const double Gx = G[qx][dx];
859  const double L = i==0 ? Gx : Bx;
860  const double R = j==0 ? Gx : Bx;
861  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
862  }
863  }
864  }
865  }
866  }
867  }
868  });
869 }
870 
871 static void PADiffusionAssembleDiagonal(const int dim,
872  const int D1D,
873  const int Q1D,
874  const int NE,
875  const bool symm,
876  const Array<double> &B,
877  const Array<double> &G,
878  const Vector &D,
879  Vector &Y)
880 {
881  if (dim == 2)
882  {
883  switch ((D1D << 4 ) | Q1D)
884  {
885  case 0x22: return SmemPADiffusionDiagonal2D<2,2,8>(NE,symm,B,G,D,Y);
886  case 0x33: return SmemPADiffusionDiagonal2D<3,3,8>(NE,symm,B,G,D,Y);
887  case 0x44: return SmemPADiffusionDiagonal2D<4,4,4>(NE,symm,B,G,D,Y);
888  case 0x55: return SmemPADiffusionDiagonal2D<5,5,4>(NE,symm,B,G,D,Y);
889  case 0x66: return SmemPADiffusionDiagonal2D<6,6,2>(NE,symm,B,G,D,Y);
890  case 0x77: return SmemPADiffusionDiagonal2D<7,7,2>(NE,symm,B,G,D,Y);
891  case 0x88: return SmemPADiffusionDiagonal2D<8,8,1>(NE,symm,B,G,D,Y);
892  case 0x99: return SmemPADiffusionDiagonal2D<9,9,1>(NE,symm,B,G,D,Y);
893  default: return PADiffusionDiagonal2D(NE,symm,B,G,D,Y,D1D,Q1D);
894  }
895  }
896  else if (dim == 3)
897  {
898  switch ((D1D << 4 ) | Q1D)
899  {
900  case 0x23: return SmemPADiffusionDiagonal3D<2,3>(NE,symm,B,G,D,Y);
901  case 0x34: return SmemPADiffusionDiagonal3D<3,4>(NE,symm,B,G,D,Y);
902  case 0x45: return SmemPADiffusionDiagonal3D<4,5>(NE,symm,B,G,D,Y);
903  case 0x56: return SmemPADiffusionDiagonal3D<5,6>(NE,symm,B,G,D,Y);
904  case 0x67: return SmemPADiffusionDiagonal3D<6,7>(NE,symm,B,G,D,Y);
905  case 0x78: return SmemPADiffusionDiagonal3D<7,8>(NE,symm,B,G,D,Y);
906  case 0x89: return SmemPADiffusionDiagonal3D<8,9>(NE,symm,B,G,D,Y);
907  case 0x9A: return SmemPADiffusionDiagonal3D<9,10>(NE,symm,B,G,D,Y);
908  default: return PADiffusionDiagonal3D(NE,symm,B,G,D,Y,D1D,Q1D);
909  }
910  }
911  MFEM_ABORT("Unknown kernel.");
912 }
913 
914 void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
915 {
916  if (DeviceCanUseCeed())
917  {
918  CeedAssembleDiagonal(ceedDataPtr, diag);
919  }
920  else
921  {
922  if (pa_data.Size()==0) { AssemblePA(*fespace); }
923  PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
924  maps->B, maps->G, pa_data, diag);
925  }
926 }
927 
928 
929 #ifdef MFEM_USE_OCCA
930 // OCCA PA Diffusion Apply 2D kernel
931 static void OccaPADiffusionApply2D(const int D1D,
932  const int Q1D,
933  const int NE,
934  const Array<double> &B,
935  const Array<double> &G,
936  const Array<double> &Bt,
937  const Array<double> &Gt,
938  const Vector &D,
939  const Vector &X,
940  Vector &Y)
941 {
942  occa::properties props;
943  props["defines/D1D"] = D1D;
944  props["defines/Q1D"] = Q1D;
945  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
946  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
947  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
948  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
949  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
950  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
951  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
952  const occa_id_t id = std::make_pair(D1D,Q1D);
953  if (!Device::Allows(Backend::OCCA_CUDA))
954  {
955  static occa_kernel_t OccaDiffApply2D_cpu;
956  if (OccaDiffApply2D_cpu.find(id) == OccaDiffApply2D_cpu.end())
957  {
958  const occa::kernel DiffusionApply2D_CPU =
959  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
960  "DiffusionApply2D_CPU", props);
961  OccaDiffApply2D_cpu.emplace(id, DiffusionApply2D_CPU);
962  }
963  OccaDiffApply2D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
964  }
965  else
966  {
967  static occa_kernel_t OccaDiffApply2D_gpu;
968  if (OccaDiffApply2D_gpu.find(id) == OccaDiffApply2D_gpu.end())
969  {
970  const occa::kernel DiffusionApply2D_GPU =
971  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
972  "DiffusionApply2D_GPU", props);
973  OccaDiffApply2D_gpu.emplace(id, DiffusionApply2D_GPU);
974  }
975  OccaDiffApply2D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
976  }
977 }
978 
979 // OCCA PA Diffusion Apply 3D kernel
980 static void OccaPADiffusionApply3D(const int D1D,
981  const int Q1D,
982  const int NE,
983  const Array<double> &B,
984  const Array<double> &G,
985  const Array<double> &Bt,
986  const Array<double> &Gt,
987  const Vector &D,
988  const Vector &X,
989  Vector &Y)
990 {
991  occa::properties props;
992  props["defines/D1D"] = D1D;
993  props["defines/Q1D"] = Q1D;
994  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
995  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
996  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
997  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
998  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
999  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
1000  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
1001  const occa_id_t id = std::make_pair(D1D,Q1D);
1002  if (!Device::Allows(Backend::OCCA_CUDA))
1003  {
1004  static occa_kernel_t OccaDiffApply3D_cpu;
1005  if (OccaDiffApply3D_cpu.find(id) == OccaDiffApply3D_cpu.end())
1006  {
1007  const occa::kernel DiffusionApply3D_CPU =
1008  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1009  "DiffusionApply3D_CPU", props);
1010  OccaDiffApply3D_cpu.emplace(id, DiffusionApply3D_CPU);
1011  }
1012  OccaDiffApply3D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1013  }
1014  else
1015  {
1016  static occa_kernel_t OccaDiffApply3D_gpu;
1017  if (OccaDiffApply3D_gpu.find(id) == OccaDiffApply3D_gpu.end())
1018  {
1019  const occa::kernel DiffusionApply3D_GPU =
1020  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1021  "DiffusionApply3D_GPU", props);
1022  OccaDiffApply3D_gpu.emplace(id, DiffusionApply3D_GPU);
1023  }
1024  OccaDiffApply3D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1025  }
1026 }
1027 #endif // MFEM_USE_OCCA
1028 
1029 // PA Diffusion Apply 2D kernel
1030 template<int T_D1D = 0, int T_Q1D = 0>
1031 static void PADiffusionApply2D(const int NE,
1032  const bool symmetric,
1033  const Array<double> &b_,
1034  const Array<double> &g_,
1035  const Array<double> &bt_,
1036  const Array<double> &gt_,
1037  const Vector &d_,
1038  const Vector &x_,
1039  Vector &y_,
1040  const int d1d = 0,
1041  const int q1d = 0)
1042 {
1043  const int D1D = T_D1D ? T_D1D : d1d;
1044  const int Q1D = T_Q1D ? T_Q1D : q1d;
1045  MFEM_VERIFY(D1D <= MAX_D1D, "");
1046  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1047  auto B = Reshape(b_.Read(), Q1D, D1D);
1048  auto G = Reshape(g_.Read(), Q1D, D1D);
1049  auto Bt = Reshape(bt_.Read(), D1D, Q1D);
1050  auto Gt = Reshape(gt_.Read(), D1D, Q1D);
1051  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1052  auto X = Reshape(x_.Read(), D1D, D1D, NE);
1053  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1054  MFEM_FORALL(e, NE,
1055  {
1056  const int D1D = T_D1D ? T_D1D : d1d;
1057  const int Q1D = T_Q1D ? T_Q1D : q1d;
1058  // the following variables are evaluated at compile time
1059  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1060  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1061 
1062  double grad[max_Q1D][max_Q1D][2];
1063  for (int qy = 0; qy < Q1D; ++qy)
1064  {
1065  for (int qx = 0; qx < Q1D; ++qx)
1066  {
1067  grad[qy][qx][0] = 0.0;
1068  grad[qy][qx][1] = 0.0;
1069  }
1070  }
1071  for (int dy = 0; dy < D1D; ++dy)
1072  {
1073  double gradX[max_Q1D][2];
1074  for (int qx = 0; qx < Q1D; ++qx)
1075  {
1076  gradX[qx][0] = 0.0;
1077  gradX[qx][1] = 0.0;
1078  }
1079  for (int dx = 0; dx < D1D; ++dx)
1080  {
1081  const double s = X(dx,dy,e);
1082  for (int qx = 0; qx < Q1D; ++qx)
1083  {
1084  gradX[qx][0] += s * B(qx,dx);
1085  gradX[qx][1] += s * G(qx,dx);
1086  }
1087  }
1088  for (int qy = 0; qy < Q1D; ++qy)
1089  {
1090  const double wy = B(qy,dy);
1091  const double wDy = G(qy,dy);
1092  for (int qx = 0; qx < Q1D; ++qx)
1093  {
1094  grad[qy][qx][0] += gradX[qx][1] * wy;
1095  grad[qy][qx][1] += gradX[qx][0] * wDy;
1096  }
1097  }
1098  }
1099  // Calculate Dxy, xDy in plane
1100  for (int qy = 0; qy < Q1D; ++qy)
1101  {
1102  for (int qx = 0; qx < Q1D; ++qx)
1103  {
1104  const int q = qx + qy * Q1D;
1105 
1106  const double O11 = D(q,0,e);
1107  const double O21 = D(q,1,e);
1108  const double O12 = symmetric ? O21 : D(q,2,e);
1109  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1110 
1111  const double gradX = grad[qy][qx][0];
1112  const double gradY = grad[qy][qx][1];
1113 
1114  grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
1115  grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
1116  }
1117  }
1118  for (int qy = 0; qy < Q1D; ++qy)
1119  {
1120  double gradX[max_D1D][2];
1121  for (int dx = 0; dx < D1D; ++dx)
1122  {
1123  gradX[dx][0] = 0;
1124  gradX[dx][1] = 0;
1125  }
1126  for (int qx = 0; qx < Q1D; ++qx)
1127  {
1128  const double gX = grad[qy][qx][0];
1129  const double gY = grad[qy][qx][1];
1130  for (int dx = 0; dx < D1D; ++dx)
1131  {
1132  const double wx = Bt(dx,qx);
1133  const double wDx = Gt(dx,qx);
1134  gradX[dx][0] += gX * wDx;
1135  gradX[dx][1] += gY * wx;
1136  }
1137  }
1138  for (int dy = 0; dy < D1D; ++dy)
1139  {
1140  const double wy = Bt(dy,qy);
1141  const double wDy = Gt(dy,qy);
1142  for (int dx = 0; dx < D1D; ++dx)
1143  {
1144  Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
1145  }
1146  }
1147  }
1148  });
1149 }
1150 
1151 // Shared memory PA Diffusion Apply 2D kernel
1152 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
1153 static void SmemPADiffusionApply2D(const int NE,
1154  const bool symmetric,
1155  const Array<double> &b_,
1156  const Array<double> &g_,
1157  const Vector &d_,
1158  const Vector &x_,
1159  Vector &y_,
1160  const int d1d = 0,
1161  const int q1d = 0)
1162 {
1163  const int D1D = T_D1D ? T_D1D : d1d;
1164  const int Q1D = T_Q1D ? T_Q1D : q1d;
1165  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1166  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1167  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1168  MFEM_VERIFY(D1D <= MD1, "");
1169  MFEM_VERIFY(Q1D <= MQ1, "");
1170  auto b = Reshape(b_.Read(), Q1D, D1D);
1171  auto g = Reshape(g_.Read(), Q1D, D1D);
1172  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1173  auto x = Reshape(x_.Read(), D1D, D1D, NE);
1174  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1175  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
1176  {
1177  const int tidz = MFEM_THREAD_ID(z);
1178  const int D1D = T_D1D ? T_D1D : d1d;
1179  const int Q1D = T_Q1D ? T_Q1D : q1d;
1180  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1181  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1182  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1183  MFEM_SHARED double sBG[2][MQ1*MD1];
1184  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1185  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1186  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1187  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1188  MFEM_SHARED double Xz[NBZ][MD1][MD1];
1189  MFEM_SHARED double GD[2][NBZ][MD1][MQ1];
1190  MFEM_SHARED double GQ[2][NBZ][MD1][MQ1];
1191  double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
1192  double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
1193  double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
1194  double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
1195  double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
1196  MFEM_FOREACH_THREAD(dy,y,D1D)
1197  {
1198  MFEM_FOREACH_THREAD(dx,x,D1D)
1199  {
1200  X[dy][dx] = x(dx,dy,e);
1201  }
1202  }
1203  if (tidz == 0)
1204  {
1205  MFEM_FOREACH_THREAD(dy,y,D1D)
1206  {
1207  MFEM_FOREACH_THREAD(q,x,Q1D)
1208  {
1209  B[q][dy] = b(q,dy);
1210  G[q][dy] = g(q,dy);
1211  }
1212  }
1213  }
1214  MFEM_SYNC_THREAD;
1215  MFEM_FOREACH_THREAD(dy,y,D1D)
1216  {
1217  MFEM_FOREACH_THREAD(qx,x,Q1D)
1218  {
1219  double u = 0.0;
1220  double v = 0.0;
1221  for (int dx = 0; dx < D1D; ++dx)
1222  {
1223  const double coords = X[dy][dx];
1224  u += B[qx][dx] * coords;
1225  v += G[qx][dx] * coords;
1226  }
1227  DQ0[dy][qx] = u;
1228  DQ1[dy][qx] = v;
1229  }
1230  }
1231  MFEM_SYNC_THREAD;
1232  MFEM_FOREACH_THREAD(qy,y,Q1D)
1233  {
1234  MFEM_FOREACH_THREAD(qx,x,Q1D)
1235  {
1236  double u = 0.0;
1237  double v = 0.0;
1238  for (int dy = 0; dy < D1D; ++dy)
1239  {
1240  u += DQ1[dy][qx] * B[qy][dy];
1241  v += DQ0[dy][qx] * G[qy][dy];
1242  }
1243  QQ0[qy][qx] = u;
1244  QQ1[qy][qx] = v;
1245  }
1246  }
1247  MFEM_SYNC_THREAD;
1248  MFEM_FOREACH_THREAD(qy,y,Q1D)
1249  {
1250  MFEM_FOREACH_THREAD(qx,x,Q1D)
1251  {
1252  const int q = (qx + ((qy) * Q1D));
1253  const double O11 = D(q,0,e);
1254  const double O21 = D(q,1,e);
1255  const double O12 = symmetric ? O21 : D(q,2,e);
1256  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1257  const double gX = QQ0[qy][qx];
1258  const double gY = QQ1[qy][qx];
1259  QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
1260  QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
1261  }
1262  }
1263  MFEM_SYNC_THREAD;
1264  if (tidz == 0)
1265  {
1266  MFEM_FOREACH_THREAD(dy,y,D1D)
1267  {
1268  MFEM_FOREACH_THREAD(q,x,Q1D)
1269  {
1270  Bt[dy][q] = b(q,dy);
1271  Gt[dy][q] = g(q,dy);
1272  }
1273  }
1274  }
1275  MFEM_SYNC_THREAD;
1276  MFEM_FOREACH_THREAD(qy,y,Q1D)
1277  {
1278  MFEM_FOREACH_THREAD(dx,x,D1D)
1279  {
1280  double u = 0.0;
1281  double v = 0.0;
1282  for (int qx = 0; qx < Q1D; ++qx)
1283  {
1284  u += Gt[dx][qx] * QQ0[qy][qx];
1285  v += Bt[dx][qx] * QQ1[qy][qx];
1286  }
1287  DQ0[qy][dx] = u;
1288  DQ1[qy][dx] = v;
1289  }
1290  }
1291  MFEM_SYNC_THREAD;
1292  MFEM_FOREACH_THREAD(dy,y,D1D)
1293  {
1294  MFEM_FOREACH_THREAD(dx,x,D1D)
1295  {
1296  double u = 0.0;
1297  double v = 0.0;
1298  for (int qy = 0; qy < Q1D; ++qy)
1299  {
1300  u += DQ0[qy][dx] * Bt[dy][qy];
1301  v += DQ1[qy][dx] * Gt[dy][qy];
1302  }
1303  Y(dx,dy,e) += (u + v);
1304  }
1305  }
1306  });
1307 }
1308 
1309 // PA Diffusion Apply 3D kernel
1310 template<int T_D1D = 0, int T_Q1D = 0>
1311 static void PADiffusionApply3D(const int NE,
1312  const bool symmetric,
1313  const Array<double> &b,
1314  const Array<double> &g,
1315  const Array<double> &bt,
1316  const Array<double> &gt,
1317  const Vector &d_,
1318  const Vector &x_,
1319  Vector &y_,
1320  int d1d = 0, int q1d = 0)
1321 {
1322  const int D1D = T_D1D ? T_D1D : d1d;
1323  const int Q1D = T_Q1D ? T_Q1D : q1d;
1324  MFEM_VERIFY(D1D <= MAX_D1D, "");
1325  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1326  auto B = Reshape(b.Read(), Q1D, D1D);
1327  auto G = Reshape(g.Read(), Q1D, D1D);
1328  auto Bt = Reshape(bt.Read(), D1D, Q1D);
1329  auto Gt = Reshape(gt.Read(), D1D, Q1D);
1330  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
1331  auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1332  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1333  MFEM_FORALL(e, NE,
1334  {
1335  const int D1D = T_D1D ? T_D1D : d1d;
1336  const int Q1D = T_Q1D ? T_Q1D : q1d;
1337  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1338  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1339  double grad[max_Q1D][max_Q1D][max_Q1D][3];
1340  for (int qz = 0; qz < Q1D; ++qz)
1341  {
1342  for (int qy = 0; qy < Q1D; ++qy)
1343  {
1344  for (int qx = 0; qx < Q1D; ++qx)
1345  {
1346  grad[qz][qy][qx][0] = 0.0;
1347  grad[qz][qy][qx][1] = 0.0;
1348  grad[qz][qy][qx][2] = 0.0;
1349  }
1350  }
1351  }
1352  for (int dz = 0; dz < D1D; ++dz)
1353  {
1354  double gradXY[max_Q1D][max_Q1D][3];
1355  for (int qy = 0; qy < Q1D; ++qy)
1356  {
1357  for (int qx = 0; qx < Q1D; ++qx)
1358  {
1359  gradXY[qy][qx][0] = 0.0;
1360  gradXY[qy][qx][1] = 0.0;
1361  gradXY[qy][qx][2] = 0.0;
1362  }
1363  }
1364  for (int dy = 0; dy < D1D; ++dy)
1365  {
1366  double gradX[max_Q1D][2];
1367  for (int qx = 0; qx < Q1D; ++qx)
1368  {
1369  gradX[qx][0] = 0.0;
1370  gradX[qx][1] = 0.0;
1371  }
1372  for (int dx = 0; dx < D1D; ++dx)
1373  {
1374  const double s = X(dx,dy,dz,e);
1375  for (int qx = 0; qx < Q1D; ++qx)
1376  {
1377  gradX[qx][0] += s * B(qx,dx);
1378  gradX[qx][1] += s * G(qx,dx);
1379  }
1380  }
1381  for (int qy = 0; qy < Q1D; ++qy)
1382  {
1383  const double wy = B(qy,dy);
1384  const double wDy = G(qy,dy);
1385  for (int qx = 0; qx < Q1D; ++qx)
1386  {
1387  const double wx = gradX[qx][0];
1388  const double wDx = gradX[qx][1];
1389  gradXY[qy][qx][0] += wDx * wy;
1390  gradXY[qy][qx][1] += wx * wDy;
1391  gradXY[qy][qx][2] += wx * wy;
1392  }
1393  }
1394  }
1395  for (int qz = 0; qz < Q1D; ++qz)
1396  {
1397  const double wz = B(qz,dz);
1398  const double wDz = G(qz,dz);
1399  for (int qy = 0; qy < Q1D; ++qy)
1400  {
1401  for (int qx = 0; qx < Q1D; ++qx)
1402  {
1403  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
1404  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
1405  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
1406  }
1407  }
1408  }
1409  }
1410  // Calculate Dxyz, xDyz, xyDz in plane
1411  for (int qz = 0; qz < Q1D; ++qz)
1412  {
1413  for (int qy = 0; qy < Q1D; ++qy)
1414  {
1415  for (int qx = 0; qx < Q1D; ++qx)
1416  {
1417  const int q = qx + (qy + qz * Q1D) * Q1D;
1418  const double O11 = D(q,0,e);
1419  const double O12 = D(q,1,e);
1420  const double O13 = D(q,2,e);
1421  const double O21 = symmetric ? O12 : D(q,3,e);
1422  const double O22 = symmetric ? D(q,3,e) : D(q,4,e);
1423  const double O23 = symmetric ? D(q,4,e) : D(q,5,e);
1424  const double O31 = symmetric ? O13 : D(q,6,e);
1425  const double O32 = symmetric ? O23 : D(q,7,e);
1426  const double O33 = symmetric ? D(q,5,e) : D(q,8,e);
1427  const double gradX = grad[qz][qy][qx][0];
1428  const double gradY = grad[qz][qy][qx][1];
1429  const double gradZ = grad[qz][qy][qx][2];
1430  grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
1431  grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
1432  grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
1433  }
1434  }
1435  }
1436  for (int qz = 0; qz < Q1D; ++qz)
1437  {
1438  double gradXY[max_D1D][max_D1D][3];
1439  for (int dy = 0; dy < D1D; ++dy)
1440  {
1441  for (int dx = 0; dx < D1D; ++dx)
1442  {
1443  gradXY[dy][dx][0] = 0;
1444  gradXY[dy][dx][1] = 0;
1445  gradXY[dy][dx][2] = 0;
1446  }
1447  }
1448  for (int qy = 0; qy < Q1D; ++qy)
1449  {
1450  double gradX[max_D1D][3];
1451  for (int dx = 0; dx < D1D; ++dx)
1452  {
1453  gradX[dx][0] = 0;
1454  gradX[dx][1] = 0;
1455  gradX[dx][2] = 0;
1456  }
1457  for (int qx = 0; qx < Q1D; ++qx)
1458  {
1459  const double gX = grad[qz][qy][qx][0];
1460  const double gY = grad[qz][qy][qx][1];
1461  const double gZ = grad[qz][qy][qx][2];
1462  for (int dx = 0; dx < D1D; ++dx)
1463  {
1464  const double wx = Bt(dx,qx);
1465  const double wDx = Gt(dx,qx);
1466  gradX[dx][0] += gX * wDx;
1467  gradX[dx][1] += gY * wx;
1468  gradX[dx][2] += gZ * wx;
1469  }
1470  }
1471  for (int dy = 0; dy < D1D; ++dy)
1472  {
1473  const double wy = Bt(dy,qy);
1474  const double wDy = Gt(dy,qy);
1475  for (int dx = 0; dx < D1D; ++dx)
1476  {
1477  gradXY[dy][dx][0] += gradX[dx][0] * wy;
1478  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
1479  gradXY[dy][dx][2] += gradX[dx][2] * wy;
1480  }
1481  }
1482  }
1483  for (int dz = 0; dz < D1D; ++dz)
1484  {
1485  const double wz = Bt(dz,qz);
1486  const double wDz = Gt(dz,qz);
1487  for (int dy = 0; dy < D1D; ++dy)
1488  {
1489  for (int dx = 0; dx < D1D; ++dx)
1490  {
1491  Y(dx,dy,dz,e) +=
1492  ((gradXY[dy][dx][0] * wz) +
1493  (gradXY[dy][dx][1] * wz) +
1494  (gradXY[dy][dx][2] * wDz));
1495  }
1496  }
1497  }
1498  }
1499  });
1500 }
1501 
1502 // Half of B and G are stored in shared to get B, Bt, G and Gt.
1503 // Indices computation for SmemPADiffusionApply3D.
1504 static MFEM_HOST_DEVICE inline int qi(const int q, const int d, const int Q)
1505 {
1506  return (q<=d) ? q : Q-1-q;
1507 }
1508 
1509 static MFEM_HOST_DEVICE inline int dj(const int q, const int d, const int D)
1510 {
1511  return (q<=d) ? d : D-1-d;
1512 }
1513 
1514 static MFEM_HOST_DEVICE inline int qk(const int q, const int d, const int Q)
1515 {
1516  return (q<=d) ? Q-1-q : q;
1517 }
1518 
1519 static MFEM_HOST_DEVICE inline int dl(const int q, const int d, const int D)
1520 {
1521  return (q<=d) ? D-1-d : d;
1522 }
1523 
1524 static MFEM_HOST_DEVICE inline double sign(const int q, const int d)
1525 {
1526  return (q<=d) ? -1.0 : 1.0;
1527 }
1528 
1529 template<int T_D1D = 0, int T_Q1D = 0>
1530 static void SmemPADiffusionApply3D(const int NE,
1531  const bool symmetric,
1532  const Array<double> &b_,
1533  const Array<double> &g_,
1534  const Vector &d_,
1535  const Vector &x_,
1536  Vector &y_,
1537  const int d1d = 0,
1538  const int q1d = 0)
1539 {
1540  const int D1D = T_D1D ? T_D1D : d1d;
1541  const int Q1D = T_Q1D ? T_Q1D : q1d;
1542  constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
1543  constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
1544  MFEM_VERIFY(D1D <= M1D, "");
1545  MFEM_VERIFY(Q1D <= M1Q, "");
1546  auto b = Reshape(b_.Read(), Q1D, D1D);
1547  auto g = Reshape(g_.Read(), Q1D, D1D);
1548  auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1549  auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1550  auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1551  MFEM_FORALL_3D(e, NE, Q1D, Q1D, 1,
1552  {
1553  const int D1D = T_D1D ? T_D1D : d1d;
1554  const int Q1D = T_Q1D ? T_Q1D : q1d;
1555  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1556  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1557  constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1558  MFEM_SHARED double sBG[MQ1*MD1];
1559  double (*B)[MD1] = (double (*)[MD1]) sBG;
1560  double (*G)[MD1] = (double (*)[MD1]) sBG;
1561  double (*Bt)[MQ1] = (double (*)[MQ1]) sBG;
1562  double (*Gt)[MQ1] = (double (*)[MQ1]) sBG;
1563  MFEM_SHARED double sm0[3][MDQ*MDQ*MDQ];
1564  MFEM_SHARED double sm1[3][MDQ*MDQ*MDQ];
1565  double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1566  double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
1567  double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
1568  double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
1569  double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
1570  double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
1571  double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
1572  double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
1573  double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
1574  double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
1575  double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
1576  double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
1577  double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
1578  double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
1579  double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1580  MFEM_FOREACH_THREAD(dy,y,D1D)
1581  {
1582  MFEM_FOREACH_THREAD(dx,x,D1D)
1583  {
1584  MFEM_UNROLL(MD1)
1585  for (int dz = 0; dz < D1D; ++dz)
1586  {
1587  X[dz][dy][dx] = x(dx,dy,dz,e);
1588  }
1589  }
1590  MFEM_FOREACH_THREAD(qx,x,Q1D)
1591  {
1592  const int i = qi(qx,dy,Q1D);
1593  const int j = dj(qx,dy,D1D);
1594  const int k = qk(qx,dy,Q1D);
1595  const int l = dl(qx,dy,D1D);
1596  B[i][j] = b(qx,dy);
1597  G[k][l] = g(qx,dy) * sign(qx,dy);
1598  }
1599  }
1600  MFEM_SYNC_THREAD;
1601  MFEM_FOREACH_THREAD(dy,y,D1D)
1602  {
1603  MFEM_FOREACH_THREAD(qx,x,Q1D)
1604  {
1605  double u[D1D], v[D1D];
1606  MFEM_UNROLL(MD1)
1607  for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = 0.0; }
1608  MFEM_UNROLL(MD1)
1609  for (int dx = 0; dx < D1D; ++dx)
1610  {
1611  const int i = qi(qx,dx,Q1D);
1612  const int j = dj(qx,dx,D1D);
1613  const int k = qk(qx,dx,Q1D);
1614  const int l = dl(qx,dx,D1D);
1615  const double s = sign(qx,dx);
1616  MFEM_UNROLL(MD1)
1617  for (int dz = 0; dz < D1D; ++dz)
1618  {
1619  const double coords = X[dz][dy][dx];
1620  u[dz] += coords * B[i][j];
1621  v[dz] += coords * G[k][l] * s;
1622  }
1623  }
1624  MFEM_UNROLL(MD1)
1625  for (int dz = 0; dz < D1D; ++dz)
1626  {
1627  DDQ0[dz][dy][qx] = u[dz];
1628  DDQ1[dz][dy][qx] = v[dz];
1629  }
1630  }
1631  }
1632  MFEM_SYNC_THREAD;
1633  MFEM_FOREACH_THREAD(qy,y,Q1D)
1634  {
1635  MFEM_FOREACH_THREAD(qx,x,Q1D)
1636  {
1637  double u[D1D], v[D1D], w[D1D];
1638  MFEM_UNROLL(MD1)
1639  for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; }
1640  MFEM_UNROLL(MD1)
1641  for (int dy = 0; dy < D1D; ++dy)
1642  {
1643  const int i = qi(qy,dy,Q1D);
1644  const int j = dj(qy,dy,D1D);
1645  const int k = qk(qy,dy,Q1D);
1646  const int l = dl(qy,dy,D1D);
1647  const double s = sign(qy,dy);
1648  MFEM_UNROLL(MD1)
1649  for (int dz = 0; dz < D1D; dz++)
1650  {
1651  u[dz] += DDQ1[dz][dy][qx] * B[i][j];
1652  v[dz] += DDQ0[dz][dy][qx] * G[k][l] * s;
1653  w[dz] += DDQ0[dz][dy][qx] * B[i][j];
1654  }
1655  }
1656  MFEM_UNROLL(MD1)
1657  for (int dz = 0; dz < D1D; dz++)
1658  {
1659  DQQ0[dz][qy][qx] = u[dz];
1660  DQQ1[dz][qy][qx] = v[dz];
1661  DQQ2[dz][qy][qx] = w[dz];
1662  }
1663  }
1664  }
1665  MFEM_SYNC_THREAD;
1666  MFEM_FOREACH_THREAD(qy,y,Q1D)
1667  {
1668  MFEM_FOREACH_THREAD(qx,x,Q1D)
1669  {
1670  double u[Q1D], v[Q1D], w[Q1D];
1671  MFEM_UNROLL(MQ1)
1672  for (int qz = 0; qz < Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; }
1673  MFEM_UNROLL(MD1)
1674  for (int dz = 0; dz < D1D; ++dz)
1675  {
1676  MFEM_UNROLL(MQ1)
1677  for (int qz = 0; qz < Q1D; qz++)
1678  {
1679  const int i = qi(qz,dz,Q1D);
1680  const int j = dj(qz,dz,D1D);
1681  const int k = qk(qz,dz,Q1D);
1682  const int l = dl(qz,dz,D1D);
1683  const double s = sign(qz,dz);
1684  u[qz] += DQQ0[dz][qy][qx] * B[i][j];
1685  v[qz] += DQQ1[dz][qy][qx] * B[i][j];
1686  w[qz] += DQQ2[dz][qy][qx] * G[k][l] * s;
1687  }
1688  }
1689  MFEM_UNROLL(MQ1)
1690  for (int qz = 0; qz < Q1D; qz++)
1691  {
1692  const double O11 = d(qx,qy,qz,0,e);
1693  const double O12 = d(qx,qy,qz,1,e);
1694  const double O13 = d(qx,qy,qz,2,e);
1695  const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1696  const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1697  const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1698  const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1699  const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1700  const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1701  const double gX = u[qz];
1702  const double gY = v[qz];
1703  const double gZ = w[qz];
1704  QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1705  QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1706  QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1707  }
1708  }
1709  }
1710  MFEM_SYNC_THREAD;
1711  MFEM_FOREACH_THREAD(d,y,D1D)
1712  {
1713  MFEM_FOREACH_THREAD(q,x,Q1D)
1714  {
1715  const int i = qi(q,d,Q1D);
1716  const int j = dj(q,d,D1D);
1717  const int k = qk(q,d,Q1D);
1718  const int l = dl(q,d,D1D);
1719  Bt[j][i] = b(q,d);
1720  Gt[l][k] = g(q,d) * sign(q,d);
1721  }
1722  }
1723  MFEM_SYNC_THREAD;
1724  MFEM_FOREACH_THREAD(qy,y,Q1D)
1725  {
1726  MFEM_FOREACH_THREAD(dx,x,D1D)
1727  {
1728  double u[Q1D], v[Q1D], w[Q1D];
1729  MFEM_UNROLL(MQ1)
1730  for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; }
1731  MFEM_UNROLL(MQ1)
1732  for (int qx = 0; qx < Q1D; ++qx)
1733  {
1734  const int i = qi(qx,dx,Q1D);
1735  const int j = dj(qx,dx,D1D);
1736  const int k = qk(qx,dx,Q1D);
1737  const int l = dl(qx,dx,D1D);
1738  const double s = sign(qx,dx);
1739  MFEM_UNROLL(MQ1)
1740  for (int qz = 0; qz < Q1D; ++qz)
1741  {
1742  u[qz] += QQQ0[qz][qy][qx] * Gt[l][k] * s;
1743  v[qz] += QQQ1[qz][qy][qx] * Bt[j][i];
1744  w[qz] += QQQ2[qz][qy][qx] * Bt[j][i];
1745  }
1746  }
1747  MFEM_UNROLL(MQ1)
1748  for (int qz = 0; qz < Q1D; ++qz)
1749  {
1750  QQD0[qz][qy][dx] = u[qz];
1751  QQD1[qz][qy][dx] = v[qz];
1752  QQD2[qz][qy][dx] = w[qz];
1753  }
1754  }
1755  }
1756  MFEM_SYNC_THREAD;
1757  MFEM_FOREACH_THREAD(dy,y,D1D)
1758  {
1759  MFEM_FOREACH_THREAD(dx,x,D1D)
1760  {
1761  double u[Q1D], v[Q1D], w[Q1D];
1762  MFEM_UNROLL(MQ1)
1763  for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; }
1764  MFEM_UNROLL(MQ1)
1765  for (int qy = 0; qy < Q1D; ++qy)
1766  {
1767  const int i = qi(qy,dy,Q1D);
1768  const int j = dj(qy,dy,D1D);
1769  const int k = qk(qy,dy,Q1D);
1770  const int l = dl(qy,dy,D1D);
1771  const double s = sign(qy,dy);
1772  MFEM_UNROLL(MQ1)
1773  for (int qz = 0; qz < Q1D; ++qz)
1774  {
1775  u[qz] += QQD0[qz][qy][dx] * Bt[j][i];
1776  v[qz] += QQD1[qz][qy][dx] * Gt[l][k] * s;
1777  w[qz] += QQD2[qz][qy][dx] * Bt[j][i];
1778  }
1779  }
1780  MFEM_UNROLL(MQ1)
1781  for (int qz = 0; qz < Q1D; ++qz)
1782  {
1783  QDD0[qz][dy][dx] = u[qz];
1784  QDD1[qz][dy][dx] = v[qz];
1785  QDD2[qz][dy][dx] = w[qz];
1786  }
1787  }
1788  }
1789  MFEM_SYNC_THREAD;
1790  MFEM_FOREACH_THREAD(dy,y,D1D)
1791  {
1792  MFEM_FOREACH_THREAD(dx,x,D1D)
1793  {
1794  double u[D1D], v[D1D], w[D1D];
1795  MFEM_UNROLL(MD1)
1796  for (int dz = 0; dz < D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; }
1797  MFEM_UNROLL(MQ1)
1798  for (int qz = 0; qz < Q1D; ++qz)
1799  {
1800  MFEM_UNROLL(MD1)
1801  for (int dz = 0; dz < D1D; ++dz)
1802  {
1803  const int i = qi(qz,dz,Q1D);
1804  const int j = dj(qz,dz,D1D);
1805  const int k = qk(qz,dz,Q1D);
1806  const int l = dl(qz,dz,D1D);
1807  const double s = sign(qz,dz);
1808  u[dz] += QDD0[qz][dy][dx] * Bt[j][i];
1809  v[dz] += QDD1[qz][dy][dx] * Bt[j][i];
1810  w[dz] += QDD2[qz][dy][dx] * Gt[l][k] * s;
1811  }
1812  }
1813  MFEM_UNROLL(MD1)
1814  for (int dz = 0; dz < D1D; ++dz)
1815  {
1816  y(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]);
1817  }
1818  }
1819  }
1820  });
1821 }
1822 
1823 static void PADiffusionApply(const int dim,
1824  const int D1D,
1825  const int Q1D,
1826  const int NE,
1827  const bool symm,
1828  const Array<double> &B,
1829  const Array<double> &G,
1830  const Array<double> &Bt,
1831  const Array<double> &Gt,
1832  const Vector &D,
1833  const Vector &X,
1834  Vector &Y)
1835 {
1836 #ifdef MFEM_USE_OCCA
1837  if (DeviceCanUseOcca())
1838  {
1839  if (dim == 2)
1840  {
1841  OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1842  return;
1843  }
1844  if (dim == 3)
1845  {
1846  OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1847  return;
1848  }
1849  MFEM_ABORT("OCCA PADiffusionApply unknown kernel!");
1850  }
1851 #endif // MFEM_USE_OCCA
1852  const int ID = (D1D << 4) | Q1D;
1853 
1854  if (dim == 2)
1855  {
1856  switch (ID)
1857  {
1858  case 0x22: return SmemPADiffusionApply2D<2,2,16>(NE,symm,B,G,D,X,Y);
1859  case 0x33: return SmemPADiffusionApply2D<3,3,16>(NE,symm,B,G,D,X,Y);
1860  case 0x44: return SmemPADiffusionApply2D<4,4,8>(NE,symm,B,G,D,X,Y);
1861  case 0x55: return SmemPADiffusionApply2D<5,5,8>(NE,symm,B,G,D,X,Y);
1862  case 0x66: return SmemPADiffusionApply2D<6,6,4>(NE,symm,B,G,D,X,Y);
1863  case 0x77: return SmemPADiffusionApply2D<7,7,4>(NE,symm,B,G,D,X,Y);
1864  case 0x88: return SmemPADiffusionApply2D<8,8,2>(NE,symm,B,G,D,X,Y);
1865  case 0x99: return SmemPADiffusionApply2D<9,9,2>(NE,symm,B,G,D,X,Y);
1866  default: return PADiffusionApply2D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1867  }
1868  }
1869 
1870  if (dim == 3)
1871  {
1872  switch (ID)
1873  {
1874  case 0x23: return SmemPADiffusionApply3D<2,3>(NE,symm,B,G,D,X,Y);
1875  case 0x34: return SmemPADiffusionApply3D<3,4>(NE,symm,B,G,D,X,Y);
1876  case 0x45: return SmemPADiffusionApply3D<4,5>(NE,symm,B,G,D,X,Y);
1877  case 0x46: return SmemPADiffusionApply3D<4,6>(NE,symm,B,G,D,X,Y);
1878  case 0x56: return SmemPADiffusionApply3D<5,6>(NE,symm,B,G,D,X,Y);
1879  case 0x58: return SmemPADiffusionApply3D<5,8>(NE,symm,B,G,D,X,Y);
1880  case 0x67: return SmemPADiffusionApply3D<6,7>(NE,symm,B,G,D,X,Y);
1881  case 0x78: return SmemPADiffusionApply3D<7,8>(NE,symm,B,G,D,X,Y);
1882  case 0x89: return SmemPADiffusionApply3D<8,9>(NE,symm,B,G,D,X,Y);
1883  default: return PADiffusionApply3D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1884  }
1885  }
1886  MFEM_ABORT("Unknown kernel.");
1887 }
1888 
1889 // PA Diffusion Apply kernel
1890 void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
1891 {
1892  if (DeviceCanUseCeed())
1893  {
1894  CeedAddMult(ceedDataPtr, x, y);
1895  }
1896  else
1897  {
1898  PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
1899  maps->B, maps->G, maps->Bt, maps->Gt,
1900  pa_data, x, y);
1901  }
1902 }
1903 
1904 } // namespace mfem
int GetNPoints() const
Returns the number of the points in the integration rule.
Definition: intrules.hpp:245
Abstract class for all finite elements.
Definition: fe.hpp:235
int Size() const
Return the logical size of the array.
Definition: array.hpp:124
int GetDim() const
Returns the reference space dimension for the finite element.
Definition: fe.hpp:309
Class for an integration rule - an Array of IntegrationPoint.
Definition: intrules.hpp:90
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition: array.hpp:103
A coefficient that is constant across space and time.
Definition: coefficient.hpp:78
const IntegrationRule & GetElementIntRule(int idx) const
Get the IntegrationRule associated with mesh element idx.
Definition: fespace.hpp:766
void SetSize(int s)
Resize the vector to size s.
Definition: vector.hpp:459
const Geometry::Type geom
Definition: ex1.cpp:40
occa::device & OccaDev()
Return the default occa::device used by MFEM.
Definition: occa.cpp:27
Data type dense matrix using column-major storage.
Definition: densemat.hpp:23
int Size() const
Returns the size of the vector.
Definition: vector.hpp:160
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
Definition: fe.hpp:165
int GetNE() const
Returns number of elements.
Definition: mesh.hpp:737
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
Definition: intrules.cpp:85
constexpr int DIM
double * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:380
std::map< occa_id_t, occa::kernel > occa_kernel_t
Definition: occa.hpp:79
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
Definition: vector.hpp:183
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:59
DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims...dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition: dtensor.hpp:134
int GetNE() const
Returns number of elements in the mesh.
Definition: fespace.hpp:427
IntegrationPoint & IntPoint(int i)
Returns a reference to the i-th integration point.
Definition: intrules.hpp:248
Quadrature function coefficient which requires that the quadrature rules used for this coefficient be...
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags)
Return the mesh geometric factors corresponding to the given integration rule.
Definition: mesh.cpp:768
double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:388
const int MAX_Q1D
Definition: forall.hpp:27
Mesh * GetMesh() const
Returns the mesh.
Definition: fespace.hpp:314
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
Definition: occa.hpp:69
double b
Definition: lissajous.cpp:42
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:37
QuadratureSpace * GetSpace() const
Get the associated QuadratureSpace.
Definition: gridfunc.hpp:711
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Definition: array.hpp:290
int Dimension() const
Definition: mesh.hpp:788
void PADiffusionSetup2D< 3 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
void PADiffusionSetup2D< 2 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
int SpaceDimension() const
Definition: mesh.hpp:789
ElementTransformation * GetElementTransformation(int i) const
Returns ElementTransformation for the i-th element.
Definition: fespace.hpp:460
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Definition: fespace.hpp:87
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:48
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
Definition: fe.cpp:376
void PADiffusionSetup3D(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:372
void GetElementTransformation(int i, IsoparametricTransformation *ElTr)
Definition: mesh.cpp:336
int dim
Definition: ex24.cpp:53
const int MAX_D1D
Definition: forall.hpp:26
const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement in the FiniteElementCollection associated with i&#39;th element in t...
Definition: fespace.cpp:1798
constexpr int SDIM
double * HostWrite()
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), false).
Definition: vector.hpp:384
const double alpha
Definition: ex15.cpp:336
Vector data type.
Definition: vector.hpp:51
void MakeRef(Vector &base, int offset, int size)
Reset the Vector to be a reference to a sub-vector of base.
Definition: vector.hpp:517
std::pair< int, int > occa_id_t
Definition: occa.hpp:78
void SetSize(int s)
Change the size of the DenseMatrix to s x s.
Definition: densemat.hpp:90
Class representing a function through its values (scalar or vector) at quadrature points...
Definition: gridfunc.hpp:670