MFEM  v4.4.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
bilininteg_diffusion_pa.cpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2022, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #include "../general/forall.hpp"
13 #include "bilininteg.hpp"
14 #include "gridfunc.hpp"
15 #include "ceed/diffusion.hpp"
16 
17 using namespace std;
18 
19 namespace mfem
20 {
21 
22 // PA Diffusion Integrator
23 
24 // OCCA 2D Assemble kernel
25 #ifdef MFEM_USE_OCCA
26 static void OccaPADiffusionSetup2D(const int D1D,
27  const int Q1D,
28  const int NE,
29  const Array<double> &W,
30  const Vector &J,
31  const Vector &C,
32  Vector &op)
33 {
34  occa::properties props;
35  props["defines/D1D"] = D1D;
36  props["defines/Q1D"] = Q1D;
37  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
38  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
39  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
40  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
41  const bool const_c = C.Size() == 1;
42  const occa_id_t id = std::make_pair(D1D,Q1D);
43  static occa_kernel_t OccaDiffSetup2D_ker;
44  if (OccaDiffSetup2D_ker.find(id) == OccaDiffSetup2D_ker.end())
45  {
46  const occa::kernel DiffusionSetup2D =
47  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
48  "DiffusionSetup2D", props);
49  OccaDiffSetup2D_ker.emplace(id, DiffusionSetup2D);
50  }
51  OccaDiffSetup2D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
52 }
53 
54 static void OccaPADiffusionSetup3D(const int D1D,
55  const int Q1D,
56  const int NE,
57  const Array<double> &W,
58  const Vector &J,
59  const Vector &C,
60  Vector &op)
61 {
62  occa::properties props;
63  props["defines/D1D"] = D1D;
64  props["defines/Q1D"] = Q1D;
65  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
66  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
67  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
68  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
69  const bool const_c = C.Size() == 1;
70  const occa_id_t id = std::make_pair(D1D,Q1D);
71  static occa_kernel_t OccaDiffSetup3D_ker;
72  if (OccaDiffSetup3D_ker.find(id) == OccaDiffSetup3D_ker.end())
73  {
74  const occa::kernel DiffusionSetup3D =
75  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
76  "DiffusionSetup3D", props);
77  OccaDiffSetup3D_ker.emplace(id, DiffusionSetup3D);
78  }
79  OccaDiffSetup3D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
80 }
81 #endif // MFEM_USE_OCCA
82 
83 template<>
84 void PADiffusionSetup2D<2>(const int Q1D,
85  const int coeffDim,
86  const int NE,
87  const Array<double> &w,
88  const Vector &j,
89  const Vector &c,
90  Vector &d)
91 {
92  const bool symmetric = (coeffDim != 4);
93  const bool const_c = c.Size() == 1;
94  MFEM_VERIFY(coeffDim < 3 ||
95  !const_c, "Constant matrix coefficient not supported");
96  const auto W = Reshape(w.Read(), Q1D,Q1D);
97  const auto J = Reshape(j.Read(), Q1D,Q1D,2,2,NE);
98  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1) :
99  Reshape(c.Read(), coeffDim,Q1D,Q1D,NE);
100  auto D = Reshape(d.Write(), Q1D,Q1D, symmetric ? 3 : 4, NE);
101  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
102  {
103  MFEM_FOREACH_THREAD(qx,x,Q1D)
104  {
105  MFEM_FOREACH_THREAD(qy,y,Q1D)
106  {
107  const double J11 = J(qx,qy,0,0,e);
108  const double J21 = J(qx,qy,1,0,e);
109  const double J12 = J(qx,qy,0,1,e);
110  const double J22 = J(qx,qy,1,1,e);
111  const double w_detJ = W(qx,qy) / ((J11*J22)-(J21*J12));
112  if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient
113  {
114  // First compute entries of R = MJ^{-T}, without det J factor.
115  const double M11 = C(0,qx,qy,e);
116  const double M12 = C(1,qx,qy,e);
117  const double M21 = symmetric ? M12 : C(2,qx,qy,e);
118  const double M22 = symmetric ? C(2,qx,qy,e) : C(3,qx,qy,e);
119  const double R11 = M11*J22 - M12*J12;
120  const double R21 = M21*J22 - M22*J12;
121  const double R12 = -M11*J21 + M12*J11;
122  const double R22 = -M21*J21 + M22*J11;
123 
124  // Now set y to J^{-1}R.
125  D(qx,qy,0,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
126  D(qx,qy,1,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1
127  D(qx,qy,2,e) = w_detJ * (symmetric ? (-J21*R12 + J11*R22) :
128  (J22*R12 - J12*R22)); // 2,2 or 1,2
129  if (!symmetric)
130  {
131  D(qx,qy,3,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
132  }
133  }
134  else // Vector or scalar coefficient
135  {
136  const double C1 = const_c ? C(0,0,0,0) : C(0,qx,qy,e);
137  const double C2 = const_c ? C(0,0,0,0) :
138  (coeffDim == 2 ? C(1,qx,qy,e) : C(0,qx,qy,e));
139 
140  D(qx,qy,0,e) = w_detJ * (C2*J12*J12 + C1*J22*J22); // 1,1
141  D(qx,qy,1,e) = -w_detJ * (C2*J12*J11 + C1*J22*J21); // 1,2
142  D(qx,qy,2,e) = w_detJ * (C2*J11*J11 + C1*J21*J21); // 2,2
143  }
144  }
145  }
146  });
147 }
148 
149 // PA Diffusion Assemble 2D kernel with 3D node coords
150 template<>
151 void PADiffusionSetup2D<3>(const int Q1D,
152  const int coeffDim,
153  const int NE,
154  const Array<double> &w,
155  const Vector &j,
156  const Vector &c,
157  Vector &d)
158 {
159  MFEM_VERIFY(coeffDim == 1, "Matrix and vector coefficients not supported");
160  constexpr int DIM = 2;
161  constexpr int SDIM = 3;
162  const bool const_c = c.Size() == 1;
163  const auto W = Reshape(w.Read(), Q1D,Q1D);
164  const auto J = Reshape(j.Read(), Q1D,Q1D,SDIM,DIM,NE);
165  const auto C = const_c ? Reshape(c.Read(), 1,1,1) :
166  Reshape(c.Read(), Q1D,Q1D,NE);
167  auto D = Reshape(d.Write(), Q1D,Q1D, 3, NE);
168  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
169  {
170  MFEM_FOREACH_THREAD(qx,x,Q1D)
171  {
172  MFEM_FOREACH_THREAD(qy,y,Q1D)
173  {
174  const double wq = W(qx,qy);
175  const double J11 = J(qx,qy,0,0,e);
176  const double J21 = J(qx,qy,1,0,e);
177  const double J31 = J(qx,qy,2,0,e);
178  const double J12 = J(qx,qy,0,1,e);
179  const double J22 = J(qx,qy,1,1,e);
180  const double J32 = J(qx,qy,2,1,e);
181  const double E = J11*J11 + J21*J21 + J31*J31;
182  const double G = J12*J12 + J22*J22 + J32*J32;
183  const double F = J11*J12 + J21*J22 + J31*J32;
184  const double iw = 1.0 / sqrt(E*G - F*F);
185  const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
186  const double alpha = wq * coeff * iw;
187  D(qx,qy,0,e) = alpha * G; // 1,1
188  D(qx,qy,1,e) = -alpha * F; // 1,2
189  D(qx,qy,2,e) = alpha * E; // 2,2
190  }
191  }
192  });
193 }
194 
195 // PA Diffusion Assemble 3D kernel
196 void PADiffusionSetup3D(const int Q1D,
197  const int coeffDim,
198  const int NE,
199  const Array<double> &w,
200  const Vector &j,
201  const Vector &c,
202  Vector &d)
203 {
204  const bool symmetric = (coeffDim != 9);
205  const bool const_c = c.Size() == 1;
206  MFEM_VERIFY(coeffDim < 6 ||
207  !const_c, "Constant matrix coefficient not supported");
208  const auto W = Reshape(w.Read(), Q1D,Q1D,Q1D);
209  const auto J = Reshape(j.Read(), Q1D,Q1D,Q1D,3,3,NE);
210  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1,1) :
211  Reshape(c.Read(), coeffDim,Q1D,Q1D,Q1D,NE);
212  auto D = Reshape(d.Write(), Q1D,Q1D,Q1D, symmetric ? 6 : 9, NE);
213  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
214  {
215  MFEM_FOREACH_THREAD(qx,x,Q1D)
216  {
217  MFEM_FOREACH_THREAD(qy,y,Q1D)
218  {
219  MFEM_FOREACH_THREAD(qz,z,Q1D)
220  {
221  const double J11 = J(qx,qy,qz,0,0,e);
222  const double J21 = J(qx,qy,qz,1,0,e);
223  const double J31 = J(qx,qy,qz,2,0,e);
224  const double J12 = J(qx,qy,qz,0,1,e);
225  const double J22 = J(qx,qy,qz,1,1,e);
226  const double J32 = J(qx,qy,qz,2,1,e);
227  const double J13 = J(qx,qy,qz,0,2,e);
228  const double J23 = J(qx,qy,qz,1,2,e);
229  const double J33 = J(qx,qy,qz,2,2,e);
230  const double detJ = J11 * (J22 * J33 - J32 * J23) -
231  /* */ J21 * (J12 * J33 - J32 * J13) +
232  /* */ J31 * (J12 * J23 - J22 * J13);
233  const double w_detJ = W(qx,qy,qz) / detJ;
234  // adj(J)
235  const double A11 = (J22 * J33) - (J23 * J32);
236  const double A12 = (J32 * J13) - (J12 * J33);
237  const double A13 = (J12 * J23) - (J22 * J13);
238  const double A21 = (J31 * J23) - (J21 * J33);
239  const double A22 = (J11 * J33) - (J13 * J31);
240  const double A23 = (J21 * J13) - (J11 * J23);
241  const double A31 = (J21 * J32) - (J31 * J22);
242  const double A32 = (J31 * J12) - (J11 * J32);
243  const double A33 = (J11 * J22) - (J12 * J21);
244 
245  if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
246  {
247  // Compute entries of R = MJ^{-T} = M adj(J)^T, without det J.
248  const double M11 = C(0, qx,qy,qz, e);
249  const double M12 = C(1, qx,qy,qz, e);
250  const double M13 = C(2, qx,qy,qz, e);
251  const double M21 = (!symmetric) ? C(3, qx,qy,qz, e) : M12;
252  const double M22 = (!symmetric) ? C(4, qx,qy,qz, e) : C(3, qx,qy,qz, e);
253  const double M23 = (!symmetric) ? C(5, qx,qy,qz, e) : C(4, qx,qy,qz, e);
254  const double M31 = (!symmetric) ? C(6, qx,qy,qz, e) : M13;
255  const double M32 = (!symmetric) ? C(7, qx,qy,qz, e) : M23;
256  const double M33 = (!symmetric) ? C(8, qx,qy,qz, e) : C(5, qx,qy,qz, e);
257 
258  const double R11 = M11*A11 + M12*A12 + M13*A13;
259  const double R12 = M11*A21 + M12*A22 + M13*A23;
260  const double R13 = M11*A31 + M12*A32 + M13*A33;
261  const double R21 = M21*A11 + M22*A12 + M23*A13;
262  const double R22 = M21*A21 + M22*A22 + M23*A23;
263  const double R23 = M21*A31 + M22*A32 + M23*A33;
264  const double R31 = M31*A11 + M32*A12 + M33*A13;
265  const double R32 = M31*A21 + M32*A22 + M33*A23;
266  const double R33 = M31*A31 + M32*A32 + M33*A33;
267 
268  // Now set D to J^{-1} R = adj(J) R
269  D(qx,qy,qz,0,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
270  const double D12 = w_detJ * (A11*R12 + A12*R22 + A13*R32);
271  D(qx,qy,qz,1,e) = D12; // 1,2
272  D(qx,qy,qz,2,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
273 
274  const double D21 = w_detJ * (A21*R11 + A22*R21 + A23*R31);
275  const double D22 = w_detJ * (A21*R12 + A22*R22 + A23*R32);
276  const double D23 = w_detJ * (A21*R13 + A22*R23 + A23*R33);
277 
278  const double D33 = w_detJ * (A31*R13 + A32*R23 + A33*R33);
279 
280  D(qx,qy,qz,3,e) = symmetric ? D22 : D21; // 2,2 or 2,1
281  D(qx,qy,qz,4,e) = symmetric ? D23 : D22; // 2,3 or 2,2
282  D(qx,qy,qz,5,e) = symmetric ? D33 : D23; // 3,3 or 2,3
283 
284  if (!symmetric)
285  {
286  D(qx,qy,qz,6,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
287  D(qx,qy,qz,7,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
288  D(qx,qy,qz,8,e) = D33; // 3,3
289  }
290  }
291  else // Vector or scalar coefficient version
292  {
293  const double C1 = const_c ? C(0,0,0,0,0) : C(0,qx,qy,qz,e);
294  const double C2 = const_c ? C(0,0,0,0,0) :
295  (coeffDim == 3 ? C(1,qx,qy,qz,e) : C(0,qx,qy,qz,e));
296  const double C3 = const_c ? C(0,0,0,0,0) :
297  (coeffDim == 3 ? C(2,qx,qy,qz,e) : C(0,qx,qy,qz,e));
298 
299  // detJ J^{-1} J^{-T} = (1/detJ) adj(J) adj(J)^T
300  D(qx,qy,qz,0,e) = w_detJ * (C1*A11*A11 + C2*A12*A12 + C3*A13*A13); // 1,1
301  D(qx,qy,qz,1,e) = w_detJ * (C1*A11*A21 + C2*A12*A22 + C3*A13*A23); // 2,1
302  D(qx,qy,qz,2,e) = w_detJ * (C1*A11*A31 + C2*A12*A32 + C3*A13*A33); // 3,1
303  D(qx,qy,qz,3,e) = w_detJ * (C1*A21*A21 + C2*A22*A22 + C3*A23*A23); // 2,2
304  D(qx,qy,qz,4,e) = w_detJ * (C1*A21*A31 + C2*A22*A32 + C3*A23*A33); // 3,2
305  D(qx,qy,qz,5,e) = w_detJ * (C1*A31*A31 + C2*A32*A32 + C3*A33*A33); // 3,3
306  }
307  }
308  }
309  }
310  });
311 }
312 
313 static void PADiffusionSetup(const int dim,
314  const int sdim,
315  const int D1D,
316  const int Q1D,
317  const int coeffDim,
318  const int NE,
319  const Array<double> &W,
320  const Vector &J,
321  const Vector &C,
322  Vector &D)
323 {
324  if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADiffusionSetup"); }
325  if (dim == 2)
326  {
327 #ifdef MFEM_USE_OCCA
328  if (DeviceCanUseOcca())
329  {
330  OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, C, D);
331  return;
332  }
333 #else
334  MFEM_CONTRACT_VAR(D1D);
335 #endif // MFEM_USE_OCCA
336  if (sdim == 2) { PADiffusionSetup2D<2>(Q1D, coeffDim, NE, W, J, C, D); }
337  if (sdim == 3) { PADiffusionSetup2D<3>(Q1D, coeffDim, NE, W, J, C, D); }
338  }
339  if (dim == 3)
340  {
341 #ifdef MFEM_USE_OCCA
342  if (DeviceCanUseOcca())
343  {
344  OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, C, D);
345  return;
346  }
347 #endif // MFEM_USE_OCCA
348  PADiffusionSetup3D(Q1D, coeffDim, NE, W, J, C, D);
349  }
350 }
351 
352 void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
353 {
354  const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
355  Device::GetDeviceMemoryType() : pa_mt;
356  // Assuming the same element type
357  fespace = &fes;
358  Mesh *mesh = fes.GetMesh();
359  if (mesh->GetNE() == 0) { return; }
360  const FiniteElement &el = *fes.GetFE(0);
361  const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
362  if (DeviceCanUseCeed())
363  {
364  delete ceedOp;
365  MFEM_VERIFY(!VQ && !MQ && !SMQ,
366  "Only scalar coefficient supported for DiffusionIntegrator"
367  " with libCEED");
368  ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
369  return;
370  }
371  const int dims = el.GetDim();
372  const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
373  const int nq = ir->GetNPoints();
374  dim = mesh->Dimension();
375  ne = fes.GetNE();
376  geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
377  const int sdim = mesh->SpaceDimension();
378  maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
379  dofs1D = maps->ndof;
380  quad1D = maps->nqpt;
381  int coeffDim = 1;
382  Vector coeff;
383  const int MQfullDim = MQ ? MQ->GetHeight() * MQ->GetWidth() : 0;
384  if (MQ)
385  {
386  symmetric = false;
387  MFEM_VERIFY(MQ->GetHeight() == dim && MQ->GetWidth() == dim, "");
388 
389  coeffDim = MQfullDim;
390 
391  coeff.SetSize(MQfullDim * nq * ne);
392 
393  DenseMatrix GM;
394  GM.SetSize(dim);
395 
396  auto C = Reshape(coeff.HostWrite(), MQfullDim, nq, ne);
397  for (int e=0; e<ne; ++e)
398  {
400  for (int p=0; p<nq; ++p)
401  {
402  MQ->Eval(GM, *tr, ir->IntPoint(p));
403  for (int i=0; i<dim; ++i)
404  for (int j=0; j<dim; ++j)
405  {
406  C(j+(i*dim), p, e) = GM(i,j);
407  }
408  }
409  }
410  }
411  else if (SMQ)
412  {
413  MFEM_VERIFY(SMQ->GetSize() == dim, "");
414  coeffDim = symmDims;
415  coeff.SetSize(symmDims * nq * ne);
416 
418  SM.SetSize(dim);
419 
420  auto C = Reshape(coeff.HostWrite(), symmDims, nq, ne);
421 
422  for (int e=0; e<ne; ++e)
423  {
425  for (int p=0; p<nq; ++p)
426  {
427  SMQ->Eval(SM, *tr, ir->IntPoint(p));
428  int cnt = 0;
429  for (int i=0; i<dim; ++i)
430  for (int j=i; j<dim; ++j, ++cnt)
431  {
432  C(cnt, p, e) = SM(i,j);
433  }
434  }
435  }
436  }
437  else if (VQ)
438  {
439  MFEM_VERIFY(VQ->GetVDim() == dim, "");
440  coeffDim = VQ->GetVDim();
441  coeff.SetSize(coeffDim * nq * ne);
442  auto C = Reshape(coeff.HostWrite(), coeffDim, nq, ne);
443  Vector DM(coeffDim);
444  for (int e=0; e<ne; ++e)
445  {
447  for (int p=0; p<nq; ++p)
448  {
449  VQ->Eval(DM, *tr, ir->IntPoint(p));
450  for (int i=0; i<coeffDim; ++i)
451  {
452  C(i, p, e) = DM[i];
453  }
454  }
455  }
456  }
457  else if (Q == nullptr)
458  {
459  coeff.SetSize(1);
460  coeff(0) = 1.0;
461  }
462  else if (ConstantCoefficient* cQ = dynamic_cast<ConstantCoefficient*>(Q))
463  {
464  coeff.SetSize(1);
465  coeff(0) = cQ->constant;
466  }
467  else if (QuadratureFunctionCoefficient* qfQ =
468  dynamic_cast<QuadratureFunctionCoefficient*>(Q))
469  {
470  const QuadratureFunction &qFun = qfQ->GetQuadFunction();
471  MFEM_VERIFY(qFun.Size() == ne*nq,
472  "Incompatible QuadratureFunction dimension \n");
473 
474  MFEM_VERIFY(ir == &qFun.GetSpace()->GetElementIntRule(0),
475  "IntegrationRule used within integrator and in"
476  " QuadratureFunction appear to be different");
477  qFun.Read();
478  coeff.MakeRef(const_cast<QuadratureFunction &>(qFun),0);
479  }
480  else
481  {
482  coeff.SetSize(nq * ne);
483  auto C = Reshape(coeff.HostWrite(), nq, ne);
484  for (int e = 0; e < ne; ++e)
485  {
487  for (int q = 0; q < nq; ++q)
488  {
489  C(q,e) = Q->Eval(T, ir->IntPoint(q));
490  }
491  }
492  }
493  pa_data.SetSize((symmetric ? symmDims : MQfullDim) * nq * ne, mt);
494  PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeffDim, ne, ir->GetWeights(),
495  geom->J, coeff, pa_data);
496 }
497 
498 template<int T_D1D = 0, int T_Q1D = 0>
499 static void PADiffusionDiagonal2D(const int NE,
500  const bool symmetric,
501  const Array<double> &b,
502  const Array<double> &g,
503  const Vector &d,
504  Vector &y,
505  const int d1d = 0,
506  const int q1d = 0)
507 {
508  const int D1D = T_D1D ? T_D1D : d1d;
509  const int Q1D = T_Q1D ? T_Q1D : q1d;
510  MFEM_VERIFY(D1D <= MAX_D1D, "");
511  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
512  auto B = Reshape(b.Read(), Q1D, D1D);
513  auto G = Reshape(g.Read(), Q1D, D1D);
514  // note the different shape for D, if this is a symmetric matrix we only
515  // store necessary entries
516  auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
517  auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
518  MFEM_FORALL(e, NE,
519  {
520  const int D1D = T_D1D ? T_D1D : d1d;
521  const int Q1D = T_Q1D ? T_Q1D : q1d;
522  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
523  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
524  // gradphi \cdot Q \gradphi has four terms
525  double QD0[MQ1][MD1];
526  double QD1[MQ1][MD1];
527  double QD2[MQ1][MD1];
528  for (int qx = 0; qx < Q1D; ++qx)
529  {
530  for (int dy = 0; dy < D1D; ++dy)
531  {
532  QD0[qx][dy] = 0.0;
533  QD1[qx][dy] = 0.0;
534  QD2[qx][dy] = 0.0;
535  for (int qy = 0; qy < Q1D; ++qy)
536  {
537  const int q = qx + qy * Q1D;
538  const double D00 = D(q,0,e);
539  const double D10 = D(q,1,e);
540  const double D01 = symmetric ? D10 : D(q,2,e);
541  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
542  QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
543  QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
544  QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
545  }
546  }
547  }
548  for (int dy = 0; dy < D1D; ++dy)
549  {
550  for (int dx = 0; dx < D1D; ++dx)
551  {
552  for (int qx = 0; qx < Q1D; ++qx)
553  {
554  Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
555  Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
556  Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
557  }
558  }
559  }
560  });
561 }
562 
563 // Shared memory PA Diffusion Diagonal 2D kernel
564 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
565 static void SmemPADiffusionDiagonal2D(const int NE,
566  const bool symmetric,
567  const Array<double> &b_,
568  const Array<double> &g_,
569  const Vector &d_,
570  Vector &y_,
571  const int d1d = 0,
572  const int q1d = 0)
573 {
574  const int D1D = T_D1D ? T_D1D : d1d;
575  const int Q1D = T_Q1D ? T_Q1D : q1d;
576  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
577  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
578  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
579  MFEM_VERIFY(D1D <= MD1, "");
580  MFEM_VERIFY(Q1D <= MQ1, "");
581  auto b = Reshape(b_.Read(), Q1D, D1D);
582  auto g = Reshape(g_.Read(), Q1D, D1D);
583  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
584  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
585  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
586  {
587  const int tidz = MFEM_THREAD_ID(z);
588  const int D1D = T_D1D ? T_D1D : d1d;
589  const int Q1D = T_Q1D ? T_Q1D : q1d;
590  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
591  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
592  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
593  MFEM_SHARED double BG[2][MQ1*MD1];
594  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
595  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
596  MFEM_SHARED double QD[3][NBZ][MD1][MQ1];
597  double (*QD0)[MD1] = (double (*)[MD1])(QD[0] + tidz);
598  double (*QD1)[MD1] = (double (*)[MD1])(QD[1] + tidz);
599  double (*QD2)[MD1] = (double (*)[MD1])(QD[2] + tidz);
600  if (tidz == 0)
601  {
602  MFEM_FOREACH_THREAD(d,y,D1D)
603  {
604  MFEM_FOREACH_THREAD(q,x,Q1D)
605  {
606  B[q][d] = b(q,d);
607  G[q][d] = g(q,d);
608  }
609  }
610  }
611  MFEM_SYNC_THREAD;
612  MFEM_FOREACH_THREAD(qx,x,Q1D)
613  {
614  MFEM_FOREACH_THREAD(dy,y,D1D)
615  {
616  QD0[qx][dy] = 0.0;
617  QD1[qx][dy] = 0.0;
618  QD2[qx][dy] = 0.0;
619  for (int qy = 0; qy < Q1D; ++qy)
620  {
621  const int q = qx + qy * Q1D;
622  const double D00 = D(q,0,e);
623  const double D10 = D(q,1,e);
624  const double D01 = symmetric ? D10 : D(q,2,e);
625  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
626  const double By = B[qy][dy];
627  const double Gy = G[qy][dy];
628  const double BBy = By * By;
629  const double BGy = By * Gy;
630  const double GGy = Gy * Gy;
631  QD0[qx][dy] += BBy * D00;
632  QD1[qx][dy] += BGy * (D01 + D10);
633  QD2[qx][dy] += GGy * D11;
634  }
635  }
636  }
637  MFEM_SYNC_THREAD;
638  MFEM_FOREACH_THREAD(dy,y,D1D)
639  {
640  MFEM_FOREACH_THREAD(dx,x,D1D)
641  {
642  for (int qx = 0; qx < Q1D; ++qx)
643  {
644  const double Bx = B[qx][dx];
645  const double Gx = G[qx][dx];
646  const double BBx = Bx * Bx;
647  const double BGx = Bx * Gx;
648  const double GGx = Gx * Gx;
649  Y(dx,dy,e) += GGx * QD0[qx][dy];
650  Y(dx,dy,e) += BGx * QD1[qx][dy];
651  Y(dx,dy,e) += BBx * QD2[qx][dy];
652  }
653  }
654  }
655  });
656 }
657 
658 template<int T_D1D = 0, int T_Q1D = 0>
659 static void PADiffusionDiagonal3D(const int NE,
660  const bool symmetric,
661  const Array<double> &b,
662  const Array<double> &g,
663  const Vector &d,
664  Vector &y,
665  const int d1d = 0,
666  const int q1d = 0)
667 {
668  constexpr int DIM = 3;
669  const int D1D = T_D1D ? T_D1D : d1d;
670  const int Q1D = T_Q1D ? T_Q1D : q1d;
671  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
672  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
673  MFEM_VERIFY(D1D <= MD1, "");
674  MFEM_VERIFY(Q1D <= MQ1, "");
675  auto B = Reshape(b.Read(), Q1D, D1D);
676  auto G = Reshape(g.Read(), Q1D, D1D);
677  auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
678  auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
679  MFEM_FORALL(e, NE,
680  {
681  const int D1D = T_D1D ? T_D1D : d1d;
682  const int Q1D = T_Q1D ? T_Q1D : q1d;
683  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
684  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
685  double QQD[MQ1][MQ1][MD1];
686  double QDD[MQ1][MD1][MD1];
687  for (int i = 0; i < DIM; ++i)
688  {
689  for (int j = 0; j < DIM; ++j)
690  {
691  // first tensor contraction, along z direction
692  for (int qx = 0; qx < Q1D; ++qx)
693  {
694  for (int qy = 0; qy < Q1D; ++qy)
695  {
696  for (int dz = 0; dz < D1D; ++dz)
697  {
698  QQD[qx][qy][dz] = 0.0;
699  for (int qz = 0; qz < Q1D; ++qz)
700  {
701  const int q = qx + (qy + qz * Q1D) * Q1D;
702  const int ksym = j >= i ?
703  3 - (3-i)*(2-i)/2 + j:
704  3 - (3-j)*(2-j)/2 + i;
705  const int k = symmetric ? ksym : (i*DIM) + j;
706  const double O = Q(q,k,e);
707  const double Bz = B(qz,dz);
708  const double Gz = G(qz,dz);
709  const double L = i==2 ? Gz : Bz;
710  const double R = j==2 ? Gz : Bz;
711  QQD[qx][qy][dz] += L * O * R;
712  }
713  }
714  }
715  }
716  // second tensor contraction, along y direction
717  for (int qx = 0; qx < Q1D; ++qx)
718  {
719  for (int dz = 0; dz < D1D; ++dz)
720  {
721  for (int dy = 0; dy < D1D; ++dy)
722  {
723  QDD[qx][dy][dz] = 0.0;
724  for (int qy = 0; qy < Q1D; ++qy)
725  {
726  const double By = B(qy,dy);
727  const double Gy = G(qy,dy);
728  const double L = i==1 ? Gy : By;
729  const double R = j==1 ? Gy : By;
730  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
731  }
732  }
733  }
734  }
735  // third tensor contraction, along x direction
736  for (int dz = 0; dz < D1D; ++dz)
737  {
738  for (int dy = 0; dy < D1D; ++dy)
739  {
740  for (int dx = 0; dx < D1D; ++dx)
741  {
742  for (int qx = 0; qx < Q1D; ++qx)
743  {
744  const double Bx = B(qx,dx);
745  const double Gx = G(qx,dx);
746  const double L = i==0 ? Gx : Bx;
747  const double R = j==0 ? Gx : Bx;
748  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
749  }
750  }
751  }
752  }
753  }
754  }
755  });
756 }
757 
758 // Shared memory PA Diffusion Diagonal 3D kernel
759 template<int T_D1D = 0, int T_Q1D = 0>
760 static void SmemPADiffusionDiagonal3D(const int NE,
761  const bool symmetric,
762  const Array<double> &b_,
763  const Array<double> &g_,
764  const Vector &d_,
765  Vector &y_,
766  const int d1d = 0,
767  const int q1d = 0)
768 {
769  constexpr int DIM = 3;
770  const int D1D = T_D1D ? T_D1D : d1d;
771  const int Q1D = T_Q1D ? T_Q1D : q1d;
772  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
773  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
774  MFEM_VERIFY(D1D <= MD1, "");
775  MFEM_VERIFY(Q1D <= MQ1, "");
776  auto b = Reshape(b_.Read(), Q1D, D1D);
777  auto g = Reshape(g_.Read(), Q1D, D1D);
778  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
779  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
780  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
781  {
782  const int tidz = MFEM_THREAD_ID(z);
783  const int D1D = T_D1D ? T_D1D : d1d;
784  const int Q1D = T_Q1D ? T_Q1D : q1d;
785  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
786  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
787  MFEM_SHARED double BG[2][MQ1*MD1];
788  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
789  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
790  MFEM_SHARED double QQD[MQ1][MQ1][MD1];
791  MFEM_SHARED double QDD[MQ1][MD1][MD1];
792  if (tidz == 0)
793  {
794  MFEM_FOREACH_THREAD(d,y,D1D)
795  {
796  MFEM_FOREACH_THREAD(q,x,Q1D)
797  {
798  B[q][d] = b(q,d);
799  G[q][d] = g(q,d);
800  }
801  }
802  }
803  MFEM_SYNC_THREAD;
804  for (int i = 0; i < DIM; ++i)
805  {
806  for (int j = 0; j < DIM; ++j)
807  {
808  // first tensor contraction, along z direction
809  MFEM_FOREACH_THREAD(qx,x,Q1D)
810  {
811  MFEM_FOREACH_THREAD(qy,y,Q1D)
812  {
813  MFEM_FOREACH_THREAD(dz,z,D1D)
814  {
815  QQD[qx][qy][dz] = 0.0;
816  for (int qz = 0; qz < Q1D; ++qz)
817  {
818  const int q = qx + (qy + qz * Q1D) * Q1D;
819  const int ksym = j >= i ?
820  3 - (3-i)*(2-i)/2 + j:
821  3 - (3-j)*(2-j)/2 + i;
822  const int k = symmetric ? ksym : (i*DIM) + j;
823  const double O = D(q,k,e);
824  const double Bz = B[qz][dz];
825  const double Gz = G[qz][dz];
826  const double L = i==2 ? Gz : Bz;
827  const double R = j==2 ? Gz : Bz;
828  QQD[qx][qy][dz] += L * O * R;
829  }
830  }
831  }
832  }
833  MFEM_SYNC_THREAD;
834  // second tensor contraction, along y direction
835  MFEM_FOREACH_THREAD(qx,x,Q1D)
836  {
837  MFEM_FOREACH_THREAD(dz,z,D1D)
838  {
839  MFEM_FOREACH_THREAD(dy,y,D1D)
840  {
841  QDD[qx][dy][dz] = 0.0;
842  for (int qy = 0; qy < Q1D; ++qy)
843  {
844  const double By = B[qy][dy];
845  const double Gy = G[qy][dy];
846  const double L = i==1 ? Gy : By;
847  const double R = j==1 ? Gy : By;
848  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
849  }
850  }
851  }
852  }
853  MFEM_SYNC_THREAD;
854  // third tensor contraction, along x direction
855  MFEM_FOREACH_THREAD(dz,z,D1D)
856  {
857  MFEM_FOREACH_THREAD(dy,y,D1D)
858  {
859  MFEM_FOREACH_THREAD(dx,x,D1D)
860  {
861  for (int qx = 0; qx < Q1D; ++qx)
862  {
863  const double Bx = B[qx][dx];
864  const double Gx = G[qx][dx];
865  const double L = i==0 ? Gx : Bx;
866  const double R = j==0 ? Gx : Bx;
867  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
868  }
869  }
870  }
871  }
872  }
873  }
874  });
875 }
876 
877 static void PADiffusionAssembleDiagonal(const int dim,
878  const int D1D,
879  const int Q1D,
880  const int NE,
881  const bool symm,
882  const Array<double> &B,
883  const Array<double> &G,
884  const Vector &D,
885  Vector &Y)
886 {
887  if (dim == 2)
888  {
889  switch ((D1D << 4 ) | Q1D)
890  {
891  case 0x22: return SmemPADiffusionDiagonal2D<2,2,8>(NE,symm,B,G,D,Y);
892  case 0x33: return SmemPADiffusionDiagonal2D<3,3,8>(NE,symm,B,G,D,Y);
893  case 0x44: return SmemPADiffusionDiagonal2D<4,4,4>(NE,symm,B,G,D,Y);
894  case 0x55: return SmemPADiffusionDiagonal2D<5,5,4>(NE,symm,B,G,D,Y);
895  case 0x66: return SmemPADiffusionDiagonal2D<6,6,2>(NE,symm,B,G,D,Y);
896  case 0x77: return SmemPADiffusionDiagonal2D<7,7,2>(NE,symm,B,G,D,Y);
897  case 0x88: return SmemPADiffusionDiagonal2D<8,8,1>(NE,symm,B,G,D,Y);
898  case 0x99: return SmemPADiffusionDiagonal2D<9,9,1>(NE,symm,B,G,D,Y);
899  default: return PADiffusionDiagonal2D(NE,symm,B,G,D,Y,D1D,Q1D);
900  }
901  }
902  else if (dim == 3)
903  {
904  switch ((D1D << 4 ) | Q1D)
905  {
906  case 0x22: return SmemPADiffusionDiagonal3D<2,2>(NE,symm,B,G,D,Y);
907  case 0x23: return SmemPADiffusionDiagonal3D<2,3>(NE,symm,B,G,D,Y);
908  case 0x34: return SmemPADiffusionDiagonal3D<3,4>(NE,symm,B,G,D,Y);
909  case 0x45: return SmemPADiffusionDiagonal3D<4,5>(NE,symm,B,G,D,Y);
910  case 0x46: return SmemPADiffusionDiagonal3D<4,6>(NE,symm,B,G,D,Y);
911  case 0x56: return SmemPADiffusionDiagonal3D<5,6>(NE,symm,B,G,D,Y);
912  case 0x67: return SmemPADiffusionDiagonal3D<6,7>(NE,symm,B,G,D,Y);
913  case 0x78: return SmemPADiffusionDiagonal3D<7,8>(NE,symm,B,G,D,Y);
914  case 0x89: return SmemPADiffusionDiagonal3D<8,9>(NE,symm,B,G,D,Y);
915  case 0x9A: return SmemPADiffusionDiagonal3D<9,10>(NE,symm,B,G,D,Y);
916  default: return PADiffusionDiagonal3D(NE,symm,B,G,D,Y,D1D,Q1D);
917  }
918  }
919  MFEM_ABORT("Unknown kernel.");
920 }
921 
922 void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
923 {
924  if (DeviceCanUseCeed())
925  {
926  ceedOp->GetDiagonal(diag);
927  }
928  else
929  {
930  if (pa_data.Size()==0) { AssemblePA(*fespace); }
931  PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
932  maps->B, maps->G, pa_data, diag);
933  }
934 }
935 
936 
937 #ifdef MFEM_USE_OCCA
938 // OCCA PA Diffusion Apply 2D kernel
939 static void OccaPADiffusionApply2D(const int D1D,
940  const int Q1D,
941  const int NE,
942  const Array<double> &B,
943  const Array<double> &G,
944  const Array<double> &Bt,
945  const Array<double> &Gt,
946  const Vector &D,
947  const Vector &X,
948  Vector &Y)
949 {
950  occa::properties props;
951  props["defines/D1D"] = D1D;
952  props["defines/Q1D"] = Q1D;
953  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
954  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
955  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
956  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
957  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
958  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
959  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
960  const occa_id_t id = std::make_pair(D1D,Q1D);
961  if (!Device::Allows(Backend::OCCA_CUDA))
962  {
963  static occa_kernel_t OccaDiffApply2D_cpu;
964  if (OccaDiffApply2D_cpu.find(id) == OccaDiffApply2D_cpu.end())
965  {
966  const occa::kernel DiffusionApply2D_CPU =
967  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
968  "DiffusionApply2D_CPU", props);
969  OccaDiffApply2D_cpu.emplace(id, DiffusionApply2D_CPU);
970  }
971  OccaDiffApply2D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
972  }
973  else
974  {
975  static occa_kernel_t OccaDiffApply2D_gpu;
976  if (OccaDiffApply2D_gpu.find(id) == OccaDiffApply2D_gpu.end())
977  {
978  const occa::kernel DiffusionApply2D_GPU =
979  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
980  "DiffusionApply2D_GPU", props);
981  OccaDiffApply2D_gpu.emplace(id, DiffusionApply2D_GPU);
982  }
983  OccaDiffApply2D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
984  }
985 }
986 
987 // OCCA PA Diffusion Apply 3D kernel
988 static void OccaPADiffusionApply3D(const int D1D,
989  const int Q1D,
990  const int NE,
991  const Array<double> &B,
992  const Array<double> &G,
993  const Array<double> &Bt,
994  const Array<double> &Gt,
995  const Vector &D,
996  const Vector &X,
997  Vector &Y)
998 {
999  occa::properties props;
1000  props["defines/D1D"] = D1D;
1001  props["defines/Q1D"] = Q1D;
1002  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
1003  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
1004  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
1005  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
1006  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
1007  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
1008  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
1009  const occa_id_t id = std::make_pair(D1D,Q1D);
1010  if (!Device::Allows(Backend::OCCA_CUDA))
1011  {
1012  static occa_kernel_t OccaDiffApply3D_cpu;
1013  if (OccaDiffApply3D_cpu.find(id) == OccaDiffApply3D_cpu.end())
1014  {
1015  const occa::kernel DiffusionApply3D_CPU =
1016  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1017  "DiffusionApply3D_CPU", props);
1018  OccaDiffApply3D_cpu.emplace(id, DiffusionApply3D_CPU);
1019  }
1020  OccaDiffApply3D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1021  }
1022  else
1023  {
1024  static occa_kernel_t OccaDiffApply3D_gpu;
1025  if (OccaDiffApply3D_gpu.find(id) == OccaDiffApply3D_gpu.end())
1026  {
1027  const occa::kernel DiffusionApply3D_GPU =
1028  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1029  "DiffusionApply3D_GPU", props);
1030  OccaDiffApply3D_gpu.emplace(id, DiffusionApply3D_GPU);
1031  }
1032  OccaDiffApply3D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1033  }
1034 }
1035 #endif // MFEM_USE_OCCA
1036 
1037 // PA Diffusion Apply 2D kernel
1038 template<int T_D1D = 0, int T_Q1D = 0>
1039 static void PADiffusionApply2D(const int NE,
1040  const bool symmetric,
1041  const Array<double> &b_,
1042  const Array<double> &g_,
1043  const Array<double> &bt_,
1044  const Array<double> &gt_,
1045  const Vector &d_,
1046  const Vector &x_,
1047  Vector &y_,
1048  const int d1d = 0,
1049  const int q1d = 0)
1050 {
1051  const int D1D = T_D1D ? T_D1D : d1d;
1052  const int Q1D = T_Q1D ? T_Q1D : q1d;
1053  MFEM_VERIFY(D1D <= MAX_D1D, "");
1054  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1055  auto B = Reshape(b_.Read(), Q1D, D1D);
1056  auto G = Reshape(g_.Read(), Q1D, D1D);
1057  auto Bt = Reshape(bt_.Read(), D1D, Q1D);
1058  auto Gt = Reshape(gt_.Read(), D1D, Q1D);
1059  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1060  auto X = Reshape(x_.Read(), D1D, D1D, NE);
1061  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1062  MFEM_FORALL(e, NE,
1063  {
1064  const int D1D = T_D1D ? T_D1D : d1d;
1065  const int Q1D = T_Q1D ? T_Q1D : q1d;
1066  // the following variables are evaluated at compile time
1067  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1068  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1069 
1070  double grad[max_Q1D][max_Q1D][2];
1071  for (int qy = 0; qy < Q1D; ++qy)
1072  {
1073  for (int qx = 0; qx < Q1D; ++qx)
1074  {
1075  grad[qy][qx][0] = 0.0;
1076  grad[qy][qx][1] = 0.0;
1077  }
1078  }
1079  for (int dy = 0; dy < D1D; ++dy)
1080  {
1081  double gradX[max_Q1D][2];
1082  for (int qx = 0; qx < Q1D; ++qx)
1083  {
1084  gradX[qx][0] = 0.0;
1085  gradX[qx][1] = 0.0;
1086  }
1087  for (int dx = 0; dx < D1D; ++dx)
1088  {
1089  const double s = X(dx,dy,e);
1090  for (int qx = 0; qx < Q1D; ++qx)
1091  {
1092  gradX[qx][0] += s * B(qx,dx);
1093  gradX[qx][1] += s * G(qx,dx);
1094  }
1095  }
1096  for (int qy = 0; qy < Q1D; ++qy)
1097  {
1098  const double wy = B(qy,dy);
1099  const double wDy = G(qy,dy);
1100  for (int qx = 0; qx < Q1D; ++qx)
1101  {
1102  grad[qy][qx][0] += gradX[qx][1] * wy;
1103  grad[qy][qx][1] += gradX[qx][0] * wDy;
1104  }
1105  }
1106  }
1107  // Calculate Dxy, xDy in plane
1108  for (int qy = 0; qy < Q1D; ++qy)
1109  {
1110  for (int qx = 0; qx < Q1D; ++qx)
1111  {
1112  const int q = qx + qy * Q1D;
1113 
1114  const double O11 = D(q,0,e);
1115  const double O21 = D(q,1,e);
1116  const double O12 = symmetric ? O21 : D(q,2,e);
1117  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1118 
1119  const double gradX = grad[qy][qx][0];
1120  const double gradY = grad[qy][qx][1];
1121 
1122  grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
1123  grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
1124  }
1125  }
1126  for (int qy = 0; qy < Q1D; ++qy)
1127  {
1128  double gradX[max_D1D][2];
1129  for (int dx = 0; dx < D1D; ++dx)
1130  {
1131  gradX[dx][0] = 0;
1132  gradX[dx][1] = 0;
1133  }
1134  for (int qx = 0; qx < Q1D; ++qx)
1135  {
1136  const double gX = grad[qy][qx][0];
1137  const double gY = grad[qy][qx][1];
1138  for (int dx = 0; dx < D1D; ++dx)
1139  {
1140  const double wx = Bt(dx,qx);
1141  const double wDx = Gt(dx,qx);
1142  gradX[dx][0] += gX * wDx;
1143  gradX[dx][1] += gY * wx;
1144  }
1145  }
1146  for (int dy = 0; dy < D1D; ++dy)
1147  {
1148  const double wy = Bt(dy,qy);
1149  const double wDy = Gt(dy,qy);
1150  for (int dx = 0; dx < D1D; ++dx)
1151  {
1152  Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
1153  }
1154  }
1155  }
1156  });
1157 }
1158 
1159 // Shared memory PA Diffusion Apply 2D kernel
1160 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
1161 static void SmemPADiffusionApply2D(const int NE,
1162  const bool symmetric,
1163  const Array<double> &b_,
1164  const Array<double> &g_,
1165  const Vector &d_,
1166  const Vector &x_,
1167  Vector &y_,
1168  const int d1d = 0,
1169  const int q1d = 0)
1170 {
1171  const int D1D = T_D1D ? T_D1D : d1d;
1172  const int Q1D = T_Q1D ? T_Q1D : q1d;
1173  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1174  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1175  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1176  MFEM_VERIFY(D1D <= MD1, "");
1177  MFEM_VERIFY(Q1D <= MQ1, "");
1178  auto b = Reshape(b_.Read(), Q1D, D1D);
1179  auto g = Reshape(g_.Read(), Q1D, D1D);
1180  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1181  auto x = Reshape(x_.Read(), D1D, D1D, NE);
1182  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1183  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
1184  {
1185  const int tidz = MFEM_THREAD_ID(z);
1186  const int D1D = T_D1D ? T_D1D : d1d;
1187  const int Q1D = T_Q1D ? T_Q1D : q1d;
1188  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1189  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1190  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1191  MFEM_SHARED double sBG[2][MQ1*MD1];
1192  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1193  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1194  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1195  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1196  MFEM_SHARED double Xz[NBZ][MD1][MD1];
1197  MFEM_SHARED double GD[2][NBZ][MD1][MQ1];
1198  MFEM_SHARED double GQ[2][NBZ][MD1][MQ1];
1199  double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
1200  double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
1201  double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
1202  double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
1203  double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
1204  MFEM_FOREACH_THREAD(dy,y,D1D)
1205  {
1206  MFEM_FOREACH_THREAD(dx,x,D1D)
1207  {
1208  X[dy][dx] = x(dx,dy,e);
1209  }
1210  }
1211  if (tidz == 0)
1212  {
1213  MFEM_FOREACH_THREAD(dy,y,D1D)
1214  {
1215  MFEM_FOREACH_THREAD(q,x,Q1D)
1216  {
1217  B[q][dy] = b(q,dy);
1218  G[q][dy] = g(q,dy);
1219  }
1220  }
1221  }
1222  MFEM_SYNC_THREAD;
1223  MFEM_FOREACH_THREAD(dy,y,D1D)
1224  {
1225  MFEM_FOREACH_THREAD(qx,x,Q1D)
1226  {
1227  double u = 0.0;
1228  double v = 0.0;
1229  for (int dx = 0; dx < D1D; ++dx)
1230  {
1231  const double coords = X[dy][dx];
1232  u += B[qx][dx] * coords;
1233  v += G[qx][dx] * coords;
1234  }
1235  DQ0[dy][qx] = u;
1236  DQ1[dy][qx] = v;
1237  }
1238  }
1239  MFEM_SYNC_THREAD;
1240  MFEM_FOREACH_THREAD(qy,y,Q1D)
1241  {
1242  MFEM_FOREACH_THREAD(qx,x,Q1D)
1243  {
1244  double u = 0.0;
1245  double v = 0.0;
1246  for (int dy = 0; dy < D1D; ++dy)
1247  {
1248  u += DQ1[dy][qx] * B[qy][dy];
1249  v += DQ0[dy][qx] * G[qy][dy];
1250  }
1251  QQ0[qy][qx] = u;
1252  QQ1[qy][qx] = v;
1253  }
1254  }
1255  MFEM_SYNC_THREAD;
1256  MFEM_FOREACH_THREAD(qy,y,Q1D)
1257  {
1258  MFEM_FOREACH_THREAD(qx,x,Q1D)
1259  {
1260  const int q = (qx + ((qy) * Q1D));
1261  const double O11 = D(q,0,e);
1262  const double O21 = D(q,1,e);
1263  const double O12 = symmetric ? O21 : D(q,2,e);
1264  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1265  const double gX = QQ0[qy][qx];
1266  const double gY = QQ1[qy][qx];
1267  QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
1268  QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
1269  }
1270  }
1271  MFEM_SYNC_THREAD;
1272  if (tidz == 0)
1273  {
1274  MFEM_FOREACH_THREAD(dy,y,D1D)
1275  {
1276  MFEM_FOREACH_THREAD(q,x,Q1D)
1277  {
1278  Bt[dy][q] = b(q,dy);
1279  Gt[dy][q] = g(q,dy);
1280  }
1281  }
1282  }
1283  MFEM_SYNC_THREAD;
1284  MFEM_FOREACH_THREAD(qy,y,Q1D)
1285  {
1286  MFEM_FOREACH_THREAD(dx,x,D1D)
1287  {
1288  double u = 0.0;
1289  double v = 0.0;
1290  for (int qx = 0; qx < Q1D; ++qx)
1291  {
1292  u += Gt[dx][qx] * QQ0[qy][qx];
1293  v += Bt[dx][qx] * QQ1[qy][qx];
1294  }
1295  DQ0[qy][dx] = u;
1296  DQ1[qy][dx] = v;
1297  }
1298  }
1299  MFEM_SYNC_THREAD;
1300  MFEM_FOREACH_THREAD(dy,y,D1D)
1301  {
1302  MFEM_FOREACH_THREAD(dx,x,D1D)
1303  {
1304  double u = 0.0;
1305  double v = 0.0;
1306  for (int qy = 0; qy < Q1D; ++qy)
1307  {
1308  u += DQ0[qy][dx] * Bt[dy][qy];
1309  v += DQ1[qy][dx] * Gt[dy][qy];
1310  }
1311  Y(dx,dy,e) += (u + v);
1312  }
1313  }
1314  });
1315 }
1316 
1317 // PA Diffusion Apply 3D kernel
1318 template<int T_D1D = 0, int T_Q1D = 0>
1319 static void PADiffusionApply3D(const int NE,
1320  const bool symmetric,
1321  const Array<double> &b,
1322  const Array<double> &g,
1323  const Array<double> &bt,
1324  const Array<double> &gt,
1325  const Vector &d_,
1326  const Vector &x_,
1327  Vector &y_,
1328  int d1d = 0, int q1d = 0)
1329 {
1330  const int D1D = T_D1D ? T_D1D : d1d;
1331  const int Q1D = T_Q1D ? T_Q1D : q1d;
1332  MFEM_VERIFY(D1D <= MAX_D1D, "");
1333  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1334  auto B = Reshape(b.Read(), Q1D, D1D);
1335  auto G = Reshape(g.Read(), Q1D, D1D);
1336  auto Bt = Reshape(bt.Read(), D1D, Q1D);
1337  auto Gt = Reshape(gt.Read(), D1D, Q1D);
1338  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
1339  auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1340  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1341  MFEM_FORALL(e, NE,
1342  {
1343  const int D1D = T_D1D ? T_D1D : d1d;
1344  const int Q1D = T_Q1D ? T_Q1D : q1d;
1345  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1346  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1347  double grad[max_Q1D][max_Q1D][max_Q1D][3];
1348  for (int qz = 0; qz < Q1D; ++qz)
1349  {
1350  for (int qy = 0; qy < Q1D; ++qy)
1351  {
1352  for (int qx = 0; qx < Q1D; ++qx)
1353  {
1354  grad[qz][qy][qx][0] = 0.0;
1355  grad[qz][qy][qx][1] = 0.0;
1356  grad[qz][qy][qx][2] = 0.0;
1357  }
1358  }
1359  }
1360  for (int dz = 0; dz < D1D; ++dz)
1361  {
1362  double gradXY[max_Q1D][max_Q1D][3];
1363  for (int qy = 0; qy < Q1D; ++qy)
1364  {
1365  for (int qx = 0; qx < Q1D; ++qx)
1366  {
1367  gradXY[qy][qx][0] = 0.0;
1368  gradXY[qy][qx][1] = 0.0;
1369  gradXY[qy][qx][2] = 0.0;
1370  }
1371  }
1372  for (int dy = 0; dy < D1D; ++dy)
1373  {
1374  double gradX[max_Q1D][2];
1375  for (int qx = 0; qx < Q1D; ++qx)
1376  {
1377  gradX[qx][0] = 0.0;
1378  gradX[qx][1] = 0.0;
1379  }
1380  for (int dx = 0; dx < D1D; ++dx)
1381  {
1382  const double s = X(dx,dy,dz,e);
1383  for (int qx = 0; qx < Q1D; ++qx)
1384  {
1385  gradX[qx][0] += s * B(qx,dx);
1386  gradX[qx][1] += s * G(qx,dx);
1387  }
1388  }
1389  for (int qy = 0; qy < Q1D; ++qy)
1390  {
1391  const double wy = B(qy,dy);
1392  const double wDy = G(qy,dy);
1393  for (int qx = 0; qx < Q1D; ++qx)
1394  {
1395  const double wx = gradX[qx][0];
1396  const double wDx = gradX[qx][1];
1397  gradXY[qy][qx][0] += wDx * wy;
1398  gradXY[qy][qx][1] += wx * wDy;
1399  gradXY[qy][qx][2] += wx * wy;
1400  }
1401  }
1402  }
1403  for (int qz = 0; qz < Q1D; ++qz)
1404  {
1405  const double wz = B(qz,dz);
1406  const double wDz = G(qz,dz);
1407  for (int qy = 0; qy < Q1D; ++qy)
1408  {
1409  for (int qx = 0; qx < Q1D; ++qx)
1410  {
1411  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
1412  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
1413  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
1414  }
1415  }
1416  }
1417  }
1418  // Calculate Dxyz, xDyz, xyDz in plane
1419  for (int qz = 0; qz < Q1D; ++qz)
1420  {
1421  for (int qy = 0; qy < Q1D; ++qy)
1422  {
1423  for (int qx = 0; qx < Q1D; ++qx)
1424  {
1425  const int q = qx + (qy + qz * Q1D) * Q1D;
1426  const double O11 = D(q,0,e);
1427  const double O12 = D(q,1,e);
1428  const double O13 = D(q,2,e);
1429  const double O21 = symmetric ? O12 : D(q,3,e);
1430  const double O22 = symmetric ? D(q,3,e) : D(q,4,e);
1431  const double O23 = symmetric ? D(q,4,e) : D(q,5,e);
1432  const double O31 = symmetric ? O13 : D(q,6,e);
1433  const double O32 = symmetric ? O23 : D(q,7,e);
1434  const double O33 = symmetric ? D(q,5,e) : D(q,8,e);
1435  const double gradX = grad[qz][qy][qx][0];
1436  const double gradY = grad[qz][qy][qx][1];
1437  const double gradZ = grad[qz][qy][qx][2];
1438  grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
1439  grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
1440  grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
1441  }
1442  }
1443  }
1444  for (int qz = 0; qz < Q1D; ++qz)
1445  {
1446  double gradXY[max_D1D][max_D1D][3];
1447  for (int dy = 0; dy < D1D; ++dy)
1448  {
1449  for (int dx = 0; dx < D1D; ++dx)
1450  {
1451  gradXY[dy][dx][0] = 0;
1452  gradXY[dy][dx][1] = 0;
1453  gradXY[dy][dx][2] = 0;
1454  }
1455  }
1456  for (int qy = 0; qy < Q1D; ++qy)
1457  {
1458  double gradX[max_D1D][3];
1459  for (int dx = 0; dx < D1D; ++dx)
1460  {
1461  gradX[dx][0] = 0;
1462  gradX[dx][1] = 0;
1463  gradX[dx][2] = 0;
1464  }
1465  for (int qx = 0; qx < Q1D; ++qx)
1466  {
1467  const double gX = grad[qz][qy][qx][0];
1468  const double gY = grad[qz][qy][qx][1];
1469  const double gZ = grad[qz][qy][qx][2];
1470  for (int dx = 0; dx < D1D; ++dx)
1471  {
1472  const double wx = Bt(dx,qx);
1473  const double wDx = Gt(dx,qx);
1474  gradX[dx][0] += gX * wDx;
1475  gradX[dx][1] += gY * wx;
1476  gradX[dx][2] += gZ * wx;
1477  }
1478  }
1479  for (int dy = 0; dy < D1D; ++dy)
1480  {
1481  const double wy = Bt(dy,qy);
1482  const double wDy = Gt(dy,qy);
1483  for (int dx = 0; dx < D1D; ++dx)
1484  {
1485  gradXY[dy][dx][0] += gradX[dx][0] * wy;
1486  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
1487  gradXY[dy][dx][2] += gradX[dx][2] * wy;
1488  }
1489  }
1490  }
1491  for (int dz = 0; dz < D1D; ++dz)
1492  {
1493  const double wz = Bt(dz,qz);
1494  const double wDz = Gt(dz,qz);
1495  for (int dy = 0; dy < D1D; ++dy)
1496  {
1497  for (int dx = 0; dx < D1D; ++dx)
1498  {
1499  Y(dx,dy,dz,e) +=
1500  ((gradXY[dy][dx][0] * wz) +
1501  (gradXY[dy][dx][1] * wz) +
1502  (gradXY[dy][dx][2] * wDz));
1503  }
1504  }
1505  }
1506  }
1507  });
1508 }
1509 
1510 template<int T_D1D = 0, int T_Q1D = 0>
1511 static void SmemPADiffusionApply3D(const int NE,
1512  const bool symmetric,
1513  const Array<double> &b_,
1514  const Array<double> &g_,
1515  const Vector &d_,
1516  const Vector &x_,
1517  Vector &y_,
1518  const int d1d = 0,
1519  const int q1d = 0)
1520 {
1521  const int D1D = T_D1D ? T_D1D : d1d;
1522  const int Q1D = T_Q1D ? T_Q1D : q1d;
1523  constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
1524  constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
1525  MFEM_VERIFY(D1D <= M1D, "");
1526  MFEM_VERIFY(Q1D <= M1Q, "");
1527  auto b = Reshape(b_.Read(), Q1D, D1D);
1528  auto g = Reshape(g_.Read(), Q1D, D1D);
1529  auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1530  auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1531  auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1532  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
1533  {
1534  const int D1D = T_D1D ? T_D1D : d1d;
1535  const int Q1D = T_Q1D ? T_Q1D : q1d;
1536  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1537  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1538  constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1539  MFEM_SHARED double sBG[2][MQ1*MD1];
1540  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1541  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1542  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1543  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1544  MFEM_SHARED double sm0[3][MDQ*MDQ*MDQ];
1545  MFEM_SHARED double sm1[3][MDQ*MDQ*MDQ];
1546  double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1547  double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
1548  double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
1549  double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
1550  double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
1551  double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
1552  double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
1553  double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
1554  double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
1555  double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
1556  double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
1557  double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
1558  double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
1559  double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
1560  double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1561  MFEM_FOREACH_THREAD(dz,z,D1D)
1562  {
1563  MFEM_FOREACH_THREAD(dy,y,D1D)
1564  {
1565  MFEM_FOREACH_THREAD(dx,x,D1D)
1566  {
1567  X[dz][dy][dx] = x(dx,dy,dz,e);
1568  }
1569  }
1570  }
1571  if (MFEM_THREAD_ID(z) == 0)
1572  {
1573  MFEM_FOREACH_THREAD(dy,y,D1D)
1574  {
1575  MFEM_FOREACH_THREAD(qx,x,Q1D)
1576  {
1577  B[qx][dy] = b(qx,dy);
1578  G[qx][dy] = g(qx,dy);
1579  }
1580  }
1581  }
1582  MFEM_SYNC_THREAD;
1583  MFEM_FOREACH_THREAD(dz,z,D1D)
1584  {
1585  MFEM_FOREACH_THREAD(dy,y,D1D)
1586  {
1587  MFEM_FOREACH_THREAD(qx,x,Q1D)
1588  {
1589  double u = 0.0, v = 0.0;
1590  MFEM_UNROLL(MD1)
1591  for (int dx = 0; dx < D1D; ++dx)
1592  {
1593  const double coords = X[dz][dy][dx];
1594  u += coords * B[qx][dx];
1595  v += coords * G[qx][dx];
1596  }
1597  DDQ0[dz][dy][qx] = u;
1598  DDQ1[dz][dy][qx] = v;
1599  }
1600  }
1601  }
1602  MFEM_SYNC_THREAD;
1603  MFEM_FOREACH_THREAD(dz,z,D1D)
1604  {
1605  MFEM_FOREACH_THREAD(qy,y,Q1D)
1606  {
1607  MFEM_FOREACH_THREAD(qx,x,Q1D)
1608  {
1609  double u = 0.0, v = 0.0, w = 0.0;
1610  MFEM_UNROLL(MD1)
1611  for (int dy = 0; dy < D1D; ++dy)
1612  {
1613  u += DDQ1[dz][dy][qx] * B[qy][dy];
1614  v += DDQ0[dz][dy][qx] * G[qy][dy];
1615  w += DDQ0[dz][dy][qx] * B[qy][dy];
1616  }
1617  DQQ0[dz][qy][qx] = u;
1618  DQQ1[dz][qy][qx] = v;
1619  DQQ2[dz][qy][qx] = w;
1620  }
1621  }
1622  }
1623  MFEM_SYNC_THREAD;
1624  MFEM_FOREACH_THREAD(qz,z,Q1D)
1625  {
1626  MFEM_FOREACH_THREAD(qy,y,Q1D)
1627  {
1628  MFEM_FOREACH_THREAD(qx,x,Q1D)
1629  {
1630  double u = 0.0, v = 0.0, w = 0.0;
1631  MFEM_UNROLL(MD1)
1632  for (int dz = 0; dz < D1D; ++dz)
1633  {
1634  u += DQQ0[dz][qy][qx] * B[qz][dz];
1635  v += DQQ1[dz][qy][qx] * B[qz][dz];
1636  w += DQQ2[dz][qy][qx] * G[qz][dz];
1637  }
1638  const double O11 = d(qx,qy,qz,0,e);
1639  const double O12 = d(qx,qy,qz,1,e);
1640  const double O13 = d(qx,qy,qz,2,e);
1641  const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1642  const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1643  const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1644  const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1645  const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1646  const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1647  const double gX = u;
1648  const double gY = v;
1649  const double gZ = w;
1650  QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1651  QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1652  QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1653  }
1654  }
1655  }
1656  MFEM_SYNC_THREAD;
1657  if (MFEM_THREAD_ID(z) == 0)
1658  {
1659  MFEM_FOREACH_THREAD(dy,y,D1D)
1660  {
1661  MFEM_FOREACH_THREAD(qx,x,Q1D)
1662  {
1663  Bt[dy][qx] = b(qx,dy);
1664  Gt[dy][qx] = g(qx,dy);
1665  }
1666  }
1667  }
1668  MFEM_SYNC_THREAD;
1669  MFEM_FOREACH_THREAD(qz,z,Q1D)
1670  {
1671  MFEM_FOREACH_THREAD(qy,y,Q1D)
1672  {
1673  MFEM_FOREACH_THREAD(dx,x,D1D)
1674  {
1675  double u = 0.0, v = 0.0, w = 0.0;
1676  MFEM_UNROLL(MQ1)
1677  for (int qx = 0; qx < Q1D; ++qx)
1678  {
1679  u += QQQ0[qz][qy][qx] * Gt[dx][qx];
1680  v += QQQ1[qz][qy][qx] * Bt[dx][qx];
1681  w += QQQ2[qz][qy][qx] * Bt[dx][qx];
1682  }
1683  QQD0[qz][qy][dx] = u;
1684  QQD1[qz][qy][dx] = v;
1685  QQD2[qz][qy][dx] = w;
1686  }
1687  }
1688  }
1689  MFEM_SYNC_THREAD;
1690  MFEM_FOREACH_THREAD(qz,z,Q1D)
1691  {
1692  MFEM_FOREACH_THREAD(dy,y,D1D)
1693  {
1694  MFEM_FOREACH_THREAD(dx,x,D1D)
1695  {
1696  double u = 0.0, v = 0.0, w = 0.0;
1697  MFEM_UNROLL(Q1D)
1698  for (int qy = 0; qy < Q1D; ++qy)
1699  {
1700  u += QQD0[qz][qy][dx] * Bt[dy][qy];
1701  v += QQD1[qz][qy][dx] * Gt[dy][qy];
1702  w += QQD2[qz][qy][dx] * Bt[dy][qy];
1703  }
1704  QDD0[qz][dy][dx] = u;
1705  QDD1[qz][dy][dx] = v;
1706  QDD2[qz][dy][dx] = w;
1707  }
1708  }
1709  }
1710  MFEM_SYNC_THREAD;
1711  MFEM_FOREACH_THREAD(dz,z,D1D)
1712  {
1713  MFEM_FOREACH_THREAD(dy,y,D1D)
1714  {
1715  MFEM_FOREACH_THREAD(dx,x,D1D)
1716  {
1717  double u = 0.0, v = 0.0, w = 0.0;
1718  MFEM_UNROLL(MQ1)
1719  for (int qz = 0; qz < Q1D; ++qz)
1720  {
1721  u += QDD0[qz][dy][dx] * Bt[dz][qz];
1722  v += QDD1[qz][dy][dx] * Bt[dz][qz];
1723  w += QDD2[qz][dy][dx] * Gt[dz][qz];
1724  }
1725  y(dx,dy,dz,e) += (u + v + w);
1726  }
1727  }
1728  }
1729  });
1730 }
1731 
1732 static void PADiffusionApply(const int dim,
1733  const int D1D,
1734  const int Q1D,
1735  const int NE,
1736  const bool symm,
1737  const Array<double> &B,
1738  const Array<double> &G,
1739  const Array<double> &Bt,
1740  const Array<double> &Gt,
1741  const Vector &D,
1742  const Vector &X,
1743  Vector &Y)
1744 {
1745 #ifdef MFEM_USE_OCCA
1746  if (DeviceCanUseOcca())
1747  {
1748  if (dim == 2)
1749  {
1750  OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1751  return;
1752  }
1753  if (dim == 3)
1754  {
1755  OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1756  return;
1757  }
1758  MFEM_ABORT("OCCA PADiffusionApply unknown kernel!");
1759  }
1760 #endif // MFEM_USE_OCCA
1761  const int id = (D1D << 4) | Q1D;
1762 
1763  if (dim == 2)
1764  {
1765  switch (id)
1766  {
1767  case 0x22: return SmemPADiffusionApply2D<2,2,16>(NE,symm,B,G,D,X,Y);
1768  case 0x33: return SmemPADiffusionApply2D<3,3,16>(NE,symm,B,G,D,X,Y);
1769  case 0x44: return SmemPADiffusionApply2D<4,4,8>(NE,symm,B,G,D,X,Y);
1770  case 0x55: return SmemPADiffusionApply2D<5,5,8>(NE,symm,B,G,D,X,Y);
1771  case 0x66: return SmemPADiffusionApply2D<6,6,4>(NE,symm,B,G,D,X,Y);
1772  case 0x77: return SmemPADiffusionApply2D<7,7,4>(NE,symm,B,G,D,X,Y);
1773  case 0x88: return SmemPADiffusionApply2D<8,8,2>(NE,symm,B,G,D,X,Y);
1774  case 0x99: return SmemPADiffusionApply2D<9,9,2>(NE,symm,B,G,D,X,Y);
1775  default: return PADiffusionApply2D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1776  }
1777  }
1778 
1779  if (dim == 3)
1780  {
1781  switch (id)
1782  {
1783  case 0x22: return SmemPADiffusionApply3D<2,2>(NE,symm,B,G,D,X,Y);
1784  case 0x23: return SmemPADiffusionApply3D<2,3>(NE,symm,B,G,D,X,Y);
1785  case 0x34: return SmemPADiffusionApply3D<3,4>(NE,symm,B,G,D,X,Y);
1786  case 0x45: return SmemPADiffusionApply3D<4,5>(NE,symm,B,G,D,X,Y);
1787  case 0x46: return SmemPADiffusionApply3D<4,6>(NE,symm,B,G,D,X,Y);
1788  case 0x56: return SmemPADiffusionApply3D<5,6>(NE,symm,B,G,D,X,Y);
1789  case 0x58: return SmemPADiffusionApply3D<5,8>(NE,symm,B,G,D,X,Y);
1790  case 0x67: return SmemPADiffusionApply3D<6,7>(NE,symm,B,G,D,X,Y);
1791  case 0x78: return SmemPADiffusionApply3D<7,8>(NE,symm,B,G,D,X,Y);
1792  case 0x89: return SmemPADiffusionApply3D<8,9>(NE,symm,B,G,D,X,Y);
1793  default: return PADiffusionApply3D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1794  }
1795  }
1796  MFEM_ABORT("Unknown kernel: 0x"<<std::hex << id << std::dec);
1797 }
1798 
1799 // PA Diffusion Apply kernel
1800 void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
1801 {
1802  if (DeviceCanUseCeed())
1803  {
1804  ceedOp->AddMult(x, y);
1805  }
1806  else
1807  {
1808  PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
1809  maps->B, maps->G, maps->Bt, maps->Gt,
1810  pa_data, x, y);
1811  }
1812 }
1813 
1814 void DiffusionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
1815 {
1816  if (symmetric)
1817  {
1818  AddMultPA(x, y);
1819  }
1820  else
1821  {
1822  MFEM_ABORT("DiffusionIntegrator::AddMultTransposePA only implemented in "
1823  "the symmetric case.")
1824  }
1825 }
1826 
1827 } // namespace mfem
int GetNPoints() const
Returns the number of the points in the integration rule.
Definition: intrules.hpp:247
Abstract class for all finite elements.
Definition: fe_base.hpp:235
int Size() const
Return the logical size of the array.
Definition: array.hpp:138
int GetDim() const
Returns the reference space dimension for the finite element.
Definition: fe_base.hpp:311
Class for an integration rule - an Array of IntegrationPoint.
Definition: intrules.hpp:90
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition: array.hpp:117
A coefficient that is constant across space and time.
Definition: coefficient.hpp:78
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
Definition: mesh.cpp:840
const IntegrationRule & GetElementIntRule(int idx) const
Get the IntegrationRule associated with mesh element idx.
Definition: fespace.hpp:976
void SetSize(int s)
Change the size of the DenseSymmetricMatrix to s x s.
Definition: symmat.cpp:32
void SetSize(int s)
Resize the vector to size s.
Definition: vector.hpp:521
occa::device & OccaDev()
Return the default occa::device used by MFEM.
Definition: occa.cpp:27
Data type dense matrix using column-major storage.
Definition: densemat.hpp:23
virtual double * HostWrite()
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), false).
Definition: vector.hpp:450
int Size() const
Returns the size of the vector.
Definition: vector.hpp:199
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
Definition: fe_base.hpp:170
int GetNE() const
Returns number of elements.
Definition: mesh.hpp:928
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
Definition: intrules.cpp:83
constexpr int DIM
std::map< occa_id_t, occa::kernel > occa_kernel_t
Definition: occa.hpp:79
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
Definition: vector.hpp:234
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:59
DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims...dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition: dtensor.hpp:131
int GetNE() const
Returns number of elements in the mesh.
Definition: fespace.hpp:590
IntegrationPoint & IntPoint(int i)
Returns a reference to the i-th integration point.
Definition: intrules.hpp:250
Quadrature function coefficient which requires that the quadrature rules used for this coefficient be...
const int MAX_Q1D
Definition: forall.hpp:29
Mesh * GetMesh() const
Returns the mesh.
Definition: fespace.hpp:433
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
Definition: occa.hpp:69
virtual double * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:446
double b
Definition: lissajous.cpp:42
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:37
QuadratureSpace * GetSpace() const
Get the associated QuadratureSpace.
Definition: gridfunc.hpp:798
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Definition: array.hpp:304
int Dimension() const
Definition: mesh.hpp:999
void PADiffusionSetup2D< 3 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
void PADiffusionSetup2D< 2 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
int SpaceDimension() const
Definition: mesh.hpp:1000
ElementTransformation * GetElementTransformation(int i) const
Returns ElementTransformation for the i-th element.
Definition: fespace.hpp:623
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Definition: fespace.hpp:88
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:48
MemoryType
Memory types supported by MFEM.
Definition: mem_manager.hpp:31
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
Definition: fe_base.cpp:366
void PADiffusionSetup3D(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
virtual double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:454
void GetElementTransformation(int i, IsoparametricTransformation *ElTr)
Definition: mesh.cpp:348
FDualNumber< tbase > sqrt(const FDualNumber< tbase > &f)
sqrt([dual number])
Definition: fdual.hpp:600
int dim
Definition: ex24.cpp:53
const int MAX_D1D
Definition: forall.hpp:28
virtual const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement in the FiniteElementCollection associated with i&#39;th element in t...
Definition: fespace.cpp:2783
constexpr int SDIM
const double alpha
Definition: ex15.cpp:369
Vector data type.
Definition: vector.hpp:60
void MakeRef(Vector &base, int offset, int size)
Reset the Vector to be a reference to a sub-vector of base.
Definition: vector.hpp:585
std::pair< int, int > occa_id_t
Definition: occa.hpp:78
RefCoord s[3]
double u(const Vector &xvec)
Definition: lor_mms.hpp:24
void SetSize(int s)
Change the size of the DenseMatrix to s x s.
Definition: densemat.hpp:105
Class representing a function through its values (scalar or vector) at quadrature points...
Definition: gridfunc.hpp:757
virtual const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:438