MFEM  v4.3.0
Finite element discretization library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Pages
bilininteg_diffusion_pa.cpp
Go to the documentation of this file.
1 // Copyright (c) 2010-2021, Lawrence Livermore National Security, LLC. Produced
2 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3 // LICENSE and NOTICE for details. LLNL-CODE-806117.
4 //
5 // This file is part of the MFEM library. For more information and source code
6 // availability visit https://mfem.org.
7 //
8 // MFEM is free software; you can redistribute it and/or modify it under the
9 // terms of the BSD-3 license. We welcome feedback and contributions, see file
10 // CONTRIBUTING.md for details.
11 
12 #include "../general/forall.hpp"
13 #include "bilininteg.hpp"
14 #include "gridfunc.hpp"
15 #include "ceed/diffusion.hpp"
16 
17 using namespace std;
18 
19 namespace mfem
20 {
21 
22 // PA Diffusion Integrator
23 
24 // OCCA 2D Assemble kernel
25 #ifdef MFEM_USE_OCCA
26 static void OccaPADiffusionSetup2D(const int D1D,
27  const int Q1D,
28  const int NE,
29  const Array<double> &W,
30  const Vector &J,
31  const Vector &C,
32  Vector &op)
33 {
34  occa::properties props;
35  props["defines/D1D"] = D1D;
36  props["defines/Q1D"] = Q1D;
37  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
38  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
39  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
40  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
41  const bool const_c = C.Size() == 1;
42  const occa_id_t id = std::make_pair(D1D,Q1D);
43  static occa_kernel_t OccaDiffSetup2D_ker;
44  if (OccaDiffSetup2D_ker.find(id) == OccaDiffSetup2D_ker.end())
45  {
46  const occa::kernel DiffusionSetup2D =
47  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
48  "DiffusionSetup2D", props);
49  OccaDiffSetup2D_ker.emplace(id, DiffusionSetup2D);
50  }
51  OccaDiffSetup2D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
52 }
53 
54 static void OccaPADiffusionSetup3D(const int D1D,
55  const int Q1D,
56  const int NE,
57  const Array<double> &W,
58  const Vector &J,
59  const Vector &C,
60  Vector &op)
61 {
62  occa::properties props;
63  props["defines/D1D"] = D1D;
64  props["defines/Q1D"] = Q1D;
65  const occa::memory o_W = OccaMemoryRead(W.GetMemory(), W.Size());
66  const occa::memory o_J = OccaMemoryRead(J.GetMemory(), J.Size());
67  const occa::memory o_C = OccaMemoryRead(C.GetMemory(), C.Size());
68  occa::memory o_op = OccaMemoryWrite(op.GetMemory(), op.Size());
69  const bool const_c = C.Size() == 1;
70  const occa_id_t id = std::make_pair(D1D,Q1D);
71  static occa_kernel_t OccaDiffSetup3D_ker;
72  if (OccaDiffSetup3D_ker.find(id) == OccaDiffSetup3D_ker.end())
73  {
74  const occa::kernel DiffusionSetup3D =
75  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
76  "DiffusionSetup3D", props);
77  OccaDiffSetup3D_ker.emplace(id, DiffusionSetup3D);
78  }
79  OccaDiffSetup3D_ker.at(id)(NE, o_W, o_J, o_C, o_op, const_c);
80 }
81 #endif // MFEM_USE_OCCA
82 
83 template<>
84 void PADiffusionSetup2D<2>(const int Q1D,
85  const int coeffDim,
86  const int NE,
87  const Array<double> &w,
88  const Vector &j,
89  const Vector &c,
90  Vector &d)
91 {
92  const bool symmetric = (coeffDim != 4);
93  const bool const_c = c.Size() == 1;
94  MFEM_VERIFY(coeffDim < 3 ||
95  !const_c, "Constant matrix coefficient not supported");
96  const auto W = Reshape(w.Read(), Q1D,Q1D);
97  const auto J = Reshape(j.Read(), Q1D,Q1D,2,2,NE);
98  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1) :
99  Reshape(c.Read(), coeffDim,Q1D,Q1D,NE);
100  auto D = Reshape(d.Write(), Q1D,Q1D, symmetric ? 3 : 4, NE);
101  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
102  {
103  MFEM_FOREACH_THREAD(qx,x,Q1D)
104  {
105  MFEM_FOREACH_THREAD(qy,y,Q1D)
106  {
107  const double J11 = J(qx,qy,0,0,e);
108  const double J21 = J(qx,qy,1,0,e);
109  const double J12 = J(qx,qy,0,1,e);
110  const double J22 = J(qx,qy,1,1,e);
111  const double w_detJ = W(qx,qy) / ((J11*J22)-(J21*J12));
112  if (coeffDim == 3 || coeffDim == 4) // Matrix coefficient
113  {
114  // First compute entries of R = MJ^{-T}, without det J factor.
115  const double M11 = C(0,qx,qy,e);
116  const double M12 = C(1,qx,qy,e);
117  const double M21 = symmetric ? M12 : C(2,qx,qy,e);
118  const double M22 = symmetric ? C(2,qx,qy,e) : C(3,qx,qy,e);
119  const double R11 = M11*J22 - M12*J12;
120  const double R21 = M21*J22 - M22*J12;
121  const double R12 = -M11*J21 + M12*J11;
122  const double R22 = -M21*J21 + M22*J11;
123 
124  // Now set y to J^{-1}R.
125  D(qx,qy,0,e) = w_detJ * ( J22*R11 - J12*R21); // 1,1
126  D(qx,qy,1,e) = w_detJ * (-J21*R11 + J11*R21); // 2,1
127  D(qx,qy,2,e) = w_detJ * (symmetric ? (-J21*R12 + J11*R22) :
128  (J22*R12 - J12*R22)); // 2,2 or 1,2
129  if (!symmetric)
130  {
131  D(qx,qy,3,e) = w_detJ * (-J21*R12 + J11*R22); // 2,2
132  }
133  }
134  else // Vector or scalar coefficient
135  {
136  const double C1 = const_c ? C(0,0,0,0) : C(0,qx,qy,e);
137  const double C2 = const_c ? C(0,0,0,0) :
138  (coeffDim == 2 ? C(1,qx,qy,e) : C(0,qx,qy,e));
139 
140  D(qx,qy,0,e) = w_detJ * (C2*J12*J12 + C1*J22*J22); // 1,1
141  D(qx,qy,1,e) = -w_detJ * (C2*J12*J11 + C1*J22*J21); // 1,2
142  D(qx,qy,2,e) = w_detJ * (C2*J11*J11 + C1*J21*J21); // 2,2
143  }
144  }
145  }
146  });
147 }
148 
149 // PA Diffusion Assemble 2D kernel with 3D node coords
150 template<>
151 void PADiffusionSetup2D<3>(const int Q1D,
152  const int coeffDim,
153  const int NE,
154  const Array<double> &w,
155  const Vector &j,
156  const Vector &c,
157  Vector &d)
158 {
159  MFEM_VERIFY(coeffDim == 1, "Matrix and vector coefficients not supported");
160  constexpr int DIM = 2;
161  constexpr int SDIM = 3;
162  const bool const_c = c.Size() == 1;
163  const auto W = Reshape(w.Read(), Q1D,Q1D);
164  const auto J = Reshape(j.Read(), Q1D,Q1D,SDIM,DIM,NE);
165  const auto C = const_c ? Reshape(c.Read(), 1,1,1) :
166  Reshape(c.Read(), Q1D,Q1D,NE);
167  auto D = Reshape(d.Write(), Q1D,Q1D, 3, NE);
168  MFEM_FORALL_2D(e, NE, Q1D,Q1D,1,
169  {
170  MFEM_FOREACH_THREAD(qx,x,Q1D)
171  {
172  MFEM_FOREACH_THREAD(qy,y,Q1D)
173  {
174  const double wq = W(qx,qy);
175  const double J11 = J(qx,qy,0,0,e);
176  const double J21 = J(qx,qy,1,0,e);
177  const double J31 = J(qx,qy,2,0,e);
178  const double J12 = J(qx,qy,0,1,e);
179  const double J22 = J(qx,qy,1,1,e);
180  const double J32 = J(qx,qy,2,1,e);
181  const double E = J11*J11 + J21*J21 + J31*J31;
182  const double G = J12*J12 + J22*J22 + J32*J32;
183  const double F = J11*J12 + J21*J22 + J31*J32;
184  const double iw = 1.0 / sqrt(E*G - F*F);
185  const double coeff = const_c ? C(0,0,0) : C(qx,qy,e);
186  const double alpha = wq * coeff * iw;
187  D(qx,qy,0,e) = alpha * G; // 1,1
188  D(qx,qy,1,e) = -alpha * F; // 1,2
189  D(qx,qy,2,e) = alpha * E; // 2,2
190  }
191  }
192  });
193 }
194 
195 // PA Diffusion Assemble 3D kernel
196 void PADiffusionSetup3D(const int Q1D,
197  const int coeffDim,
198  const int NE,
199  const Array<double> &w,
200  const Vector &j,
201  const Vector &c,
202  Vector &d)
203 {
204  const bool symmetric = (coeffDim != 9);
205  const bool const_c = c.Size() == 1;
206  MFEM_VERIFY(coeffDim < 6 ||
207  !const_c, "Constant matrix coefficient not supported");
208  const auto W = Reshape(w.Read(), Q1D,Q1D,Q1D);
209  const auto J = Reshape(j.Read(), Q1D,Q1D,Q1D,3,3,NE);
210  const auto C = const_c ? Reshape(c.Read(), 1,1,1,1,1) :
211  Reshape(c.Read(), coeffDim,Q1D,Q1D,Q1D,NE);
212  auto D = Reshape(d.Write(), Q1D,Q1D,Q1D, symmetric ? 6 : 9, NE);
213  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
214  {
215  MFEM_FOREACH_THREAD(qx,x,Q1D)
216  {
217  MFEM_FOREACH_THREAD(qy,y,Q1D)
218  {
219  MFEM_FOREACH_THREAD(qz,z,Q1D)
220  {
221  const double J11 = J(qx,qy,qz,0,0,e);
222  const double J21 = J(qx,qy,qz,1,0,e);
223  const double J31 = J(qx,qy,qz,2,0,e);
224  const double J12 = J(qx,qy,qz,0,1,e);
225  const double J22 = J(qx,qy,qz,1,1,e);
226  const double J32 = J(qx,qy,qz,2,1,e);
227  const double J13 = J(qx,qy,qz,0,2,e);
228  const double J23 = J(qx,qy,qz,1,2,e);
229  const double J33 = J(qx,qy,qz,2,2,e);
230  const double detJ = J11 * (J22 * J33 - J32 * J23) -
231  /* */ J21 * (J12 * J33 - J32 * J13) +
232  /* */ J31 * (J12 * J23 - J22 * J13);
233  const double w_detJ = W(qx,qy,qz) / detJ;
234  // adj(J)
235  const double A11 = (J22 * J33) - (J23 * J32);
236  const double A12 = (J32 * J13) - (J12 * J33);
237  const double A13 = (J12 * J23) - (J22 * J13);
238  const double A21 = (J31 * J23) - (J21 * J33);
239  const double A22 = (J11 * J33) - (J13 * J31);
240  const double A23 = (J21 * J13) - (J11 * J23);
241  const double A31 = (J21 * J32) - (J31 * J22);
242  const double A32 = (J31 * J12) - (J11 * J32);
243  const double A33 = (J11 * J22) - (J12 * J21);
244 
245  if (coeffDim == 6 || coeffDim == 9) // Matrix coefficient version
246  {
247  // Compute entries of R = MJ^{-T} = M adj(J)^T, without det J.
248  const double M11 = C(0, qx,qy,qz, e);
249  const double M12 = C(1, qx,qy,qz, e);
250  const double M13 = C(2, qx,qy,qz, e);
251  const double M21 = (!symmetric) ? C(3, qx,qy,qz, e) : M12;
252  const double M22 = (!symmetric) ? C(4, qx,qy,qz, e) : C(3, qx,qy,qz, e);
253  const double M23 = (!symmetric) ? C(5, qx,qy,qz, e) : C(4, qx,qy,qz, e);
254  const double M31 = (!symmetric) ? C(6, qx,qy,qz, e) : M13;
255  const double M32 = (!symmetric) ? C(7, qx,qy,qz, e) : M23;
256  const double M33 = (!symmetric) ? C(8, qx,qy,qz, e) : C(5, qx,qy,qz, e);
257 
258  const double R11 = M11*A11 + M12*A12 + M13*A13;
259  const double R12 = M11*A21 + M12*A22 + M13*A23;
260  const double R13 = M11*A31 + M12*A32 + M13*A33;
261  const double R21 = M21*A11 + M22*A12 + M23*A13;
262  const double R22 = M21*A21 + M22*A22 + M23*A23;
263  const double R23 = M21*A31 + M22*A32 + M23*A33;
264  const double R31 = M31*A11 + M32*A12 + M33*A13;
265  const double R32 = M31*A21 + M32*A22 + M33*A23;
266  const double R33 = M31*A31 + M32*A32 + M33*A33;
267 
268  // Now set D to J^{-1} R = adj(J) R
269  D(qx,qy,qz,0,e) = w_detJ * (A11*R11 + A12*R21 + A13*R31); // 1,1
270  const double D12 = w_detJ * (A11*R12 + A12*R22 + A13*R32);
271  D(qx,qy,qz,1,e) = D12; // 1,2
272  D(qx,qy,qz,2,e) = w_detJ * (A11*R13 + A12*R23 + A13*R33); // 1,3
273 
274  const double D21 = w_detJ * (A21*R11 + A22*R21 + A23*R31);
275  const double D22 = w_detJ * (A21*R12 + A22*R22 + A23*R32);
276  const double D23 = w_detJ * (A21*R13 + A22*R23 + A23*R33);
277 
278  const double D33 = w_detJ * (A31*R13 + A32*R23 + A33*R33);
279 
280  D(qx,qy,qz,3,e) = symmetric ? D22 : D21; // 2,2 or 2,1
281  D(qx,qy,qz,4,e) = symmetric ? D23 : D22; // 2,3 or 2,2
282  D(qx,qy,qz,5,e) = symmetric ? D33 : D23; // 3,3 or 2,3
283 
284  if (!symmetric)
285  {
286  D(qx,qy,qz,6,e) = w_detJ * (A31*R11 + A32*R21 + A33*R31); // 3,1
287  D(qx,qy,qz,7,e) = w_detJ * (A31*R12 + A32*R22 + A33*R32); // 3,2
288  D(qx,qy,qz,8,e) = D33; // 3,3
289  }
290  }
291  else // Vector or scalar coefficient version
292  {
293  const double C1 = const_c ? C(0,0,0,0,0) : C(0,qx,qy,qz,e);
294  const double C2 = const_c ? C(0,0,0,0,0) :
295  (coeffDim == 3 ? C(1,qx,qy,qz,e) : C(0,qx,qy,qz,e));
296  const double C3 = const_c ? C(0,0,0,0,0) :
297  (coeffDim == 3 ? C(2,qx,qy,qz,e) : C(0,qx,qy,qz,e));
298 
299  // detJ J^{-1} J^{-T} = (1/detJ) adj(J) adj(J)^T
300  D(qx,qy,qz,0,e) = w_detJ * (C1*A11*A11 + C2*A12*A12 + C3*A13*A13); // 1,1
301  D(qx,qy,qz,1,e) = w_detJ * (C1*A11*A21 + C2*A12*A22 + C3*A13*A23); // 2,1
302  D(qx,qy,qz,2,e) = w_detJ * (C1*A11*A31 + C2*A12*A32 + C3*A13*A33); // 3,1
303  D(qx,qy,qz,3,e) = w_detJ * (C1*A21*A21 + C2*A22*A22 + C3*A23*A23); // 2,2
304  D(qx,qy,qz,4,e) = w_detJ * (C1*A21*A31 + C2*A22*A32 + C3*A23*A33); // 3,2
305  D(qx,qy,qz,5,e) = w_detJ * (C1*A31*A31 + C2*A32*A32 + C3*A33*A33); // 3,3
306  }
307  }
308  }
309  }
310  });
311 }
312 
313 static void PADiffusionSetup(const int dim,
314  const int sdim,
315  const int D1D,
316  const int Q1D,
317  const int coeffDim,
318  const int NE,
319  const Array<double> &W,
320  const Vector &J,
321  const Vector &C,
322  Vector &D)
323 {
324  if (dim == 1) { MFEM_ABORT("dim==1 not supported in PADiffusionSetup"); }
325  if (dim == 2)
326  {
327 #ifdef MFEM_USE_OCCA
328  if (DeviceCanUseOcca())
329  {
330  OccaPADiffusionSetup2D(D1D, Q1D, NE, W, J, C, D);
331  return;
332  }
333 #else
334  MFEM_CONTRACT_VAR(D1D);
335 #endif // MFEM_USE_OCCA
336  if (sdim == 2) { PADiffusionSetup2D<2>(Q1D, coeffDim, NE, W, J, C, D); }
337  if (sdim == 3) { PADiffusionSetup2D<3>(Q1D, coeffDim, NE, W, J, C, D); }
338  }
339  if (dim == 3)
340  {
341 #ifdef MFEM_USE_OCCA
342  if (DeviceCanUseOcca())
343  {
344  OccaPADiffusionSetup3D(D1D, Q1D, NE, W, J, C, D);
345  return;
346  }
347 #endif // MFEM_USE_OCCA
348  PADiffusionSetup3D(Q1D, coeffDim, NE, W, J, C, D);
349  }
350 }
351 
352 void DiffusionIntegrator::AssemblePA(const FiniteElementSpace &fes)
353 {
354  const MemoryType mt = (pa_mt == MemoryType::DEFAULT) ?
355  Device::GetDeviceMemoryType() : pa_mt;
356  // Assuming the same element type
357  fespace = &fes;
358  Mesh *mesh = fes.GetMesh();
359  if (mesh->GetNE() == 0) { return; }
360  const FiniteElement &el = *fes.GetFE(0);
361  const IntegrationRule *ir = IntRule ? IntRule : &GetRule(el, el);
362  if (DeviceCanUseCeed())
363  {
364  delete ceedOp;
365  MFEM_VERIFY(!VQ && !MQ && !SMQ,
366  "Only scalar coefficient supported for DiffusionIntegrator"
367  " with libCEED");
368  ceedOp = new ceed::PADiffusionIntegrator(fes, *ir, Q);
369  return;
370  }
371  const int dims = el.GetDim();
372  const int symmDims = (dims * (dims + 1)) / 2; // 1x1: 1, 2x2: 3, 3x3: 6
373  const int nq = ir->GetNPoints();
374  dim = mesh->Dimension();
375  ne = fes.GetNE();
376  geom = mesh->GetGeometricFactors(*ir, GeometricFactors::JACOBIANS, mt);
377  const int sdim = mesh->SpaceDimension();
378  maps = &el.GetDofToQuad(*ir, DofToQuad::TENSOR);
379  dofs1D = maps->ndof;
380  quad1D = maps->nqpt;
381  int coeffDim = 1;
382  Vector coeff;
383  const int MQfullDim = MQ ? MQ->GetHeight() * MQ->GetWidth() : 0;
384  if (MQ)
385  {
386  symmetric = false;
387  MFEM_VERIFY(MQ->GetHeight() == dim && MQ->GetWidth() == dim, "");
388 
389  coeffDim = MQfullDim;
390 
391  coeff.SetSize(MQfullDim * nq * ne);
392 
393  DenseMatrix M;
394  M.SetSize(dim);
395 
396  auto C = Reshape(coeff.HostWrite(), MQfullDim, nq, ne);
397  for (int e=0; e<ne; ++e)
398  {
400  for (int p=0; p<nq; ++p)
401  {
402  MQ->Eval(M, *tr, ir->IntPoint(p));
403  for (int i=0; i<dim; ++i)
404  for (int j=0; j<dim; ++j)
405  {
406  C(j+(i*dim), p, e) = M(i,j);
407  }
408  }
409  }
410  }
411  else if (SMQ)
412  {
413  MFEM_VERIFY(SMQ->GetSize() == dim, "");
414  coeffDim = symmDims;
415  coeff.SetSize(symmDims * nq * ne);
416 
418  M.SetSize(dim);
419 
420  auto C = Reshape(coeff.HostWrite(), symmDims, nq, ne);
421 
422  for (int e=0; e<ne; ++e)
423  {
425  for (int p=0; p<nq; ++p)
426  {
427  SMQ->Eval(M, *tr, ir->IntPoint(p));
428  int cnt = 0;
429  for (int i=0; i<dim; ++i)
430  for (int j=i; j<dim; ++j, ++cnt)
431  {
432  C(cnt, p, e) = M(i,j);
433  }
434  }
435  }
436  }
437  else if (VQ)
438  {
439  MFEM_VERIFY(VQ->GetVDim() == dim, "");
440  coeffDim = VQ->GetVDim();
441  coeff.SetSize(coeffDim * nq * ne);
442  auto C = Reshape(coeff.HostWrite(), coeffDim, nq, ne);
443  Vector D(coeffDim);
444  for (int e=0; e<ne; ++e)
445  {
447  for (int p=0; p<nq; ++p)
448  {
449  VQ->Eval(D, *tr, ir->IntPoint(p));
450  for (int i=0; i<coeffDim; ++i)
451  {
452  C(i, p, e) = D[i];
453  }
454  }
455  }
456  }
457  else if (Q == nullptr)
458  {
459  coeff.SetSize(1);
460  coeff(0) = 1.0;
461  }
462  else if (ConstantCoefficient* cQ = dynamic_cast<ConstantCoefficient*>(Q))
463  {
464  coeff.SetSize(1);
465  coeff(0) = cQ->constant;
466  }
467  else if (QuadratureFunctionCoefficient* cQ =
468  dynamic_cast<QuadratureFunctionCoefficient*>(Q))
469  {
470  const QuadratureFunction &qFun = cQ->GetQuadFunction();
471  MFEM_VERIFY(qFun.Size() == ne*nq,
472  "Incompatible QuadratureFunction dimension \n");
473 
474  MFEM_VERIFY(ir == &qFun.GetSpace()->GetElementIntRule(0),
475  "IntegrationRule used within integrator and in"
476  " QuadratureFunction appear to be different");
477  qFun.Read();
478  coeff.MakeRef(const_cast<QuadratureFunction &>(qFun),0);
479  }
480  else
481  {
482  coeff.SetSize(nq * ne);
483  auto C = Reshape(coeff.HostWrite(), nq, ne);
484  for (int e = 0; e < ne; ++e)
485  {
487  for (int q = 0; q < nq; ++q)
488  {
489  C(q,e) = Q->Eval(T, ir->IntPoint(q));
490  }
491  }
492  }
493  pa_data.SetSize((symmetric ? symmDims : MQfullDim) * nq * ne, mt);
494  PADiffusionSetup(dim, sdim, dofs1D, quad1D, coeffDim, ne, ir->GetWeights(),
495  geom->J, coeff, pa_data);
496 }
497 
498 template<int T_D1D = 0, int T_Q1D = 0>
499 static void PADiffusionDiagonal2D(const int NE,
500  const bool symmetric,
501  const Array<double> &b,
502  const Array<double> &g,
503  const Vector &d,
504  Vector &y,
505  const int d1d = 0,
506  const int q1d = 0)
507 {
508  const int D1D = T_D1D ? T_D1D : d1d;
509  const int Q1D = T_Q1D ? T_Q1D : q1d;
510  MFEM_VERIFY(D1D <= MAX_D1D, "");
511  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
512  auto B = Reshape(b.Read(), Q1D, D1D);
513  auto G = Reshape(g.Read(), Q1D, D1D);
514  // note the different shape for D, if this is a symmetric matrix we only
515  // store necessary entries
516  auto D = Reshape(d.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
517  auto Y = Reshape(y.ReadWrite(), D1D, D1D, NE);
518  MFEM_FORALL(e, NE,
519  {
520  const int D1D = T_D1D ? T_D1D : d1d;
521  const int Q1D = T_Q1D ? T_Q1D : q1d;
522  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
523  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
524  // gradphi \cdot Q \gradphi has four terms
525  double QD0[MQ1][MD1];
526  double QD1[MQ1][MD1];
527  double QD2[MQ1][MD1];
528  for (int qx = 0; qx < Q1D; ++qx)
529  {
530  for (int dy = 0; dy < D1D; ++dy)
531  {
532  QD0[qx][dy] = 0.0;
533  QD1[qx][dy] = 0.0;
534  QD2[qx][dy] = 0.0;
535  for (int qy = 0; qy < Q1D; ++qy)
536  {
537  const int q = qx + qy * Q1D;
538  const double D00 = D(q,0,e);
539  const double D10 = D(q,1,e);
540  const double D01 = symmetric ? D10 : D(q,2,e);
541  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
542  QD0[qx][dy] += B(qy, dy) * B(qy, dy) * D00;
543  QD1[qx][dy] += B(qy, dy) * G(qy, dy) * (D01 + D10);
544  QD2[qx][dy] += G(qy, dy) * G(qy, dy) * D11;
545  }
546  }
547  }
548  for (int dy = 0; dy < D1D; ++dy)
549  {
550  for (int dx = 0; dx < D1D; ++dx)
551  {
552  for (int qx = 0; qx < Q1D; ++qx)
553  {
554  Y(dx,dy,e) += G(qx, dx) * G(qx, dx) * QD0[qx][dy];
555  Y(dx,dy,e) += G(qx, dx) * B(qx, dx) * QD1[qx][dy];
556  Y(dx,dy,e) += B(qx, dx) * B(qx, dx) * QD2[qx][dy];
557  }
558  }
559  }
560  });
561 }
562 
563 // Shared memory PA Diffusion Diagonal 2D kernel
564 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
565 static void SmemPADiffusionDiagonal2D(const int NE,
566  const bool symmetric,
567  const Array<double> &b_,
568  const Array<double> &g_,
569  const Vector &d_,
570  Vector &y_,
571  const int d1d = 0,
572  const int q1d = 0)
573 {
574  const int D1D = T_D1D ? T_D1D : d1d;
575  const int Q1D = T_Q1D ? T_Q1D : q1d;
576  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
577  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
578  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
579  MFEM_VERIFY(D1D <= MD1, "");
580  MFEM_VERIFY(Q1D <= MQ1, "");
581  auto b = Reshape(b_.Read(), Q1D, D1D);
582  auto g = Reshape(g_.Read(), Q1D, D1D);
583  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
584  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
585  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
586  {
587  const int tidz = MFEM_THREAD_ID(z);
588  const int D1D = T_D1D ? T_D1D : d1d;
589  const int Q1D = T_Q1D ? T_Q1D : q1d;
590  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
591  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
592  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
593  MFEM_SHARED double BG[2][MQ1*MD1];
594  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
595  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
596  MFEM_SHARED double QD[3][NBZ][MD1][MQ1];
597  double (*QD0)[MD1] = (double (*)[MD1])(QD[0] + tidz);
598  double (*QD1)[MD1] = (double (*)[MD1])(QD[1] + tidz);
599  double (*QD2)[MD1] = (double (*)[MD1])(QD[2] + tidz);
600  if (tidz == 0)
601  {
602  MFEM_FOREACH_THREAD(d,y,D1D)
603  {
604  MFEM_FOREACH_THREAD(q,x,Q1D)
605  {
606  B[q][d] = b(q,d);
607  G[q][d] = g(q,d);
608  }
609  }
610  }
611  MFEM_SYNC_THREAD;
612  MFEM_FOREACH_THREAD(qx,x,Q1D)
613  {
614  MFEM_FOREACH_THREAD(dy,y,D1D)
615  {
616  QD0[qx][dy] = 0.0;
617  QD1[qx][dy] = 0.0;
618  QD2[qx][dy] = 0.0;
619  for (int qy = 0; qy < Q1D; ++qy)
620  {
621  const int q = qx + qy * Q1D;
622  const double D00 = D(q,0,e);
623  const double D10 = D(q,1,e);
624  const double D01 = symmetric ? D10 : D(q,2,e);
625  const double D11 = symmetric ? D(q,2,e) : D(q,3,e);
626  const double By = B[qy][dy];
627  const double Gy = G[qy][dy];
628  const double BB = By * By;
629  const double BG = By * Gy;
630  const double GG = Gy * Gy;
631  QD0[qx][dy] += BB * D00;
632  QD1[qx][dy] += BG * (D01 + D10);
633  QD2[qx][dy] += GG * D11;
634  }
635  }
636  }
637  MFEM_SYNC_THREAD;
638  MFEM_FOREACH_THREAD(dy,y,D1D)
639  {
640  MFEM_FOREACH_THREAD(dx,x,D1D)
641  {
642  for (int qx = 0; qx < Q1D; ++qx)
643  {
644  const double Bx = B[qx][dx];
645  const double Gx = G[qx][dx];
646  const double BB = Bx * Bx;
647  const double BG = Bx * Gx;
648  const double GG = Gx * Gx;
649  Y(dx,dy,e) += GG * QD0[qx][dy];
650  Y(dx,dy,e) += BG * QD1[qx][dy];
651  Y(dx,dy,e) += BB * QD2[qx][dy];
652  }
653  }
654  }
655  });
656 }
657 
658 template<int T_D1D = 0, int T_Q1D = 0>
659 static void PADiffusionDiagonal3D(const int NE,
660  const bool symmetric,
661  const Array<double> &b,
662  const Array<double> &g,
663  const Vector &d,
664  Vector &y,
665  const int d1d = 0,
666  const int q1d = 0)
667 {
668  constexpr int DIM = 3;
669  const int D1D = T_D1D ? T_D1D : d1d;
670  const int Q1D = T_Q1D ? T_Q1D : q1d;
671  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
672  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
673  MFEM_VERIFY(D1D <= MD1, "");
674  MFEM_VERIFY(Q1D <= MQ1, "");
675  auto B = Reshape(b.Read(), Q1D, D1D);
676  auto G = Reshape(g.Read(), Q1D, D1D);
677  auto Q = Reshape(d.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
678  auto Y = Reshape(y.ReadWrite(), D1D, D1D, D1D, NE);
679  MFEM_FORALL(e, NE,
680  {
681  const int D1D = T_D1D ? T_D1D : d1d;
682  const int Q1D = T_Q1D ? T_Q1D : q1d;
683  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
684  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
685  double QQD[MQ1][MQ1][MD1];
686  double QDD[MQ1][MD1][MD1];
687  for (int i = 0; i < DIM; ++i)
688  {
689  for (int j = 0; j < DIM; ++j)
690  {
691  // first tensor contraction, along z direction
692  for (int qx = 0; qx < Q1D; ++qx)
693  {
694  for (int qy = 0; qy < Q1D; ++qy)
695  {
696  for (int dz = 0; dz < D1D; ++dz)
697  {
698  QQD[qx][qy][dz] = 0.0;
699  for (int qz = 0; qz < Q1D; ++qz)
700  {
701  const int q = qx + (qy + qz * Q1D) * Q1D;
702  const int ksym = j >= i ?
703  3 - (3-i)*(2-i)/2 + j:
704  3 - (3-j)*(2-j)/2 + i;
705  const int k = symmetric ? ksym : (i*DIM) + j;
706  const double O = Q(q,k,e);
707  const double Bz = B(qz,dz);
708  const double Gz = G(qz,dz);
709  const double L = i==2 ? Gz : Bz;
710  const double R = j==2 ? Gz : Bz;
711  QQD[qx][qy][dz] += L * O * R;
712  }
713  }
714  }
715  }
716  // second tensor contraction, along y direction
717  for (int qx = 0; qx < Q1D; ++qx)
718  {
719  for (int dz = 0; dz < D1D; ++dz)
720  {
721  for (int dy = 0; dy < D1D; ++dy)
722  {
723  QDD[qx][dy][dz] = 0.0;
724  for (int qy = 0; qy < Q1D; ++qy)
725  {
726  const double By = B(qy,dy);
727  const double Gy = G(qy,dy);
728  const double L = i==1 ? Gy : By;
729  const double R = j==1 ? Gy : By;
730  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
731  }
732  }
733  }
734  }
735  // third tensor contraction, along x direction
736  for (int dz = 0; dz < D1D; ++dz)
737  {
738  for (int dy = 0; dy < D1D; ++dy)
739  {
740  for (int dx = 0; dx < D1D; ++dx)
741  {
742  for (int qx = 0; qx < Q1D; ++qx)
743  {
744  const double Bx = B(qx,dx);
745  const double Gx = G(qx,dx);
746  const double L = i==0 ? Gx : Bx;
747  const double R = j==0 ? Gx : Bx;
748  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
749  }
750  }
751  }
752  }
753  }
754  }
755  });
756 }
757 
758 // Shared memory PA Diffusion Diagonal 3D kernel
759 template<int T_D1D = 0, int T_Q1D = 0>
760 static void SmemPADiffusionDiagonal3D(const int NE,
761  const bool symmetric,
762  const Array<double> &b_,
763  const Array<double> &g_,
764  const Vector &d_,
765  Vector &y_,
766  const int d1d = 0,
767  const int q1d = 0)
768 {
769  constexpr int DIM = 3;
770  const int D1D = T_D1D ? T_D1D : d1d;
771  const int Q1D = T_Q1D ? T_Q1D : q1d;
772  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
773  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
774  MFEM_VERIFY(D1D <= MD1, "");
775  MFEM_VERIFY(Q1D <= MQ1, "");
776  auto b = Reshape(b_.Read(), Q1D, D1D);
777  auto g = Reshape(g_.Read(), Q1D, D1D);
778  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
779  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
780  MFEM_FORALL_3D(e, NE, Q1D, Q1D, Q1D,
781  {
782  const int tidz = MFEM_THREAD_ID(z);
783  const int D1D = T_D1D ? T_D1D : d1d;
784  const int Q1D = T_Q1D ? T_Q1D : q1d;
785  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
786  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
787  MFEM_SHARED double BG[2][MQ1*MD1];
788  double (*B)[MD1] = (double (*)[MD1]) (BG+0);
789  double (*G)[MD1] = (double (*)[MD1]) (BG+1);
790  MFEM_SHARED double QQD[MQ1][MQ1][MD1];
791  MFEM_SHARED double QDD[MQ1][MD1][MD1];
792  if (tidz == 0)
793  {
794  MFEM_FOREACH_THREAD(d,y,D1D)
795  {
796  MFEM_FOREACH_THREAD(q,x,Q1D)
797  {
798  B[q][d] = b(q,d);
799  G[q][d] = g(q,d);
800  }
801  }
802  }
803  MFEM_SYNC_THREAD;
804  for (int i = 0; i < DIM; ++i)
805  {
806  for (int j = 0; j < DIM; ++j)
807  {
808  // first tensor contraction, along z direction
809  MFEM_FOREACH_THREAD(qx,x,Q1D)
810  {
811  MFEM_FOREACH_THREAD(qy,y,Q1D)
812  {
813  MFEM_FOREACH_THREAD(dz,z,D1D)
814  {
815  QQD[qx][qy][dz] = 0.0;
816  for (int qz = 0; qz < Q1D; ++qz)
817  {
818  const int q = qx + (qy + qz * Q1D) * Q1D;
819  const int ksym = j >= i ?
820  3 - (3-i)*(2-i)/2 + j:
821  3 - (3-j)*(2-j)/2 + i;
822  const int k = symmetric ? ksym : (i*DIM) + j;
823  const double O = D(q,k,e);
824  const double Bz = B[qz][dz];
825  const double Gz = G[qz][dz];
826  const double L = i==2 ? Gz : Bz;
827  const double R = j==2 ? Gz : Bz;
828  QQD[qx][qy][dz] += L * O * R;
829  }
830  }
831  }
832  }
833  MFEM_SYNC_THREAD;
834  // second tensor contraction, along y direction
835  MFEM_FOREACH_THREAD(qx,x,Q1D)
836  {
837  MFEM_FOREACH_THREAD(dz,z,D1D)
838  {
839  MFEM_FOREACH_THREAD(dy,y,D1D)
840  {
841  QDD[qx][dy][dz] = 0.0;
842  for (int qy = 0; qy < Q1D; ++qy)
843  {
844  const double By = B[qy][dy];
845  const double Gy = G[qy][dy];
846  const double L = i==1 ? Gy : By;
847  const double R = j==1 ? Gy : By;
848  QDD[qx][dy][dz] += L * QQD[qx][qy][dz] * R;
849  }
850  }
851  }
852  }
853  MFEM_SYNC_THREAD;
854  // third tensor contraction, along x direction
855  MFEM_FOREACH_THREAD(dz,z,D1D)
856  {
857  MFEM_FOREACH_THREAD(dy,y,D1D)
858  {
859  MFEM_FOREACH_THREAD(dx,x,D1D)
860  {
861  for (int qx = 0; qx < Q1D; ++qx)
862  {
863  const double Bx = B[qx][dx];
864  const double Gx = G[qx][dx];
865  const double L = i==0 ? Gx : Bx;
866  const double R = j==0 ? Gx : Bx;
867  Y(dx, dy, dz, e) += L * QDD[qx][dy][dz] * R;
868  }
869  }
870  }
871  }
872  }
873  }
874  });
875 }
876 
877 static void PADiffusionAssembleDiagonal(const int dim,
878  const int D1D,
879  const int Q1D,
880  const int NE,
881  const bool symm,
882  const Array<double> &B,
883  const Array<double> &G,
884  const Vector &D,
885  Vector &Y)
886 {
887  if (dim == 2)
888  {
889  switch ((D1D << 4 ) | Q1D)
890  {
891  case 0x22: return SmemPADiffusionDiagonal2D<2,2,8>(NE,symm,B,G,D,Y);
892  case 0x33: return SmemPADiffusionDiagonal2D<3,3,8>(NE,symm,B,G,D,Y);
893  case 0x44: return SmemPADiffusionDiagonal2D<4,4,4>(NE,symm,B,G,D,Y);
894  case 0x55: return SmemPADiffusionDiagonal2D<5,5,4>(NE,symm,B,G,D,Y);
895  case 0x66: return SmemPADiffusionDiagonal2D<6,6,2>(NE,symm,B,G,D,Y);
896  case 0x77: return SmemPADiffusionDiagonal2D<7,7,2>(NE,symm,B,G,D,Y);
897  case 0x88: return SmemPADiffusionDiagonal2D<8,8,1>(NE,symm,B,G,D,Y);
898  case 0x99: return SmemPADiffusionDiagonal2D<9,9,1>(NE,symm,B,G,D,Y);
899  default: return PADiffusionDiagonal2D(NE,symm,B,G,D,Y,D1D,Q1D);
900  }
901  }
902  else if (dim == 3)
903  {
904  switch ((D1D << 4 ) | Q1D)
905  {
906  case 0x23: return SmemPADiffusionDiagonal3D<2,3>(NE,symm,B,G,D,Y);
907  case 0x34: return SmemPADiffusionDiagonal3D<3,4>(NE,symm,B,G,D,Y);
908  case 0x45: return SmemPADiffusionDiagonal3D<4,5>(NE,symm,B,G,D,Y);
909  case 0x56: return SmemPADiffusionDiagonal3D<5,6>(NE,symm,B,G,D,Y);
910  case 0x67: return SmemPADiffusionDiagonal3D<6,7>(NE,symm,B,G,D,Y);
911  case 0x78: return SmemPADiffusionDiagonal3D<7,8>(NE,symm,B,G,D,Y);
912  case 0x89: return SmemPADiffusionDiagonal3D<8,9>(NE,symm,B,G,D,Y);
913  case 0x9A: return SmemPADiffusionDiagonal3D<9,10>(NE,symm,B,G,D,Y);
914  default: return PADiffusionDiagonal3D(NE,symm,B,G,D,Y,D1D,Q1D);
915  }
916  }
917  MFEM_ABORT("Unknown kernel.");
918 }
919 
920 void DiffusionIntegrator::AssembleDiagonalPA(Vector &diag)
921 {
922  if (DeviceCanUseCeed())
923  {
924  ceedOp->GetDiagonal(diag);
925  }
926  else
927  {
928  if (pa_data.Size()==0) { AssemblePA(*fespace); }
929  PADiffusionAssembleDiagonal(dim, dofs1D, quad1D, ne, symmetric,
930  maps->B, maps->G, pa_data, diag);
931  }
932 }
933 
934 
935 #ifdef MFEM_USE_OCCA
936 // OCCA PA Diffusion Apply 2D kernel
937 static void OccaPADiffusionApply2D(const int D1D,
938  const int Q1D,
939  const int NE,
940  const Array<double> &B,
941  const Array<double> &G,
942  const Array<double> &Bt,
943  const Array<double> &Gt,
944  const Vector &D,
945  const Vector &X,
946  Vector &Y)
947 {
948  occa::properties props;
949  props["defines/D1D"] = D1D;
950  props["defines/Q1D"] = Q1D;
951  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
952  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
953  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
954  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
955  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
956  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
957  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
958  const occa_id_t id = std::make_pair(D1D,Q1D);
959  if (!Device::Allows(Backend::OCCA_CUDA))
960  {
961  static occa_kernel_t OccaDiffApply2D_cpu;
962  if (OccaDiffApply2D_cpu.find(id) == OccaDiffApply2D_cpu.end())
963  {
964  const occa::kernel DiffusionApply2D_CPU =
965  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
966  "DiffusionApply2D_CPU", props);
967  OccaDiffApply2D_cpu.emplace(id, DiffusionApply2D_CPU);
968  }
969  OccaDiffApply2D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
970  }
971  else
972  {
973  static occa_kernel_t OccaDiffApply2D_gpu;
974  if (OccaDiffApply2D_gpu.find(id) == OccaDiffApply2D_gpu.end())
975  {
976  const occa::kernel DiffusionApply2D_GPU =
977  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
978  "DiffusionApply2D_GPU", props);
979  OccaDiffApply2D_gpu.emplace(id, DiffusionApply2D_GPU);
980  }
981  OccaDiffApply2D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
982  }
983 }
984 
985 // OCCA PA Diffusion Apply 3D kernel
986 static void OccaPADiffusionApply3D(const int D1D,
987  const int Q1D,
988  const int NE,
989  const Array<double> &B,
990  const Array<double> &G,
991  const Array<double> &Bt,
992  const Array<double> &Gt,
993  const Vector &D,
994  const Vector &X,
995  Vector &Y)
996 {
997  occa::properties props;
998  props["defines/D1D"] = D1D;
999  props["defines/Q1D"] = Q1D;
1000  const occa::memory o_B = OccaMemoryRead(B.GetMemory(), B.Size());
1001  const occa::memory o_G = OccaMemoryRead(G.GetMemory(), G.Size());
1002  const occa::memory o_Bt = OccaMemoryRead(Bt.GetMemory(), Bt.Size());
1003  const occa::memory o_Gt = OccaMemoryRead(Gt.GetMemory(), Gt.Size());
1004  const occa::memory o_D = OccaMemoryRead(D.GetMemory(), D.Size());
1005  const occa::memory o_X = OccaMemoryRead(X.GetMemory(), X.Size());
1006  occa::memory o_Y = OccaMemoryReadWrite(Y.GetMemory(), Y.Size());
1007  const occa_id_t id = std::make_pair(D1D,Q1D);
1008  if (!Device::Allows(Backend::OCCA_CUDA))
1009  {
1010  static occa_kernel_t OccaDiffApply3D_cpu;
1011  if (OccaDiffApply3D_cpu.find(id) == OccaDiffApply3D_cpu.end())
1012  {
1013  const occa::kernel DiffusionApply3D_CPU =
1014  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1015  "DiffusionApply3D_CPU", props);
1016  OccaDiffApply3D_cpu.emplace(id, DiffusionApply3D_CPU);
1017  }
1018  OccaDiffApply3D_cpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1019  }
1020  else
1021  {
1022  static occa_kernel_t OccaDiffApply3D_gpu;
1023  if (OccaDiffApply3D_gpu.find(id) == OccaDiffApply3D_gpu.end())
1024  {
1025  const occa::kernel DiffusionApply3D_GPU =
1026  mfem::OccaDev().buildKernel("occa://mfem/fem/occa.okl",
1027  "DiffusionApply3D_GPU", props);
1028  OccaDiffApply3D_gpu.emplace(id, DiffusionApply3D_GPU);
1029  }
1030  OccaDiffApply3D_gpu.at(id)(NE, o_B, o_G, o_Bt, o_Gt, o_D, o_X, o_Y);
1031  }
1032 }
1033 #endif // MFEM_USE_OCCA
1034 
1035 // PA Diffusion Apply 2D kernel
1036 template<int T_D1D = 0, int T_Q1D = 0>
1037 static void PADiffusionApply2D(const int NE,
1038  const bool symmetric,
1039  const Array<double> &b_,
1040  const Array<double> &g_,
1041  const Array<double> &bt_,
1042  const Array<double> &gt_,
1043  const Vector &d_,
1044  const Vector &x_,
1045  Vector &y_,
1046  const int d1d = 0,
1047  const int q1d = 0)
1048 {
1049  const int D1D = T_D1D ? T_D1D : d1d;
1050  const int Q1D = T_Q1D ? T_Q1D : q1d;
1051  MFEM_VERIFY(D1D <= MAX_D1D, "");
1052  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1053  auto B = Reshape(b_.Read(), Q1D, D1D);
1054  auto G = Reshape(g_.Read(), Q1D, D1D);
1055  auto Bt = Reshape(bt_.Read(), D1D, Q1D);
1056  auto Gt = Reshape(gt_.Read(), D1D, Q1D);
1057  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1058  auto X = Reshape(x_.Read(), D1D, D1D, NE);
1059  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1060  MFEM_FORALL(e, NE,
1061  {
1062  const int D1D = T_D1D ? T_D1D : d1d;
1063  const int Q1D = T_Q1D ? T_Q1D : q1d;
1064  // the following variables are evaluated at compile time
1065  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1066  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1067 
1068  double grad[max_Q1D][max_Q1D][2];
1069  for (int qy = 0; qy < Q1D; ++qy)
1070  {
1071  for (int qx = 0; qx < Q1D; ++qx)
1072  {
1073  grad[qy][qx][0] = 0.0;
1074  grad[qy][qx][1] = 0.0;
1075  }
1076  }
1077  for (int dy = 0; dy < D1D; ++dy)
1078  {
1079  double gradX[max_Q1D][2];
1080  for (int qx = 0; qx < Q1D; ++qx)
1081  {
1082  gradX[qx][0] = 0.0;
1083  gradX[qx][1] = 0.0;
1084  }
1085  for (int dx = 0; dx < D1D; ++dx)
1086  {
1087  const double s = X(dx,dy,e);
1088  for (int qx = 0; qx < Q1D; ++qx)
1089  {
1090  gradX[qx][0] += s * B(qx,dx);
1091  gradX[qx][1] += s * G(qx,dx);
1092  }
1093  }
1094  for (int qy = 0; qy < Q1D; ++qy)
1095  {
1096  const double wy = B(qy,dy);
1097  const double wDy = G(qy,dy);
1098  for (int qx = 0; qx < Q1D; ++qx)
1099  {
1100  grad[qy][qx][0] += gradX[qx][1] * wy;
1101  grad[qy][qx][1] += gradX[qx][0] * wDy;
1102  }
1103  }
1104  }
1105  // Calculate Dxy, xDy in plane
1106  for (int qy = 0; qy < Q1D; ++qy)
1107  {
1108  for (int qx = 0; qx < Q1D; ++qx)
1109  {
1110  const int q = qx + qy * Q1D;
1111 
1112  const double O11 = D(q,0,e);
1113  const double O21 = D(q,1,e);
1114  const double O12 = symmetric ? O21 : D(q,2,e);
1115  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1116 
1117  const double gradX = grad[qy][qx][0];
1118  const double gradY = grad[qy][qx][1];
1119 
1120  grad[qy][qx][0] = (O11 * gradX) + (O12 * gradY);
1121  grad[qy][qx][1] = (O21 * gradX) + (O22 * gradY);
1122  }
1123  }
1124  for (int qy = 0; qy < Q1D; ++qy)
1125  {
1126  double gradX[max_D1D][2];
1127  for (int dx = 0; dx < D1D; ++dx)
1128  {
1129  gradX[dx][0] = 0;
1130  gradX[dx][1] = 0;
1131  }
1132  for (int qx = 0; qx < Q1D; ++qx)
1133  {
1134  const double gX = grad[qy][qx][0];
1135  const double gY = grad[qy][qx][1];
1136  for (int dx = 0; dx < D1D; ++dx)
1137  {
1138  const double wx = Bt(dx,qx);
1139  const double wDx = Gt(dx,qx);
1140  gradX[dx][0] += gX * wDx;
1141  gradX[dx][1] += gY * wx;
1142  }
1143  }
1144  for (int dy = 0; dy < D1D; ++dy)
1145  {
1146  const double wy = Bt(dy,qy);
1147  const double wDy = Gt(dy,qy);
1148  for (int dx = 0; dx < D1D; ++dx)
1149  {
1150  Y(dx,dy,e) += ((gradX[dx][0] * wy) + (gradX[dx][1] * wDy));
1151  }
1152  }
1153  }
1154  });
1155 }
1156 
1157 // Shared memory PA Diffusion Apply 2D kernel
1158 template<int T_D1D = 0, int T_Q1D = 0, int T_NBZ = 0>
1159 static void SmemPADiffusionApply2D(const int NE,
1160  const bool symmetric,
1161  const Array<double> &b_,
1162  const Array<double> &g_,
1163  const Vector &d_,
1164  const Vector &x_,
1165  Vector &y_,
1166  const int d1d = 0,
1167  const int q1d = 0)
1168 {
1169  const int D1D = T_D1D ? T_D1D : d1d;
1170  const int Q1D = T_Q1D ? T_Q1D : q1d;
1171  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1172  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1173  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1174  MFEM_VERIFY(D1D <= MD1, "");
1175  MFEM_VERIFY(Q1D <= MQ1, "");
1176  auto b = Reshape(b_.Read(), Q1D, D1D);
1177  auto g = Reshape(g_.Read(), Q1D, D1D);
1178  auto D = Reshape(d_.Read(), Q1D*Q1D, symmetric ? 3 : 4, NE);
1179  auto x = Reshape(x_.Read(), D1D, D1D, NE);
1180  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, NE);
1181  MFEM_FORALL_2D(e, NE, Q1D, Q1D, NBZ,
1182  {
1183  const int tidz = MFEM_THREAD_ID(z);
1184  const int D1D = T_D1D ? T_D1D : d1d;
1185  const int Q1D = T_Q1D ? T_Q1D : q1d;
1186  constexpr int NBZ = T_NBZ ? T_NBZ : 1;
1187  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1188  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1189  MFEM_SHARED double sBG[2][MQ1*MD1];
1190  double (*B)[MD1] = (double (*)[MD1]) (sBG+0);
1191  double (*G)[MD1] = (double (*)[MD1]) (sBG+1);
1192  double (*Bt)[MQ1] = (double (*)[MQ1]) (sBG+0);
1193  double (*Gt)[MQ1] = (double (*)[MQ1]) (sBG+1);
1194  MFEM_SHARED double Xz[NBZ][MD1][MD1];
1195  MFEM_SHARED double GD[2][NBZ][MD1][MQ1];
1196  MFEM_SHARED double GQ[2][NBZ][MD1][MQ1];
1197  double (*X)[MD1] = (double (*)[MD1])(Xz + tidz);
1198  double (*DQ0)[MD1] = (double (*)[MD1])(GD[0] + tidz);
1199  double (*DQ1)[MD1] = (double (*)[MD1])(GD[1] + tidz);
1200  double (*QQ0)[MD1] = (double (*)[MD1])(GQ[0] + tidz);
1201  double (*QQ1)[MD1] = (double (*)[MD1])(GQ[1] + tidz);
1202  MFEM_FOREACH_THREAD(dy,y,D1D)
1203  {
1204  MFEM_FOREACH_THREAD(dx,x,D1D)
1205  {
1206  X[dy][dx] = x(dx,dy,e);
1207  }
1208  }
1209  if (tidz == 0)
1210  {
1211  MFEM_FOREACH_THREAD(dy,y,D1D)
1212  {
1213  MFEM_FOREACH_THREAD(q,x,Q1D)
1214  {
1215  B[q][dy] = b(q,dy);
1216  G[q][dy] = g(q,dy);
1217  }
1218  }
1219  }
1220  MFEM_SYNC_THREAD;
1221  MFEM_FOREACH_THREAD(dy,y,D1D)
1222  {
1223  MFEM_FOREACH_THREAD(qx,x,Q1D)
1224  {
1225  double u = 0.0;
1226  double v = 0.0;
1227  for (int dx = 0; dx < D1D; ++dx)
1228  {
1229  const double coords = X[dy][dx];
1230  u += B[qx][dx] * coords;
1231  v += G[qx][dx] * coords;
1232  }
1233  DQ0[dy][qx] = u;
1234  DQ1[dy][qx] = v;
1235  }
1236  }
1237  MFEM_SYNC_THREAD;
1238  MFEM_FOREACH_THREAD(qy,y,Q1D)
1239  {
1240  MFEM_FOREACH_THREAD(qx,x,Q1D)
1241  {
1242  double u = 0.0;
1243  double v = 0.0;
1244  for (int dy = 0; dy < D1D; ++dy)
1245  {
1246  u += DQ1[dy][qx] * B[qy][dy];
1247  v += DQ0[dy][qx] * G[qy][dy];
1248  }
1249  QQ0[qy][qx] = u;
1250  QQ1[qy][qx] = v;
1251  }
1252  }
1253  MFEM_SYNC_THREAD;
1254  MFEM_FOREACH_THREAD(qy,y,Q1D)
1255  {
1256  MFEM_FOREACH_THREAD(qx,x,Q1D)
1257  {
1258  const int q = (qx + ((qy) * Q1D));
1259  const double O11 = D(q,0,e);
1260  const double O21 = D(q,1,e);
1261  const double O12 = symmetric ? O21 : D(q,2,e);
1262  const double O22 = symmetric ? D(q,2,e) : D(q,3,e);
1263  const double gX = QQ0[qy][qx];
1264  const double gY = QQ1[qy][qx];
1265  QQ0[qy][qx] = (O11 * gX) + (O12 * gY);
1266  QQ1[qy][qx] = (O21 * gX) + (O22 * gY);
1267  }
1268  }
1269  MFEM_SYNC_THREAD;
1270  if (tidz == 0)
1271  {
1272  MFEM_FOREACH_THREAD(dy,y,D1D)
1273  {
1274  MFEM_FOREACH_THREAD(q,x,Q1D)
1275  {
1276  Bt[dy][q] = b(q,dy);
1277  Gt[dy][q] = g(q,dy);
1278  }
1279  }
1280  }
1281  MFEM_SYNC_THREAD;
1282  MFEM_FOREACH_THREAD(qy,y,Q1D)
1283  {
1284  MFEM_FOREACH_THREAD(dx,x,D1D)
1285  {
1286  double u = 0.0;
1287  double v = 0.0;
1288  for (int qx = 0; qx < Q1D; ++qx)
1289  {
1290  u += Gt[dx][qx] * QQ0[qy][qx];
1291  v += Bt[dx][qx] * QQ1[qy][qx];
1292  }
1293  DQ0[qy][dx] = u;
1294  DQ1[qy][dx] = v;
1295  }
1296  }
1297  MFEM_SYNC_THREAD;
1298  MFEM_FOREACH_THREAD(dy,y,D1D)
1299  {
1300  MFEM_FOREACH_THREAD(dx,x,D1D)
1301  {
1302  double u = 0.0;
1303  double v = 0.0;
1304  for (int qy = 0; qy < Q1D; ++qy)
1305  {
1306  u += DQ0[qy][dx] * Bt[dy][qy];
1307  v += DQ1[qy][dx] * Gt[dy][qy];
1308  }
1309  Y(dx,dy,e) += (u + v);
1310  }
1311  }
1312  });
1313 }
1314 
1315 // PA Diffusion Apply 3D kernel
1316 template<int T_D1D = 0, int T_Q1D = 0>
1317 static void PADiffusionApply3D(const int NE,
1318  const bool symmetric,
1319  const Array<double> &b,
1320  const Array<double> &g,
1321  const Array<double> &bt,
1322  const Array<double> &gt,
1323  const Vector &d_,
1324  const Vector &x_,
1325  Vector &y_,
1326  int d1d = 0, int q1d = 0)
1327 {
1328  const int D1D = T_D1D ? T_D1D : d1d;
1329  const int Q1D = T_Q1D ? T_Q1D : q1d;
1330  MFEM_VERIFY(D1D <= MAX_D1D, "");
1331  MFEM_VERIFY(Q1D <= MAX_Q1D, "");
1332  auto B = Reshape(b.Read(), Q1D, D1D);
1333  auto G = Reshape(g.Read(), Q1D, D1D);
1334  auto Bt = Reshape(bt.Read(), D1D, Q1D);
1335  auto Gt = Reshape(gt.Read(), D1D, Q1D);
1336  auto D = Reshape(d_.Read(), Q1D*Q1D*Q1D, symmetric ? 6 : 9, NE);
1337  auto X = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1338  auto Y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1339  MFEM_FORALL(e, NE,
1340  {
1341  const int D1D = T_D1D ? T_D1D : d1d;
1342  const int Q1D = T_Q1D ? T_Q1D : q1d;
1343  constexpr int max_D1D = T_D1D ? T_D1D : MAX_D1D;
1344  constexpr int max_Q1D = T_Q1D ? T_Q1D : MAX_Q1D;
1345  double grad[max_Q1D][max_Q1D][max_Q1D][3];
1346  for (int qz = 0; qz < Q1D; ++qz)
1347  {
1348  for (int qy = 0; qy < Q1D; ++qy)
1349  {
1350  for (int qx = 0; qx < Q1D; ++qx)
1351  {
1352  grad[qz][qy][qx][0] = 0.0;
1353  grad[qz][qy][qx][1] = 0.0;
1354  grad[qz][qy][qx][2] = 0.0;
1355  }
1356  }
1357  }
1358  for (int dz = 0; dz < D1D; ++dz)
1359  {
1360  double gradXY[max_Q1D][max_Q1D][3];
1361  for (int qy = 0; qy < Q1D; ++qy)
1362  {
1363  for (int qx = 0; qx < Q1D; ++qx)
1364  {
1365  gradXY[qy][qx][0] = 0.0;
1366  gradXY[qy][qx][1] = 0.0;
1367  gradXY[qy][qx][2] = 0.0;
1368  }
1369  }
1370  for (int dy = 0; dy < D1D; ++dy)
1371  {
1372  double gradX[max_Q1D][2];
1373  for (int qx = 0; qx < Q1D; ++qx)
1374  {
1375  gradX[qx][0] = 0.0;
1376  gradX[qx][1] = 0.0;
1377  }
1378  for (int dx = 0; dx < D1D; ++dx)
1379  {
1380  const double s = X(dx,dy,dz,e);
1381  for (int qx = 0; qx < Q1D; ++qx)
1382  {
1383  gradX[qx][0] += s * B(qx,dx);
1384  gradX[qx][1] += s * G(qx,dx);
1385  }
1386  }
1387  for (int qy = 0; qy < Q1D; ++qy)
1388  {
1389  const double wy = B(qy,dy);
1390  const double wDy = G(qy,dy);
1391  for (int qx = 0; qx < Q1D; ++qx)
1392  {
1393  const double wx = gradX[qx][0];
1394  const double wDx = gradX[qx][1];
1395  gradXY[qy][qx][0] += wDx * wy;
1396  gradXY[qy][qx][1] += wx * wDy;
1397  gradXY[qy][qx][2] += wx * wy;
1398  }
1399  }
1400  }
1401  for (int qz = 0; qz < Q1D; ++qz)
1402  {
1403  const double wz = B(qz,dz);
1404  const double wDz = G(qz,dz);
1405  for (int qy = 0; qy < Q1D; ++qy)
1406  {
1407  for (int qx = 0; qx < Q1D; ++qx)
1408  {
1409  grad[qz][qy][qx][0] += gradXY[qy][qx][0] * wz;
1410  grad[qz][qy][qx][1] += gradXY[qy][qx][1] * wz;
1411  grad[qz][qy][qx][2] += gradXY[qy][qx][2] * wDz;
1412  }
1413  }
1414  }
1415  }
1416  // Calculate Dxyz, xDyz, xyDz in plane
1417  for (int qz = 0; qz < Q1D; ++qz)
1418  {
1419  for (int qy = 0; qy < Q1D; ++qy)
1420  {
1421  for (int qx = 0; qx < Q1D; ++qx)
1422  {
1423  const int q = qx + (qy + qz * Q1D) * Q1D;
1424  const double O11 = D(q,0,e);
1425  const double O12 = D(q,1,e);
1426  const double O13 = D(q,2,e);
1427  const double O21 = symmetric ? O12 : D(q,3,e);
1428  const double O22 = symmetric ? D(q,3,e) : D(q,4,e);
1429  const double O23 = symmetric ? D(q,4,e) : D(q,5,e);
1430  const double O31 = symmetric ? O13 : D(q,6,e);
1431  const double O32 = symmetric ? O23 : D(q,7,e);
1432  const double O33 = symmetric ? D(q,5,e) : D(q,8,e);
1433  const double gradX = grad[qz][qy][qx][0];
1434  const double gradY = grad[qz][qy][qx][1];
1435  const double gradZ = grad[qz][qy][qx][2];
1436  grad[qz][qy][qx][0] = (O11*gradX)+(O12*gradY)+(O13*gradZ);
1437  grad[qz][qy][qx][1] = (O21*gradX)+(O22*gradY)+(O23*gradZ);
1438  grad[qz][qy][qx][2] = (O31*gradX)+(O32*gradY)+(O33*gradZ);
1439  }
1440  }
1441  }
1442  for (int qz = 0; qz < Q1D; ++qz)
1443  {
1444  double gradXY[max_D1D][max_D1D][3];
1445  for (int dy = 0; dy < D1D; ++dy)
1446  {
1447  for (int dx = 0; dx < D1D; ++dx)
1448  {
1449  gradXY[dy][dx][0] = 0;
1450  gradXY[dy][dx][1] = 0;
1451  gradXY[dy][dx][2] = 0;
1452  }
1453  }
1454  for (int qy = 0; qy < Q1D; ++qy)
1455  {
1456  double gradX[max_D1D][3];
1457  for (int dx = 0; dx < D1D; ++dx)
1458  {
1459  gradX[dx][0] = 0;
1460  gradX[dx][1] = 0;
1461  gradX[dx][2] = 0;
1462  }
1463  for (int qx = 0; qx < Q1D; ++qx)
1464  {
1465  const double gX = grad[qz][qy][qx][0];
1466  const double gY = grad[qz][qy][qx][1];
1467  const double gZ = grad[qz][qy][qx][2];
1468  for (int dx = 0; dx < D1D; ++dx)
1469  {
1470  const double wx = Bt(dx,qx);
1471  const double wDx = Gt(dx,qx);
1472  gradX[dx][0] += gX * wDx;
1473  gradX[dx][1] += gY * wx;
1474  gradX[dx][2] += gZ * wx;
1475  }
1476  }
1477  for (int dy = 0; dy < D1D; ++dy)
1478  {
1479  const double wy = Bt(dy,qy);
1480  const double wDy = Gt(dy,qy);
1481  for (int dx = 0; dx < D1D; ++dx)
1482  {
1483  gradXY[dy][dx][0] += gradX[dx][0] * wy;
1484  gradXY[dy][dx][1] += gradX[dx][1] * wDy;
1485  gradXY[dy][dx][2] += gradX[dx][2] * wy;
1486  }
1487  }
1488  }
1489  for (int dz = 0; dz < D1D; ++dz)
1490  {
1491  const double wz = Bt(dz,qz);
1492  const double wDz = Gt(dz,qz);
1493  for (int dy = 0; dy < D1D; ++dy)
1494  {
1495  for (int dx = 0; dx < D1D; ++dx)
1496  {
1497  Y(dx,dy,dz,e) +=
1498  ((gradXY[dy][dx][0] * wz) +
1499  (gradXY[dy][dx][1] * wz) +
1500  (gradXY[dy][dx][2] * wDz));
1501  }
1502  }
1503  }
1504  }
1505  });
1506 }
1507 
1508 // Half of B and G are stored in shared to get B, Bt, G and Gt.
1509 // Indices computation for SmemPADiffusionApply3D.
1510 static MFEM_HOST_DEVICE inline int qi(const int q, const int d, const int Q)
1511 {
1512  return (q<=d) ? q : Q-1-q;
1513 }
1514 
1515 static MFEM_HOST_DEVICE inline int dj(const int q, const int d, const int D)
1516 {
1517  return (q<=d) ? d : D-1-d;
1518 }
1519 
1520 static MFEM_HOST_DEVICE inline int qk(const int q, const int d, const int Q)
1521 {
1522  return (q<=d) ? Q-1-q : q;
1523 }
1524 
1525 static MFEM_HOST_DEVICE inline int dl(const int q, const int d, const int D)
1526 {
1527  return (q<=d) ? D-1-d : d;
1528 }
1529 
1530 static MFEM_HOST_DEVICE inline double sign(const int q, const int d)
1531 {
1532  return (q<=d) ? -1.0 : 1.0;
1533 }
1534 
1535 template<int T_D1D = 0, int T_Q1D = 0>
1536 static void SmemPADiffusionApply3D(const int NE,
1537  const bool symmetric,
1538  const Array<double> &b_,
1539  const Array<double> &g_,
1540  const Vector &d_,
1541  const Vector &x_,
1542  Vector &y_,
1543  const int d1d = 0,
1544  const int q1d = 0)
1545 {
1546  const int D1D = T_D1D ? T_D1D : d1d;
1547  const int Q1D = T_Q1D ? T_Q1D : q1d;
1548  constexpr int M1Q = T_Q1D ? T_Q1D : MAX_Q1D;
1549  constexpr int M1D = T_D1D ? T_D1D : MAX_D1D;
1550  MFEM_VERIFY(D1D <= M1D, "");
1551  MFEM_VERIFY(Q1D <= M1Q, "");
1552  auto b = Reshape(b_.Read(), Q1D, D1D);
1553  auto g = Reshape(g_.Read(), Q1D, D1D);
1554  auto d = Reshape(d_.Read(), Q1D, Q1D, Q1D, symmetric ? 6 : 9, NE);
1555  auto x = Reshape(x_.Read(), D1D, D1D, D1D, NE);
1556  auto y = Reshape(y_.ReadWrite(), D1D, D1D, D1D, NE);
1557  MFEM_FORALL_3D(e, NE, Q1D, Q1D, 1,
1558  {
1559  const int D1D = T_D1D ? T_D1D : d1d;
1560  const int Q1D = T_Q1D ? T_Q1D : q1d;
1561  constexpr int MQ1 = T_Q1D ? T_Q1D : MAX_Q1D;
1562  constexpr int MD1 = T_D1D ? T_D1D : MAX_D1D;
1563  constexpr int MDQ = (MQ1 > MD1) ? MQ1 : MD1;
1564  MFEM_SHARED double sBG[MQ1*MD1];
1565  double (*B)[MD1] = (double (*)[MD1]) sBG;
1566  double (*G)[MD1] = (double (*)[MD1]) sBG;
1567  double (*Bt)[MQ1] = (double (*)[MQ1]) sBG;
1568  double (*Gt)[MQ1] = (double (*)[MQ1]) sBG;
1569  MFEM_SHARED double sm0[3][MDQ*MDQ*MDQ];
1570  MFEM_SHARED double sm1[3][MDQ*MDQ*MDQ];
1571  double (*X)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1572  double (*DDQ0)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+0);
1573  double (*DDQ1)[MD1][MQ1] = (double (*)[MD1][MQ1]) (sm0+1);
1574  double (*DQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+0);
1575  double (*DQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+1);
1576  double (*DQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm1+2);
1577  double (*QQQ0)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+0);
1578  double (*QQQ1)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+1);
1579  double (*QQQ2)[MQ1][MQ1] = (double (*)[MQ1][MQ1]) (sm0+2);
1580  double (*QQD0)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+0);
1581  double (*QQD1)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+1);
1582  double (*QQD2)[MQ1][MD1] = (double (*)[MQ1][MD1]) (sm1+2);
1583  double (*QDD0)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+0);
1584  double (*QDD1)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+1);
1585  double (*QDD2)[MD1][MD1] = (double (*)[MD1][MD1]) (sm0+2);
1586  MFEM_FOREACH_THREAD(dy,y,D1D)
1587  {
1588  MFEM_FOREACH_THREAD(dx,x,D1D)
1589  {
1590  MFEM_UNROLL(MD1)
1591  for (int dz = 0; dz < D1D; ++dz)
1592  {
1593  X[dz][dy][dx] = x(dx,dy,dz,e);
1594  }
1595  }
1596  MFEM_FOREACH_THREAD(qx,x,Q1D)
1597  {
1598  const int i = qi(qx,dy,Q1D);
1599  const int j = dj(qx,dy,D1D);
1600  const int k = qk(qx,dy,Q1D);
1601  const int l = dl(qx,dy,D1D);
1602  B[i][j] = b(qx,dy);
1603  G[k][l] = g(qx,dy) * sign(qx,dy);
1604  }
1605  }
1606  MFEM_SYNC_THREAD;
1607  MFEM_FOREACH_THREAD(dy,y,D1D)
1608  {
1609  MFEM_FOREACH_THREAD(qx,x,Q1D)
1610  {
1611  double u[D1D], v[D1D];
1612  MFEM_UNROLL(MD1)
1613  for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = 0.0; }
1614  MFEM_UNROLL(MD1)
1615  for (int dx = 0; dx < D1D; ++dx)
1616  {
1617  const int i = qi(qx,dx,Q1D);
1618  const int j = dj(qx,dx,D1D);
1619  const int k = qk(qx,dx,Q1D);
1620  const int l = dl(qx,dx,D1D);
1621  const double s = sign(qx,dx);
1622  MFEM_UNROLL(MD1)
1623  for (int dz = 0; dz < D1D; ++dz)
1624  {
1625  const double coords = X[dz][dy][dx];
1626  u[dz] += coords * B[i][j];
1627  v[dz] += coords * G[k][l] * s;
1628  }
1629  }
1630  MFEM_UNROLL(MD1)
1631  for (int dz = 0; dz < D1D; ++dz)
1632  {
1633  DDQ0[dz][dy][qx] = u[dz];
1634  DDQ1[dz][dy][qx] = v[dz];
1635  }
1636  }
1637  }
1638  MFEM_SYNC_THREAD;
1639  MFEM_FOREACH_THREAD(qy,y,Q1D)
1640  {
1641  MFEM_FOREACH_THREAD(qx,x,Q1D)
1642  {
1643  double u[D1D], v[D1D], w[D1D];
1644  MFEM_UNROLL(MD1)
1645  for (int dz = 0; dz < D1D; dz++) { u[dz] = v[dz] = w[dz] = 0.0; }
1646  MFEM_UNROLL(MD1)
1647  for (int dy = 0; dy < D1D; ++dy)
1648  {
1649  const int i = qi(qy,dy,Q1D);
1650  const int j = dj(qy,dy,D1D);
1651  const int k = qk(qy,dy,Q1D);
1652  const int l = dl(qy,dy,D1D);
1653  const double s = sign(qy,dy);
1654  MFEM_UNROLL(MD1)
1655  for (int dz = 0; dz < D1D; dz++)
1656  {
1657  u[dz] += DDQ1[dz][dy][qx] * B[i][j];
1658  v[dz] += DDQ0[dz][dy][qx] * G[k][l] * s;
1659  w[dz] += DDQ0[dz][dy][qx] * B[i][j];
1660  }
1661  }
1662  MFEM_UNROLL(MD1)
1663  for (int dz = 0; dz < D1D; dz++)
1664  {
1665  DQQ0[dz][qy][qx] = u[dz];
1666  DQQ1[dz][qy][qx] = v[dz];
1667  DQQ2[dz][qy][qx] = w[dz];
1668  }
1669  }
1670  }
1671  MFEM_SYNC_THREAD;
1672  MFEM_FOREACH_THREAD(qy,y,Q1D)
1673  {
1674  MFEM_FOREACH_THREAD(qx,x,Q1D)
1675  {
1676  double u[Q1D], v[Q1D], w[Q1D];
1677  MFEM_UNROLL(MQ1)
1678  for (int qz = 0; qz < Q1D; qz++) { u[qz] = v[qz] = w[qz] = 0.0; }
1679  MFEM_UNROLL(MD1)
1680  for (int dz = 0; dz < D1D; ++dz)
1681  {
1682  MFEM_UNROLL(MQ1)
1683  for (int qz = 0; qz < Q1D; qz++)
1684  {
1685  const int i = qi(qz,dz,Q1D);
1686  const int j = dj(qz,dz,D1D);
1687  const int k = qk(qz,dz,Q1D);
1688  const int l = dl(qz,dz,D1D);
1689  const double s = sign(qz,dz);
1690  u[qz] += DQQ0[dz][qy][qx] * B[i][j];
1691  v[qz] += DQQ1[dz][qy][qx] * B[i][j];
1692  w[qz] += DQQ2[dz][qy][qx] * G[k][l] * s;
1693  }
1694  }
1695  MFEM_UNROLL(MQ1)
1696  for (int qz = 0; qz < Q1D; qz++)
1697  {
1698  const double O11 = d(qx,qy,qz,0,e);
1699  const double O12 = d(qx,qy,qz,1,e);
1700  const double O13 = d(qx,qy,qz,2,e);
1701  const double O21 = symmetric ? O12 : d(qx,qy,qz,3,e);
1702  const double O22 = symmetric ? d(qx,qy,qz,3,e) : d(qx,qy,qz,4,e);
1703  const double O23 = symmetric ? d(qx,qy,qz,4,e) : d(qx,qy,qz,5,e);
1704  const double O31 = symmetric ? O13 : d(qx,qy,qz,6,e);
1705  const double O32 = symmetric ? O23 : d(qx,qy,qz,7,e);
1706  const double O33 = symmetric ? d(qx,qy,qz,5,e) : d(qx,qy,qz,8,e);
1707  const double gX = u[qz];
1708  const double gY = v[qz];
1709  const double gZ = w[qz];
1710  QQQ0[qz][qy][qx] = (O11*gX) + (O12*gY) + (O13*gZ);
1711  QQQ1[qz][qy][qx] = (O21*gX) + (O22*gY) + (O23*gZ);
1712  QQQ2[qz][qy][qx] = (O31*gX) + (O32*gY) + (O33*gZ);
1713  }
1714  }
1715  }
1716  MFEM_SYNC_THREAD;
1717  MFEM_FOREACH_THREAD(d,y,D1D)
1718  {
1719  MFEM_FOREACH_THREAD(q,x,Q1D)
1720  {
1721  const int i = qi(q,d,Q1D);
1722  const int j = dj(q,d,D1D);
1723  const int k = qk(q,d,Q1D);
1724  const int l = dl(q,d,D1D);
1725  Bt[j][i] = b(q,d);
1726  Gt[l][k] = g(q,d) * sign(q,d);
1727  }
1728  }
1729  MFEM_SYNC_THREAD;
1730  MFEM_FOREACH_THREAD(qy,y,Q1D)
1731  {
1732  MFEM_FOREACH_THREAD(dx,x,D1D)
1733  {
1734  double u[Q1D], v[Q1D], w[Q1D];
1735  MFEM_UNROLL(MQ1)
1736  for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; }
1737  MFEM_UNROLL(MQ1)
1738  for (int qx = 0; qx < Q1D; ++qx)
1739  {
1740  const int i = qi(qx,dx,Q1D);
1741  const int j = dj(qx,dx,D1D);
1742  const int k = qk(qx,dx,Q1D);
1743  const int l = dl(qx,dx,D1D);
1744  const double s = sign(qx,dx);
1745  MFEM_UNROLL(MQ1)
1746  for (int qz = 0; qz < Q1D; ++qz)
1747  {
1748  u[qz] += QQQ0[qz][qy][qx] * Gt[l][k] * s;
1749  v[qz] += QQQ1[qz][qy][qx] * Bt[j][i];
1750  w[qz] += QQQ2[qz][qy][qx] * Bt[j][i];
1751  }
1752  }
1753  MFEM_UNROLL(MQ1)
1754  for (int qz = 0; qz < Q1D; ++qz)
1755  {
1756  QQD0[qz][qy][dx] = u[qz];
1757  QQD1[qz][qy][dx] = v[qz];
1758  QQD2[qz][qy][dx] = w[qz];
1759  }
1760  }
1761  }
1762  MFEM_SYNC_THREAD;
1763  MFEM_FOREACH_THREAD(dy,y,D1D)
1764  {
1765  MFEM_FOREACH_THREAD(dx,x,D1D)
1766  {
1767  double u[Q1D], v[Q1D], w[Q1D];
1768  MFEM_UNROLL(MQ1)
1769  for (int qz = 0; qz < Q1D; ++qz) { u[qz] = v[qz] = w[qz] = 0.0; }
1770  MFEM_UNROLL(MQ1)
1771  for (int qy = 0; qy < Q1D; ++qy)
1772  {
1773  const int i = qi(qy,dy,Q1D);
1774  const int j = dj(qy,dy,D1D);
1775  const int k = qk(qy,dy,Q1D);
1776  const int l = dl(qy,dy,D1D);
1777  const double s = sign(qy,dy);
1778  MFEM_UNROLL(MQ1)
1779  for (int qz = 0; qz < Q1D; ++qz)
1780  {
1781  u[qz] += QQD0[qz][qy][dx] * Bt[j][i];
1782  v[qz] += QQD1[qz][qy][dx] * Gt[l][k] * s;
1783  w[qz] += QQD2[qz][qy][dx] * Bt[j][i];
1784  }
1785  }
1786  MFEM_UNROLL(MQ1)
1787  for (int qz = 0; qz < Q1D; ++qz)
1788  {
1789  QDD0[qz][dy][dx] = u[qz];
1790  QDD1[qz][dy][dx] = v[qz];
1791  QDD2[qz][dy][dx] = w[qz];
1792  }
1793  }
1794  }
1795  MFEM_SYNC_THREAD;
1796  MFEM_FOREACH_THREAD(dy,y,D1D)
1797  {
1798  MFEM_FOREACH_THREAD(dx,x,D1D)
1799  {
1800  double u[D1D], v[D1D], w[D1D];
1801  MFEM_UNROLL(MD1)
1802  for (int dz = 0; dz < D1D; ++dz) { u[dz] = v[dz] = w[dz] = 0.0; }
1803  MFEM_UNROLL(MQ1)
1804  for (int qz = 0; qz < Q1D; ++qz)
1805  {
1806  MFEM_UNROLL(MD1)
1807  for (int dz = 0; dz < D1D; ++dz)
1808  {
1809  const int i = qi(qz,dz,Q1D);
1810  const int j = dj(qz,dz,D1D);
1811  const int k = qk(qz,dz,Q1D);
1812  const int l = dl(qz,dz,D1D);
1813  const double s = sign(qz,dz);
1814  u[dz] += QDD0[qz][dy][dx] * Bt[j][i];
1815  v[dz] += QDD1[qz][dy][dx] * Bt[j][i];
1816  w[dz] += QDD2[qz][dy][dx] * Gt[l][k] * s;
1817  }
1818  }
1819  MFEM_UNROLL(MD1)
1820  for (int dz = 0; dz < D1D; ++dz)
1821  {
1822  y(dx,dy,dz,e) += (u[dz] + v[dz] + w[dz]);
1823  }
1824  }
1825  }
1826  });
1827 }
1828 
1829 static void PADiffusionApply(const int dim,
1830  const int D1D,
1831  const int Q1D,
1832  const int NE,
1833  const bool symm,
1834  const Array<double> &B,
1835  const Array<double> &G,
1836  const Array<double> &Bt,
1837  const Array<double> &Gt,
1838  const Vector &D,
1839  const Vector &X,
1840  Vector &Y)
1841 {
1842 #ifdef MFEM_USE_OCCA
1843  if (DeviceCanUseOcca())
1844  {
1845  if (dim == 2)
1846  {
1847  OccaPADiffusionApply2D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1848  return;
1849  }
1850  if (dim == 3)
1851  {
1852  OccaPADiffusionApply3D(D1D,Q1D,NE,B,G,Bt,Gt,D,X,Y);
1853  return;
1854  }
1855  MFEM_ABORT("OCCA PADiffusionApply unknown kernel!");
1856  }
1857 #endif // MFEM_USE_OCCA
1858  const int ID = (D1D << 4) | Q1D;
1859 
1860  if (dim == 2)
1861  {
1862  switch (ID)
1863  {
1864  case 0x22: return SmemPADiffusionApply2D<2,2,16>(NE,symm,B,G,D,X,Y);
1865  case 0x33: return SmemPADiffusionApply2D<3,3,16>(NE,symm,B,G,D,X,Y);
1866  case 0x44: return SmemPADiffusionApply2D<4,4,8>(NE,symm,B,G,D,X,Y);
1867  case 0x55: return SmemPADiffusionApply2D<5,5,8>(NE,symm,B,G,D,X,Y);
1868  case 0x66: return SmemPADiffusionApply2D<6,6,4>(NE,symm,B,G,D,X,Y);
1869  case 0x77: return SmemPADiffusionApply2D<7,7,4>(NE,symm,B,G,D,X,Y);
1870  case 0x88: return SmemPADiffusionApply2D<8,8,2>(NE,symm,B,G,D,X,Y);
1871  case 0x99: return SmemPADiffusionApply2D<9,9,2>(NE,symm,B,G,D,X,Y);
1872  default: return PADiffusionApply2D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1873  }
1874  }
1875 
1876  if (dim == 3)
1877  {
1878  switch (ID)
1879  {
1880  case 0x23: return SmemPADiffusionApply3D<2,3>(NE,symm,B,G,D,X,Y);
1881  case 0x34: return SmemPADiffusionApply3D<3,4>(NE,symm,B,G,D,X,Y);
1882  case 0x45: return SmemPADiffusionApply3D<4,5>(NE,symm,B,G,D,X,Y);
1883  case 0x46: return SmemPADiffusionApply3D<4,6>(NE,symm,B,G,D,X,Y);
1884  case 0x56: return SmemPADiffusionApply3D<5,6>(NE,symm,B,G,D,X,Y);
1885  case 0x58: return SmemPADiffusionApply3D<5,8>(NE,symm,B,G,D,X,Y);
1886  case 0x67: return SmemPADiffusionApply3D<6,7>(NE,symm,B,G,D,X,Y);
1887  case 0x78: return SmemPADiffusionApply3D<7,8>(NE,symm,B,G,D,X,Y);
1888  case 0x89: return SmemPADiffusionApply3D<8,9>(NE,symm,B,G,D,X,Y);
1889  default: return PADiffusionApply3D(NE,symm,B,G,Bt,Gt,D,X,Y,D1D,Q1D);
1890  }
1891  }
1892  MFEM_ABORT("Unknown kernel.");
1893 }
1894 
1895 // PA Diffusion Apply kernel
1896 void DiffusionIntegrator::AddMultPA(const Vector &x, Vector &y) const
1897 {
1898  if (DeviceCanUseCeed())
1899  {
1900  ceedOp->AddMult(x, y);
1901  }
1902  else
1903  {
1904  PADiffusionApply(dim, dofs1D, quad1D, ne, symmetric,
1905  maps->B, maps->G, maps->Bt, maps->Gt,
1906  pa_data, x, y);
1907  }
1908 }
1909 
1910 void DiffusionIntegrator::AddMultTransposePA(const Vector &x, Vector &y) const
1911 {
1912  if (symmetric)
1913  {
1914  AddMultPA(x, y);
1915  }
1916  else
1917  {
1918  MFEM_ABORT("DiffusionIntegrator::AddMultTransposePA only implemented in "
1919  "the symmetric case.")
1920  }
1921 }
1922 
1923 } // namespace mfem
int GetNPoints() const
Returns the number of the points in the integration rule.
Definition: intrules.hpp:247
Abstract class for all finite elements.
Definition: fe.hpp:243
int Size() const
Return the logical size of the array.
Definition: array.hpp:134
int GetDim() const
Returns the reference space dimension for the finite element.
Definition: fe.hpp:317
Class for an integration rule - an Array of IntegrationPoint.
Definition: intrules.hpp:90
Memory< T > & GetMemory()
Return a reference to the Memory object used by the Array.
Definition: array.hpp:113
A coefficient that is constant across space and time.
Definition: coefficient.hpp:78
const GeometricFactors * GetGeometricFactors(const IntegrationRule &ir, const int flags, MemoryType d_mt=MemoryType::DEFAULT)
Return the mesh geometric factors corresponding to the given integration rule.
Definition: mesh.cpp:784
const IntegrationRule & GetElementIntRule(int idx) const
Get the IntegrationRule associated with mesh element idx.
Definition: fespace.hpp:950
void SetSize(int s)
Change the size of the DenseSymmetricMatrix to s x s.
Definition: symmat.cpp:39
void SetSize(int s)
Resize the vector to size s.
Definition: vector.hpp:513
const Geometry::Type geom
Definition: ex1.cpp:40
occa::device & OccaDev()
Return the default occa::device used by MFEM.
Definition: occa.cpp:27
Data type dense matrix using column-major storage.
Definition: densemat.hpp:23
virtual double * HostWrite()
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), false).
Definition: vector.hpp:438
int Size() const
Returns the size of the vector.
Definition: vector.hpp:190
int ndof
Number of degrees of freedom = number of basis functions. When mode is TENSOR, this is the 1D number...
Definition: fe.hpp:173
int GetNE() const
Returns number of elements.
Definition: mesh.hpp:846
const Array< double > & GetWeights() const
Return the quadrature weights in a contiguous array.
Definition: intrules.cpp:83
constexpr int DIM
std::map< occa_id_t, occa::kernel > occa_kernel_t
Definition: occa.hpp:79
Memory< double > & GetMemory()
Return a reference to the Memory object used by the Vector.
Definition: vector.hpp:225
occa::memory OccaMemoryReadWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read-write access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:59
DeviceTensor< sizeof...(Dims), T > Reshape(T *ptr, Dims...dims)
Wrap a pointer as a DeviceTensor with automatically deduced template parameters.
Definition: dtensor.hpp:136
int GetNE() const
Returns number of elements in the mesh.
Definition: fespace.hpp:567
IntegrationPoint & IntPoint(int i)
Returns a reference to the i-th integration point.
Definition: intrules.hpp:250
Quadrature function coefficient which requires that the quadrature rules used for this coefficient be...
const int MAX_Q1D
Definition: forall.hpp:28
Mesh * GetMesh() const
Returns the mesh.
Definition: fespace.hpp:410
bool DeviceCanUseOcca()
Function that determines if an OCCA kernel should be used, based on the current mfem::Device configur...
Definition: occa.hpp:69
virtual double * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:434
double b
Definition: lissajous.cpp:42
const occa::memory OccaMemoryRead(const Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for read only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:37
QuadratureSpace * GetSpace() const
Get the associated QuadratureSpace.
Definition: gridfunc.hpp:775
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Definition: array.hpp:300
int Dimension() const
Definition: mesh.hpp:911
void PADiffusionSetup2D< 3 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
void PADiffusionSetup2D< 2 >(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
int SpaceDimension() const
Definition: mesh.hpp:912
ElementTransformation * GetElementTransformation(int i) const
Returns ElementTransformation for the i-th element.
Definition: fespace.hpp:600
Class FiniteElementSpace - responsible for providing FEM view of the mesh, mainly managing the set of...
Definition: fespace.hpp:87
occa::memory OccaMemoryWrite(Memory< T > &mem, size_t size)
Wrap a Memory object as occa::memory for write only access with the mfem::Device MemoryClass. The returned occa::memory is associated with the default occa::device used by MFEM.
Definition: occa.hpp:48
MemoryType
Memory types supported by MFEM.
Definition: mem_manager.hpp:31
virtual const DofToQuad & GetDofToQuad(const IntegrationRule &ir, DofToQuad::Mode mode) const
Return a DofToQuad structure corresponding to the given IntegrationRule using the given DofToQuad::Mo...
Definition: fe.cpp:367
void PADiffusionSetup3D(const int Q1D, const int coeffDim, const int NE, const Array< double > &w, const Vector &j, const Vector &c, Vector &d)
virtual double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:442
void GetElementTransformation(int i, IsoparametricTransformation *ElTr)
Definition: mesh.cpp:347
int dim
Definition: ex24.cpp:53
const int MAX_D1D
Definition: forall.hpp:27
virtual const FiniteElement * GetFE(int i) const
Returns pointer to the FiniteElement in the FiniteElementCollection associated with i&#39;th element in t...
Definition: fespace.cpp:2388
constexpr int SDIM
const double alpha
Definition: ex15.cpp:369
Vector data type.
Definition: vector.hpp:60
void MakeRef(Vector &base, int offset, int size)
Reset the Vector to be a reference to a sub-vector of base.
Definition: vector.hpp:577
std::pair< int, int > occa_id_t
Definition: occa.hpp:78
RefCoord s[3]
double u(const Vector &xvec)
Definition: lor_mms.hpp:24
void SetSize(int s)
Change the size of the DenseMatrix to s x s.
Definition: densemat.hpp:105
Class representing a function through its values (scalar or vector) at quadrature points...
Definition: gridfunc.hpp:734
virtual const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:426