22template <Ordering::Type Order,
bool Atomic>
23static void ParDerefMultKernelImpl(
const ParDerefineMatrixOp &op,
24 const Vector &x, Vector &y)
27 if (op.xghost_send.Size())
30 auto idcs = op.send_permutations.Read();
32 : op.xghost_send.HostWrite();
33 auto vdims = op.fespace->GetVDim();
34 auto sptr = op.send_segment_idcs.Read();
35 auto lptr = op.send_segments.Read();
36 auto old_ndofs = x.Size() / vdims;
38 forall(op.send_permutations.Size(), [=] MFEM_HOST_DEVICE(
int i)
41 int width = lptr[seg + 1] - lptr[seg];
42 auto tdst = dst + i + lptr[seg] * vdims;
50 for (
int vdim = 0; vdim < vdims; ++vdim)
55 : (col * vdims + vdim)];
64 if (op.xghost_recv.Size())
66 auto vdims = op.fespace->GetVDim();
68 : op.xghost_recv.HostWrite();
69 for (
int i = 0; i < op.recv_ranks.Size(); ++i)
71 op.requests.emplace_back();
72 MPI_Irecv(rcv + op.recv_segments[i] * vdims,
73 (op.recv_segments[i + 1] - op.recv_segments[i]) * vdims,
74 MPITypeMap<real_t>::mpi_type, op.recv_ranks[i],
76 op.fespace->GetComm(), &op.requests.back());
79 if (op.xghost_send.Size())
81 auto vdims = op.fespace->GetVDim();
84 : op.xghost_send.HostWrite();
85 for (
int i = 0; i < op.send_ranks.Size(); ++i)
87 op.requests.emplace_back();
88 MPI_Isend(dst + op.send_segments[i] * vdims,
89 (op.send_segments[i + 1] - op.send_segments[i]) * vdims,
90 MPITypeMap<real_t>::mpi_type, op.send_ranks[i],
92 op.fespace->GetComm(), &op.requests.back());
97 DerefineMatrixOpMultFunctor<Order, Atomic, true> func;
101 func.yptr = y.ReadWrite();
102 func.bsptr = op.block_storage.Read();
103 func.boptr = op.block_offsets.Read();
104 func.brptr = op.block_row_idcs_offsets.Read();
105 func.bcptr = op.block_col_idcs_offsets.Read();
106 func.rptr = op.row_idcs.Read();
107 func.cptr = op.col_idcs.Read();
108 func.vdims = op.fespace->GetVDim();
109 func.nblocks = op.block_offsets.Size();
110 func.width = op.Width() / func.vdims;
111 func.height = op.Height() / func.vdims;
112 func.Run(op.max_rows);
115 if (op.requests.size())
117 MPI_Waitall(op.requests.size(), op.requests.data(), MPI_STATUSES_IGNORE);
118 if (op.xghost_recv.Size())
121 DerefineMatrixOpMultFunctor<Order, Atomic, false> func;
124 : op.xghost_recv.HostRead();
125 func.yptr = y.ReadWrite();
126 func.bsptr = op.block_storage.Read();
127 func.boptr = op.off_diag_block_offsets.Read();
128 func.brptr = op.block_off_diag_row_idcs_offsets.Read();
129 func.rsptr = op.recv_segment_idcs.Read();
130 func.segptr = op.recv_segments.Read();
131 func.coptr = op.block_off_diag_col_offsets.Read();
132 func.bwptr = op.block_off_diag_widths.Read();
133 func.rptr = op.row_off_diag_idcs.Read();
134 func.vdims = op.fespace->GetVDim();
135 func.nblocks = op.off_diag_block_offsets.Size();
136 func.width = op.xghost_recv.Size() / func.vdims;
137 func.height = op.Height() / func.vdims;
138 func.Run(op.max_rows);
144template <Ordering::Type Order,
bool Atomic>
145ParDerefineMatrixOp::MultKernelType ParDerefineMatrixOp::MultKernel::Kernel()
147 return internal::ParDerefMultKernelImpl<Order, Atomic>;
150ParDerefineMatrixOp::MultKernelType
153 MFEM_ABORT(
"invalid MultKernel parameters");
156ParDerefineMatrixOp::Kernels::Kernels()
158 MultKernel::Specialization<Ordering::byNODES, false>::Add();
159 MultKernel::Specialization<Ordering::byVDIM, false>::Add();
160 MultKernel::Specialization<Ordering::byNODES, true>::Add();
161 MultKernel::Specialization<Ordering::byVDIM, true>::Add();
164void ParDerefineMatrixOp::Mult(
const Vector &x, Vector &y)
const
166 const bool is_dg = fespace->FEColl()->GetContType()
169 MultKernel::Run(fespace->GetOrdering(), is_dg, *
this, x, y);
175ParDerefineMatrixOp::ParDerefineMatrixOp(ParFiniteElementSpace &fespace_,
177 const Table *old_elem_dof,
178 const Table *old_elem_fos)
183 constexpr int max_team_size = 256;
185 const int NRanks = fespace->GetNRanks();
187 const int nrk = HYPRE_AssumedPartitionCheck() ? 2 : NRanks;
189 MFEM_VERIFY(fespace->Nonconforming(),
190 "Not implemented for conforming meshes.");
191 MFEM_VERIFY(fespace->old_dof_offsets[nrk],
192 "Missing previous (finer) space.");
194 const int MyRank = fespace->GetMyRank();
195 ParNCMesh *old_pncmesh = fespace->GetParMesh()->pncmesh;
196 const CoarseFineTransformations &dtrans =
197 old_pncmesh->GetDerefinementTransforms();
198 const Array<int> &old_ranks = old_pncmesh->GetDerefineOldRanks();
200 const bool is_dg = fespace->FEColl()->GetContType()
201 == FiniteElementCollection::DISCONTINUOUS;
202 DenseMatrix localRVO;
204 DenseTensor localR[Geometry::NumGeom];
206 int off_diag_rows = 0;
209 auto get_ldofs = [&](
int k) ->
int
211 const Embedding &emb = dtrans.embeddings[k];
212 if (fespace->IsVariableOrder())
214 const FiniteElement *fe = fespace->GetFE(emb.parent);
219 Geometry::Type geom =
220 fespace->GetParMesh()->GetElementBaseGeometry(emb.parent);
221 return fespace->FEColl()->FiniteElementForGeometry(geom)->GetDof();
224 Array<int> dofs, old_dofs;
234 std::map<int, std::vector<int>> to_send;
237 std::map<int, std::vector<int>> od_ks;
240 std::map<int, int> od_seg_lens;
246 int num_diagonal_blocks = 0;
247 int num_offdiagonal_blocks = 0;
248 for (
int k = 0; k < dtrans.embeddings.Size(); ++k)
250 const Embedding &emb = dtrans.embeddings[k];
251 int fine_rank = old_ranks[k];
252 int coarse_rank = (emb.parent < 0) ? (-1 - emb.parent)
253 : old_pncmesh->ElementRank(emb.parent);
254 if (coarse_rank != MyRank && fine_rank == MyRank)
257 old_elem_dof->GetRow(k, old_dofs);
258 auto &tmp = to_send[coarse_rank];
259 send_len += old_dofs.Size();
260 for (
int i = 0; i < old_dofs.Size(); ++i)
262 tmp.emplace_back(old_dofs[i]);
265 else if (coarse_rank == MyRank && fine_rank != MyRank)
268 MFEM_ASSERT(emb.parent >= 0,
"");
269 auto ldofs = get_ldofs(k);
270 off_diag_rows += ldofs;
272 od_ks[fine_rank].emplace_back(k);
273 od_seg_lens[fine_rank] += ldofs;
274 ++num_offdiagonal_blocks;
275 if (fespace->IsVariableOrder())
277 total_size += ldofs * ldofs;
280 else if (coarse_rank == MyRank && fine_rank == MyRank)
282 MFEM_ASSERT(emb.parent >= 0,
"");
284 ++num_diagonal_blocks;
285 auto ldofs = get_ldofs(k);
288 if (fespace->IsVariableOrder())
290 total_size += ldofs * ldofs;
294 send_segments.SetSize(to_send.size() + 1);
295 send_segments.HostWrite();
296 send_ranks.SetSize(to_send.size());
297 send_ranks.HostWrite();
300 send_segments[0] = 0;
301 for (
auto &tmp : to_send)
303 send_ranks[idx] = tmp.first;
304 send_segments[idx + 1] = send_segments[idx] + tmp.second.size();
308 recv_segment_idcs.SetSize(off_diag_rows);
309 recv_segment_idcs.HostWrite();
310 recv_segments.SetSize(od_ks.size() + 1);
311 recv_segments.HostWrite();
312 recv_ranks.SetSize(od_ks.size());
313 recv_ranks.HostWrite();
316 row_idcs.SetSize(diag_rows);
317 row_idcs.HostWrite();
318 row_off_diag_idcs.SetSize(off_diag_rows);
319 row_off_diag_idcs.HostWrite();
320 col_idcs.SetSize(diag_cols);
321 col_idcs.HostWrite();
322 block_row_idcs_offsets.SetSize(num_diagonal_blocks + 1);
323 block_row_idcs_offsets.HostWrite();
324 block_col_idcs_offsets.SetSize(num_diagonal_blocks + 1);
325 block_col_idcs_offsets.HostWrite();
326 block_off_diag_row_idcs_offsets.SetSize(num_offdiagonal_blocks + 1);
327 block_off_diag_row_idcs_offsets.HostWrite();
328 block_off_diag_col_offsets.SetSize(num_offdiagonal_blocks);
329 block_off_diag_col_offsets.HostWrite();
330 block_off_diag_widths.SetSize(num_offdiagonal_blocks);
331 block_off_diag_widths.HostWrite();
332 pack_col_idcs.SetSize(send_len);
335#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
336 xghost_send.SetSize(send_len * fespace->GetVDim(),
337 Device::GetGPUAwareMPI() ? MemoryType::DEFAULT
339 xghost_recv.SetSize(recv_len * fespace->GetVDim(),
340 Device::GetGPUAwareMPI() ? MemoryType::DEFAULT
343 xghost_send.SetSize(send_len * fespace->GetVDim());
344 xghost_recv.SetSize(recv_len * fespace->GetVDim());
346 send_permutations.SetSize(send_len);
347 send_segment_idcs.SetSize(send_len);
348 block_offsets.SetSize(num_diagonal_blocks);
349 block_offsets.HostWrite();
350 off_diag_block_offsets.SetSize(num_offdiagonal_blocks);
351 off_diag_block_offsets.HostWrite();
352 int geom_offsets[Geometry::NumGeom];
355 if (fespace->IsVariableOrder())
357 block_storage.SetSize(total_size);
358 bs_ptr = block_storage.HostWrite();
366 Mesh::GeometryList elem_geoms(*fespace->GetMesh());
369 for (
int i = 0; i < elem_geoms.Size(); ++i)
371 fespace->GetLocalDerefinementMatrices(elem_geoms[i],
372 localR[elem_geoms[i]]);
373 geom_offsets[elem_geoms[i]] = size;
374 size += localR[elem_geoms[i]].TotalSize();
376 block_storage.SetSize(size);
377 bs_ptr = block_storage.HostWrite();
379 for (
int i = 0; i < elem_geoms.Size(); ++i)
381 std::copy(localR[elem_geoms[i]].Data(),
382 localR[elem_geoms[i]].Data()
383 + localR[elem_geoms[i]].TotalSize(),
385 bs_ptr += localR[elem_geoms[i]].TotalSize();
393 auto ptr = send_permutations.HostWrite();
394 auto ptr2 = send_segment_idcs.HostWrite();
396 for (
auto &v : to_send)
398 ptr = std::copy(v.second.begin(), v.second.end(), ptr);
399 for (
size_t idx = 0; idx < v.second.size(); ++idx)
408 block_row_idcs_offsets[0] = 0;
409 block_col_idcs_offsets[0] = 0;
410 block_off_diag_row_idcs_offsets[0] = 0;
411 Array<int> mark(fespace->GetNDofs());
415 recv_segments[0] = 0;
416 for (
auto &v : od_seg_lens)
418 recv_ranks[idx] = v.first;
419 recv_segments[idx + 1] = recv_segments[idx] + v.second;
425 std::unordered_map<int, std::array<int, 3>> ks_map;
429 for (
auto &v1 : od_ks)
431 for (
auto k : v1.second)
433 auto &tmp = ks_map[k];
434 tmp[0] = ks_map.size() - 1;
437 od_ridx += get_ldofs(k);
448 for (
int k = 0; k < dtrans.embeddings.Size(); ++k)
450 const Embedding &emb = dtrans.embeddings[k];
455 int fine_rank = old_ranks[k];
456 int coarse_rank = (emb.parent < 0) ? (-1 - emb.parent)
457 : old_pncmesh->ElementRank(emb.parent);
458 if (coarse_rank == MyRank)
461 Geometry::Type geom =
462 fespace->GetMesh()->GetElementBaseGeometry(emb.parent);
463 if (fespace->IsVariableOrder())
465 const FiniteElement *fe = fespace->GetFE(emb.parent);
466 const DenseTensor &pmats = dtrans.point_matrices[geom];
467 const int ldof = fe->GetDof();
469 IsoparametricTransformation isotr;
470 isotr.SetIdentityTransformation(geom);
472 localRVO.SetSize(ldof, ldof);
473 isotr.SetPointMat(pmats(emb.matrix));
476 fe->GetLocalRestriction(isotr, localRVO);
478 auto s = localRVO.Height() * localRVO.Width();
479 std::copy(localRVO.Data(), localRVO.Data() + s, bs_ptr);
483 fespace->IsVariableOrder() ? localRVO : localR[geom](emb.matrix);
484 max_rows = std::max(lR.Height(), max_rows);
485 auto size = lR.Height() * lR.Width();
486 fespace->elem_dof->GetRow(emb.parent, dofs);
487 if (fine_rank == MyRank)
490 old_elem_dof->GetRow(k, old_dofs);
491 MFEM_VERIFY(old_dofs.Size() == dofs.Size(),
492 "Parent and child must have same #dofs.");
493 block_row_idcs_offsets[diag_idx + 1] =
494 block_row_idcs_offsets[diag_idx] + lR.Height();
495 block_col_idcs_offsets[diag_idx + 1] =
496 block_col_idcs_offsets[diag_idx] + lR.Width();
498 if (fespace->IsVariableOrder())
500 block_offsets[diag_idx] = var_offset;
505 block_offsets[diag_idx] = geom_offsets[geom] + size * emb.matrix;
507 for (
int i = 0; i < lR.Height(); ++i, ++ridx)
509 if (!std::isfinite(lR(i, 0)))
511 row_idcs[ridx] = INT_MAX;
515 int m = (r >= 0) ? r : (-1 - r);
516 if (is_dg || !mark[m])
523 row_idcs[ridx] = INT_MAX;
526 for (
int i = 0; i < lR.Width(); ++i, ++cidx)
528 col_idcs[cidx] = old_dofs[i];
535 auto &tmp = ks_map.at(k);
536 auto od_idx = tmp[0];
537 auto od_ridx = tmp[1];
538 block_off_diag_row_idcs_offsets[od_idx + 1] =
539 block_off_diag_row_idcs_offsets[od_idx] + lR.Height();
540 block_off_diag_col_offsets[od_idx] = od_ridx;
541 block_off_diag_widths[od_idx] = lR.Width();
542 recv_segment_idcs[od_idx] = tmp[2];
544 if (fespace->IsVariableOrder())
546 off_diag_block_offsets[od_idx] = var_offset;
551 off_diag_block_offsets[od_idx] =
552 geom_offsets[geom] + size * emb.matrix;
554 for (
int i = 0; i < lR.Height(); ++i, ++od_ridx)
556 if (!std::isfinite(lR(i, 0)))
558 row_off_diag_idcs[od_ridx] = INT_MAX;
562 int m = (r >= 0) ? r : (-1 - r);
563 if (is_dg || !mark[m])
565 row_off_diag_idcs[od_ridx] = r;
570 row_off_diag_idcs[od_ridx] = INT_MAX;
578 if (Device::Allows(Backend::DEVICE_MASK))
580 max_rows = std::min(max_rows, max_team_size);
586 requests.reserve(recv_ranks.Size() + send_ranks.Size());
static bool GetGPUAwareMPI()
Get the status of GPU-aware MPI flag.
@ DISCONTINUOUS
Field is discontinuous across element interfaces.
int GetVDim(const FieldDescriptor &f)
Get the vdim of a field descriptor.
int GetVSize(const FieldDescriptor &f)
Get the vdof size of a field descriptor.
SchrodingerBaseKernels< ParMesh, ParFiniteElementSpace, ParComplexGridFunction, ParGridFunction, ParBilinearForm, ParMixedBilinearForm, ParLinearForm > Kernels
MemoryType
Memory types supported by MFEM.
@ HOST_PINNED
Host memory: pinned (page-locked)
@ DEREFINEMENT_MATRIX_CONSTRUCTION_DATA
void forall(int N, lambda &&body)