MFEM v4.9.0
Finite element discretization library
Loading...
Searching...
No Matches
pderefmat_op.cpp
Go to the documentation of this file.
1// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
2// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
3// LICENSE and NOTICE for details. LLNL-CODE-806117.
4//
5// This file is part of the MFEM library. For more information and source code
6// availability visit https://mfem.org.
7//
8// MFEM is free software; you can redistribute it and/or modify it under the
9// terms of the BSD-3 license. We welcome feedback and contributions, see file
10// CONTRIBUTING.md for details.
11
12#include "pderefmat_op.hpp"
13
14#ifdef MFEM_USE_MPI
15
16#include "fes_kernels.hpp"
17/// \cond DO_NOT_DOCUMENT
18namespace mfem
19{
20namespace internal
21{
22template <Ordering::Type Order, bool Atomic>
23static void ParDerefMultKernelImpl(const ParDerefineMatrixOp &op,
24 const Vector &x, Vector &y)
25{
26 // pack sends
27 if (op.xghost_send.Size())
28 {
29 auto src = x.Read();
30 auto idcs = op.send_permutations.Read();
31 auto dst = Device::GetGPUAwareMPI() ? op.xghost_send.Write()
32 : op.xghost_send.HostWrite();
33 auto vdims = op.fespace->GetVDim();
34 auto sptr = op.send_segment_idcs.Read();
35 auto lptr = op.send_segments.Read();
36 auto old_ndofs = x.Size() / vdims;
37
38 forall(op.send_permutations.Size(), [=] MFEM_HOST_DEVICE(int i)
39 {
40 int seg = sptr[i];
41 int width = lptr[seg + 1] - lptr[seg];
42 auto tdst = dst + i + lptr[seg] * vdims;
43 int sign = 1;
44 int col = idcs[i];
45 if (col < 0)
46 {
47 sign = -1;
48 col = -1 - col;
49 }
50 for (int vdim = 0; vdim < vdims; ++vdim)
51 {
52 tdst[vdim * width] =
53 sign
54 * src[Order == Ordering::byNODES ? (col + vdim * old_ndofs)
55 : (col * vdims + vdim)];
56 }
57 });
58 // TODO: is this needed so we can send the packed data correctly?
59 // unclear for GPU-aware MPI, definitely required otherwise
60 MFEM_DEVICE_SYNC;
61 }
62 // initialize off-diagonal receive and send
63 op.requests.clear();
64 if (op.xghost_recv.Size())
65 {
66 auto vdims = op.fespace->GetVDim();
67 auto rcv = Device::GetGPUAwareMPI() ? op.xghost_recv.Write()
68 : op.xghost_recv.HostWrite();
69 for (int i = 0; i < op.recv_ranks.Size(); ++i)
70 {
71 op.requests.emplace_back();
72 MPI_Irecv(rcv + op.recv_segments[i] * vdims,
73 (op.recv_segments[i + 1] - op.recv_segments[i]) * vdims,
74 MPITypeMap<real_t>::mpi_type, op.recv_ranks[i],
76 op.fespace->GetComm(), &op.requests.back());
77 }
78 }
79 if (op.xghost_send.Size())
80 {
81 auto vdims = op.fespace->GetVDim();
82 // only is a GPU mem ptr if GPU-aware MPI is enabled
83 auto dst = Device::GetGPUAwareMPI() ? op.xghost_send.Write()
84 : op.xghost_send.HostWrite();
85 for (int i = 0; i < op.send_ranks.Size(); ++i)
86 {
87 op.requests.emplace_back();
88 MPI_Isend(dst + op.send_segments[i] * vdims,
89 (op.send_segments[i + 1] - op.send_segments[i]) * vdims,
90 MPITypeMap<real_t>::mpi_type, op.send_ranks[i],
92 op.fespace->GetComm(), &op.requests.back());
93 }
94 }
95 {
96 // diagonal
97 DerefineMatrixOpMultFunctor<Order, Atomic, true> func;
98 func.xptr = x.Read();
99 y.UseDevice();
100 y = 0.;
101 func.yptr = y.ReadWrite();
102 func.bsptr = op.block_storage.Read();
103 func.boptr = op.block_offsets.Read();
104 func.brptr = op.block_row_idcs_offsets.Read();
105 func.bcptr = op.block_col_idcs_offsets.Read();
106 func.rptr = op.row_idcs.Read();
107 func.cptr = op.col_idcs.Read();
108 func.vdims = op.fespace->GetVDim();
109 func.nblocks = op.block_offsets.Size();
110 func.width = op.Width() / func.vdims;
111 func.height = op.Height() / func.vdims;
112 func.Run(op.max_rows);
113 }
114 // wait for comm to finish, if any
115 if (op.requests.size())
116 {
117 MPI_Waitall(op.requests.size(), op.requests.data(), MPI_STATUSES_IGNORE);
118 if (op.xghost_recv.Size())
119 {
120 // off-diagonal kernel
121 DerefineMatrixOpMultFunctor<Order, Atomic, false> func;
122 // directly read from host-pinned memory if not using GPU-aware MPI
123 func.xptr = Device::GetGPUAwareMPI() ? op.xghost_recv.Read()
124 : op.xghost_recv.HostRead();
125 func.yptr = y.ReadWrite();
126 func.bsptr = op.block_storage.Read();
127 func.boptr = op.off_diag_block_offsets.Read();
128 func.brptr = op.block_off_diag_row_idcs_offsets.Read();
129 func.rsptr = op.recv_segment_idcs.Read();
130 func.segptr = op.recv_segments.Read();
131 func.coptr = op.block_off_diag_col_offsets.Read();
132 func.bwptr = op.block_off_diag_widths.Read();
133 func.rptr = op.row_off_diag_idcs.Read();
134 func.vdims = op.fespace->GetVDim();
135 func.nblocks = op.off_diag_block_offsets.Size();
136 func.width = op.xghost_recv.Size() / func.vdims;
137 func.height = op.Height() / func.vdims;
138 func.Run(op.max_rows);
139 }
140 }
141}
142} // namespace internal
143
144template <Ordering::Type Order, bool Atomic>
145ParDerefineMatrixOp::MultKernelType ParDerefineMatrixOp::MultKernel::Kernel()
146{
147 return internal::ParDerefMultKernelImpl<Order, Atomic>;
148}
149
150ParDerefineMatrixOp::MultKernelType
151ParDerefineMatrixOp::MultKernel::Fallback(Ordering::Type, bool)
152{
153 MFEM_ABORT("invalid MultKernel parameters");
154}
155
156ParDerefineMatrixOp::Kernels::Kernels()
157{
158 MultKernel::Specialization<Ordering::byNODES, false>::Add();
159 MultKernel::Specialization<Ordering::byVDIM, false>::Add();
160 MultKernel::Specialization<Ordering::byNODES, true>::Add();
161 MultKernel::Specialization<Ordering::byVDIM, true>::Add();
162}
163
164void ParDerefineMatrixOp::Mult(const Vector &x, Vector &y) const
165{
166 const bool is_dg = fespace->FEColl()->GetContType()
168 // DG needs atomic summation
169 MultKernel::Run(fespace->GetOrdering(), is_dg, *this, x, y);
170 // use this to prevent xghost* from being re-purposed for subsequent Mult
171 // calls
172 MFEM_DEVICE_SYNC;
173}
174
175ParDerefineMatrixOp::ParDerefineMatrixOp(ParFiniteElementSpace &fespace_,
176 int old_ndofs,
177 const Table *old_elem_dof,
178 const Table *old_elem_fos)
179 : Operator(fespace_.GetVSize(), old_ndofs * fespace_.GetVDim()),
180 fespace(&fespace_)
181{
182 static Kernels kernels;
183 constexpr int max_team_size = 256;
184
185 const int NRanks = fespace->GetNRanks();
186
187 const int nrk = HYPRE_AssumedPartitionCheck() ? 2 : NRanks;
188
189 MFEM_VERIFY(fespace->Nonconforming(),
190 "Not implemented for conforming meshes.");
191 MFEM_VERIFY(fespace->old_dof_offsets[nrk],
192 "Missing previous (finer) space.");
193
194 const int MyRank = fespace->GetMyRank();
195 ParNCMesh *old_pncmesh = fespace->GetParMesh()->pncmesh;
196 const CoarseFineTransformations &dtrans =
197 old_pncmesh->GetDerefinementTransforms();
198 const Array<int> &old_ranks = old_pncmesh->GetDerefineOldRanks();
199
200 const bool is_dg = fespace->FEColl()->GetContType()
201 == FiniteElementCollection::DISCONTINUOUS;
202 DenseMatrix localRVO; // for variable-order only
203
204 DenseTensor localR[Geometry::NumGeom];
205 int diag_rows = 0;
206 int off_diag_rows = 0;
207 int diag_cols = 0;
208
209 auto get_ldofs = [&](int k) -> int
210 {
211 const Embedding &emb = dtrans.embeddings[k];
212 if (fespace->IsVariableOrder())
213 {
214 const FiniteElement *fe = fespace->GetFE(emb.parent);
215 return fe->GetDof();
216 }
217 else
218 {
219 Geometry::Type geom =
220 fespace->GetParMesh()->GetElementBaseGeometry(emb.parent);
221 return fespace->FEColl()->FiniteElementForGeometry(geom)->GetDof();
222 }
223 };
224 Array<int> dofs, old_dofs;
225 max_rows = 1;
226 // first pass:
227 // - determine memory block lengths
228 // - identify dofs in x we need to send/receive
229 // don't need to send the indices, fine rank will re-arrange and sign
230 // change x before transmitting the ghost data
231
232 // key: coarse rank to send to
233 // value: old dofs to send (with sign)
234 std::map<int, std::vector<int>> to_send;
235 // key: fine rank
236 // value: indices into dtrans.embeddings
237 std::map<int, std::vector<int>> od_ks;
238 // key: fine rank
239 // value: recv segment length
240 std::map<int, int> od_seg_lens;
241 int send_len = 0;
242 int recv_len = 0;
243 // size of block_storage, if fespace->IsVariableOrder()
244 // otherwise unused
245 int total_size = 0;
246 int num_diagonal_blocks = 0;
247 int num_offdiagonal_blocks = 0;
248 for (int k = 0; k < dtrans.embeddings.Size(); ++k)
249 {
250 const Embedding &emb = dtrans.embeddings[k];
251 int fine_rank = old_ranks[k];
252 int coarse_rank = (emb.parent < 0) ? (-1 - emb.parent)
253 : old_pncmesh->ElementRank(emb.parent);
254 if (coarse_rank != MyRank && fine_rank == MyRank)
255 {
256 // this rank needs to send data in x to course_rank
257 old_elem_dof->GetRow(k, old_dofs);
258 auto &tmp = to_send[coarse_rank];
259 send_len += old_dofs.Size();
260 for (int i = 0; i < old_dofs.Size(); ++i)
261 {
262 tmp.emplace_back(old_dofs[i]);
263 }
264 }
265 else if (coarse_rank == MyRank && fine_rank != MyRank)
266 {
267 // this rank needs to receive data in x from fine_rank
268 MFEM_ASSERT(emb.parent >= 0, "");
269 auto ldofs = get_ldofs(k);
270 off_diag_rows += ldofs;
271 recv_len += ldofs;
272 od_ks[fine_rank].emplace_back(k);
273 od_seg_lens[fine_rank] += ldofs;
274 ++num_offdiagonal_blocks;
275 if (fespace->IsVariableOrder())
276 {
277 total_size += ldofs * ldofs;
278 }
279 }
280 else if (coarse_rank == MyRank && fine_rank == MyRank)
281 {
282 MFEM_ASSERT(emb.parent >= 0, "");
283 // diagonal
284 ++num_diagonal_blocks;
285 auto ldofs = get_ldofs(k);
286 diag_rows += ldofs;
287 diag_cols += ldofs;
288 if (fespace->IsVariableOrder())
289 {
290 total_size += ldofs * ldofs;
291 }
292 }
293 }
294 send_segments.SetSize(to_send.size() + 1);
295 send_segments.HostWrite();
296 send_ranks.SetSize(to_send.size());
297 send_ranks.HostWrite();
298 {
299 int idx = 0;
300 send_segments[0] = 0;
301 for (auto &tmp : to_send)
302 {
303 send_ranks[idx] = tmp.first;
304 send_segments[idx + 1] = send_segments[idx] + tmp.second.size();
305 ++idx;
306 }
307 }
308 recv_segment_idcs.SetSize(off_diag_rows);
309 recv_segment_idcs.HostWrite();
310 recv_segments.SetSize(od_ks.size() + 1);
311 recv_segments.HostWrite();
312 recv_ranks.SetSize(od_ks.size());
313 recv_ranks.HostWrite();
314
315 // set sizes
316 row_idcs.SetSize(diag_rows);
317 row_idcs.HostWrite();
318 row_off_diag_idcs.SetSize(off_diag_rows);
319 row_off_diag_idcs.HostWrite();
320 col_idcs.SetSize(diag_cols);
321 col_idcs.HostWrite();
322 block_row_idcs_offsets.SetSize(num_diagonal_blocks + 1);
323 block_row_idcs_offsets.HostWrite();
324 block_col_idcs_offsets.SetSize(num_diagonal_blocks + 1);
325 block_col_idcs_offsets.HostWrite();
326 block_off_diag_row_idcs_offsets.SetSize(num_offdiagonal_blocks + 1);
327 block_off_diag_row_idcs_offsets.HostWrite();
328 block_off_diag_col_offsets.SetSize(num_offdiagonal_blocks);
329 block_off_diag_col_offsets.HostWrite();
330 block_off_diag_widths.SetSize(num_offdiagonal_blocks);
331 block_off_diag_widths.HostWrite();
332 pack_col_idcs.SetSize(send_len);
333 // memory manager doesn't appear to have a graceful fallback for
334 // HOST_PINNED if not built with CUDA or HIP
335#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP)
336 xghost_send.SetSize(send_len * fespace->GetVDim(),
337 Device::GetGPUAwareMPI() ? MemoryType::DEFAULT
339 xghost_recv.SetSize(recv_len * fespace->GetVDim(),
340 Device::GetGPUAwareMPI() ? MemoryType::DEFAULT
342#else
343 xghost_send.SetSize(send_len * fespace->GetVDim());
344 xghost_recv.SetSize(recv_len * fespace->GetVDim());
345#endif
346 send_permutations.SetSize(send_len);
347 send_segment_idcs.SetSize(send_len);
348 block_offsets.SetSize(num_diagonal_blocks);
349 block_offsets.HostWrite();
350 off_diag_block_offsets.SetSize(num_offdiagonal_blocks);
351 off_diag_block_offsets.HostWrite();
352 int geom_offsets[Geometry::NumGeom];
353 real_t *bs_ptr;
354
355 if (fespace->IsVariableOrder())
356 {
357 block_storage.SetSize(total_size);
358 bs_ptr = block_storage.HostWrite();
359 // compute block data later
360 }
361 else
362 {
363 // compression scheme:
364 // block_offsets is the start of each block, potentially repeated
365 // only need to store localR for used shapes
366 Mesh::GeometryList elem_geoms(*fespace->GetMesh());
367
368 int size = 0;
369 for (int i = 0; i < elem_geoms.Size(); ++i)
370 {
371 fespace->GetLocalDerefinementMatrices(elem_geoms[i],
372 localR[elem_geoms[i]]);
373 geom_offsets[elem_geoms[i]] = size;
374 size += localR[elem_geoms[i]].TotalSize();
375 }
376 block_storage.SetSize(size);
377 bs_ptr = block_storage.HostWrite();
378 // copy blocks into block_storage
379 for (int i = 0; i < elem_geoms.Size(); ++i)
380 {
381 std::copy(localR[elem_geoms[i]].Data(),
382 localR[elem_geoms[i]].Data()
383 + localR[elem_geoms[i]].TotalSize(),
384 bs_ptr);
385 bs_ptr += localR[elem_geoms[i]].TotalSize();
386 }
387 }
388
389 // second pass:
390 // - initialize buffers
391
392 {
393 auto ptr = send_permutations.HostWrite();
394 auto ptr2 = send_segment_idcs.HostWrite();
395 int i = 0;
396 for (auto &v : to_send)
397 {
398 ptr = std::copy(v.second.begin(), v.second.end(), ptr);
399 for (size_t idx = 0; idx < v.second.size(); ++idx)
400 {
401 *ptr2 = i;
402 ++ptr2;
403 }
404 ++i;
405 }
406 }
407
408 block_row_idcs_offsets[0] = 0;
409 block_col_idcs_offsets[0] = 0;
410 block_off_diag_row_idcs_offsets[0] = 0;
411 Array<int> mark(fespace->GetNDofs());
412 mark = 0;
413 {
414 int idx = 0;
415 recv_segments[0] = 0;
416 for (auto &v : od_seg_lens)
417 {
418 recv_ranks[idx] = v.first;
419 recv_segments[idx + 1] = recv_segments[idx] + v.second;
420 ++idx;
421 }
422 }
423 // key: index into dtrans.embeddings
424 // value: off-diagonal block offset, od_ridx, seg id
425 std::unordered_map<int, std::array<int, 3>> ks_map;
426 {
427 int od_ridx = 0;
428 int seg_id = 0;
429 for (auto &v1 : od_ks)
430 {
431 for (auto k : v1.second)
432 {
433 auto &tmp = ks_map[k];
434 tmp[0] = ks_map.size() - 1;
435 tmp[1] = od_ridx;
436 tmp[2] = seg_id;
437 od_ridx += get_ldofs(k);
438 }
439 ++seg_id;
440 }
441 }
442 int diag_idx = 0;
443 int var_offset = 0;
444 int ridx = 0;
445 int cidx = 0;
446 // can't break this up into separate diagonals/off-diagonals loops because
447 // of mark
448 for (int k = 0; k < dtrans.embeddings.Size(); ++k)
449 {
450 const Embedding &emb = dtrans.embeddings[k];
451 if (emb.parent < 0)
452 {
453 continue;
454 }
455 int fine_rank = old_ranks[k];
456 int coarse_rank = (emb.parent < 0) ? (-1 - emb.parent)
457 : old_pncmesh->ElementRank(emb.parent);
458 if (coarse_rank == MyRank)
459 {
460 // either diagonal or off-diagonal
461 Geometry::Type geom =
462 fespace->GetMesh()->GetElementBaseGeometry(emb.parent);
463 if (fespace->IsVariableOrder())
464 {
465 const FiniteElement *fe = fespace->GetFE(emb.parent);
466 const DenseTensor &pmats = dtrans.point_matrices[geom];
467 const int ldof = fe->GetDof();
468
469 IsoparametricTransformation isotr;
470 isotr.SetIdentityTransformation(geom);
471
472 localRVO.SetSize(ldof, ldof);
473 isotr.SetPointMat(pmats(emb.matrix));
474 // Local restriction is size ldofxldof assuming that the parent
475 // and child are of same polynomial order.
476 fe->GetLocalRestriction(isotr, localRVO);
477 // copy block
478 auto s = localRVO.Height() * localRVO.Width();
479 std::copy(localRVO.Data(), localRVO.Data() + s, bs_ptr);
480 bs_ptr += s;
481 }
482 DenseMatrix &lR =
483 fespace->IsVariableOrder() ? localRVO : localR[geom](emb.matrix);
484 max_rows = std::max(lR.Height(), max_rows);
485 auto size = lR.Height() * lR.Width();
486 fespace->elem_dof->GetRow(emb.parent, dofs);
487 if (fine_rank == MyRank)
488 {
489 // diagonal
490 old_elem_dof->GetRow(k, old_dofs);
491 MFEM_VERIFY(old_dofs.Size() == dofs.Size(),
492 "Parent and child must have same #dofs.");
493 block_row_idcs_offsets[diag_idx + 1] =
494 block_row_idcs_offsets[diag_idx] + lR.Height();
495 block_col_idcs_offsets[diag_idx + 1] =
496 block_col_idcs_offsets[diag_idx] + lR.Width();
497
498 if (fespace->IsVariableOrder())
499 {
500 block_offsets[diag_idx] = var_offset;
501 var_offset += size;
502 }
503 else
504 {
505 block_offsets[diag_idx] = geom_offsets[geom] + size * emb.matrix;
506 }
507 for (int i = 0; i < lR.Height(); ++i, ++ridx)
508 {
509 if (!std::isfinite(lR(i, 0)))
510 {
511 row_idcs[ridx] = INT_MAX;
512 continue;
513 }
514 int r = dofs[i];
515 int m = (r >= 0) ? r : (-1 - r);
516 if (is_dg || !mark[m])
517 {
518 row_idcs[ridx] = r;
519 mark[m] = 1;
520 }
521 else
522 {
523 row_idcs[ridx] = INT_MAX;
524 }
525 }
526 for (int i = 0; i < lR.Width(); ++i, ++cidx)
527 {
528 col_idcs[cidx] = old_dofs[i];
529 }
530 ++diag_idx;
531 }
532 else
533 {
534 // off-diagonal
535 auto &tmp = ks_map.at(k);
536 auto od_idx = tmp[0];
537 auto od_ridx = tmp[1];
538 block_off_diag_row_idcs_offsets[od_idx + 1] =
539 block_off_diag_row_idcs_offsets[od_idx] + lR.Height();
540 block_off_diag_col_offsets[od_idx] = od_ridx;
541 block_off_diag_widths[od_idx] = lR.Width();
542 recv_segment_idcs[od_idx] = tmp[2];
543
544 if (fespace->IsVariableOrder())
545 {
546 off_diag_block_offsets[od_idx] = var_offset;
547 var_offset += size;
548 }
549 else
550 {
551 off_diag_block_offsets[od_idx] =
552 geom_offsets[geom] + size * emb.matrix;
553 }
554 for (int i = 0; i < lR.Height(); ++i, ++od_ridx)
555 {
556 if (!std::isfinite(lR(i, 0)))
557 {
558 row_off_diag_idcs[od_ridx] = INT_MAX;
559 continue;
560 }
561 int r = dofs[i];
562 int m = (r >= 0) ? r : (-1 - r);
563 if (is_dg || !mark[m])
564 {
565 row_off_diag_idcs[od_ridx] = r;
566 mark[m] = 1;
567 }
568 else
569 {
570 row_off_diag_idcs[od_ridx] = INT_MAX;
571 }
572 }
573 ++od_idx;
574 }
575 }
576 }
577 // if not using GPU, set max_rows/max_cols to zero
578 if (Device::Allows(Backend::DEVICE_MASK))
579 {
580 max_rows = std::min(max_rows, max_team_size);
581 }
582 else
583 {
584 max_rows = 1;
585 }
586 requests.reserve(recv_ranks.Size() + send_ranks.Size());
587}
588} // namespace mfem
589/// \endcond DO_NOT_DOCUMENT
590
591#endif
static bool GetGPUAwareMPI()
Get the status of GPU-aware MPI flag.
Definition device.hpp:298
@ DISCONTINUOUS
Field is discontinuous across element interfaces.
Definition fe_coll.hpp:48
Type
Ordering methods:
Definition ordering.hpp:17
mfem::real_t real_t
int GetVDim(const FieldDescriptor &f)
Get the vdim of a field descriptor.
Definition util.hpp:864
int GetVSize(const FieldDescriptor &f)
Get the vdof size of a field descriptor.
Definition util.hpp:760
SchrodingerBaseKernels< ParMesh, ParFiniteElementSpace, ParComplexGridFunction, ParGridFunction, ParBilinearForm, ParMixedBilinearForm, ParLinearForm > Kernels
MemoryType
Memory types supported by MFEM.
@ HOST_PINNED
Host memory: pinned (page-locked)
@ DEREFINEMENT_MATRIX_CONSTRUCTION_DATA
void forall(int N, lambda &&body)
Definition forall.hpp:839