html/gpu__blas_8cpp_source.html

// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced

// at the Lawrence Livermore National Laboratory. All Rights reserved. See files

// LICENSE and NOTICE for details. LLNL-CODE-806117.

//

// This file is part of the MFEM library. For more information and source code

// availability visit https://mfem.org.

//

// MFEM is free software; you can redistribute it and/or modify it under the

// terms of the BSD-3 license. We welcome feedback and contributions, see file

// CONTRIBUTING.md for details.


#include "gpu_blas.hpp"

#include "../../general/forall.hpp"


#if defined(MFEM_USE_CUDA)

#define MFEM_cu_or_hip(stub) cu##stub

#define MFEM_CU_or_HIP(stub) CU##stub

#elif defined(MFEM_USE_HIP)

#define MFEM_cu_or_hip(stub) hip##stub

#define MFEM_CU_or_HIP(stub) HIP##stub

#endif


#define MFEM_CONCAT(x, y, z) MFEM_CONCAT_(x, y, z)

#define MFEM_CONCAT_(x, y, z) x ## y ## z


#ifdef MFEM_USE_SINGLE

#define MFEM_GPUBLAS_PREFIX(stub) MFEM_CONCAT(MFEM_cu_or_hip(blas), S, stub)

#elif defined(MFEM_USE_DOUBLE)

#define MFEM_GPUBLAS_PREFIX(stub) MFEM_CONCAT(MFEM_cu_or_hip(blas), D, stub)

#endif


#define MFEM_BLAS_SUCCESS MFEM_CU_or_HIP(BLAS_STATUS_SUCCESS)


namespace mfem

{


GPUBlas &GPUBlas::Instance()

{

   static GPUBlas instance;

   return instance;

}


GPUBlas::HandleType GPUBlas::Handle()

{

   return Instance().handle;

}


#ifndef MFEM_USE_CUDA_OR_HIP


GPUBlas::GPUBlas() { }

GPUBlas::~GPUBlas() { }

void GPUBlas::EnableAtomics() { }

void GPUBlas::DisableAtomics() { }


#else


using blasStatus_t = MFEM_cu_or_hip(blasStatus_t);


GPUBlas::GPUBlas()

{

   blasStatus_t status = MFEM_cu_or_hip(blasCreate)(&handle);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "Cannot initialize GPU BLAS.");

}


GPUBlas::~GPUBlas()

{

   MFEM_cu_or_hip(blasDestroy)(handle);

}


void GPUBlas::EnableAtomics()

{

   const blasStatus_t status = MFEM_cu_or_hip(blasSetAtomicsMode)(

                                  Handle(), MFEM_CU_or_HIP(BLAS_ATOMICS_ALLOWED));

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "GPU BLAS error.");

}


void GPUBlas::DisableAtomics()

{

   const blasStatus_t status = MFEM_cu_or_hip(blasSetAtomicsMode)(

                                  Handle(), MFEM_CU_or_HIP(BLAS_ATOMICS_NOT_ALLOWED));

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "GPU BLAS error.");

}


void GPUBlasBatchedLinAlg::AddMult(const DenseTensor &A, const Vector &x,

                                   Vector &y, real_t alpha, real_t beta,

                                   Op op) const

{

   const bool tr = (op == Op::T);


   const int m = tr ? A.SizeJ() : A.SizeI();

   const int n = tr ? A.SizeI() : A.SizeJ();

   const int n_mat = A.SizeK();

   const int k = x.Size() / n / n_mat;


   auto d_A = A.Read();

   auto d_x = x.Read(); // Shape: (n, k, n_mat)

   auto d_y = beta == 0.0 ? y.Write() : y.ReadWrite(); // Shape (m, k, n_mat)


   const auto op_A = tr ? MFEM_CU_or_HIP(BLAS_OP_T) : MFEM_CU_or_HIP(BLAS_OP_N);

   const auto op_B = MFEM_CU_or_HIP(BLAS_OP_N);


   const blasStatus_t status = MFEM_GPUBLAS_PREFIX(gemmStridedBatched)(

                                  GPUBlas::Handle(), op_A, op_B, m, k, n,

                                  &alpha, d_A, m, m*n, d_x, n, n*k, &beta, d_y,

                                  m, m*k, n_mat);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "GPU BLAS error.");

}


void GPUBlasBatchedLinAlg::LUFactor(DenseTensor &A, Array<int> &P) const

{

   const int n = A.SizeI();

   const int n_mat = A.SizeK();


   P.SetSize(n*n_mat);


   Array<int> info_array(n_mat);


   real_t *A_base = A.ReadWrite();

   Array<real_t*> A_ptrs(n_mat);

   real_t **d_A_ptrs = A_ptrs.Write();

   mfem::forall(n_mat, [=] MFEM_HOST_DEVICE (int i)

   {

      d_A_ptrs[i] = A_base + i*n*n;

   });


   const blasStatus_t status = MFEM_GPUBLAS_PREFIX(getrfBatched)(

                                  GPUBlas::Handle(), n, d_A_ptrs, n, P.Write(),

                                  info_array.Write(), n_mat);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "");

}


void GPUBlasBatchedLinAlg::LUSolve(

   const DenseTensor &LU, const Array<int> &P, Vector &x) const

{

   const int n = LU.SizeI();

   const int n_mat = LU.SizeK();

   const int n_rhs = x.Size() / n / n_mat;


   Array<real_t*> A_ptrs(n_mat);

   real_t **d_A_ptrs = A_ptrs.Write();

   Array<real_t*> B_ptrs(n_mat);

   real_t **d_B_ptrs = B_ptrs.Write();


   {

      real_t *A_base = const_cast<real_t*>(LU.Read());

      real_t *B_base = x.ReadWrite();

      mfem::forall(n_mat, [=] MFEM_HOST_DEVICE (int i)

      {

         d_A_ptrs[i] = A_base + i*n*n;

         d_B_ptrs[i] = B_base + i*n*n_rhs;

      });

   }


   int info = 0;

   const blasStatus_t status = MFEM_GPUBLAS_PREFIX(getrsBatched)(

                                  GPUBlas::Handle(), MFEM_CU_or_HIP(BLAS_OP_N),

                                  n, n_rhs, d_A_ptrs, n, P.Read(), d_B_ptrs, n,

                                  &info, n_mat);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "");

}


void GPUBlasBatchedLinAlg::Invert(DenseTensor &A) const

{

   const int n = A.SizeI();

   const int n_mat = A.SizeK();


   DenseTensor LU(A.SizeI(), A.SizeJ(), A.SizeK());

   LU.Write();

   LU.GetMemory().CopyFrom(A.GetMemory(), A.TotalSize());


   Array<real_t*> LU_ptrs(n_mat);

   Array<real_t*> A_ptrs(n_mat);

   real_t **d_A_ptrs = A_ptrs.Write();

   real_t **d_LU_ptrs = LU_ptrs.Write();

   {

      real_t *A_base = A.ReadWrite();

      real_t *LU_base = LU.Write();

      mfem::forall(n_mat, [=] MFEM_HOST_DEVICE (int i)

      {

         d_A_ptrs[i] = A_base + i*n*n;

         d_LU_ptrs[i] = LU_base + i*n*n;

      });

   }


   Array<int> P(n*n_mat);

   Array<int> info_array(n_mat);

   blasStatus_t status;


   status = MFEM_GPUBLAS_PREFIX(getrfBatched)(

               GPUBlas::Handle(), n, d_LU_ptrs, n, P.Write(),

               info_array.Write(), n_mat);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "");


   status = MFEM_GPUBLAS_PREFIX(getriBatched)(

               GPUBlas::Handle(), n, d_LU_ptrs, n, P.ReadWrite(), d_A_ptrs, n,

               info_array.Write(), n_mat);

   MFEM_VERIFY(status == MFEM_BLAS_SUCCESS, "");

}


#endif


} // namespace mfem

mfem::Array
Definition array.hpp:48

mfem::Array::ReadWrite
T * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(a.GetMemory(), a.Size(), on_dev).
Definition array.hpp:397

mfem::Array::SetSize
void SetSize(int nsize)
Change the logical size of the array, keep existing entries.
Definition array.hpp:840

mfem::Array::Write
T * Write(bool on_dev=true)
Shortcut for mfem::Write(a.GetMemory(), a.Size(), on_dev).
Definition array.hpp:389

mfem::Array::Read
const T * Read(bool on_dev=true) const
Shortcut for mfem::Read(a.GetMemory(), a.Size(), on_dev).
Definition array.hpp:381

mfem::BatchedLinAlg::Op
Op
Operation type (transposed or not transposed)
Definition batched.hpp:54

mfem::DenseTensor
Rank 3 tensor (array of matrices)
Definition densemat.hpp:1111

mfem::DenseTensor::GetMemory
Memory< real_t > & GetMemory()
Definition densemat.hpp:1229

mfem::DenseTensor::SizeJ
int SizeJ() const
Definition densemat.hpp:1129

mfem::DenseTensor::TotalSize
int TotalSize() const
Definition densemat.hpp:1132

mfem::DenseTensor::Read
const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read( GetMemory(), TotalSize(), on_dev).
Definition densemat.hpp:1242

mfem::DenseTensor::Write
real_t * Write(bool on_dev=true)
Shortcut for mfem::Write(GetMemory(), TotalSize(), on_dev).
Definition densemat.hpp:1248

mfem::DenseTensor::ReadWrite
real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(GetMemory(), TotalSize(), on_dev).
Definition densemat.hpp:1254

mfem::DenseTensor::SizeI
int SizeI() const
Definition densemat.hpp:1128

mfem::DenseTensor::SizeK
int SizeK() const
Definition densemat.hpp:1130

mfem::GPUBlasBatchedLinAlg::AddMult
void AddMult(const DenseTensor &A, const Vector &x, Vector &y, real_t alpha=1.0, real_t beta=1.0, Op op=Op::N) const override
See BatchedLinAlg::AddMult.
Definition gpu_blas.cpp:84

mfem::GPUBlasBatchedLinAlg::LUSolve
void LUSolve(const DenseTensor &LU, const Array< int > &P, Vector &x) const override
See BatchedLinAlg::LUSolve.
Definition gpu_blas.cpp:132

mfem::GPUBlasBatchedLinAlg::LUFactor
void LUFactor(DenseTensor &A, Array< int > &P) const override
See BatchedLinAlg::LUFactor.
Definition gpu_blas.cpp:109

mfem::GPUBlasBatchedLinAlg::Invert
void Invert(DenseTensor &A) const override
See BatchedLinAlg::Invert.
Definition gpu_blas.cpp:162

mfem::GPUBlas::EnableAtomics
static void EnableAtomics()
Enable atomic operations.
Definition gpu_blas.cpp:52

mfem::GPUBlas::DisableAtomics
static void DisableAtomics()
Disable atomic operations.
Definition gpu_blas.cpp:53

mfem::GPUBlas::Handle
static HandleType Handle()
Return the handle, creating it if needed.
Definition gpu_blas.cpp:43

mfem::Memory::CopyFrom
void CopyFrom(const Memory &src, int size)
Copy size entries from src to *this.
Definition mem_manager.hpp:1282

mfem::Vector
Vector data type.
Definition vector.hpp:82

mfem::Vector::Read
virtual const real_t * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition vector.hpp:520

mfem::Vector::ReadWrite
virtual real_t * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition vector.hpp:536

mfem::Vector::Size
int Size() const
Returns the size of the vector.
Definition vector.hpp:234

mfem::Vector::Write
virtual real_t * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition vector.hpp:528

alpha
const real_t alpha
Definition ex15.cpp:369

forall.hpp

gpu_blas.hpp

mfem
Definition CodeDocumentation.dox:1

mfem::blasStatus_t
MFEM_cu_or_hip(blasStatus_t) blasStatus_t
Definition gpu_blas.cpp:57

mfem::real_t
float real_t
Definition config.hpp:46

mfem::forall
void forall(int N, lambda &&body)
Definition forall.hpp:839