4.6/amgxsolver_8cpp_source.html

 // Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
 // at the Lawrence Livermore National Laboratory. All Rights reserved. See files
 // LICENSE and NOTICE for details. LLNL-CODE-806117.
 //
 // This file is part of the MFEM library. For more information and source code
 // availability visit https://mfem.org.
 //
 // MFEM is free software; you can redistribute it and/or modify it under the
 // terms of the BSD-3 license. We welcome feedback and contributions, see file
 // CONTRIBUTING.md for details.

 // Implementation of the MFEM wrapper for Nvidia's multigrid library, AmgX
 //
 // This work is partially based on:
 //
 //    Pi-Yueh Chuang and Lorena A. Barba (2017).
 //    AmgXWrapper: An interface between PETSc and the NVIDIA AmgX library.
 //    J. Open Source Software, 2(16):280, doi:10.21105/joss.00280
 //
 // See https://github.com/barbagroup/AmgXWrapper.

 #include "../config/config.hpp"
 #include "amgxsolver.hpp"
 #ifdef MFEM_USE_AMGX

 namespace mfem
 {

 int AmgXSolver::count = 0;

 AMGX_resources_handle AmgXSolver::rsrc = nullptr;

 AmgXSolver::AmgXSolver()
    : ConvergenceCheck(false) {};

 AmgXSolver::AmgXSolver(const AMGX_MODE amgxMode_, const bool verbose)
 {
    amgxMode = amgxMode_;

    if (amgxMode == AmgXSolver::SOLVER) { ConvergenceCheck = true;}
    else { ConvergenceCheck = false;}

    DefaultParameters(amgxMode, verbose);

    InitSerial();
 }

 #ifdef MFEM_USE_MPI

 AmgXSolver::AmgXSolver(const MPI_Comm &comm,
                        const AMGX_MODE amgxMode_, const bool verbose)
 {
    std::string config;
    amgxMode = amgxMode_;

    if (amgxMode == AmgXSolver::SOLVER) { ConvergenceCheck = true;}
    else { ConvergenceCheck = false;}

    DefaultParameters(amgxMode, verbose);

    InitExclusiveGPU(comm);
 }

 AmgXSolver::AmgXSolver(const MPI_Comm &comm, const int nDevs,
                        const AMGX_MODE amgxMode_, const bool verbose)
 {
    std::string config;
    amgxMode = amgxMode_;

    if (amgxMode == AmgXSolver::SOLVER) { ConvergenceCheck = true;}
    else { ConvergenceCheck = false;}

    DefaultParameters(amgxMode_, verbose);

    InitMPITeams(comm, nDevs);
 }

 #endif

 AmgXSolver::~AmgXSolver()
 {
    if (isInitialized) { Finalize(); }
 }

 void AmgXSolver::InitSerial()
 {
    count++;

    mpi_gpu_mode = "serial";

    AMGX_SAFE_CALL(AMGX_initialize());

    AMGX_SAFE_CALL(AMGX_initialize_plugins());

    AMGX_SAFE_CALL(AMGX_install_signal_handler());

    MFEM_VERIFY(configSrc != CONFIG_SRC::UNDEFINED,
                "AmgX configuration is not defined \n");

    if (configSrc == CONFIG_SRC::EXTERNAL)
    {
       AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, amgx_config.c_str()));
    }
    else
    {
       AMGX_SAFE_CALL(AMGX_config_create(&cfg, amgx_config.c_str()));
    }

    AMGX_SAFE_CALL(AMGX_resources_create_simple(&rsrc, cfg));
    AMGX_SAFE_CALL(AMGX_solver_create(&solver, rsrc, precision_mode, cfg));
    AMGX_SAFE_CALL(AMGX_matrix_create(&AmgXA, rsrc, precision_mode));
    AMGX_SAFE_CALL(AMGX_vector_create(&AmgXP, rsrc, precision_mode));
    AMGX_SAFE_CALL(AMGX_vector_create(&AmgXRHS, rsrc, precision_mode));

    isInitialized = true;
 }

 #ifdef MFEM_USE_MPI

 void AmgXSolver::InitExclusiveGPU(const MPI_Comm &comm)
 {
    // If this instance has already been initialized, skip
    if (isInitialized)
    {
       mfem_error("This AmgXSolver instance has been initialized on this process.");
    }

    // Note that every MPI rank may talk to a GPU
    mpi_gpu_mode = "mpi-gpu-exclusive";
    gpuProc = 0;

    // Increment number of AmgX instances
    count++;

    MPI_Comm_dup(comm, &gpuWorld);
    MPI_Comm_size(gpuWorld, &gpuWorldSize);
    MPI_Comm_rank(gpuWorld, &myGpuWorldRank);

    // Each rank will only see 1 device call it device 0
    nDevs = 1, devID = 0;

    InitAmgX();

    isInitialized = true;
 }

 // Initialize for MPI ranks > GPUs, all devices are visible to all of the MPI
 // ranks
 void AmgXSolver::InitMPITeams(const MPI_Comm &comm,
                               const int nDevs)
 {
    // If this instance has already been initialized, skip
    if (isInitialized)
    {
       mfem_error("This AmgXSolver instance has been initialized on this process.");
    }

    mpi_gpu_mode = "mpi-teams";

    // Increment number of AmgX instances
    count++;

    // Get the name of this node
    int     len;
    char    name[MPI_MAX_PROCESSOR_NAME];
    MPI_Get_processor_name(name, &len);
    nodeName = name;
    int globalcommrank;

    MPI_Comm_rank(comm, &globalcommrank);

    // Initialize communicators and corresponding information
    InitMPIcomms(comm, nDevs);

    // Only processes in gpuWorld are required to initialize AmgX
    if (gpuProc == 0)
    {
       InitAmgX();
    }

    isInitialized = true;
 }

 #endif

 void AmgXSolver::ReadParameters(const std::string config,
                                 const CONFIG_SRC source)
 {
    amgx_config = config;
    configSrc = source;
 }

 void AmgXSolver::SetConvergenceCheck(bool setConvergenceCheck_)
 {
    ConvergenceCheck = setConvergenceCheck_;
 }

 void AmgXSolver::DefaultParameters(const AMGX_MODE amgxMode_,
                                    const bool verbose)
 {
    amgxMode = amgxMode_;

    configSrc = INTERNAL;

    if (amgxMode == AMGX_MODE::PRECONDITIONER)
    {
       amgx_config = "{\n"
                     " \"config_version\": 2, \n"
                     " \"solver\": { \n"
                     "   \"solver\": \"AMG\", \n"
                     "   \"scope\": \"main\", \n"
                     "   \"smoother\": \"JACOBI_L1\", \n"
                     "   \"presweeps\": 1, \n"
                     "   \"interpolator\": \"D2\", \n"
                     "   \"max_row_sum\" : 0.9, \n"
                     "   \"strength_threshold\" : 0.25, \n"
                     "   \"postsweeps\": 1, \n"
                     "   \"max_iters\": 1, \n"
                     "   \"cycle\": \"V\"";
       if (verbose)
       {
          amgx_config = amgx_config + ",\n"
                        "   \"obtain_timings\": 1, \n"
                        "   \"print_grid_stats\": 1, \n"
                        "   \"monitor_residual\": 1, \n"
                        "   \"print_solve_stats\": 1 \n";
       }
       else
       {
          amgx_config = amgx_config + "\n";
       }
       amgx_config = amgx_config + " }\n" + "}\n";
       // use a zero initial guess in Mult()
       iterative_mode = false;
    }
    else if (amgxMode == AMGX_MODE::SOLVER)
    {
       amgx_config = "{ \n"
                     " \"config_version\": 2, \n"
                     " \"solver\": { \n"
                     "   \"preconditioner\": { \n"
                     "     \"solver\": \"AMG\", \n"
                     "     \"smoother\": { \n"
                     "     \"scope\": \"jacobi\", \n"
                     "     \"solver\": \"JACOBI_L1\" \n"
                     "       }, \n"
                     "     \"presweeps\": 1, \n"
                     "     \"interpolator\": \"D2\", \n"
                     "     \"max_row_sum\" : 0.9, \n"
                     "     \"strength_threshold\" : 0.25, \n"
                     "     \"max_iters\": 1, \n"
                     "     \"scope\": \"amg\", \n"
                     "     \"max_levels\": 100, \n"
                     "     \"cycle\": \"V\", \n"
                     "     \"postsweeps\": 1 \n"
                     "    }, \n"
                     "  \"solver\": \"PCG\", \n"
                     "  \"max_iters\": 150, \n"
                     "  \"convergence\": \"RELATIVE_INI_CORE\", \n"
                     "  \"scope\": \"main\", \n"
                     "  \"tolerance\": 1e-12, \n"
                     "  \"monitor_residual\": 1, \n"
                     "  \"norm\": \"L2\" ";
       if (verbose)
       {
          amgx_config = amgx_config + ", \n"
                        "        \"obtain_timings\": 1, \n"
                        "        \"print_grid_stats\": 1, \n"
                        "        \"print_solve_stats\": 1 \n";
       }
       else
       {
          amgx_config = amgx_config + "\n";
       }
       amgx_config = amgx_config + "   } \n" + "} \n";
       // use the user-specified vector as an initial guess in Mult()
       iterative_mode = true;
    }
    else
    {
       mfem_error("AmgX mode not supported \n");
    }
 }

 // Sets up AmgX library for MPI builds
 #ifdef MFEM_USE_MPI
 void AmgXSolver::InitAmgX()
 {
    // Set up once
    if (count == 1)
    {
       AMGX_SAFE_CALL(AMGX_initialize());

       AMGX_SAFE_CALL(AMGX_initialize_plugins());

       AMGX_SAFE_CALL(AMGX_install_signal_handler());

       AMGX_SAFE_CALL(AMGX_register_print_callback(
                         [](const char *msg, int length)->void
       {
          int irank; MPI_Comm_rank(MPI_COMM_WORLD, &irank);
          if (irank == 0) { mfem::out<<msg;} }));
    }

    MFEM_VERIFY(configSrc != CONFIG_SRC::UNDEFINED,
                "AmgX configuration is not defined \n");

    if (configSrc == CONFIG_SRC::EXTERNAL)
    {
       AMGX_SAFE_CALL(AMGX_config_create_from_file(&cfg, amgx_config.c_str()));
    }
    else
    {
       AMGX_SAFE_CALL(AMGX_config_create(&cfg, amgx_config.c_str()));
    }

    // Let AmgX handle returned error codes internally
    AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));

    // Create an AmgX resource object, only the first instance needs to create
    // the resource object.
    if (count == 1) { AMGX_SAFE_CALL(AMGX_resources_create(&rsrc, cfg, &gpuWorld, 1, &devID)); }

    // Create AmgX vector object for unknowns and RHS
    AMGX_SAFE_CALL(AMGX_vector_create(&AmgXP, rsrc, precision_mode));
    AMGX_SAFE_CALL(AMGX_vector_create(&AmgXRHS, rsrc, precision_mode));

    // Create AmgX matrix object for unknowns and RHS
    AMGX_SAFE_CALL(AMGX_matrix_create(&AmgXA, rsrc, precision_mode));

    // Create an AmgX solver object
    AMGX_SAFE_CALL(AMGX_solver_create(&solver, rsrc, precision_mode, cfg));

    // Obtain the default number of rings based on current configuration
    AMGX_SAFE_CALL(AMGX_config_get_default_number_of_rings(cfg, &ring));
 }

 // Groups MPI ranks into teams and assigns the roots to talk to GPUs
 void AmgXSolver::InitMPIcomms(const MPI_Comm &comm, const int nDevs)
 {
    // Duplicate the global communicator
    MPI_Comm_dup(comm, &globalCpuWorld);
    MPI_Comm_set_name(globalCpuWorld, "globalCpuWorld");

    // Get size and rank for global communicator
    MPI_Comm_size(globalCpuWorld, &globalSize);
    MPI_Comm_rank(globalCpuWorld, &myGlobalRank);

    // Get the communicator for processors on the same node (local world)
    MPI_Comm_split_type(globalCpuWorld,
                        MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localCpuWorld);
    MPI_Comm_set_name(localCpuWorld, "localCpuWorld");

    // Get size and rank for local communicator
    MPI_Comm_size(localCpuWorld, &localSize);
    MPI_Comm_rank(localCpuWorld, &myLocalRank);

    // Set up corresponding ID of the device used by each local process
    SetDeviceIDs(nDevs);

    MPI_Barrier(globalCpuWorld);

    // Split the global world into a world involved in AmgX and a null world
    MPI_Comm_split(globalCpuWorld, gpuProc, 0, &gpuWorld);

    // Get size and rank for the communicator corresponding to gpuWorld
    if (gpuWorld != MPI_COMM_NULL)
    {
       MPI_Comm_set_name(gpuWorld, "gpuWorld");
       MPI_Comm_size(gpuWorld, &gpuWorldSize);
       MPI_Comm_rank(gpuWorld, &myGpuWorldRank);
    }
    else // for those that will not communicate with the GPU
    {
       gpuWorldSize = MPI_UNDEFINED;
       myGpuWorldRank = MPI_UNDEFINED;
    }

    // Split local world into worlds corresponding to each CUDA device
    MPI_Comm_split(localCpuWorld, devID, 0, &devWorld);
    MPI_Comm_set_name(devWorld, "devWorld");

    // Get size and rank for the communicator corresponding to myWorld
    MPI_Comm_size(devWorld, &devWorldSize);
    MPI_Comm_rank(devWorld, &myDevWorldRank);

    MPI_Barrier(globalCpuWorld);
 }

 // Determine MPI teams based on available devices
 void AmgXSolver::SetDeviceIDs(const int nDevs)
 {
    // Set the ID of device that each local process will use
    if (nDevs == localSize) // # of the devices and local process are the same
    {
       devID = myLocalRank;
       gpuProc = 0;
    }
    else if (nDevs > localSize) // there are more devices than processes
    {
       MFEM_WARNING("CUDA devices on the node " << nodeName.c_str() <<
                    " are more than the MPI processes launched. Only "<<
                    nDevs << " devices will be used.\n");
       devID = myLocalRank;
       gpuProc = 0;
    }
    else // in case there are more ranks than devices
    {
       int     nBasic = localSize / nDevs,
               nRemain = localSize % nDevs;

       if (myLocalRank < (nBasic+1)*nRemain)
       {
          devID = myLocalRank / (nBasic + 1);
          if (myLocalRank % (nBasic + 1) == 0) { gpuProc = 0; }
       }
       else
       {
          devID = (myLocalRank - (nBasic+1)*nRemain) / nBasic + nRemain;
          if ((myLocalRank - (nBasic+1)*nRemain) % nBasic == 0) { gpuProc = 0; }
       }
    }
 }

 void AmgXSolver::GatherArray(const Array<double> &inArr, Array<double> &outArr,
                              const int mpiTeamSz, const MPI_Comm &mpiTeamComm) const
 {
    // Calculate number of elements to be collected from each process
    Array<int> Apart(mpiTeamSz);
    int locAsz = inArr.Size();
    MPI_Gather(&locAsz, 1, MPI_INT,
               Apart.HostWrite(),1, MPI_INT,0,mpiTeamComm);

    MPI_Barrier(mpiTeamComm);

    // Determine stride for process (to be used by root)
    Array<int> Adisp(mpiTeamSz);
    int myid; MPI_Comm_rank(mpiTeamComm, &myid);
    if (myid == 0)
    {
       Adisp[0] = 0;
       for (int i=1; i<mpiTeamSz; ++i)
       {
          Adisp[i] = Adisp[i-1] + Apart[i-1];
       }
    }

    MPI_Gatherv(inArr.HostRead(), inArr.Size(), MPI_DOUBLE,
                outArr.HostWrite(), Apart.HostRead(), Adisp.HostRead(),
                MPI_DOUBLE, 0, mpiTeamComm);
 }

 void AmgXSolver::GatherArray(const Vector &inArr, Vector &outArr,
                              const int mpiTeamSz, const MPI_Comm &mpiTeamComm) const
 {
    // Calculate number of elements to be collected from each process
    Array<int> Apart(mpiTeamSz);
    int locAsz = inArr.Size();
    MPI_Gather(&locAsz, 1, MPI_INT,
               Apart.HostWrite(),1, MPI_INT,0,mpiTeamComm);

    MPI_Barrier(mpiTeamComm);

    // Determine stride for process (to be used by root)
    Array<int> Adisp(mpiTeamSz);
    int myid; MPI_Comm_rank(mpiTeamComm, &myid);
    if (myid == 0)
    {
       Adisp[0] = 0;
       for (int i=1; i<mpiTeamSz; ++i)
       {
          Adisp[i] = Adisp[i-1] + Apart[i-1];
       }
    }

    MPI_Gatherv(inArr.HostRead(), inArr.Size(), MPI_DOUBLE,
                outArr.HostWrite(), Apart.HostRead(), Adisp.HostRead(),
                MPI_DOUBLE, 0, mpiTeamComm);
 }

 void AmgXSolver::GatherArray(const Array<int> &inArr, Array<int> &outArr,
                              const int mpiTeamSz, const MPI_Comm &mpiTeamComm) const
 {
    // Calculate number of elements to be collected from each process
    Array<int> Apart(mpiTeamSz);
    int locAsz = inArr.Size();
    MPI_Gather(&locAsz, 1, MPI_INT,
               Apart.GetData(),1, MPI_INT,0,mpiTeamComm);

    MPI_Barrier(mpiTeamComm);

    // Determine stride for process (to be used by root)
    Array<int> Adisp(mpiTeamSz);
    int myid; MPI_Comm_rank(mpiTeamComm, &myid);
    if (myid == 0)
    {
       Adisp[0] = 0;
       for (int i=1; i<mpiTeamSz; ++i)
       {
          Adisp[i] = Adisp[i-1] + Apart[i-1];
       }
    }

    MPI_Gatherv(inArr.HostRead(), inArr.Size(), MPI_INT,
                outArr.HostWrite(), Apart.HostRead(), Adisp.HostRead(),
                MPI_INT, 0, mpiTeamComm);
 }


 void AmgXSolver::GatherArray(const Array<int64_t> &inArr,
                              Array<int64_t> &outArr,
                              const int mpiTeamSz, const MPI_Comm &mpiTeamComm) const
 {
    // Calculate number of elements to be collected from each process
    Array<int> Apart(mpiTeamSz);
    int locAsz = inArr.Size();
    MPI_Gather(&locAsz, 1, MPI_INT,
               Apart.GetData(),1, MPI_INT,0,mpiTeamComm);

    MPI_Barrier(mpiTeamComm);

    // Determine stride for process
    Array<int> Adisp(mpiTeamSz);
    int myid; MPI_Comm_rank(mpiTeamComm, &myid);
    if (myid == 0)
    {
       Adisp[0] = 0;
       for (int i=1; i<mpiTeamSz; ++i)
       {
          Adisp[i] = Adisp[i-1] + Apart[i-1];
       }
    }

    MPI_Gatherv(inArr.HostRead(), inArr.Size(), MPI_INT64_T,
                outArr.HostWrite(), Apart.HostRead(), Adisp.HostRead(),
                MPI_INT64_T, 0, mpiTeamComm);

    MPI_Barrier(mpiTeamComm);
 }

 void AmgXSolver::GatherArray(const Vector &inArr, Vector &outArr,
                              const int mpiTeamSz, const MPI_Comm &mpiTeamComm,
                              Array<int> &Apart, Array<int> &Adisp) const
 {
    // Calculate number of elements to be collected from each process
    int locAsz = inArr.Size();
    MPI_Allgather(&locAsz, 1, MPI_INT,
                  Apart.HostWrite(),1, MPI_INT, mpiTeamComm);

    MPI_Barrier(mpiTeamComm);

    // Determine stride for process
    Adisp[0] = 0;
    for (int i=1; i<mpiTeamSz; ++i)
    {
       Adisp[i] = Adisp[i-1] + Apart[i-1];
    }

    MPI_Gatherv(inArr.HostRead(), inArr.Size(), MPI_DOUBLE,
                outArr.HostWrite(), Apart.HostRead(), Adisp.HostRead(),
                MPI_DOUBLE, 0, mpiTeamComm);
 }

 void AmgXSolver::ScatterArray(const Vector &inArr, Vector &outArr,
                               const int mpiTeamSz, const MPI_Comm &mpiTeamComm,
                               Array<int> &Apart, Array<int> &Adisp) const
 {
    MPI_Scatterv(inArr.HostRead(),Apart.HostRead(),Adisp.HostRead(),
                 MPI_DOUBLE,outArr.HostWrite(),outArr.Size(),
                 MPI_DOUBLE, 0, mpiTeamComm);
 }
 #endif

 void AmgXSolver::SetMatrix(const SparseMatrix &in_A, const bool update_mat)
 {
    if (update_mat == false)
    {
       AMGX_SAFE_CALL(AMGX_matrix_upload_all(AmgXA, in_A.Height(),
                                             in_A.NumNonZeroElems(),
                                             1, 1,
                                             in_A.ReadI(),
                                             in_A.ReadJ(),
                                             in_A.ReadData(), NULL));

       AMGX_SAFE_CALL(AMGX_solver_setup(solver, AmgXA));
       AMGX_SAFE_CALL(AMGX_vector_bind(AmgXP, AmgXA));
       AMGX_SAFE_CALL(AMGX_vector_bind(AmgXRHS, AmgXA));
    }
    else
    {
       AMGX_SAFE_CALL(AMGX_matrix_replace_coefficients(AmgXA,
                                                       in_A.Height(),
                                                       in_A.NumNonZeroElems(),
                                                       in_A.ReadData(), NULL));
    }
 }

 #ifdef MFEM_USE_MPI

 void AmgXSolver::SetMatrix(const HypreParMatrix &A, const bool update_mat)
 {
    // Require hypre >= 2.16.
 #if MFEM_HYPRE_VERSION < 21600
    mfem_error("Hypre version 2.16+ is required when using AmgX \n");
 #endif

    // Ensure HypreParMatrix is on the host
    A.HostRead();

    hypre_ParCSRMatrix * A_ptr =
       (hypre_ParCSRMatrix *)const_cast<HypreParMatrix&>(A);

    hypre_CSRMatrix *A_csr = hypre_MergeDiagAndOffd(A_ptr);

    A.HypreRead();

    Array<double> loc_A(A_csr->data, (int)A_csr->num_nonzeros);
    const Array<HYPRE_Int> loc_I(A_csr->i, (int)A_csr->num_rows+1);

    // Column index must be int64_t so we must promote here
    Array<int64_t> loc_J((int)A_csr->num_nonzeros);
    for (int i=0; i<A_csr->num_nonzeros; ++i)
    {
       loc_J[i] = A_csr->big_j[i];
    }

    // Assumes one GPU per MPI rank
    if (mpi_gpu_mode=="mpi-gpu-exclusive")
    {
       SetMatrixMPIGPUExclusive(A, loc_A, loc_I, loc_J, update_mat);
       // Free A_csr data from hypre_MergeDiagAndOffd method
       hypre_CSRMatrixDestroy(A_csr);
       return;
    }

    // Assumes teams of MPI ranks are sharing a GPU
    if (mpi_gpu_mode == "mpi-teams")
    {
       SetMatrixMPITeams(A, loc_A, loc_I, loc_J, update_mat);
       // Free A_csr data from hypre_MergeDiagAndOffd method
       hypre_CSRMatrixDestroy(A_csr);
       return;
    }

    mfem_error("Unsupported MPI_GPU combination \n");
 }

 void AmgXSolver::SetMatrixMPIGPUExclusive(const HypreParMatrix &A,
                                           const Array<double> &loc_A,
                                           const Array<int> &loc_I,
                                           const Array<int64_t> &loc_J,
                                           const bool update_mat)
 {
    // Create a vector of offsets describing matrix row partitions
    Array<int64_t> rowPart(gpuWorldSize+1); rowPart = 0.0;

    int64_t myStart = A.GetRowStarts()[0];

    MPI_Allgather(&myStart, 1, MPI_INT64_T,
                  rowPart.GetData(),1, MPI_INT64_T
                  ,gpuWorld);
    MPI_Barrier(gpuWorld);

    rowPart[gpuWorldSize] = A.M();

    const int nGlobalRows = A.M();
    const int local_rows = loc_I.Size()-1;
    const int num_nnz = loc_I[local_rows];

    if (update_mat == false)
    {
       AMGX_distribution_handle dist;
       AMGX_SAFE_CALL(AMGX_distribution_create(&dist, cfg));
       AMGX_SAFE_CALL(AMGX_distribution_set_partition_data(dist,
                                                           AMGX_DIST_PARTITION_OFFSETS,
                                                           rowPart.GetData()));

       AMGX_SAFE_CALL(AMGX_matrix_upload_distributed(AmgXA, nGlobalRows,
                                                     local_rows, num_nnz, 1, 1,
                                                     loc_I.Read(), loc_J.Read(),
                                                     loc_A.Read(), NULL, dist));

       AMGX_SAFE_CALL(AMGX_distribution_destroy(dist));

       MPI_Barrier(gpuWorld);

       AMGX_SAFE_CALL(AMGX_solver_setup(solver, AmgXA));

       AMGX_SAFE_CALL(AMGX_vector_bind(AmgXP, AmgXA));
       AMGX_SAFE_CALL(AMGX_vector_bind(AmgXRHS, AmgXA));
    }
    else
    {
       AMGX_SAFE_CALL(AMGX_matrix_replace_coefficients(AmgXA, nGlobalRows,
                                                       num_nnz, loc_A, NULL));
    }
 }

 void AmgXSolver::SetMatrixMPITeams(const HypreParMatrix &A,
                                    const Array<double> &loc_A,
                                    const Array<int> &loc_I,
                                    const Array<int64_t> &loc_J,
                                    const bool update_mat)
 {
    // The following arrays hold the consolidated diagonal + off-diagonal matrix
    // data
    Array<int> all_I;
    Array<int64_t> all_J;
    Array<double> all_A;

    // Determine array sizes
    int J_allsz(0), all_NNZ(0), nDevRows(0);
    const int loc_row_len = std::abs(A.RowPart()[1] -
                                     A.RowPart()[0]); // end of row partition
    const int loc_Jz_sz = loc_J.Size();
    const int loc_A_sz = loc_A.Size();

    MPI_Reduce(&loc_row_len, &nDevRows, 1, MPI_INT, MPI_SUM, 0, devWorld);
    MPI_Reduce(&loc_Jz_sz, &J_allsz, 1, MPI_INT, MPI_SUM, 0, devWorld);
    MPI_Reduce(&loc_A_sz, &all_NNZ, 1, MPI_INT, MPI_SUM, 0, devWorld);

    MPI_Barrier(devWorld);

    if (myDevWorldRank == 0)
    {
       all_I.SetSize(nDevRows+devWorldSize);
       all_J.SetSize(J_allsz); all_J = 0.0;
       all_A.SetSize(all_NNZ);
    }

    GatherArray(loc_I, all_I, devWorldSize, devWorld);
    GatherArray(loc_J, all_J, devWorldSize, devWorld);
    GatherArray(loc_A, all_A, devWorldSize, devWorld);

    MPI_Barrier(devWorld);

    int local_nnz(0);
    int64_t local_rows(0);

    if (myDevWorldRank == 0)
    {
       // A fix up step is needed for the array holding row data to remove extra
       // zeros when consolidating team data.
       Array<int> z_ind(devWorldSize+1);
       int iter = 1;
       while (iter < devWorldSize-1)
       {
          // Determine the indices of zeros in global all_I array
          int counter = 0;
          z_ind[counter] = counter;
          counter++;
          for (int idx=1; idx<all_I.Size()-1; idx++)
          {
             if (all_I[idx]==0)
             {
                z_ind[counter] = idx-1;
                counter++;
             }
          }
          z_ind[devWorldSize] = all_I.Size()-1;
          // End of determining indices of zeros in global all_I Array

          // Bump all_I
          for (int idx=z_ind[1]+1; idx < z_ind[2]; idx++)
          {
             all_I[idx] = all_I[idx-1] + (all_I[idx+1] - all_I[idx]);
          }

          // Shift array after bump to remove unnecessary values in middle of
          // array
          for (int idx=z_ind[2]; idx < all_I.Size()-1; ++idx)
          {
             all_I[idx] = all_I[idx+1];
          }
          iter++;
       }

       // LAST TIME THROUGH ARRAY
       // Determine the indices of zeros in global row_ptr array
       int counter = 0;
       z_ind[counter] = counter;
       counter++;
       for (int idx=1; idx<all_I.Size()-1; idx++)
       {
          if (all_I[idx]==0)
          {
             z_ind[counter] = idx-1;
             counter++;
          }
       }

       z_ind[devWorldSize] = all_I.Size()-1;
       // End of determining indices of zeros in global all_I Array BUMP all_I
       // one last time
       for (int idx=z_ind[1]+1; idx < all_I.Size()-1; idx++)
       {
          all_I[idx] = all_I[idx-1] + (all_I[idx+1] - all_I[idx]);
       }
       local_nnz = all_I[all_I.Size()-devWorldSize];
       local_rows = nDevRows;
    }

    // Create row partition
    mat_local_rows = local_rows; // class copy
    Array<int64_t> rowPart;
    if (gpuProc == 0)
    {
       rowPart.SetSize(gpuWorldSize+1); rowPart=0;

       MPI_Allgather(&local_rows, 1, MPI_INT64_T,
                     &rowPart.GetData()[1], 1, MPI_INT64_T,
                     gpuWorld);
       MPI_Barrier(gpuWorld);

       // Fixup step
       for (int i=1; i<rowPart.Size(); ++i)
       {
          rowPart[i] += rowPart[i-1];
       }

       // Upload A matrix to AmgX
       MPI_Barrier(gpuWorld);

       int nGlobalRows = A.M();
       if (update_mat == false)
       {
          AMGX_distribution_handle dist;
          AMGX_SAFE_CALL(AMGX_distribution_create(&dist, cfg));
          AMGX_SAFE_CALL(AMGX_distribution_set_partition_data(dist,
                                                              AMGX_DIST_PARTITION_OFFSETS,
                                                              rowPart.GetData()));

          AMGX_SAFE_CALL(AMGX_matrix_upload_distributed(AmgXA, nGlobalRows,
                                                        local_rows, local_nnz,
                                                        1, 1, all_I.ReadWrite(),
                                                        all_J.Read(),
                                                        all_A.Read(),
                                                        nullptr, dist));

          AMGX_SAFE_CALL(AMGX_distribution_destroy(dist));
          MPI_Barrier(gpuWorld);

          AMGX_SAFE_CALL(AMGX_solver_setup(solver, AmgXA));

          // Bind vectors to A
          AMGX_SAFE_CALL(AMGX_vector_bind(AmgXP, AmgXA));
          AMGX_SAFE_CALL(AMGX_vector_bind(AmgXRHS, AmgXA));
       }
       else
       {
          AMGX_SAFE_CALL(AMGX_matrix_replace_coefficients(AmgXA, nGlobalRows,
                                                          local_nnz, all_A, NULL));
       }
    }
 }

 #endif

 void AmgXSolver::SetOperator(const Operator& op)
 {
    height = op.Height();
    width = op.Width();

    if (const SparseMatrix* Aptr =
           dynamic_cast<const SparseMatrix*>(&op))
    {
       SetMatrix(*Aptr);
    }
 #ifdef MFEM_USE_MPI
    else if (const HypreParMatrix* Aptr =
                dynamic_cast<const HypreParMatrix*>(&op))
    {
       SetMatrix(*Aptr);
    }
 #endif
    else
    {
       mfem_error("Unsupported Operator Type \n");
    }
 }

 void AmgXSolver::UpdateOperator(const Operator& op)
 {
    if (const SparseMatrix* Aptr =
           dynamic_cast<const SparseMatrix*>(&op))
    {
       SetMatrix(*Aptr, true);
    }
 #ifdef MFEM_USE_MPI
    else if (const HypreParMatrix* Aptr =
                dynamic_cast<const HypreParMatrix*>(&op))
    {
       SetMatrix(*Aptr, true);
    }
 #endif
    else
    {
       mfem_error("Unsupported Operator Type \n");
    }
 }

 void AmgXSolver::Mult(const Vector& B, Vector& X) const
 {
    // Set initial guess to zero
    X.UseDevice(true);
    if (!iterative_mode) { X = 0.0; }

    // Mult for serial, and mpi-exclusive modes
    if (mpi_gpu_mode != "mpi-teams")
    {
       AMGX_SAFE_CALL(AMGX_vector_upload(AmgXP, X.Size(), 1, X.ReadWrite()));
       AMGX_SAFE_CALL(AMGX_vector_upload(AmgXRHS, B.Size(), 1, B.Read()));

       if (mpi_gpu_mode != "serial")
       {
 #ifdef MFEM_USE_MPI
          MPI_Barrier(gpuWorld);
 #endif
       }

       AMGX_SAFE_CALL(AMGX_solver_solve(solver,AmgXRHS, AmgXP));

       AMGX_SOLVE_STATUS   status;
       AMGX_SAFE_CALL(AMGX_solver_get_status(solver, &status));
       if (status != AMGX_SOLVE_SUCCESS && ConvergenceCheck)
       {
          if (status == AMGX_SOLVE_DIVERGED)
          {
             mfem_error("AmgX solver diverged \n");
          }
          else
          {
             mfem_error("AmgX solver failed to solve system \n");
          }
       }

       AMGX_SAFE_CALL(AMGX_vector_download(AmgXP, X.Write()));
       return;
    }

 #ifdef MFEM_USE_MPI
    Vector all_X(mat_local_rows);
    Vector all_B(mat_local_rows);
    Array<int> Apart_X(devWorldSize);
    Array<int> Adisp_X(devWorldSize);
    Array<int> Apart_B(devWorldSize);
    Array<int> Adisp_B(devWorldSize);

    GatherArray(X, all_X, devWorldSize, devWorld, Apart_X, Adisp_X);
    GatherArray(B, all_B, devWorldSize, devWorld, Apart_B, Adisp_B);
    MPI_Barrier(devWorld);

    if (gpuWorld != MPI_COMM_NULL)
    {
       AMGX_SAFE_CALL(AMGX_vector_upload(AmgXP, all_X.Size(), 1, all_X.ReadWrite()));
       AMGX_SAFE_CALL(AMGX_vector_upload(AmgXRHS, all_B.Size(), 1, all_B.ReadWrite()));

       MPI_Barrier(gpuWorld);

       AMGX_SAFE_CALL(AMGX_solver_solve(solver,AmgXRHS, AmgXP));

       AMGX_SOLVE_STATUS   status;
       AMGX_SAFE_CALL(AMGX_solver_get_status(solver, &status));
       if (status != AMGX_SOLVE_SUCCESS && amgxMode == SOLVER)
       {
          if (status == AMGX_SOLVE_DIVERGED)
          {
             mfem_error("AmgX solver diverged \n");
          }
          else
          {
             mfem_error("AmgX solver failed to solve system \n");
          }
       }

       AMGX_SAFE_CALL(AMGX_vector_download(AmgXP, all_X.Write()));
    }

    ScatterArray(all_X, X, devWorldSize, devWorld, Apart_X, Adisp_X);
 #endif
 }

 int AmgXSolver::GetNumIterations()
 {
    int getIters;
    AMGX_SAFE_CALL(AMGX_solver_get_iterations_number(solver, &getIters));
    return getIters;
 }

 void AmgXSolver::Finalize()
 {
    // Check instance is initialized
    if (! isInitialized || count < 1)
    {
       mfem_error("Error in AmgXSolver::Finalize(). \n"
                  "This AmgXWrapper has not been initialized. \n"
                  "Please initialize it before finalization.\n");
    }

    // Only processes using GPU are required to destroy AmgX content
 #ifdef MFEM_USE_MPI
    if (gpuProc == 0 || mpi_gpu_mode == "serial")
 #endif
    {
       // Destroy solver instance
       AMGX_SAFE_CALL(AMGX_solver_destroy(solver));

       // Destroy matrix instance
       AMGX_SAFE_CALL(AMGX_matrix_destroy(AmgXA));

       // Destroy RHS and unknown vectors
       AMGX_SAFE_CALL(AMGX_vector_destroy(AmgXP));
       AMGX_SAFE_CALL(AMGX_vector_destroy(AmgXRHS));

       // Only the last instance need to destroy resource and finalizing AmgX
       if (count == 1)
       {
          AMGX_SAFE_CALL(AMGX_resources_destroy(rsrc));
          AMGX_SAFE_CALL(AMGX_config_destroy(cfg));

          AMGX_SAFE_CALL(AMGX_finalize_plugins());
          AMGX_SAFE_CALL(AMGX_finalize());
       }
       else
       {
          AMGX_SAFE_CALL(AMGX_config_destroy(cfg));
       }
 #ifdef MFEM_USE_MPI
       // destroy gpuWorld
       if (mpi_gpu_mode != "serial")
       {
          MPI_Comm_free(&gpuWorld);
       }
 #endif
    }

    // reset necessary variables in case users want to reuse the variable of
    // this instance for a new instance
 #ifdef MFEM_USE_MPI
    gpuProc = MPI_UNDEFINED;
    if (globalCpuWorld != MPI_COMM_NULL)
    {
       MPI_Comm_free(&globalCpuWorld);
       MPI_Comm_free(&localCpuWorld);
       MPI_Comm_free(&devWorld);
    }
 #endif
    // decrease the number of instances
    count -= 1;

    // change status
    isInitialized = false;
 }

 } // mfem namespace

 #endif
mfem::AmgXSolver::ConvergenceCheck
bool ConvergenceCheck
Flag to check for convergence.
Definition: amgxsolver.hpp:77

mfem::Vector::UseDevice
virtual void UseDevice(bool use_dev) const
Enable execution of Vector operations using the mfem::Device.
Definition: vector.hpp:115

amgxsolver.hpp

mfem::Operator::Width
int Width() const
Get the width (size of input) of the Operator. Synonym with NumCols().
Definition: operator.hpp:72

mfem::Vector::Size
int Size() const
Returns the size of the vector.
Definition: vector.hpp:197

mfem::AmgXSolver::SOLVER
Definition: amgxsolver.hpp:74

mfem::AmgXSolver::CONFIG_SRC
CONFIG_SRC
Definition: amgxsolver.hpp:83

mfem::Vector::Read
virtual const double * Read(bool on_dev=true) const
Shortcut for mfem::Read(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:453

mfem::Solver::iterative_mode
bool iterative_mode
If true, use the second argument of Mult() as an initial guess.
Definition: operator.hpp:686

mfem::AmgXSolver::InitMPITeams
void InitMPITeams(const MPI_Comm &comm, const int nDevs)
Definition: amgxsolver.cpp:149

mfem::AmgXSolver::ReadParameters
void ReadParameters(const std::string config, CONFIG_SRC source)
Definition: amgxsolver.cpp:186

mfem::AmgXSolver::AmgXSolver
AmgXSolver()
Definition: amgxsolver.cpp:33

mfem::AmgXSolver::Finalize
void Finalize()
Definition: amgxsolver.cpp:990

source
void source(const Vector &x, Vector &f)
Definition: ex25.cpp:617

mfem::AmgXSolver::SetConvergenceCheck
void SetConvergenceCheck(bool setConvergenceCheck_=true)
Add a check for convergence after applying Mult.
Definition: amgxsolver.cpp:193

mfem::AmgXSolver::~AmgXSolver
~AmgXSolver()
Definition: amgxsolver.cpp:80

mfem::SparseMatrix
Data type sparse matrix.
Definition: sparsemat.hpp:50

mfem
Definition: CodeDocumentation.dox:1

mfem::AmgXSolver::InitSerial
void InitSerial()
Definition: amgxsolver.cpp:85

mfem::Vector::Write
virtual double * Write(bool on_dev=true)
Shortcut for mfem::Write(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:461

mfem::mfem_error
void mfem_error(const char *msg)
Function called when an error is encountered. Used by the macros MFEM_ABORT, MFEM_ASSERT, MFEM_VERIFY.
Definition: error.cpp:154

mfem::Array< int >

mfem::AmgXSolver::UpdateOperator
void UpdateOperator(const Operator &op)
Definition: amgxsolver.cpp:882

mfem::AmgXSolver::INTERNAL
Definition: amgxsolver.hpp:83

mfem::out
OutStream out(std::cout)
Global stream used by the library for standard output. Initially it uses the same std::streambuf as s...
Definition: globals.hpp:66

mfem::AmgXSolver::SetOperator
virtual void SetOperator(const Operator &op)
Definition: amgxsolver.cpp:859

mfem::Operator::Height
int Height() const
Get the height (size of output) of the Operator. Synonym with NumRows().
Definition: operator.hpp:66

mfem::AmgXSolver::DefaultParameters
void DefaultParameters(const AMGX_MODE amgxMode_, const bool verbose)
Definition: amgxsolver.cpp:198

mfem::Operator::height
int height
Dimension of the output / number of rows in the matrix.
Definition: operator.hpp:27

mfem::Vector::ReadWrite
virtual double * ReadWrite(bool on_dev=true)
Shortcut for mfem::ReadWrite(vec.GetMemory(), vec.Size(), on_dev).
Definition: vector.hpp:469

mfem::AmgXSolver::InitExclusiveGPU
void InitExclusiveGPU(const MPI_Comm &comm)
Definition: amgxsolver.cpp:120

mfem::AmgXSolver::Mult
virtual void Mult(const Vector &b, Vector &x) const
Operator application: y=A(x).
Definition: amgxsolver.cpp:902

mfem::Vector
Vector data type.
Definition: vector.hpp:58

mfem::Operator
Abstract operator.
Definition: operator.hpp:24

mfem::HypreParMatrix
Wrapper for hypre&#39;s ParCSR matrix class.
Definition: hypre.hpp:343

mfem::AmgXSolver::AMGX_MODE
AMGX_MODE
Flags to configure AmgXSolver as a solver or preconditioner.
Definition: amgxsolver.hpp:74

mfem::AmgXSolver::GetNumIterations
int GetNumIterations()
Definition: amgxsolver.cpp:983

mfem::Operator::width
int width
Dimension of the input / number of columns in the matrix.
Definition: operator.hpp:28