Program Listing for File CUDACholeksySparseSolver.inl
Program Listing for File CUDACholeksySparseSolver.inl#
↰ Return to documentation for file (src/SofaCUDALinearSolver/CUDACholeksySparseSolver.inl
)
/******************************************************************************
* SOFA, Simulation Open-Framework Architecture *
* (c) 2006 INRIA, USTL, UJF, CNRS, MGH *
* *
* This program is free software; you can redistribute it and/or modify it *
* under the terms of the GNU Lesser General Public License as published by *
* the Free Software Foundation; either version 2.1 of the License, or (at *
* your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, but WITHOUT *
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or *
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
* for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
*******************************************************************************
* Authors: The SOFA Team and external contributors (see Authors.txt) *
* *
* Contact information: contact@sofa-framework.org *
******************************************************************************/
#pragma once
#include <SofaCUDALinearSolver/CUDACholeksySparseSolver.h>
#include <sofa/helper/ScopedAdvancedTimer.h>
#include <SofaCUDALinearSolver/utils.h>
#include <cusparse.h>
namespace sofa::component::linearsolver::direct
{
template<class TMatrix , class TVector>
CUDASparseCholeskySolver<TMatrix,TVector>::CUDASparseCholeskySolver()
: Inherit1()
, d_typePermutation(initData(&d_typePermutation, "permutation", "Type of fill-in reducing permutation"))
, d_hardware(initData(&d_hardware, "hardware", "On which hardware to solve the linear system: CPU or GPU"))
{
sofa::helper::OptionsGroup typePermutationOptions(4,"None","RCM" ,"AMD", "METIS");
typePermutationOptions.setSelectedItem(0); // default None
d_typePermutation.setValue(typePermutationOptions);
sofa::helper::OptionsGroup hardwareOptions(2,"CPU", "GPU");
hardwareOptions.setSelectedItem(1);
d_hardware.setValue(hardwareOptions);
cusolverSpCreate(&handle);
cusparseCreate(&cusparseHandle);
cudaStreamCreate(&stream);
cusolverSpSetStream(handle, stream);
cusparseSetStream(cusparseHandle, stream);
cusparseCreateMatDescr(&descr);
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
host_RowPtr = nullptr;
host_ColsInd = nullptr;
host_values = nullptr;
device_RowPtr = nullptr;
device_ColsInd = nullptr;
device_values = nullptr;
device_x = nullptr;
device_b = nullptr;
buffer_cpu = nullptr;
buffer_gpu = nullptr;
device_info = nullptr;
host_info = nullptr;
singularity = 0;
size_internal = 0;
size_work = 0;
size_perm = 0;
notSameShape = true;
nnz = 0;
previous_n = 0 ;
previous_nnz = 0;
rows = 0;
reorder = 0;
previous_ColsInd.clear() ;
previous_RowPtr.clear() ;
}
template<class TMatrix , class TVector>
CUDASparseCholeskySolver<TMatrix,TVector>::~CUDASparseCholeskySolver()
{
previous_RowPtr.clear();
previous_ColsInd.clear();
checkCudaErrors(cudaFree(device_x));
checkCudaErrors(cudaFree(device_b));
checkCudaErrors(cudaFree(device_RowPtr));
checkCudaErrors(cudaFree(device_ColsInd));
checkCudaErrors(cudaFree(device_values));
cusolverSpDestroy(handle);
cusolverSpDestroyCsrcholInfo(device_info);
cusolverSpDestroyCsrcholInfoHost(host_info);
cusparseDestroyMatDescr(descr);
cudaStreamDestroy(stream);
}
template <class TMatrix, class TVector>
void CUDASparseCholeskySolver<TMatrix, TVector>::setWorkspace()
{
if (d_hardware.getValue().getSelectedId() == 0)
{
const int* hRow = reorder != 0 ? host_rowPermuted.data() : host_RowPtr;
const int* hCol = reorder != 0 ? host_colPermuted.data() : host_ColsInd;
const Real* hValues = reorder != 0 ? host_valuePermuted.data() : host_values;
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholBufferInfoHost( handle, rows, nnz, descr, hValues, hRow, hCol,
host_info, &size_internal, &size_work ));
}
else
{
checksolver(cusolverSpScsrcholBufferInfoHost( handle, rows, nnz, descr, device_values, hRow, hCol,
host_info, &size_internal, &size_work ));
}
}
else
{
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholBufferInfo( handle, rows, nnz, descr, device_values, device_RowPtr, device_ColsInd,
device_info, &size_internal, &size_work ));
}
else
{
checksolver(cusolverSpScsrcholBufferInfo( handle, rows, nnz, descr, device_values, device_RowPtr, device_ColsInd,
device_info, &size_internal, &size_work ));
}
}
}
template <class TMatrix, class TVector>
void CUDASparseCholeskySolver<TMatrix, TVector>::numericFactorization()
{
if (d_hardware.getValue().getSelectedId() == 0)
{
const int* hRow = reorder != 0 ? host_rowPermuted.data() : host_RowPtr;
const int* hCol = reorder != 0 ? host_colPermuted.data() : host_ColsInd;
const Real* hValues = reorder != 0 ? host_valuePermuted.data() : host_values;
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholFactorHost( handle, rows, nnz, descr, hValues, hRow, hCol,
host_info, buffer_gpu ));
}
else
{
checksolver(cusolverSpScsrcholFactorHost( handle, rows, nnz, descr, hValues, hRow, hCol,
host_info, buffer_gpu ));
}
}
else
{
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholFactor( handle, rows, nnz, descr, device_values, device_RowPtr, device_ColsInd,
device_info, buffer_gpu ));
}
else
{
checksolver(cusolverSpScsrcholFactor( handle, rows, nnz, descr, device_values, device_RowPtr, device_ColsInd,
device_info, buffer_gpu ));
}
}
}
template <class TMatrix, class TVector>
void CUDASparseCholeskySolver<TMatrix, TVector>::createCholeskyInfo()
{
cusolverSpDestroyCsrcholInfo(device_info);
cusolverSpDestroyCsrcholInfoHost(host_info);
if (d_hardware.getValue().getSelectedId() == 0)
{
checksolver(cusolverSpCreateCsrcholInfoHost(&host_info));
}
else
{
checksolver(cusolverSpCreateCsrcholInfo(&device_info));
}
}
template <class TMatrix, class TVector>
void CUDASparseCholeskySolver<TMatrix, TVector>::symbolicFactorization()
{
if (d_hardware.getValue().getSelectedId() == 0)
{
const int* hRow = reorder != 0 ? host_rowPermuted.data() : host_RowPtr;
const int* hCol = reorder != 0 ? host_colPermuted.data() : host_ColsInd;
checksolver(cusolverSpXcsrcholAnalysisHost( handle, rows, nnz, descr, hRow, hCol, host_info ));
}
else
{
checksolver(cusolverSpXcsrcholAnalysis( handle, rows, nnz, descr, device_RowPtr, device_ColsInd, device_info ));
cudaStreamSynchronize(stream);// for the timer
}
}
template<class TMatrix , class TVector>
void CUDASparseCholeskySolver<TMatrix,TVector>:: invert(Matrix& M)
{
sofa::helper::ScopedAdvancedTimer invertTimer("invert");
{
sofa::helper::ScopedAdvancedTimer copyTimer("copyMatrixData");
m_filteredMatrix.copyNonZeros(M);
m_filteredMatrix.compress();
}
reorder = d_typePermutation.getValue().getSelectedId() ;
rows = m_filteredMatrix.rowSize();
cols = m_filteredMatrix.colSize();
nnz = m_filteredMatrix.getColsValue().size(); // number of non zero coefficients
host_RowPtr = (int*) m_filteredMatrix.getRowBegin().data();
host_ColsInd = (int*) m_filteredMatrix.getColsIndex().data();
host_values = (Real*) m_filteredMatrix.getColsValue().data();
{
sofa::helper::ScopedAdvancedTimer compareMatrixShapeTimer("compareMatrixShape");
notSameShape = compareMatrixShape(rows , host_ColsInd, host_RowPtr, previous_RowPtr.size()-1, previous_ColsInd.data(), previous_RowPtr.data() );
}
// allocate memory
if(previous_n < rows)
{
host_rowPermuted.resize(rows + 1);
if (d_hardware.getValue().getSelectedId() != 0)
{
checkCudaErrors(cudaMalloc( &device_RowPtr, sizeof(int)*( rows +1) ));
checkCudaErrors(cudaMalloc(&device_x, sizeof(Real)*cols));
checkCudaErrors(cudaMalloc(&device_b, sizeof(Real)*cols));
}
}
if(previous_nnz < nnz)
{
if (d_hardware.getValue().getSelectedId() != 0)
{
if(device_ColsInd) cudaFree(device_ColsInd);
checkCudaErrors(cudaMalloc( &device_ColsInd, sizeof(int)*nnz ));
if(device_values) cudaFree(device_values);
checkCudaErrors(cudaMalloc( &device_values, sizeof(Real)*nnz ));
}
host_colPermuted.resize(nnz);
host_valuePermuted.resize(nnz);
}
// A = PAQ
// compute fill reducing permutation
if( reorder != 0 && notSameShape)
{
sofa::helper::ScopedAdvancedTimer permutationsTimer("Permutations");
host_perm.resize(rows);
switch( reorder )
{
default:
case 0:// None, this case should not be visited
break;
case 1://RCM, Symmetric Reverse Cuthill-McKee permutation
checksolver( cusolverSpXcsrsymrcmHost(handle, rows, nnz, descr, host_RowPtr, host_ColsInd, host_perm.data()) );
break;
case 2://AMD, Symmetric Approximate Minimum Degree Algorithm based on Quotient Graph
checksolver( cusolverSpXcsrsymamdHost(handle, rows, nnz, descr, host_RowPtr, host_ColsInd, host_perm.data()) );
break;
case 3://METIS, nested dissection
checksolver( cusolverSpXcsrmetisndHost(handle, rows, nnz, descr, host_RowPtr, host_ColsInd, nullptr, host_perm.data()) );
break;
}
checksolver( cusolverSpXcsrperm_bufferSizeHost(handle, rows, cols, nnz, descr, host_RowPtr, host_ColsInd
, host_perm.data(), host_perm.data(), &size_perm) );
if(buffer_cpu) free(buffer_cpu);
buffer_cpu = (void*)malloc(sizeof(char)*size_perm) ;
host_map.resize(nnz);
std::iota(host_map.begin(), host_map.end(), 0);
//apply the permutation
for(int i=0;i<rows+1;i++) host_rowPermuted[i] = host_RowPtr[i];
for(int i=0;i<nnz;i++) host_colPermuted[i] = host_ColsInd[i];
checksolver( cusolverSpXcsrpermHost( handle, rows, cols, nnz, descr, host_rowPermuted.data(), host_colPermuted.data()
, host_perm.data(), host_perm.data(), host_map.data(), buffer_cpu ));
}
// send data to the device
if (notSameShape && d_hardware.getValue().getSelectedId() != 0)
{
const int* host_rowPtrToCopy = reorder != 0 ? host_rowPermuted.data() : host_RowPtr;
const int* host_colPtrToCopy = reorder != 0 ? host_colPermuted.data() : host_ColsInd;
checkCudaErrors( cudaMemcpyAsync( device_RowPtr, host_rowPtrToCopy, sizeof(int)*(rows+1), cudaMemcpyHostToDevice, stream) );
checkCudaErrors( cudaMemcpyAsync( device_ColsInd, host_colPtrToCopy, sizeof(int)*nnz, cudaMemcpyHostToDevice, stream ) );
}
//store the shape of the matrix
if(notSameShape)
{
previous_nnz = nnz ;
previous_RowPtr.resize( rows + 1 );
previous_ColsInd.resize( nnz );
for(int i=0;i<nnz;i++) previous_ColsInd[i] = host_ColsInd[i];
for(int i=0; i<rows +1; i++) previous_RowPtr[i] = host_RowPtr[i];
}
if( reorder != 0)
{
sofa::helper::ScopedAdvancedTimer reorderValuesTimer("ReorderValues");
for(int i=0;i<nnz;i++)
{
host_valuePermuted[i] = host_values[ host_map[i] ];
}
}
if (d_hardware.getValue().getSelectedId() != 0)
{
const Real* hostValuesToCopy = reorder != 0 ? host_valuePermuted.data() : host_values;
checkCudaErrors( cudaMemcpyAsync( device_values, hostValuesToCopy, sizeof(Real)*nnz, cudaMemcpyHostToDevice, stream ) );
}
// factorize on device LL^t = PAP^t
if(notSameShape)
{
createCholeskyInfo();
{
sofa::helper::ScopedAdvancedTimer symbolicTimer("Symbolic factorization");
symbolicFactorization();
}
setWorkspace();
if(buffer_gpu) cudaFree(buffer_gpu);
if (d_hardware.getValue().getSelectedId() == 0)
{
checkCudaErrors(cudaMallocHost(&buffer_gpu, sizeof(char)*size_work));
}
else
{
checkCudaErrors(cudaMalloc(&buffer_gpu, sizeof(char)*size_work));
}
}
{
sofa::helper::ScopedAdvancedTimer numericTimer("Numeric factorization");
numericFactorization();
cudaStreamSynchronize(stream);// for the timer
}
}
template <class TMatrix, class TVector>
void CUDASparseCholeskySolver<TMatrix, TVector>::solve_impl(int n, Real* b, Real* x)
{
if (d_hardware.getValue().getSelectedId() == 0)
{
Real* host_b = (reorder != 0) ? host_bPermuted.data() : b;
Real* host_x = (reorder != 0) ? host_xPermuted.data() : x;
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholSolveHost( handle, n, host_b, host_x, host_info, buffer_gpu ));
}
else
{
checksolver(cusolverSpScsrcholSolveHost( handle, n, host_b, host_x, host_info, buffer_gpu ));
}
}
else
{
if constexpr (std::is_same_v<Real, double>)
{
checksolver(cusolverSpDcsrcholSolve( handle, n, device_b, device_x, device_info, buffer_gpu ));
}
else
{
checksolver(cusolverSpScsrcholSolve( handle, n, device_b, device_x, device_info, buffer_gpu ));
}
}
}
template<class TMatrix , class TVector>
void CUDASparseCholeskySolver<TMatrix,TVector>::solve(Matrix& M, Vector& x, Vector& b)
{
sofa::helper::ScopedAdvancedTimer solveTimer("solve");
int n = M.colSize() ; // avoidable, used to prevent compilation warning
if( previous_n < n && reorder !=0 )
{
host_bPermuted.resize(n);
host_xPermuted.resize(n);
}
if( reorder != 0 )
{
sofa::helper::ScopedAdvancedTimer reorderRHSTimer("reorderRHS");
for(int i = 0; i < n; ++i)
{
host_bPermuted[i] = b[ host_perm[i] ];
}
}
Real* host_b = (reorder != 0) ? host_bPermuted.data() : b.ptr();
Real* host_x = (reorder != 0) ? host_xPermuted.data() : x.ptr();
if (d_hardware.getValue().getSelectedId() != 0)
{
sofa::helper::ScopedAdvancedTimer copyRHSToDeviceTimer("copyRHSToDevice");
checkCudaErrors(cudaMemcpyAsync( device_b, host_b, sizeof(Real)*n, cudaMemcpyHostToDevice,stream));
checkCudaErrors(cudaStreamSynchronize(stream));
}
{
// LL^t y = Pb
sofa::helper::ScopedAdvancedTimer solveTimer("Solve");
solve_impl(n, x.ptr(), b.ptr());
checkCudaErrors(cudaStreamSynchronize(stream));
}
if (d_hardware.getValue().getSelectedId() != 0)
{
sofa::helper::ScopedAdvancedTimer copySolutionToHostTimer("copySolutionToHost");
checkCudaErrors(cudaMemcpyAsync( host_x, device_x, sizeof(Real)*n, cudaMemcpyDeviceToHost,stream));
cudaStreamSynchronize(stream);
}
if( reorder != 0 )
{
sofa::helper::ScopedAdvancedTimer reorderSolutionTimer("reorderSolution");
for(int i = 0; i < n; ++i)
{
x[host_perm[i]] = host_xPermuted[ i ]; // Px = y
}
}
previous_n = n ;
}
inline bool compareMatrixShape(const int s_M,const int * M_colind,const int * M_rowptr,const int s_P,const int * P_colind,const int * P_rowptr)
{
if (s_M != s_P) return true;
if (M_rowptr[s_M] != P_rowptr[s_M] ) return true;
for (int i=0;i<s_P;i++) {
if (M_rowptr[i]!=P_rowptr[i]) return true;
}
for (int i=0;i<M_rowptr[s_M];i++) {
if (M_colind[i]!=P_colind[i]) return true;
}
return false;
}
}// namespace sofa::component::linearsolver::direct