From 50cdd0e7660507ed25e3e3bf6a6464e91340a117 Mon Sep 17 00:00:00 2001 From: Kartick Ramakrishnan Date: Sun, 10 Mar 2024 21:12:50 +0530 Subject: [PATCH 1/2] BLASWrapper bug fixes --- include/BLASWrapper.h | 105 ++++++++----- utils/BLASWrapperDevice.cu.cc | 239 ++++++++++++++++++++++++++++- utils/BLASWrapperDevice.hip.cc | 245 +++++++++++++++++++++++++++++- utils/BLASWrapperDeviceKernels.cc | 113 +++++++++++++- utils/BLASWrapperHost.cc | 207 ++++++++++++++++++++++--- 5 files changed, 838 insertions(+), 71 deletions(-) diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index 02a9b8632..9c3f40d62 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -207,6 +207,17 @@ namespace dftfe const unsigned int INCY, std::complex * result) const; + // Complex dot proeuct with all Reduce call + void + xdot(const unsigned int N, + const std::complex *X, + const unsigned int INCX, + const std::complex *Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + std::complex * result) const; + + // Real double Ax+y void xaxpy(const unsigned int n, @@ -233,14 +244,6 @@ namespace dftfe double * y, const unsigned int incy) const; - // Real copy of double data to float - void - xcopy(const unsigned int n, - double * x, - const unsigned int incx, - float * y, - const unsigned int incy) const; - // Complex double copy of data void xcopy(const unsigned int n, @@ -265,13 +268,6 @@ namespace dftfe std::complex * y, const unsigned int incy) const; - void - xcopy(const unsigned int n, - std::complex *x, - const unsigned int incx, - std::complex * y, - const unsigned int incy) const; - // Real double symmetric matrix-vector product void xsymv(const char UPLO, @@ -500,6 +496,14 @@ namespace dftfe const ValueType1 * copyFromVec, ValueType2 * copyToVec); + template + void + axpby(const unsigned int n, + const ValueType2 alpha, + const ValueType1 * x, + const ValueType2 beta, + ValueType1 * y) const; + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, @@ -509,13 +513,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - double * addToVecReal, - double * addToVecImag, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * addFromVec, + ValueType2 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; @@ -527,6 +532,17 @@ namespace dftfe const ValueType1 * s, ValueType2 * x); + template + void + stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + void add(double * y, const double * x, @@ -721,6 +737,16 @@ namespace dftfe const unsigned int INCY, std::complex * result) const; + // Complex dot product + void + xdot(const unsigned int N, + const std::complex *X, + const unsigned int INCX, + const std::complex *Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + std::complex * result) const; + // Real double Ax+y void xaxpy(const unsigned int n, @@ -747,14 +773,6 @@ namespace dftfe double * y, const unsigned int incy) const; - // Real copy of double data - void - xcopy(const unsigned int n, - double * x, - const unsigned int incx, - float * y, - const unsigned int incy) const; - // Complex double copy of data void xcopy(const unsigned int n, @@ -779,13 +797,6 @@ namespace dftfe std::complex * y, const unsigned int incy) const; - void - xcopy(const unsigned int n, - std::complex *x, - const unsigned int incx, - std::complex * y, - const unsigned int incy) const; - // Real double symmetric matrix-vector product void xsymv(const char UPLO, @@ -1011,6 +1022,13 @@ namespace dftfe const dftfe::size_type startingId, const ValueType1 * copyFromVec, ValueType2 * copyToVec); + template + void + axpby(const unsigned int n, + const ValueType2 alpha, + const ValueType1 * x, + const ValueType2 beta, + ValueType1 * y) const; template void @@ -1021,13 +1039,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - double * addToVecReal, - double * addToVecImag, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * addFromVec, + ValueType2 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; @@ -1038,6 +1057,16 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, ValueType2 * x); + template + void + stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); void add(double * y, diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index 1433721a8..f505005d7 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -41,6 +41,67 @@ namespace dftfe return d_deviceBlasHandle; } + void + BLASWrapper::xcopy( + const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + cublasZcopy(d_deviceBlasHandle, + n, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + cublasCcopy(d_deviceBlasHandle, + n, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const double * x, + const unsigned int incx, + double * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + cublasDcopy(d_deviceBlasHandle, n, x, incx, y, incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const float * x, + const unsigned int incx, + float * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + cublasScopy(d_deviceBlasHandle, n, x, incx, y, incy); + DEVICEBLAS_API_CHECK(status); + } void BLASWrapper::xgemm( @@ -496,6 +557,25 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + template + void + BLASWrapper::axpby( + const unsigned int n, + const ValueType2 alpha, + const ValueType1 * x, + const ValueType2 beta, + ValueType1 * y) const + { + axpbyDeviceKernel<<<(n / dftfe::utils::DEVICE_BLOCK_SIZE) + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + n, + dftfe::utils::makeDataTypeDeviceCompatible(x), + dftfe::utils::makeDataTypeDeviceCompatible(y), + alpha, + beta); + } + + template void BLASWrapper::axpyStridedBlockAtomicAdd( @@ -518,6 +598,32 @@ namespace dftfe } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * addFromVec, + ValueType2 * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + axpyStridedBlockAtomicAddDeviceKernel<<< + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(a), + dftfe::utils::makeDataTypeDeviceCompatible(s), + dftfe::utils::makeDataTypeDeviceCompatible(addFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(addToVec), + addToVecStartingContiguousBlockIds); + } + + void BLASWrapper::xdot( @@ -529,8 +635,9 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localResult = 0.0; - dftfe::utils::deviceBlasStatus_t status = cublasDdot( + double localResult = 0.0; + *result = 0.0; + dftfe::utils::deviceBlasStatus_t status = cublasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); DEVICEBLAS_API_CHECK(status); MPI_Allreduce( @@ -572,6 +679,35 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + void + BLASWrapper::xdot( + const unsigned int N, + const std::complex *X, + const unsigned int INCX, + const std::complex *Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + std::complex * result) const + { + std::complex localResult = 0.0; + *result = 0.0; + dftfe::utils::deviceBlasStatus_t status = + cublasZdotc(d_deviceBlasHandle, + int(N), + dftfe::utils::makeDataTypeDeviceCompatible(X), + int(INCX), + dftfe::utils::makeDataTypeDeviceCompatible(Y), + int(INCY), + dftfe::utils::makeDataTypeDeviceCompatible(&localResult)); + DEVICEBLAS_API_CHECK(status); + MPI_Allreduce(&localResult, + result, + 1, + dataTypes::mpi_type_id(result), + MPI_SUM, + mpi_communicator); + } + void BLASWrapper::xgemmStridedBatched( const char transA, @@ -1068,7 +1204,8 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localresult = 0.0; + double localresult = 0.0; + *result = 0.0; dftfe::utils::deviceBlasStatus_t status = cublasDznrm2(d_deviceBlasHandle, int(n), @@ -1089,12 +1226,14 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localresult = 0.0; + double localresult = 0.0; + *result = 0.0; dftfe::utils::deviceBlasStatus_t status = cublasDnrm2(d_deviceBlasHandle, int(n), x, int(incx), &localresult); localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } @@ -1286,6 +1425,50 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); } + + template + void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) + { + stridedCopyToBlockScaleDeviceKernel<<< + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(a), + dftfe::utils::makeDataTypeDeviceCompatible(s), + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), + copyFromVecStartingContiguousBlockIds); + } + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + template void BLASWrapper::stridedBlockScale( @@ -1404,6 +1587,42 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const double *x, + const double beta, + double *y) const; + + + template void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + // for xscal template void BLASWrapper::xscal( @@ -1429,6 +1648,12 @@ namespace dftfe const std::complex a, const dftfe::size_type n) const; + template void + BLASWrapper::xscal( + std::complex * x, + const double a, + const dftfe::size_type n) const; + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1520,6 +1745,12 @@ namespace dftfe const std::complex *copyFromVec, std::complex * copyToVec); + template void + BLASWrapper::copyRealArrsToComplexArr( + const dftfe::size_type size, + const double * realArr, + const double * imagArr, + std::complex * complexArr); } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 873d28386..0bc0a822e 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -129,6 +129,67 @@ namespace dftfe return d_deviceBlasHandle; } + void + BLASWrapper::xcopy( + const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + hipblasZcopy(d_deviceBlasHandle, + n, + dftfe::utils::makeDataTypeHipBlasCompatible(x), + incx, + dftfe::utils::makeDataTypeHipBlasCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + hipblasCcopy(d_deviceBlasHandle, + n, + dftfe::utils::makeDataTypeHipBlasCompatible(x), + incx, + dftfe::utils::makeDataTypeHipBlasCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const double * x, + const unsigned int incx, + double * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + hipblasDcopy(d_deviceBlasHandle, n, x, incx, y, incy); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xcopy( + const unsigned int n, + const float * x, + const unsigned int incx, + float * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasStatus_t status = + hipblasScopy(d_deviceBlasHandle, n, x, incx, y, incy); + DEVICEBLAS_API_CHECK(status); + } void BLASWrapper::xgemm( @@ -574,6 +635,25 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + + template + void + BLASWrapper::axpby( + const unsigned int n, + const ValueType2 alpha, + const ValueType1 * x, + const ValueType2 beta, + ValueType1 * y) const + { + hipLaunchKernelGGL(axpbyDeviceKernel,n / dftfe::utils::DEVICE_BLOCK_SIZE) + + 1,dftfe::utils::DEVICE_BLOCK_SIZE,0,0,n, + dftfe::utils::makeDataTypeDeviceCompatible(x), + dftfe::utils::makeDataTypeDeviceCompatible(y), + alpha, + beta); + } + + template void BLASWrapper::axpyStridedBlockAtomicAdd( @@ -597,6 +677,33 @@ namespace dftfe addToVecStartingContiguousBlockIds); } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * addFromVec, + ValueType2 * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + hipLaunchKernelGGL(axpyStridedBlockAtomicAddDeviceKernel, + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(a), + dftfe::utils::makeDataTypeDeviceCompatible(s), + dftfe::utils::makeDataTypeDeviceCompatible(addFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(addToVec), + addToVecStartingContiguousBlockIds); + } + void @@ -623,8 +730,9 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localResult = 0.0; - dftfe::utils::deviceBlasStatus_t status = hipblasDdot( + double localResult = 0.0; + *result = 0.0; + dftfe::utils::deviceBlasStatus_t status = hipblasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); DEVICEBLAS_API_CHECK(status); MPI_Allreduce( @@ -651,6 +759,35 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + void + BLASWrapper::xdot( + const unsigned int N, + const std::complex *X, + const unsigned int INCX, + const std::complex *Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + std::complex * result) const + { + std::complex localResult = 0.0; + *result = 0.0; + dftfe::utils::deviceBlasStatus_t status = + hipblasZdotc(d_deviceBlasHandle, + int(N), + dftfe::utils::makeDataTypeHipBlasCompatible(X), + int(INCX), + dftfe::utils::makeDataTypeHipBlasCompatible(Y), + int(INCY), + dftfe::utils::makeDataTypeHipBlasCompatible(&localResult)); + DEVICEBLAS_API_CHECK(status); + MPI_Allreduce(&localResult, + result, + 1, + dataTypes::mpi_type_id(result), + MPI_SUM, + mpi_communicator); + } + void BLASWrapper::xgemmStridedBatched( const char transA, @@ -1147,7 +1284,8 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localresult = 0.0; + double localresult = 0.0; + *result = 0.0; dftfe::utils::deviceBlasStatus_t status = hipblasDznrm2(d_deviceBlasHandle, int(n), @@ -1168,12 +1306,14 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localresult = 0.0; + double localresult = 0.0; + *result = 0.0; dftfe::utils::deviceBlasStatus_t status = hipblasDnrm2(d_deviceBlasHandle, int(n), x, int(incx), &localresult); localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } template @@ -1275,6 +1415,52 @@ namespace dftfe copyFromVecStartingContiguousBlockIds); } + template + void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) + { + hipLaunchKernelGGL( + stridedCopyToBlockScaleDeviceKernel, + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(a), + dftfe::utils::makeDataTypeDeviceCompatible(s), + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), + copyFromVecStartingContiguousBlockIds); + } + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + void BLASWrapper::add( double * y, @@ -1516,6 +1702,12 @@ namespace dftfe const std::complex a, const dftfe::size_type n) const; + template void + BLASWrapper::xscal( + std::complex * x, + const double a, + const dftfe::size_type n) const; + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1622,5 +1814,50 @@ namespace dftfe const std::complex * addFromVec, std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const double *x, + const double beta, + double *y) const; + + + template void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + + template void + BLASWrapper::copyRealArrsToComplexArr( + const dftfe::size_type size, + const double * realArr, + const double * imagArr, + std::complex * complexArr); + } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDeviceKernels.cc b/utils/BLASWrapperDeviceKernels.cc index edad51bca..e0f1d6682 100644 --- a/utils/BLASWrapperDeviceKernels.cc +++ b/utils/BLASWrapperDeviceKernels.cc @@ -113,6 +113,39 @@ namespace dftfe } } + template + __global__ void + stridedCopyToBlockScaleDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = + index - blockIndex * contiguousBlockSize; + const ValueType1 coeff = dftfe::utils::mult(a, s[blockIndex]); + dftfe::utils::copyValue( + copyToVec + index, + dftfe::utils::mult( + copyFromVec[copyFromVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + coeff)); + } + } + + template __global__ void stridedCopyFromBlockDeviceKernel( @@ -305,15 +338,93 @@ namespace dftfe } } + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } __global__ void axpyStridedBlockAtomicAddDeviceKernel( const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, const dftfe::utils::deviceDoubleComplex *addFromVec, dftfe::utils::deviceDoubleComplex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) - {} + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const dftfe::utils::deviceDoubleComplex *addFromVec, + dftfe::utils::deviceDoubleComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x, + addFromVec[index].x); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y, + addFromVec[index].y); + } + } __global__ void diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index 86897ecd1..1bdd6bddb 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -72,7 +72,7 @@ namespace dftfe const ValueType2 alpha, const dftfe::size_type n) const { - std::transform(x, x + n, x, [&](auto &c) { return alpha * c; }); + std::transform(x, x + n, x, [&alpha](auto &c) { return alpha * c; }); } // for xscal template void @@ -93,6 +93,12 @@ namespace dftfe const std::complex a, const dftfe::size_type n) const; + template void + BLASWrapper::xscal( + std::complex * x, + const double a, + const dftfe::size_type n) const; + template void BLASWrapper::xscal( std::complex * x, @@ -110,17 +116,6 @@ namespace dftfe dcopy_(&n, x, &incx, y, &incy); } - void - BLASWrapper::xcopy( - const unsigned int n, - double * x, - const unsigned int incx, - float * y, - const unsigned int incy) const - { - std::memcpy(x, y, n * sizeof(x)); - } - void BLASWrapper::xcopy( const unsigned int n, @@ -253,17 +248,6 @@ namespace dftfe zcopy_(&n, x, &incx, y, &incy); } - void - BLASWrapper::xcopy( - const unsigned int n, - std::complex *x, - const unsigned int incx, - std::complex * y, - const unsigned int incy) const - { - std::memcpy(x, y, n * sizeof(x)); - } - void BLASWrapper::xcopy( const unsigned int n, @@ -284,9 +268,11 @@ namespace dftfe double * result) const { double localresult = dnrm2_(&n, x, &incx); + *result = 0.0; localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } void @@ -298,9 +284,11 @@ namespace dftfe double * result) const { double localresult = dznrm2_(&n, x, &incx); + *result = 0.0; localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } @@ -326,6 +314,53 @@ namespace dftfe *result = zdotc_(&N, X, &INCX, Y, &INCY); } + void + BLASWrapper::xdot( + const unsigned int N, + const double * X, + const unsigned int INCX, + const double * Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + double * result) const + { + double localResult = 0.0; + *result = 0.0; + localResult = ddot_(&N, X, &INCX, Y, &INCY); + MPI_Allreduce(&localResult, + result, + 1, + dataTypes::mpi_type_id(result), + MPI_SUM, + mpi_communicator); + } + void + BLASWrapper::xdot( + const unsigned int N, + const std::complex *X, + const unsigned int INCX, + const std::complex *Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + std::complex * result) const + { + std::complex localResult = 0.0; + *result = 0.0; + localResult = + std::inner_product(X, + X + N, + Y, + std::complex(0.0), + std::plus<>{}, + [](auto &a, auto &b) { return std::conj(a) * b; }); + MPI_Allreduce(&localResult, + result, + 1, + dataTypes::mpi_type_id(result), + MPI_SUM, + mpi_communicator); + } + void BLASWrapper::xaxpy( @@ -369,6 +404,41 @@ namespace dftfe std::plus<>{}); } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * addFromVec, + ValueType2 * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) + { + ValueType1 coeff = a * s[iBlock]; + std::transform(addFromVec + iBlock * contiguousBlockSize, + addFromVec + (iBlock + 1) * contiguousBlockSize, + addToVec + addToVecStartingContiguousBlockIds[iBlock], + addToVec + addToVecStartingContiguousBlockIds[iBlock], + [&coeff](auto &p, auto &q) { return p * coeff + q; }); + } + } + + template + void + BLASWrapper::axpby(const unsigned int n, + const ValueType2 alpha, + const ValueType1 *x, + const ValueType2 beta, + ValueType1 *y) const + { + std::transform(x, x + n, y, y, [&alpha, &beta](auto &p, auto &q) { + return alpha * p + beta * q; + }); + } + void BLASWrapper::xsymv( const char UPLO, @@ -536,7 +606,11 @@ namespace dftfe const ValueTypeReal * imagArr, ValueTypeComplex * complexArr) { - AssertThrow(false, dftUtils::ExcNotImplementedYet()); + std::transform(realArr, + realArr + size, + imagArr, + complexArr, + [](auto &a, auto &b) { return ValueTypeComplex(a, b); }); } template @@ -624,6 +698,48 @@ namespace dftfe { AssertThrow(false, dftUtils::ExcNotImplementedYet()); } + template + void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType1 a, + const ValueType1 * s, + const ValueType2 * copyFromVec, + ValueType2 * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) + { + for (int iBatch = 0; iBatch < numContiguousBlocks; iBatch++) + { + ValueType1 alpha = a * s[iBatch]; + std::transform(copyFromVec + + copyFromVecStartingContiguousBlockIds[iBatch], + copyFromVec + + copyFromVecStartingContiguousBlockIds[iBatch] + + contiguousBlockSize, + copyToVecBlock + iBatch * contiguousBlockSize, + [&alpha](auto &a) { return alpha * a; }); + } + } + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + template void BLASWrapper::stridedBlockScale( @@ -801,5 +917,48 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const double *x, + const double beta, + double * y) const; + + + template void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + + template void + BLASWrapper::copyRealArrsToComplexArr( + const dftfe::size_type size, + const double * realArr, + const double * imagArr, + std::complex * complexArr); + } // End of namespace linearAlgebra } // End of namespace dftfe From 43061d76203df88a16c5486d737e4d5a6afa3112 Mon Sep 17 00:00:00 2001 From: Kartick Ramakrishnan Date: Mon, 11 Mar 2024 11:53:23 +0530 Subject: [PATCH 2/2] Adding GPU poisson Ctests and changes --- .../real/accuracyBenchmarks/outputMg2x_15 | 103 ++++++++++++++++++ .../real/accuracyBenchmarks/outputMg2x_16 | 68 ++++++++++++ .../real/accuracyBenchmarks/outputN2_5 | 44 ++++++++ .../real/jobscripts/crusher6GPUsMPS1.slurm | 3 + .../frontierJobScript6GCDs6MPITasks.rc | 3 + .../jobscripts/matrixlabgpu18Tasks6GPUs.slurm | 3 + .../jobscripts/perlmutter6Tasks6GPUs.slurm | 3 + .../real/jobscripts/summit.lsf | 3 + .../real/parameterFileMg2x_15.prm | 70 ++++++++++++ .../real/parameterFileMg2x_16.prm | 79 ++++++++++++++ .../real/parameterFileN2_5.prm | 74 +++++++++++++ 11 files changed, 453 insertions(+) create mode 100644 testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_15 create mode 100644 testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_16 create mode 100644 testsGPU/pseudopotential/real/accuracyBenchmarks/outputN2_5 create mode 100644 testsGPU/pseudopotential/real/parameterFileMg2x_15.prm create mode 100644 testsGPU/pseudopotential/real/parameterFileMg2x_16.prm create mode 100644 testsGPU/pseudopotential/real/parameterFileN2_5.prm diff --git a/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_15 b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_15 new file mode 100644 index 000000000..659697bde --- /dev/null +++ b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_15 @@ -0,0 +1,103 @@ +number of atoms: 31 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 310 +number of eigen values: 180 +============================================================================================================================= +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 1.176399999999999935e+01 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 1.917200000000000060e+01 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 2.037584570023999930e+01 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 2.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 1: 2.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 2: 0.000000000000000000e+00 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 3: 0.000000000000000000e+00 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 4: 2.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 5: 2.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 6: 0.000000000000000000e+00 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 7: 0.000000000000000000e+00 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 8: 2.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 9: 2.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 10: 0.000000000000000000e+00 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 11: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 12: 2.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 13: 2.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 14: 0.000000000000000000e+00 7.500000000000000000e-01 6.666666666670000296e-01 +AtomId 15: 5.000000000000000000e-01 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 16: 7.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 17: 7.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 18: 5.000000000000000000e-01 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 19: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 20: 7.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 21: 7.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 22: 5.000000000000000000e-01 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 23: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 24: 7.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 25: 7.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 26: 5.000000000000000000e-01 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 27: 5.000000000000000000e-01 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 28: 7.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 29: 7.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 30: 5.000000000000000000e-01 7.500000000000000000e-01 6.666666666670000296e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2094 + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 3 +FE interpolating polynomial order for electrostatics solve: 3 +FE interpolating polynomial order for nodal electron density computation: 5 +number of elements: 1440 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 52791 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... + +Starting SCF iterations.... +SCF iterations converged to the specified tolerance after: 15 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -1673.62180253 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.001805,0.001878,0.013126 +AtomId 1: 0.002319,0.000000,0.003373 +AtomId 2: 0.000233,0.001707,0.140373 +AtomId 3: 0.000194,0.000000,0.004429 +AtomId 4: 0.001175,0.001333,0.013673 +AtomId 5: 0.001964,0.000000,0.000305 +AtomId 6: 0.000233,0.000011,0.138744 +AtomId 7: 0.000191,0.000000,0.000954 +AtomId 8: 0.001805,0.001877,0.013126 +AtomId 9: 0.003732,0.000000,0.001505 +AtomId 10: 0.000233,0.001707,0.140373 +AtomId 11: 0.000192,0.000000,0.000588 +AtomId 12: 0.001175,0.001333,0.013673 +AtomId 13: 0.000157,0.000000,0.001220 +AtomId 14: 0.000233,0.000011,0.138744 +AtomId 15: 0.000004,0.000000,0.001199 +AtomId 16: 0.001805,0.001878,0.013126 +AtomId 17: 0.002319,0.000000,0.003373 +AtomId 18: 0.000000,0.001220,0.140761 +AtomId 19: 0.000096,0.000000,0.000014 +AtomId 20: 0.001175,0.001333,0.013673 +AtomId 21: 0.001964,0.000000,0.000305 +AtomId 22: 0.000000,0.002938,0.140140 +AtomId 23: 0.000093,0.000000,0.004499 +AtomId 24: 0.001805,0.001877,0.013126 +AtomId 25: 0.003732,0.000000,0.001505 +AtomId 26: 0.000000,0.001220,0.140761 +AtomId 27: 0.000000,0.000000,0.000041 +AtomId 28: 0.001175,0.001333,0.013673 +AtomId 29: 0.000157,0.000000,0.001220 +AtomId 30: 0.000000,0.002938,0.140140 +-------------------------------------------------------------------------------------------- diff --git a/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_16 b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_16 new file mode 100644 index 000000000..b7e5302cd --- /dev/null +++ b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputMg2x_16 @@ -0,0 +1,68 @@ +number of atoms: 31 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 310 +number of eigen values: 180 +============================================================================================================================= +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 1.176399999999999935e+01 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 1.917200000000000060e+01 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 2.037584570023999930e+01 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 2.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 1: 2.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 2: 0.000000000000000000e+00 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 3: 0.000000000000000000e+00 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 4: 2.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 5: 2.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 6: 0.000000000000000000e+00 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 7: 0.000000000000000000e+00 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 8: 2.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 9: 2.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 10: 0.000000000000000000e+00 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 11: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 12: 2.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 13: 2.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 14: 0.000000000000000000e+00 7.500000000000000000e-01 6.666666666670000296e-01 +AtomId 15: 5.000000000000000000e-01 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 16: 7.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 17: 7.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 18: 5.000000000000000000e-01 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 19: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 20: 7.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 21: 7.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 22: 5.000000000000000000e-01 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 23: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 24: 7.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 25: 7.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 26: 5.000000000000000000e-01 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 27: 5.000000000000000000e-01 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 28: 7.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 29: 7.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 30: 5.000000000000000000e-01 7.500000000000000000e-01 6.666666666670000296e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2094 + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 3 +FE interpolating polynomial order for electrostatics solve: 3 +FE interpolating polynomial order for nodal electron density computation: 5 +number of elements: 1440 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 52791 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... + +Starting SCF iterations.... +SCF iterations converged to the specified tolerance after: 26 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -1666.902177 diff --git a/testsGPU/pseudopotential/real/accuracyBenchmarks/outputN2_5 b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputN2_5 new file mode 100644 index 000000000..50870a6bb --- /dev/null +++ b/testsGPU/pseudopotential/real/accuracyBenchmarks/outputN2_5 @@ -0,0 +1,44 @@ +number of atoms: 2 +number of atoms types: 1 +Z:7 +============================================================================================================================= +number of electrons: 10 +number of eigen values: 8 +============================================================================================================================= +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 2.000000000000000000e+01 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 2.000000000000000000e+01 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 2.000000000000000000e+01 +----------------------------------------------------------------------------------------- +------------Cartesian coordinates of atoms (origin at center of domain)------------------ +AtomId 0: -1.300000000000000044e+00 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 1: 1.300000000000000044e+00 0.000000000000000000e+00 0.000000000000000000e+00 +----------------------------------------------------------------------------------------- + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 2 +FE interpolating polynomial order for electrostatics solve: 2 +FE interpolating polynomial order for nodal electron density computation: 4 +number of elements: 400 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 4529 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... + +Starting SCF iterations.... +SCF iterations converged to the specified tolerance after: 9 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -19.66693754 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.323342,0.000000,0.000000 +AtomId 1: 0.323342,0.000000,0.000000 +-------------------------------------------------------------------------------------------- diff --git a/testsGPU/pseudopotential/real/jobscripts/crusher6GPUsMPS1.slurm b/testsGPU/pseudopotential/real/jobscripts/crusher6GPUsMPS1.slurm index ffc26c5b9..2765e4feb 100644 --- a/testsGPU/pseudopotential/real/jobscripts/crusher6GPUsMPS1.slurm +++ b/testsGPU/pseudopotential/real/jobscripts/crusher6GPUsMPS1.slurm @@ -18,9 +18,12 @@ srun -n 6 -c 1 ./dftfe parameterFileN2_1.prm > outputN2_1 srun -n 6 -c 1 ./dftfe parameterFileN2_2.prm > outputN2_2 srun -n 6 -c 1 ./dftfe parameterFileN2_3.prm > outputN2_3 srun -n 6 -c 1 ./dftfe parameterFileN2_4.prm > outputN2_4 +srun -n 6 -c 1 ./dftfe parameterFileN2_5.prm > outputN2_5 srun -n 6 -c 1 ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 srun -n 6 -c 1 ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 srun -n 6 -c 1 ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 srun -n 6 -c 1 ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 srun -n 6 -c 1 ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 +srun -n 6 -c 1 ./dftfe parameterFileMg2x_15.prm > outputMg2x_15 +srun -n 6 -c 1 ./dftfe parameterFileMg2x_16.prm > outputMg2x_16 diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc index 33d1e53b4..46948cdea 100644 --- a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -27,8 +27,11 @@ srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_1.prm > outputN2_1 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_2.prm > outputN2_2 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_3.prm > outputN2_3 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_4.prm > outputN2_4 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_5.prm > outputN2_5 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_8.prm > outputMg2x_8 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_9.prm > outputMg2x_9 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_10.prm > outputMg2x_10 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_11.prm > outputMg2x_11 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_14.prm > outputMg2x_14 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_15.prm > outputMg2x_15 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_16.prm > outputMg2x_16 diff --git a/testsGPU/pseudopotential/real/jobscripts/matrixlabgpu18Tasks6GPUs.slurm b/testsGPU/pseudopotential/real/jobscripts/matrixlabgpu18Tasks6GPUs.slurm index bd78760be..537d71c34 100644 --- a/testsGPU/pseudopotential/real/jobscripts/matrixlabgpu18Tasks6GPUs.slurm +++ b/testsGPU/pseudopotential/real/jobscripts/matrixlabgpu18Tasks6GPUs.slurm @@ -34,9 +34,12 @@ srun -n 6 --mpi=pmi2 ./dftfe parameterFileN2_1.prm > outputN2_1 srun -n 6 --mpi=pmi2 ./dftfe parameterFileN2_2.prm > outputN2_2 srun -n 6 --mpi=pmi2 ./dftfe parameterFileN2_3.prm > outputN2_3 srun -n 6 --mpi=pmi2 ./dftfe parameterFileN2_4.prm > outputN2_4 +srun -n 6 --mpi=pmi2 ./dftfe parameterFileN2_5.prm > outputN2_5 srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 +srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_15.prm > outputMg2x_15 +srun -n 6 --mpi=pmi2 ./dftfe parameterFileMg2x_16.prm > outputMg2x_16 diff --git a/testsGPU/pseudopotential/real/jobscripts/perlmutter6Tasks6GPUs.slurm b/testsGPU/pseudopotential/real/jobscripts/perlmutter6Tasks6GPUs.slurm index 3edd79114..fc7b39268 100644 --- a/testsGPU/pseudopotential/real/jobscripts/perlmutter6Tasks6GPUs.slurm +++ b/testsGPU/pseudopotential/real/jobscripts/perlmutter6Tasks6GPUs.slurm @@ -21,8 +21,11 @@ srun ./dftfe parameterFileN2_1.prm > outputN2_1 srun ./dftfe parameterFileN2_2.prm > outputN2_2 srun ./dftfe parameterFileN2_3.prm > outputN2_3 srun ./dftfe parameterFileN2_4.prm > outputN2_4 +srun ./dftfe parameterFileN2_5.prm > outputN2_5 srun ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 srun ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 srun ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 srun ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 srun ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 +srun ./dftfe parameterFileMg2x_15.prm > outputMg2x_15 +srun ./dftfe parameterFileMg2x_16.prm > outputMg2x_16 diff --git a/testsGPU/pseudopotential/real/jobscripts/summit.lsf b/testsGPU/pseudopotential/real/jobscripts/summit.lsf index e2a15c837..298de7769 100644 --- a/testsGPU/pseudopotential/real/jobscripts/summit.lsf +++ b/testsGPU/pseudopotential/real/jobscripts/summit.lsf @@ -16,11 +16,14 @@ jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe p jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileN2_2.prm > outputN2_2 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileN2_3.prm > outputN2_3 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileN2_4.prm > outputN2_4 +jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileN2_5.prm > outputN2_5 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 +jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_15.prm > outputMg2x_15 +jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_16.prm > outputMg2x_16 jsrun --smpiargs "-gpu" -n 6 -a 3 -c 7 -g 1 -r 6 -d packed -b packed:2 ./dftfe parameterFileMg2x_1.prm > outputMg2x_1 jsrun --smpiargs "-gpu" -n 6 -a 3 -c 7 -g 1 -r 6 -d packed -b packed:2 ./dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu diff --git a/testsGPU/pseudopotential/real/parameterFileMg2x_15.prm b/testsGPU/pseudopotential/real/parameterFileMg2x_15.prm new file mode 100644 index 000000000..a9f907c12 --- /dev/null +++ b/testsGPU/pseudopotential/real/parameterFileMg2x_15.prm @@ -0,0 +1,70 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true + +subsection GPU + set USE GPU=true + set AUTO GPU BLOCK SIZES=false +end + +subsection Boundary conditions + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true +end + + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 4 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = pseudoMg.inp +end + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 3 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AROUND ATOM = 1.0 + set MESH SIZE AT ATOM = 1.0 + end +end + +subsection Geometry + set NATOMS=31 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = coordinatesMg2x.inp + set DOMAIN VECTORS FILE = domainVectorsMg2x.inp + subsection Optimization + set ION FORCE = true + end +end + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER = false + set MIXING PARAMETER =0.7 + set MIXING METHOD=LOW_RANK_DIELECM_PRECOND + set MAXIMUM ITERATIONS = 100 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + subsection LOW RANK DIELECM PRECOND + set METHOD SUB TYPE=ADAPTIVE + set STARTING NORM LARGE DAMPING=2.0 + set ADAPTIVE RANK REL TOL=0.1 + set POISSON SOLVER ABS TOL=1.0e-10 + end + subsection Eigen-solver parameters + set CHEBYSHEV POLYNOMIAL DEGREE = 20 + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 180 + set CHEBY WFC BLOCK SIZE=45 + set WFC BLOCK SIZE=45 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + end +end +subsection Poisson problem parameters + set GPU MODE = true + set MAXIMUM ITERATIONS = 20000 + set TOLERANCE = 1e-10 +end diff --git a/testsGPU/pseudopotential/real/parameterFileMg2x_16.prm b/testsGPU/pseudopotential/real/parameterFileMg2x_16.prm new file mode 100644 index 000000000..bfcffcc80 --- /dev/null +++ b/testsGPU/pseudopotential/real/parameterFileMg2x_16.prm @@ -0,0 +1,79 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true + +subsection GPU + set USE GPU=true + set AUTO GPU BLOCK SIZES=false + set USE GPUDIRECT MPI ALL REDUCE = true +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=false + set FLOATING NUCLEAR CHARGES=false + set CONSTRAINTS FROM SERIAL DOFHANDLER = false + set CONSTRAINTS PARALLEL CHECK = false + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true + set SELF POTENTIAL RADIUS = 3.0 + set POINT WISE DIRICHLET CONSTRAINT =true +end + + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 4 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = pseudoMg.inp +end + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 3 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AROUND ATOM = 1.0 + set MESH SIZE AT ATOM = 1.0 + end +end + +subsection Geometry + set NATOMS=31 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = coordinatesMg2x.inp + set DOMAIN VECTORS FILE = domainVectorsMg2x.inp + subsection Optimization + set ION FORCE = false + end +end + +subsection Parallelization + set NPBAND=2 +end + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER = false + set MAXIMUM ITERATIONS = 100 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + subsection Eigen-solver parameters + set CHEBYSHEV POLYNOMIAL DEGREE = 20 + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 180 + set SPECTRUM SPLIT CORE EIGENSTATES=90 + set CHEBY WFC BLOCK SIZE=45 + set WFC BLOCK SIZE=45 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set OVERLAP COMPUTE COMMUN ORTHO RR=true + set USE MIXED PREC XTHX SPECTRUM SPLIT=true + set USE MIXED PREC CGS SR=true + set USE MIXED PREC CGS O=false + end +end +subsection Poisson problem parameters + set GPU MODE = true + set MAXIMUM ITERATIONS = 20000 + set TOLERANCE = 1e-10 +end + diff --git a/testsGPU/pseudopotential/real/parameterFileN2_5.prm b/testsGPU/pseudopotential/real/parameterFileN2_5.prm new file mode 100644 index 000000000..68aea0825 --- /dev/null +++ b/testsGPU/pseudopotential/real/parameterFileN2_5.prm @@ -0,0 +1,74 @@ +set VERBOSITY=0 +set REPRODUCIBLE OUTPUT=true + +subsection GPU + set USE GPU =true + set AUTO GPU BLOCK SIZES=false +end +subsection Geometry + set NATOMS=2 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = coordinates.inp + set DOMAIN VECTORS FILE = domainVectors.inp + subsection Optimization + set ION FORCE = true + end +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=false + set FLOATING NUCLEAR CHARGES=false + set PERIODIC1 = false + set PERIODIC2 = false + set PERIODIC3 = false + set CONSTRAINTS FROM SERIAL DOFHANDLER = false + set CONSTRAINTS PARALLEL CHECK = false + set SELF POTENTIAL RADIUS=4.0 +end + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER=2 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set MESH SIZE AROUND ATOM = 0.7 + set ATOM BALL RADIUS=2.0 + set BASE MESH SIZE = 10.0 + end +end + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 4 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = pseudo.inp +end + +subsection Parallelization + set NPBAND=2 +end + +subsection SCF parameters + set MIXING HISTORY = 70 + set MIXING PARAMETER = 0.5 + set MAXIMUM ITERATIONS = 100 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC = ATOMIC + subsection Eigen-solver parameters + set CHEBYSHEV POLYNOMIAL DEGREE=15 + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 8 + set SUBSPACE ROT DOFS BLOCK SIZE = 2000 + set WFC BLOCK SIZE=2 + set CHEBY WFC BLOCK SIZE=2 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set USE ELPA=true + set SCALAPACKPROCS=2 + set OVERLAP COMPUTE COMMUN CHEBY=true + set OVERLAP COMPUTE COMMUN ORTHO RR=false + end +end + +subsection Poisson problem parameters + set GPU MODE = true + set MAXIMUM ITERATIONS = 20000 + set TOLERANCE = 1e-10 +end \ No newline at end of file