From 3a4289e8379cb3a0bbb24f735450aeb5192203a8 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Thu, 18 Apr 2024 11:40:55 +0530 Subject: [PATCH 01/11] Prelim Cleanups --- CMakeLists.txt | 4 +- include/AtomicCenteredNonLocalOperator.h | 2 - include/MPICommunicatorP2P.h | 34 ++++------ include/dftfeDataTypes.h | 19 ++++++ include/oncvClass.h | 2 - include/sphericalHarmonicUtils.h | 2 +- ...t.cc => AtomicCenteredNonLocalOperator.cc} | 10 ++- .../oncv/{oncvClass.t.cc => oncvClass.cc} | 7 +- utils/BLASWrapperDevice.cu.cc | 64 +++++++++++++++++++ utils/BLASWrapperDevice.hip.cc | 64 +++++++++++++++++++ utils/MPICommunicatorP2P.cc | 46 +++++++------ 11 files changed, 206 insertions(+), 48 deletions(-) rename src/atom/{AtomicCenteredNonLocalOperator.t.cc => AtomicCenteredNonLocalOperator.cc} (99%) rename src/pseudo/oncv/{oncvClass.t.cc => oncvClass.cc} (99%) diff --git a/CMakeLists.txt b/CMakeLists.txt index cf2e9ac09..d3ee23c9a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,7 @@ SET(TARGET_SRC ./utils/FEBasisOperations.cc ./utils/FEBasisOperationsKernels.cc ./src/force/locateAtomCoreNodesForce.cc + ./src/atom/AtomicCenteredNonLocalOperator.cc ./src/atom/AtomPseudoWavefunctions.cc ./src/atom/AtomCenteredSphericalFunctionContainer.cc ./src/atom/AtomCenteredSphericalFunctionBase.cc @@ -171,7 +172,8 @@ SET(TARGET_SRC ./src/atom/AtomCenteredSphericalFunctionValenceDensitySpline.cc ./src/atom/AtomCenteredSphericalFunctionCoreDensitySpline.cc ./src/atom/AtomCenteredSphericalFunctionLocalPotentialSpline.cc - ./src/atom/AtomCenteredSphericalFunctionProjectorSpline.cc) + ./src/atom/AtomCenteredSphericalFunctionProjectorSpline.cc + ./src/pseudo/oncv/oncvClass.cc) IF ("${GPU_LANG}" STREQUAL "cuda") diff --git a/include/AtomicCenteredNonLocalOperator.h b/include/AtomicCenteredNonLocalOperator.h index de38af052..ac8642f7c 100644 --- a/include/AtomicCenteredNonLocalOperator.h +++ b/include/AtomicCenteredNonLocalOperator.h @@ -477,6 +477,4 @@ namespace dftfe } // namespace dftfe -#include "../src/atom/AtomicCenteredNonLocalOperator.t.cc" - #endif // DFTFE_ATOMICCENTEREDNONLOCALOPERATOR_H diff --git a/include/MPICommunicatorP2P.h b/include/MPICommunicatorP2P.h index f531c8496..bfc30ec7f 100644 --- a/include/MPICommunicatorP2P.h +++ b/include/MPICommunicatorP2P.h @@ -58,22 +58,6 @@ namespace dftfe }; - template - struct singlePrecType - { - typedef T type; - }; - template <> - struct singlePrecType - { - typedef float type; - }; - template <> - struct singlePrecType> - { - typedef std::complex type; - }; - template class MPICommunicatorP2P { @@ -136,10 +120,14 @@ namespace dftfe MemoryStorage d_tempFloatImagArrayForAtomics; - MemoryStorage::type, memorySpace> + MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> d_sendRecvBufferSinglePrec; - MemoryStorage::type, memorySpace> + MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> d_ghostDataCopySinglePrec; #ifdef DFTFE_WITH_DEVICE @@ -149,12 +137,14 @@ namespace dftfe std::shared_ptr> d_sendRecvBufferHostPinnedPtr; - std::shared_ptr::type, - MemorySpace::HOST_PINNED>> + std::shared_ptr::type, + MemorySpace::HOST_PINNED>> d_ghostDataCopySinglePrecHostPinnedPtr; - std::shared_ptr::type, - MemorySpace::HOST_PINNED>> + std::shared_ptr::type, + MemorySpace::HOST_PINNED>> d_sendRecvBufferSinglePrecHostPinnedPtr; #endif // DFTFE_WITH_DEVICE diff --git a/include/dftfeDataTypes.h b/include/dftfeDataTypes.h index 1ccb651a6..5f5eb51d0 100644 --- a/include/dftfeDataTypes.h +++ b/include/dftfeDataTypes.h @@ -108,6 +108,25 @@ namespace dftfe { return MPI_DOUBLE_COMPLEX; } + + template + struct singlePrecType + { + typedef T type; + }; + + template <> + struct singlePrecType + { + typedef float type; + }; + + template <> + struct singlePrecType> + { + typedef std::complex type; + }; + } // namespace dataTypes } // namespace dftfe diff --git a/include/oncvClass.h b/include/oncvClass.h index 90264513d..e261e64fd 100644 --- a/include/oncvClass.h +++ b/include/oncvClass.h @@ -312,7 +312,5 @@ namespace dftfe }; // end of class - } // end of namespace dftfe -#include "../src/pseudo/oncv/oncvClass.t.cc" #endif // DFTFE_PSEUDOPOTENTIALBASE_H diff --git a/include/sphericalHarmonicUtils.h b/include/sphericalHarmonicUtils.h index fd25e4edc..ccb24c113 100644 --- a/include/sphericalHarmonicUtils.h +++ b/include/sphericalHarmonicUtils.h @@ -18,7 +18,7 @@ // #ifndef DFTFE_SPHERICALHARMONICUTILS_H #define DFTFE_SPHERICALHARMONICUTILS_H - +#include namespace dftfe { namespace sphericalHarmonicUtils diff --git a/src/atom/AtomicCenteredNonLocalOperator.t.cc b/src/atom/AtomicCenteredNonLocalOperator.cc similarity index 99% rename from src/atom/AtomicCenteredNonLocalOperator.t.cc rename to src/atom/AtomicCenteredNonLocalOperator.cc index a7cc505c1..143bb54d8 100644 --- a/src/atom/AtomicCenteredNonLocalOperator.t.cc +++ b/src/atom/AtomicCenteredNonLocalOperator.cc @@ -16,6 +16,7 @@ // // @author Kartick Ramakrishnan, Sambit Das, Phani Motamarri, Vishal Subramanian // +#include #if defined(DFTFE_WITH_DEVICE) # include # include @@ -23,7 +24,6 @@ # include # include #endif - namespace dftfe { template @@ -2271,5 +2271,13 @@ namespace dftfe { return (d_flattenedNonLocalCellDofIndexToProcessDofIndexMap); } + template class AtomicCenteredNonLocalOperator< + dataTypes::number, + dftfe::utils::MemorySpace::HOST>; +#if defined(DFTFE_WITH_DEVICE) + template class AtomicCenteredNonLocalOperator< + dataTypes::number, + dftfe::utils::MemorySpace::DEVICE>; +#endif } // namespace dftfe diff --git a/src/pseudo/oncv/oncvClass.t.cc b/src/pseudo/oncv/oncvClass.cc similarity index 99% rename from src/pseudo/oncv/oncvClass.t.cc rename to src/pseudo/oncv/oncvClass.cc index 695b30986..aff7df6de 100644 --- a/src/pseudo/oncv/oncvClass.t.cc +++ b/src/pseudo/oncv/oncvClass.cc @@ -16,7 +16,7 @@ // // @author Kartick Ramakrishnan // - +#include namespace dftfe { template @@ -716,4 +716,9 @@ namespace dftfe d_atomicProjectorFnsContainer->getTotalNumberOfSphericalFunctionsPerAtom( atomicNumbers[atomId])); } + template class oncvClass; +#if defined(DFTFE_WITH_DEVICE) + template class oncvClass; +#endif } // namespace dftfe diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index f505005d7..184d66737 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -1702,6 +1702,70 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + // strided copy from block + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); template void diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 50632c7f8..db616689f 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -1760,6 +1760,70 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + // strided copy from block + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); template void BLASWrapper:: diff --git a/utils/MPICommunicatorP2P.cc b/utils/MPICommunicatorP2P.cc index 348cb225c..6a6ed00eb 100644 --- a/utils/MPICommunicatorP2P.cc +++ b/utils/MPICommunicatorP2P.cc @@ -134,11 +134,13 @@ namespace dftfe if (d_commProtocol == communicationProtocol::mpiHost) { d_ghostDataCopySinglePrecHostPinnedPtr = std::make_shared< - MemoryStorage::type, + MemoryStorage::type, MemorySpace::HOST_PINNED>>( d_mpiPatternP2P->localGhostSize() * d_blockSize, 0.0); d_sendRecvBufferSinglePrecHostPinnedPtr = std::make_shared< - MemoryStorage::type, + MemoryStorage::type, MemorySpace::HOST_PINNED>>( d_mpiPatternP2P->getOwnedLocalIndicesForTargetProcs() .size() * @@ -347,8 +349,8 @@ namespace dftfe } else { - typename singlePrecType::type *recvArrayStartPtr = - d_ghostDataCopySinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *recvArrayStartPtr = d_ghostDataCopySinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -372,7 +374,8 @@ namespace dftfe .data()[2 * i]) * d_blockSize * sizeof( - typename singlePrecType::type), + typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getGhostProcIds().data()[i], static_cast( @@ -416,8 +419,8 @@ namespace dftfe d_sendRecvBufferSinglePrec); // initiate non-blocking sends to target processors - typename singlePrecType::type *sendArrayStartPtr = - d_sendRecvBufferSinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *sendArrayStartPtr = d_sendRecvBufferSinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -456,7 +459,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getTargetProcIds().data()[i], @@ -483,7 +487,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getGhostProcIds().data()[i], @@ -511,7 +516,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getTargetProcIds().data()[i], static_cast( @@ -793,8 +799,8 @@ namespace dftfe else { // initiate non-blocking receives from target processors - typename singlePrecType::type *recvArrayStartPtr = - d_sendRecvBufferSinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *recvArrayStartPtr = d_sendRecvBufferSinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) { @@ -815,7 +821,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getTargetProcIds().data()[i], static_cast( @@ -855,8 +862,8 @@ namespace dftfe d_ghostDataCopySinglePrec.data()); // initiate non-blocking sends to ghost processors - typename singlePrecType::type *sendArrayStartPtr = - d_ghostDataCopySinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *sendArrayStartPtr = d_ghostDataCopySinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -898,7 +905,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getGhostProcIds().data()[i], @@ -923,7 +931,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getTargetProcIds().data()[i], @@ -951,7 +960,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getGhostProcIds().data()[i], static_cast( From 02f89fbb8e226b4a3fdcf72f3db0ac90702053cc Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Sun, 21 Apr 2024 23:52:48 +0530 Subject: [PATCH 02/11] Residual based single prec algo for Chebyshev Filtering --- include/AtomicCenteredNonLocalOperator.h | 19 +- include/BLASWrapper.h | 36 ++- include/FEBasisOperations.h | 30 ++ include/KohnShamHamiltonianOperator.h | 33 +- ...hevOrthogonalizedSubspaceIterationSolver.h | 8 +- include/constraintMatrixInfoDevice.h | 8 +- include/dftParameters.h | 1 + include/linearAlgebraOperations.h | 29 ++ include/linearAlgebraOperationsDevice.h | 33 ++ include/oncvClass.h | 35 ++- include/operator.h | 18 ++ src/atom/AtomicCenteredNonLocalOperator.cc | 27 +- ...icCenteredNonLocalOperatorKernelsDevice.cc | 56 ++-- src/dft/dft.cc | 11 +- src/dft/initBoundaryConditions.cc | 13 + src/dft/kohnShamEigenSolve.cc | 27 +- .../KohnShamHamiltonianOperator.cc | 213 +++++++++++++ src/linAlg/linearAlgebraOperationsDevice.cc | 295 ++++++++++++++++++ src/linAlg/linearAlgebraOperationsOpt.cc | 176 ++++++++++- src/pseudo/oncv/oncvClass.cc | 85 ++++- ...evOrthogonalizedSubspaceIterationSolver.cc | 72 ++++- ...ogonalizedSubspaceIterationSolverDevice.cc | 116 ++++++- utils/BLASWrapperDevice.cu.cc | 146 ++++++++- utils/BLASWrapperDevice.hip.cc | 150 ++++++++- utils/BLASWrapperDeviceKernels.cc | 207 ++++++++++++ utils/BLASWrapperHost.cc | 275 +++++++++++++++- utils/FEBasisOperations.cc | 50 +++ utils/constraintMatrixInfo.cc | 38 +++ utils/constraintMatrixInfoDevice.cc | 161 +++++++--- utils/dftParameters.cc | 11 +- 30 files changed, 2232 insertions(+), 147 deletions(-) diff --git a/include/AtomicCenteredNonLocalOperator.h b/include/AtomicCenteredNonLocalOperator.h index ac8642f7c..77d39a486 100644 --- a/include/AtomicCenteredNonLocalOperator.h +++ b/include/AtomicCenteredNonLocalOperator.h @@ -61,7 +61,7 @@ namespace dftfe std::shared_ptr> BLASWrapperPtr, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperatorPtr, std::shared_ptr atomCenteredSphericalFunctionContainer, @@ -102,8 +102,9 @@ namespace dftfe const std::vector &kPointWeights, const std::vector &kPointCoordinates, std::shared_ptr< - dftfe::basis:: - FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperationsPtr, const unsigned int quadratureIndex); #if defined(DFTFE_WITH_DEVICE) @@ -313,7 +314,7 @@ namespace dftfe std::vector d_numberCellsForEachAtom; std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> d_basisOperatorPtr; @@ -419,11 +420,11 @@ namespace dftfe */ void computeCMatrixEntries( - std::shared_ptr< - dftfe::basis:: - FEBasisOperations> - basisOperationsPtr, - const unsigned int quadratureIndex); + std::shared_ptr> basisOperationsPtr, + const unsigned int quadratureIndex); std::map< unsigned int, diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index 9c3f40d62..aa26dde45 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -503,6 +503,19 @@ namespace dftfe const ValueType1 * x, const ValueType2 beta, ValueType1 * y) const; + template + void + ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const; template void @@ -513,14 +526,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; @@ -1030,6 +1043,21 @@ namespace dftfe const ValueType2 beta, ValueType1 * y) const; + template + void + ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const; + + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, @@ -1039,14 +1067,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index a17ed8ab6..520ce1148 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -719,6 +719,17 @@ namespace dftfe createScratchMultiVectors(const unsigned int vecBlockSize, const unsigned int numMultiVecs = 1) const; + /** + * @brief Creates single precision scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs number of scratch multivectors needed with + * this vecBlockSize. + */ + void + createScratchMultiVectorsSinglePrec( + const unsigned int vecBlockSize, + const unsigned int numMultiVecs = 1) const; + /** * @brief Clears scratch multivectors. */ @@ -735,6 +746,18 @@ namespace dftfe getMultiVector(const unsigned int vecBlockSize, const unsigned int index = 0) const; + /** + * @brief Gets single precision scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs index of the multivector among those with the + * same vecBlockSize. + */ + dftfe::linearAlgebra::MultiVector< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + getMultiVectorSinglePrec(const unsigned int vecBlockSize, + const unsigned int index = 0) const; + /** * @brief Apply constraints on given multivector. * @param[inout] multiVector the given multivector. @@ -864,6 +887,13 @@ namespace dftfe dftfe::linearAlgebra::MultiVector>> scratchMultiVectors; + mutable std::map< + unsigned int, + std::vector::type, + memorySpace>>> + scratchMultiVectorsSinglePrec; + std::vector d_quadratureIDsVector; unsigned int d_quadratureID; unsigned int d_quadratureIndex; diff --git a/include/KohnShamHamiltonianOperator.h b/include/KohnShamHamiltonianOperator.h index 2f04cd3b1..16715a88a 100644 --- a/include/KohnShamHamiltonianOperator.h +++ b/include/KohnShamHamiltonianOperator.h @@ -88,6 +88,11 @@ namespace dftfe const unsigned int index); + dftfe::linearAlgebra::MultiVector & + getScratchFEMultivectorSinglePrec(const unsigned int numVectors, + const unsigned int index); + + /** * @brief Computes effective potential involving exchange-correlation functionals * @param rhoValues electron-density @@ -178,6 +183,19 @@ namespace dftfe const bool skip2 = false, const bool skip3 = false); + void + HXCheby(dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse, + const bool skip1, + const bool skip2, + const bool skip3); + void HXRR( dftfe::linearAlgebra::MultiVector &src, @@ -190,6 +208,10 @@ namespace dftfe AtomicCenteredNonLocalOperator> d_ONCVnonLocalOperator; + std::shared_ptr< + AtomicCenteredNonLocalOperator> + d_ONCVnonLocalOperatorSinglePrec; + std::shared_ptr> d_BLASWrapperPtr; std::shared_ptr< @@ -207,6 +229,8 @@ namespace dftfe std::vector> d_cellHamiltonianMatrix; + std::vector> + d_cellHamiltonianMatrixSinglePrec; dftfe::utils::MemoryStorage d_cellHamiltonianMatrixExtPot; @@ -216,8 +240,15 @@ namespace dftfe dftfe::utils::MemoryStorage d_cellWaveFunctionMatrixDst; + dftfe::utils::MemoryStorage + d_cellWaveFunctionMatrixSrcSinglePrec; + dftfe::utils::MemoryStorage + d_cellWaveFunctionMatrixDstSinglePrec; + dftfe::linearAlgebra::MultiVector - d_ONCVNonLocalProjectorTimesVectorBlock; + d_ONCVNonLocalProjectorTimesVectorBlock; + dftfe::linearAlgebra::MultiVector + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec; dftfe::utils::MemoryStorage d_VeffJxW; dftfe::utils::MemoryStorage d_VeffExtPotJxW; diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h index 52e242366..2b7b8a065 100644 --- a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h +++ b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h @@ -24,7 +24,7 @@ #include "operator.h" #include "elpaScalaManager.h" #include "dftParameters.h" - +#include "BLASWrapper.h" namespace dftfe { @@ -65,7 +65,10 @@ namespace dftfe */ void solve(operatorDFTClass &operatorMatrix, - elpaScalaManager & elpaScala, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + elpaScalaManager & elpaScala, dataTypes::number * eigenVectorsFlattened, dataTypes::number * eigenVectorsRotFracDensityFlattened, const unsigned int totalNumberWaveFunctions, @@ -74,6 +77,7 @@ namespace dftfe std::vector &residuals, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, + const bool isFirstFilteringCall, const bool computeResidual, const bool useMixedPrec = false, const bool isFirstScf = false); diff --git a/include/constraintMatrixInfoDevice.h b/include/constraintMatrixInfoDevice.h index c4127109e..071c52562 100644 --- a/include/constraintMatrixInfoDevice.h +++ b/include/constraintMatrixInfoDevice.h @@ -95,14 +95,10 @@ namespace dftfe * care of constraints * @param blockSize number of components for a given node */ + template void distribute_slave_to_master( - distributedDeviceVec &fieldVector) const; - - void - distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const; - + distributedDeviceVec &fieldVector) const; /** * @brief sets field values at constrained nodes to be zero diff --git a/include/dftParameters.h b/include/dftParameters.h index 11da05b2d..51b3d4ea0 100644 --- a/include/dftParameters.h +++ b/include/dftParameters.h @@ -137,6 +137,7 @@ namespace dftfe bool deviceFineGrainedTimings; bool allowFullCPUMemSubspaceRot; bool useMixedPrecCheby; + bool useSinglePrecCheby; bool overlapComputeCommunCheby; bool overlapComputeCommunOrthoRR; bool autoDeviceBlockSizes; diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index 7e3df2ed2..9d13de6a2 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -144,6 +144,13 @@ namespace dftfe double * y, const unsigned int *incy); void + saxpy_(const unsigned int *n, + const float * alpha, + const float * x, + const unsigned int *incx, + float * y, + const unsigned int *incy); + void dgemm_(const char * transA, const char * transB, const unsigned int *m, @@ -412,6 +419,13 @@ namespace dftfe std::complex * y, const unsigned int * incy); void + caxpy_(const unsigned int * n, + const std::complex *alpha, + const std::complex *x, + const unsigned int * incx, + std::complex * y, + const unsigned int * incy); + void dpotrf_(const char * uplo, const unsigned int *n, double * a, @@ -617,6 +631,21 @@ namespace dftfe const double b, const double a0); + template + void + chebyshevFilterSinglePrec( + const std::shared_ptr> + & BLASWrapperPtr, + operatorDFTClass & operatorMatrix, + dftfe::linearAlgebra::MultiVector & X, + dftfe::linearAlgebra::MultiVector & Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); /** @brief Orthogonalize given subspace using GramSchmidt orthogonalization diff --git a/include/linearAlgebraOperationsDevice.h b/include/linearAlgebraOperationsDevice.h index f614adc91..97e8c4b18 100644 --- a/include/linearAlgebraOperationsDevice.h +++ b/include/linearAlgebraOperationsDevice.h @@ -25,6 +25,7 @@ # include "elpaScalaManager.h" # include "deviceDirectCCLWrapper.h" # include "dftParameters.h" +# include namespace dftfe { @@ -94,6 +95,38 @@ namespace dftfe const double b, const double a0); + void + chebyshevFilterOverlapComputeCommunicationSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X1, + dftfe::linearAlgebra::MultiVector &Y1, + dftfe::linearAlgebra::MultiVector &X2, + dftfe::linearAlgebra::MultiVector &Y2, + dftfe::linearAlgebra::MultiVector + &X1_SP, + dftfe::linearAlgebra::MultiVector + &Y1_SP, + dftfe::linearAlgebra::MultiVector + &X2_SP, + dftfe::linearAlgebra::MultiVector + & Y2_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); + /** @brief Computes Sc=X^{T}*Xc. * * diff --git a/include/oncvClass.h b/include/oncvClass.h index e261e64fd..215662821 100644 --- a/include/oncvClass.h +++ b/include/oncvClass.h @@ -101,7 +101,8 @@ namespace dftfe unsigned int densityQuadratureIdElectro, std::shared_ptr excFunctionalPtr, const std::vector> &atomLocations, - unsigned int numEigenValues); + unsigned int numEigenValues, + const bool singlePrecNonLocalOperator); /** * @brief Initialises all the data members with addresses/values to/of dftClass. @@ -201,6 +202,17 @@ namespace dftfe AtomicCenteredNonLocalOperator> getNonLocalOperator(); + const dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + getCouplingMatrixSinglePrec(); + + + const std::shared_ptr::type, + memorySpace>> + getNonLocalOperatorSinglePrec(); + private: /** * @brief Converts the periodic image data structure to relevant form for the container class @@ -241,8 +253,13 @@ namespace dftfe std::map> d_atomicNonLocalPseudoPotentialConstants; dftfe::utils::MemoryStorage d_couplingMatrixEntries; + dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> + d_couplingMatrixEntriesSinglePrec; bool d_HamiltonianCouplingMatrixEntriesUpdated; + bool d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated; std::vector> d_atomicWaveFnsVector; std::shared_ptr @@ -276,11 +293,12 @@ namespace dftfe d_BasisOperatorDevicePtr; #endif - std::map d_atomTypeCoreFlagMap; - bool d_floatingNuclearCharges; - int d_verbosity; - std::vector> d_atomLocations; - std::set d_atomTypes; + std::map d_atomTypeCoreFlagMap; + bool d_floatingNuclearCharges; + bool d_singlePrecNonLocalOperator; + int d_verbosity; + std::vector> d_atomLocations; + std::set d_atomTypes; std::map> d_atomTypesList; std::string d_dftfeScratchFolderName; std::vector d_imageIds; @@ -292,6 +310,11 @@ namespace dftfe std::shared_ptr> d_nonLocalOperator; + std::shared_ptr::type, + memorySpace>> + d_nonLocalOperatorSinglePrec; + std::vector> d_atomicProjectorFnsVector; diff --git a/include/operator.h b/include/operator.h index 7ea1f6e0b..fd694df10 100644 --- a/include/operator.h +++ b/include/operator.h @@ -66,6 +66,11 @@ namespace dftfe getScratchFEMultivector(const unsigned int numVectors, const unsigned int index) = 0; + virtual dftfe::linearAlgebra::MultiVector & + getScratchFEMultivectorSinglePrec(const unsigned int numVectors, + const unsigned int index) = 0; + virtual void init(const std::vector &kPointCoordinates, const std::vector &kPointWeights) = 0; @@ -98,6 +103,19 @@ namespace dftfe const bool skip2 = false, const bool skip3 = false) = 0; + virtual void + HXCheby(dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse = false, + const bool skip1 = false, + const bool skip2 = false, + const bool skip3 = false) = 0; + virtual dftUtils::constraintMatrixInfo * getOverloadedConstraintMatrixHost() const = 0; diff --git a/src/atom/AtomicCenteredNonLocalOperator.cc b/src/atom/AtomicCenteredNonLocalOperator.cc index 143bb54d8..a2620f068 100644 --- a/src/atom/AtomicCenteredNonLocalOperator.cc +++ b/src/atom/AtomicCenteredNonLocalOperator.cc @@ -32,7 +32,7 @@ namespace dftfe std::shared_ptr> BLASWrapperPtr, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperatorPtr, std::shared_ptr atomCenteredSphericalFunctionContainer, @@ -117,11 +117,11 @@ namespace dftfe template void AtomicCenteredNonLocalOperator::computeCMatrixEntries( - std::shared_ptr< - dftfe::basis:: - FEBasisOperations> - basisOperationsPtr, - const unsigned int quadratureIndex) + std::shared_ptr> basisOperationsPtr, + const unsigned int quadratureIndex) { d_locallyOwnedCells = basisOperationsPtr->nCells(); basisOperationsPtr->reinit(0, 0, quadratureIndex); @@ -138,7 +138,7 @@ namespace dftfe ->shapeFunctionBasisData(); // shapeFunctionData() for complex const dftfe::utils::MemoryStorage quadraturePointsVector = basisOperationsPtr->quadPoints(); - const dftfe::utils::MemoryStorage JxwVector = basisOperationsPtr->JxW(); const std::vector &atomicNumber = @@ -2047,7 +2047,7 @@ namespace dftfe else { initialiseOperatorActionOnX(kPointIndex); - dftfe::utils::MemoryStorage cellWaveFunctionMatrix; cellWaveFunctionMatrix.resize(d_locallyOwnedCells * @@ -2246,8 +2246,9 @@ namespace dftfe const std::vector &kPointWeights, const std::vector &kPointCoordinates, std::shared_ptr< - dftfe::basis:: - FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperationsPtr, const unsigned int quadratureIndex) { @@ -2274,10 +2275,16 @@ namespace dftfe template class AtomicCenteredNonLocalOperator< dataTypes::number, dftfe::utils::MemorySpace::HOST>; + template class AtomicCenteredNonLocalOperator< + dataTypes::numberFP32, + dftfe::utils::MemorySpace::HOST>; #if defined(DFTFE_WITH_DEVICE) template class AtomicCenteredNonLocalOperator< dataTypes::number, dftfe::utils::MemorySpace::DEVICE>; + template class AtomicCenteredNonLocalOperator< + dataTypes::numberFP32, + dftfe::utils::MemorySpace::DEVICE>; #endif } // namespace dftfe diff --git a/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc b/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc index d3986724c..2f2c4d6ee 100644 --- a/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc +++ b/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc @@ -246,38 +246,38 @@ namespace dftfe template void copyToDealiiParallelNonLocalVec( - const unsigned int numWfcs, - const unsigned int totalEntries, - const double * sphericalFnTimesWfcParallelVec, - double * sphericalFnTimesWfcDealiiParallelVec, - const unsigned int *indexMapDealiiParallelNumbering); + const unsigned int numWfcs, + const unsigned int totalEntries, + const dataTypes::number *sphericalFnTimesWfcParallelVec, + dataTypes::number * sphericalFnTimesWfcDealiiParallelVec, + const unsigned int * indexMapDealiiParallelNumbering); template void copyToDealiiParallelNonLocalVec( - const unsigned int numWfcs, - const unsigned int totalEntries, - const std::complex *sphericalFnTimesWfcParallelVec, - std::complex * sphericalFnTimesWfcDealiiParallelVec, - const unsigned int * indexMapDealiiParallelNumbering); + const unsigned int numWfcs, + const unsigned int totalEntries, + const dataTypes::numberFP32 *sphericalFnTimesWfcParallelVec, + dataTypes::numberFP32 * sphericalFnTimesWfcDealiiParallelVec, + const unsigned int * indexMapDealiiParallelNumbering); template void copyFromParallelNonLocalVecToAllCellsVec( - const unsigned int numWfcs, - const unsigned int numNonLocalCells, - const unsigned int maxSingleAtomContribution, - const double * sphericalFnTimesWfcParallelVec, - double * sphericalFnTimesWfcAllCellsVec, - const int * indexMapPaddedToParallelVec); + const unsigned int numWfcs, + const unsigned int numNonLocalCells, + const unsigned int maxSingleAtomContribution, + const dataTypes::number *sphericalFnTimesWfcParallelVec, + dataTypes::number * sphericalFnTimesWfcAllCellsVec, + const int * indexMapPaddedToParallelVec); template void copyFromParallelNonLocalVecToAllCellsVec( - const unsigned int numWfcs, - const unsigned int numNonLocalCells, - const unsigned int maxSingleAtomContribution, - const std::complex *sphericalFnTimesWfcParallelVec, - std::complex * sphericalFnTimesWfcAllCellsVec, - const int * indexMapPaddedToParallelVec); + const unsigned int numWfcs, + const unsigned int numNonLocalCells, + const unsigned int maxSingleAtomContribution, + const dataTypes::numberFP32 *sphericalFnTimesWfcParallelVec, + dataTypes::numberFP32 * sphericalFnTimesWfcAllCellsVec, + const int * indexMapPaddedToParallelVec); template void @@ -286,10 +286,10 @@ namespace dftfe const unsigned int numberNodesPerElement, const unsigned int numberWfc, const unsigned int numberCellsTraversed, - const dftfe::utils::MemoryStorage - & nonLocalContribution, - double *TotalContribution, + & nonLocalContribution, + dataTypes::number *TotalContribution, const dftfe::utils::MemoryStorage &cellNodeIdMapNonLocalToLocal); @@ -300,10 +300,10 @@ namespace dftfe const unsigned int numberNodesPerElement, const unsigned int numberWfc, const unsigned int numberCellsTraversed, - const dftfe::utils::MemoryStorage, + const dftfe::utils::MemoryStorage - & nonLocalContribution, - std::complex *TotalContribution, + & nonLocalContribution, + dataTypes::numberFP32 *TotalContribution, const dftfe::utils::MemoryStorage &cellNodeIdMapNonLocalToLocal); diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 713b6c5fe..58cec5283 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1184,7 +1184,8 @@ namespace dftfe d_densityQuadratureIdElectro, d_excManagerPtr, atomLocations, - d_numEigenValues); + d_numEigenValues, + d_dftParamsPtr->useSinglePrecCheby); // @@ -3059,6 +3060,14 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE + std::cout + << "DEBUG ss dft.cc " + << (scfIter < + d_dftParamsPtr->spectrumSplitStartingScfIter || + scfConverged) + << " " << scfIter << " " + << d_dftParamsPtr->spectrumSplitStartingScfIter << " " + << scfConverged << std::endl; if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) kohnShamEigenSpaceCompute( diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index f9e5825ce..0513b6ca4 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -317,10 +317,16 @@ namespace dftfe d_basisOperationsPtrHost->createScratchMultiVectors(1, 3); d_basisOperationsPtrHost->createScratchMultiVectors(BVec, 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrHost->createScratchMultiVectorsSinglePrec(BVec, + 2); if (d_numEigenValues % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors(d_numEigenValues % BVec, 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrHost->createScratchMultiVectorsSinglePrec( + d_numEigenValues % BVec, 2); if (d_numEigenValues != d_numEigenValuesRR && d_numEigenValuesRR % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( @@ -360,6 +366,10 @@ namespace dftfe d_basisOperationsPtrDevice->createScratchMultiVectors(1, 3); d_basisOperationsPtrDevice->createScratchMultiVectors( BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrDevice->createScratchMultiVectorsSinglePrec( + BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + d_basisOperationsPtrDevice->computeCellStiffnessMatrix( d_feOrderPlusOneQuadratureId, 50, true, false); if (std::is_same>::value) @@ -408,6 +418,9 @@ namespace dftfe d_basisOperationsPtrDevice->createScratchMultiVectors(1, 3); d_basisOperationsPtrDevice->createScratchMultiVectors( BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrDevice->createScratchMultiVectorsSinglePrec( + BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); } #endif diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index 93b36e041..9b28be56f 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -338,6 +338,12 @@ namespace dftfe std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } if (d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + @@ -405,6 +411,7 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, + d_BLASWrapperPtrHost, elpaScala, d_eigenVectorsFlattenedHost.data() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -420,6 +427,8 @@ namespace dftfe residualNormWaveFunctions, interBandGroupComm, mpi_communicator, + d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + + spinType], computeResidual, useMixedPrec, isFirstScf); @@ -519,13 +528,19 @@ namespace dftfe if (d_dftParamsPtr->spinPolarized == 1) pcout << "spin: " << spinType + 1 << std::endl; } - + std::cout << "DEBUG ss " << isSpectrumSplit << std::endl; std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); std::vector eigenValuesDummy(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } subspaceIterationSolverDevice.reinitSpectrumBounds( a0[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType], @@ -813,6 +828,13 @@ namespace dftfe std::vector eigenValuesTemp(d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } + if (d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + @@ -868,6 +890,7 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, + d_BLASWrapperPtrHost, *d_elpaScala, d_eigenVectorsFlattenedHost.data() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -883,6 +906,8 @@ namespace dftfe residualNormWaveFunctions, interBandGroupComm, mpi_communicator, + d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + + spinType], true, false); diff --git a/src/dftOperator/KohnShamHamiltonianOperator.cc b/src/dftOperator/KohnShamHamiltonianOperator.cc index d6a2fff51..52cf0617a 100644 --- a/src/dftOperator/KohnShamHamiltonianOperator.cc +++ b/src/dftOperator/KohnShamHamiltonianOperator.cc @@ -71,6 +71,9 @@ namespace dftfe { if (d_dftParamsPtr->isPseudopotential) d_ONCVnonLocalOperator = oncvClassPtr->getNonLocalOperator(); + if (d_dftParamsPtr->isPseudopotential && d_dftParamsPtr->useSinglePrecCheby) + d_ONCVnonLocalOperatorSinglePrec = + oncvClassPtr->getNonLocalOperatorSinglePrec(); d_cellsBlockSizeHamiltonianConstruction = memorySpace == dftfe::utils::MemorySpace::HOST ? 1 : 50; d_cellsBlockSizeHX = memorySpace == dftfe::utils::MemorySpace::HOST ? @@ -110,6 +113,8 @@ namespace dftfe d_dftParamsPtr->memOptMode ? 1 : (d_kPointWeights.size() * (d_dftParamsPtr->spinPolarized + 1))); + d_cellHamiltonianMatrixSinglePrec.resize( + d_dftParamsPtr->useSinglePrecCheby ? d_cellHamiltonianMatrix.size() : 0); const unsigned int nCells = d_basisOperationsPtr->nCells(); const unsigned int nDofsPerCell = d_basisOperationsPtr->nDofsPerCell(); @@ -123,6 +128,12 @@ namespace dftfe ++iHamiltonian) d_cellHamiltonianMatrix[iHamiltonian].resize(nDofsPerCell * nDofsPerCell * nCells); + for (unsigned int iHamiltonian = 0; + iHamiltonian < d_cellHamiltonianMatrixSinglePrec.size(); + ++iHamiltonian) + d_cellHamiltonianMatrixSinglePrec[iHamiltonian].resize( + nDofsPerCell * nDofsPerCell * nCells); + d_basisOperationsPtrHost->reinit(0, 0, d_densityQuadratureID, false); const unsigned int numberQuadraturePoints = d_basisOperationsPtrHost->nQuadsPerCell(); @@ -617,6 +628,11 @@ namespace dftfe if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) if (d_dftParamsPtr->isPseudopotential) d_ONCVnonLocalOperator->initialiseOperatorActionOnX(d_kPointIndex); + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + if (d_dftParamsPtr->isPseudopotential && + d_dftParamsPtr->useSinglePrecCheby) + d_ONCVnonLocalOperatorSinglePrec->initialiseOperatorActionOnX( + d_kPointIndex); } @@ -631,10 +647,20 @@ namespace dftfe nCells * nDofsPerCell * numWaveFunctions) d_cellWaveFunctionMatrixSrc.resize(nCells * nDofsPerCell * numWaveFunctions); + if (d_dftParamsPtr->useSinglePrecCheby && + d_cellWaveFunctionMatrixSrcSinglePrec.size() < + nCells * nDofsPerCell * numWaveFunctions) + d_cellWaveFunctionMatrixSrcSinglePrec.resize(nCells * nDofsPerCell * + numWaveFunctions); if (d_cellWaveFunctionMatrixDst.size() < d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions) d_cellWaveFunctionMatrixDst.resize(d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions); + if (d_dftParamsPtr->useSinglePrecCheby && + d_cellWaveFunctionMatrixDstSinglePrec.size() < + d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions) + d_cellWaveFunctionMatrixDstSinglePrec.resize( + d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions); if (d_dftParamsPtr->isPseudopotential) { @@ -649,6 +675,23 @@ namespace dftfe d_ONCVnonLocalOperator->initialiseFlattenedDataStructure( numWaveFunctions, d_ONCVNonLocalProjectorTimesVectorBlock); } + if (d_dftParamsPtr->isPseudopotential && d_dftParamsPtr->useSinglePrecCheby) + { + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + { + d_ONCVnonLocalOperatorSinglePrec->initialiseFlattenedDataStructure( + numWaveFunctions, + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec); + d_ONCVnonLocalOperatorSinglePrec + ->initialiseCellWaveFunctionPointers( + d_cellWaveFunctionMatrixSrcSinglePrec); + } + else + d_ONCVnonLocalOperatorSinglePrec->initialiseFlattenedDataStructure( + numWaveFunctions, + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec); + } + d_basisOperationsPtr->reinit(numWaveFunctions, d_cellsBlockSizeHX, d_densityQuadratureID, @@ -696,6 +739,15 @@ namespace dftfe return d_basisOperationsPtr->getMultiVector(numVectors, index); } + template + dftfe::linearAlgebra::MultiVector & + KohnShamHamiltonianOperator::getScratchFEMultivectorSinglePrec( + const unsigned int numVectors, + const unsigned int index) + { + return d_basisOperationsPtr->getMultiVectorSinglePrec(numVectors, index); + } + template void KohnShamHamiltonianOperator< @@ -820,6 +872,11 @@ namespace dftfe 1); } } + if (d_dftParamsPtr->useSinglePrecCheby) + d_BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + d_cellHamiltonianMatrix[d_HamiltonianIndex].size(), + d_cellHamiltonianMatrix[d_HamiltonianIndex].data(), + d_cellHamiltonianMatrixSinglePrec[d_HamiltonianIndex].data()); if (d_dftParamsPtr->memOptMode) if ((d_dftParamsPtr->isPseudopotential || d_dftParamsPtr->smearedNuclearCharges) && @@ -1109,6 +1166,162 @@ namespace dftfe } } + template + void + KohnShamHamiltonianOperator::HXCheby( + dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse, + const bool skip1, + const bool skip2, + const bool skip3) + { + const unsigned int numCells = d_basisOperationsPtr->nCells(); + const unsigned int numDoFsPerCell = d_basisOperationsPtr->nDofsPerCell(); + const unsigned int numberWavefunctions = src.numVectors(); + if (d_numVectorsInternal != numberWavefunctions) + reinitNumberWavefunctions(numberWavefunctions); + + if (d_basisOperationsPtr->d_nVectors != numberWavefunctions) + d_basisOperationsPtr->reinit(numberWavefunctions, + d_cellsBlockSizeHX, + d_densityQuadratureID, + false, + false); + const bool hasNonlocalComponents = + d_dftParamsPtr->isPseudopotential && + (d_ONCVnonLocalOperatorSinglePrec + ->getTotalNonLocalElementsInCurrentProcessor() > 0) && + !onlyHPrimePartForFirstOrderDensityMatResponse; + const dataTypes::numberFP32 scalarCoeffAlpha = dataTypes::numberFP32(1.0), + scalarCoeffBeta = dataTypes::numberFP32(0.0); + + if (!skip1 && !skip2 && !skip3) + src.updateGhostValues(); + if (!skip1) + { + d_basisOperationsPtr + ->d_constraintInfo[d_basisOperationsPtr->d_dofHandlerID] + .distribute(src); + if constexpr (memorySpace == dftfe::utils::MemorySpace::HOST) + if (d_dftParamsPtr->isPseudopotential) + d_ONCVnonLocalOperatorSinglePrec->initialiseOperatorActionOnX( + d_kPointIndex); + for (unsigned int iCell = 0; iCell < numCells; + iCell += d_cellsBlockSizeHX) + { + std::pair cellRange( + iCell, std::min(iCell + d_cellsBlockSizeHX, numCells)); + d_BLASWrapperPtr->stridedCopyToBlock( + numberWavefunctions, + numDoFsPerCell * (cellRange.second - cellRange.first), + src.data(), + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + d_basisOperationsPtr->d_flattenedCellDofIndexToProcessDofIndexMap + .data() + + cellRange.first * numDoFsPerCell); + if (hasNonlocalComponents) + d_ONCVnonLocalOperatorSinglePrec->applyCconjtransOnX( + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + cellRange); + } + } + if (!skip2) + { + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec.setValue(0); + d_ONCVnonLocalOperatorSinglePrec->applyAllReduceOnCconjtransX( + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec, true); + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .accumulateAddLocallyOwnedBegin(); + } + src.zeroOutGhosts(); + inverseMassVectorScaledConstraintsNoneDataInfoPtr->set_zero(src); + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .accumulateAddLocallyOwnedEnd(); + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .updateGhostValuesBegin(); + } + d_BLASWrapperPtr->axpby(src.locallyOwnedSize() * src.numVectors(), + scalarX, + src.data(), + scalarY, + dst.data()); + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .updateGhostValuesEnd(); + d_ONCVnonLocalOperatorSinglePrec->applyVOnCconjtransX( + CouplingStructure::diagonal, + d_oncvClassPtr->getCouplingMatrixSinglePrec(), + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec, + true); + } + } + if (!skip3) + { + for (unsigned int iCell = 0; iCell < numCells; + iCell += d_cellsBlockSizeHX) + { + std::pair cellRange( + iCell, std::min(iCell + d_cellsBlockSizeHX, numCells)); + + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + numberWavefunctions, + numDoFsPerCell, + numDoFsPerCell, + &scalarCoeffAlpha, + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + numberWavefunctions, + numDoFsPerCell * numberWavefunctions, + d_cellHamiltonianMatrixSinglePrec[d_HamiltonianIndex].data() + + cellRange.first * numDoFsPerCell * numDoFsPerCell, + numDoFsPerCell, + numDoFsPerCell * numDoFsPerCell, + &scalarCoeffBeta, + d_cellWaveFunctionMatrixDstSinglePrec.data(), + numberWavefunctions, + numDoFsPerCell * numberWavefunctions, + cellRange.second - cellRange.first); + if (hasNonlocalComponents) + d_ONCVnonLocalOperatorSinglePrec->applyCOnVCconjtransX( + d_cellWaveFunctionMatrixDstSinglePrec.data(), cellRange); + d_BLASWrapperPtr->axpyStridedBlockAtomicAdd( + numberWavefunctions, + numDoFsPerCell * (cellRange.second - cellRange.first), + scalarHX, + d_basisOperationsPtr->cellInverseMassVectorBasisData().data() + + cellRange.first * numDoFsPerCell, + d_cellWaveFunctionMatrixDstSinglePrec.data(), + dst.data(), + d_basisOperationsPtr->d_flattenedCellDofIndexToProcessDofIndexMap + .data() + + cellRange.first * numDoFsPerCell); + } + + inverseMassVectorScaledConstraintsNoneDataInfoPtr + ->distribute_slave_to_master(dst); + } + if (!skip1 && !skip2 && !skip3) + { + dst.accumulateAddLocallyOwned(); + dst.zeroOutGhosts(); + } + } template class KohnShamHamiltonianOperator; diff --git a/src/linAlg/linearAlgebraOperationsDevice.cc b/src/linAlg/linearAlgebraOperationsDevice.cc index aa25b7a35..dd353519e 100644 --- a/src/linAlg/linearAlgebraOperationsDevice.cc +++ b/src/linAlg/linearAlgebraOperationsDevice.cc @@ -490,6 +490,301 @@ namespace dftfe X2 = Y2; } + void + chebyshevFilterOverlapComputeCommunicationSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X1, + dftfe::linearAlgebra::MultiVector &Y1, + dftfe::linearAlgebra::MultiVector &X2, + dftfe::linearAlgebra::MultiVector &Y2, + dftfe::linearAlgebra::MultiVector + &X1_SP, + dftfe::linearAlgebra::MultiVector + &Y1_SP, + dftfe::linearAlgebra::MultiVector + &X2_SP, + dftfe::linearAlgebra::MultiVector + & Y2_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0) + { + double e, c, sigma, sigma1, sigma2, gamma, alpha1Old, alpha2Old; + e = (b - a) / 2.0; + c = (b + a) / 2.0; + sigma = e / (a0 - c); + sigma1 = sigma; + gamma = 2.0 / sigma1; + + dftfe::utils::MemoryStorage + eigenValuesFiltered, eigenValuesFiltered1, eigenValuesFiltered2; + eigenValuesFiltered.resize(eigenvalues.size()); + eigenValuesFiltered.copyFrom(eigenvalues); + eigenValuesFiltered1 = eigenValuesFiltered; + eigenValuesFiltered2 = eigenValuesFiltered; + eigenValuesFiltered1.setValue(1.0); + + // + // create YArray + // initialize to zeros. + // x + operatorMatrix.HXCheby(X1, 1.0, 0.0, 0.0, Y1); + + + // + // call HX + // + + + double alpha1 = sigma1 / e, alpha2 = -c; + eigenValuesFiltered2.setValue(alpha1 * alpha2); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered2.size(), + alpha1, + eigenValuesFiltered2.data(), + eigenValuesFiltered1.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered2.data()); + BLASWrapperPtr->ApaBD(X1.locallyOwnedSize(), + X1.numVectors(), + -1.0, + Y1.data(), + X1.data(), + eigenValuesFiltered.data(), + Y1.data()); + X1_SP.setValue(0.0); + X2_SP.setValue(0.0); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X1.locallyOwnedSize() * X1.numVectors(), Y1.data(), Y1_SP.data()); + BLASWrapperPtr->xscal(Y1_SP.data(), + dataTypes::numberFP32(alpha1), + X2.locallyOwnedSize() * X2.numVectors()); + X2.updateGhostValues(); + operatorMatrix.HXCheby(X2, 1.0, 0.0, 0.0, Y2, false, false, true, true); + // + // polynomial loop + // + for (unsigned int degree = 2; degree < m + 1; ++degree) + { + sigma2 = 1.0 / (gamma - sigma); + alpha1Old = alpha1, alpha2Old = alpha2; + alpha1 = 2.0 * sigma2 / e, alpha2 = -(sigma * sigma2); + + if (degree == 2) + { + operatorMatrix.HXCheby( + X2, 1.0, 0.0, 0.0, Y2, false, true, false, true); + Y1_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby( + X2, 1.0, 0.0, 0.0, Y2, false, true, true, false); + Y1_SP.updateGhostValuesEnd(); + Y2.accumulateAddLocallyOwnedBegin(); + } + else + { + operatorMatrix.HXCheby(Y2_SP, + alpha1Old, + alpha2Old, + -c * alpha1Old, + X2_SP, + false, + true, + false, + true); + Y1_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby(Y2_SP, + alpha1Old, + alpha2Old, + -c * alpha1Old, + X2_SP, + false, + true, + true, + false); + Y1_SP.updateGhostValuesEnd(); + X2_SP.accumulateAddLocallyOwnedBegin(); + } + + + // + // call HX + // + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + false, + true, + true); + if (degree == 2) + { + Y2.accumulateAddLocallyOwnedEnd(); + Y2.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2.locallyOwnedSize(), + X2.numVectors(), + -1.0, + Y2.data(), + X2.data(), + eigenValuesFiltered.data() + + X1.numVectors(), + Y2.data()); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X2.locallyOwnedSize() * X2.numVectors(), + Y2.data(), + Y2_SP.data()); + BLASWrapperPtr->xscal(Y2_SP.data(), + dataTypes::numberFP32(alpha1Old), + X2.locallyOwnedSize() * X2.numVectors()); + } + else + { + X2_SP.accumulateAddLocallyOwnedEnd(); + X2_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2_SP.locallyOwnedSize(), + X2_SP.numVectors(), + alpha1Old, + X2_SP.data(), + Y2.data(), + eigenValuesFiltered2.data() + + X1_SP.numVectors(), + X2_SP.data()); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1Old, + eigenValuesFiltered2.data(), + alpha2Old, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1Old, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + X2_SP.swap(Y2_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + } + + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + true, + false, + true); + Y2_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + true, + true, + false); + Y2_SP.updateGhostValuesEnd(); + X1_SP.accumulateAddLocallyOwnedBegin(); + operatorMatrix.HXCheby(Y2_SP, + alpha1, + alpha2, + -c * alpha1, + X2_SP, + false, + false, + true, + true); + X1_SP.accumulateAddLocallyOwnedEnd(); + X1_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X1_SP.locallyOwnedSize(), + X1_SP.numVectors(), + alpha1, + X1_SP.data(), + Y1.data(), + eigenValuesFiltered2.data(), + X1_SP.data()); + + // + // XArray = YArray + // + X1_SP.swap(Y1_SP); + + if (degree == m) + { + operatorMatrix.HXCheby(Y2_SP, + alpha1, + alpha2, + -c * alpha1, + X2_SP, + false, + true, + false, + false); + X2_SP.accumulateAddLocallyOwned(); + X2_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2_SP.locallyOwnedSize(), + X2_SP.numVectors(), + alpha1, + X2_SP.data(), + Y2.data(), + eigenValuesFiltered2.data() + + X1_SP.numVectors(), + X2_SP.data()); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1, + eigenValuesFiltered2.data(), + alpha2, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + X2_SP.swap(Y2_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + } + + // + // YArray = YNewArray + // + sigma = sigma2; + } + + // copy back YArray to XArray + BLASWrapperPtr->ApaBD(X1.locallyOwnedSize(), + X1.numVectors(), + 1.0, + Y1_SP.data(), + X1.data(), + eigenValuesFiltered2.data(), + X1.data()); + + BLASWrapperPtr->ApaBD(X2.locallyOwnedSize(), + X2.numVectors(), + 1.0, + Y2_SP.data(), + X2.data(), + eigenValuesFiltered2.data() + X1.numVectors(), + X2.data()); + } + void subspaceRotationSpectrumSplitScalapack( diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index 31e68657a..ed09bdb21 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -336,6 +336,129 @@ namespace dftfe X = Y; } + // + // chebyshev filtering of given subspace XArray + // + template + void + chebyshevFilterSinglePrec( + const std::shared_ptr> + & BLASWrapperPtr, + operatorDFTClass & operatorMatrix, + dftfe::linearAlgebra::MultiVector & X, + dftfe::linearAlgebra::MultiVector & Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0) + { + double e, c, sigma, sigma1, sigma2, gamma; + e = (b - a) / 2.0; + c = (b + a) / 2.0; + sigma = e / (a0 - c); + sigma1 = sigma; + gamma = 2.0 / sigma1; + + dftfe::utils::MemoryStorage eigenValuesFiltered, + eigenValuesFiltered1, eigenValuesFiltered2; + eigenValuesFiltered.resize(eigenvalues.size()); + eigenValuesFiltered.copyFrom(eigenvalues); + eigenValuesFiltered1 = eigenValuesFiltered; + eigenValuesFiltered2 = eigenValuesFiltered; + eigenValuesFiltered1.setValue(1.0); + // + // create YArray + // initialize to zeros. + // x + operatorMatrix.HXCheby(X, 1.0, 0.0, 0.0, Y); + // Y=HX + // + // call HX + // + double alpha1 = sigma1 / e, alpha2 = -c; + // Y=alpha1*HX+alpha1 * alpha2*X + eigenValuesFiltered2.setValue(alpha1 * alpha2); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered2.size(), + alpha1, + eigenValuesFiltered2.data(), + eigenValuesFiltered1.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered2.data()); + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + -1.0, + Y.data(), + X.data(), + eigenValuesFiltered.data(), + Y.data()); + X_SP.setValue(0.0); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X.locallyOwnedSize() * X.numVectors(), Y.data(), Y_SP.data()); + BLASWrapperPtr->xscal(Y_SP.data(), + dataTypes::numberFP32(alpha1), + X.locallyOwnedSize() * X.numVectors()); + // + // polynomial loop + // + for (unsigned int degree = 2; degree < m + 1; ++degree) + { + sigma2 = 1.0 / (gamma - sigma); + alpha1 = 2.0 * sigma2 / e, alpha2 = -(sigma * sigma2); + + + operatorMatrix.HXCheby(Y_SP, alpha1, alpha2, -c * alpha1, X_SP); + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + alpha1, + X_SP.data(), + Y.data(), + eigenValuesFiltered2.data(), + X_SP.data()); + + // + // call HX + // + // operatorMatrix.HXCheby( + // Y, X, Y, eigenValuesFiltered2, alpha1, alpha2, -c * alpha1, X); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1, + eigenValuesFiltered2.data(), + alpha2, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + + + // + // XArray = YArray + // + X_SP.swap(Y_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + + // + // YArray = YNewArray + // + sigma = sigma2; + } + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + 1.0, + Y_SP.data(), + X.data(), + eigenValuesFiltered2.data(), + X.data()); + + // copy back YArray to XArray + } template @@ -2912,18 +3035,20 @@ namespace dftfe 1, operatorMatrix.getMPICommunicatorDomain(), &YNorm); + double lowerBound = std::floor(eigenValuesT[0]); + double upperBound = + std::ceil(eigenValuesT[lanczosIterations - 1] + + (dftParams.reproducible_output ? YNorm : YNorm / 10.0)); - if (dftParams.verbosity >= 5 && this_mpi_process == 0) + if (dftParams.verbosity >= 4 && this_mpi_process == 0) { std::cout << "bUp1: " << eigenValuesT[lanczosIterations - 1] << ", fvector norm: " << YNorm << std::endl; std::cout << "aLow: " << eigenValuesT[0] << std::endl; + std::cout << "boundL: " << lowerBound << std::endl; + std::cout << "boundU: " << upperBound << std::endl; } - double lowerBound = std::floor(eigenValuesT[0]); - double upperBound = - std::ceil(eigenValuesT[lanczosIterations - 1] + - (dftParams.reproducible_output ? YNorm : YNorm / 10.0)); return (std::make_pair(lowerBound, upperBound)); } @@ -3694,6 +3819,26 @@ namespace dftfe const double, const double, const double); + template void + chebyshevFilterSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X, + dftfe::linearAlgebra::MultiVector &Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); + #ifdef DFTFE_WITH_DEVICE template void chebyshevFilter( @@ -3706,6 +3851,27 @@ namespace dftfe const double, const double, const double); + template void + chebyshevFilterSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X, + dftfe::linearAlgebra::MultiVector &Y, + dftfe::linearAlgebra::MultiVector + &X_SP, + dftfe::linearAlgebra::MultiVector + & Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); #endif diff --git a/src/pseudo/oncv/oncvClass.cc b/src/pseudo/oncv/oncvClass.cc index aff7df6de..7190f30fe 100644 --- a/src/pseudo/oncv/oncvClass.cc +++ b/src/pseudo/oncv/oncvClass.cc @@ -118,7 +118,8 @@ namespace dftfe unsigned int densityQuadratureIdElectro, std::shared_ptr excFunctionalPtr, const std::vector> &atomLocations, - unsigned int numEigenValues) + unsigned int numEigenValues, + const bool singlePrecNonLocalOperator) { MPI_Barrier(d_mpiCommParent); d_BasisOperatorHostPtr = basisOperationsHostPtr; @@ -142,6 +143,7 @@ namespace dftfe d_nlpspQuadratureId = nlpspQuadratureId; d_excManagerPtr = excFunctionalPtr; d_numEigenValues = numEigenValues; + d_singlePrecNonLocalOperator = singlePrecNonLocalOperator; createAtomCenteredSphericalFunctionsForDensities(); createAtomCenteredSphericalFunctionsForProjectors(); @@ -161,6 +163,15 @@ namespace dftfe d_BasisOperatorHostPtr, d_atomicProjectorFnsContainer, d_mpiCommParent); + if constexpr (dftfe::utils::MemorySpace::HOST == memorySpace) + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec = + std::make_shared::type, + memorySpace>>(d_BLASWrapperHostPtr, + d_BasisOperatorHostPtr, + d_atomicProjectorFnsContainer, + d_mpiCommParent); } #if defined(DFTFE_WITH_DEVICE) else @@ -172,6 +183,15 @@ namespace dftfe d_BasisOperatorDevicePtr, d_atomicProjectorFnsContainer, d_mpiCommParent); + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec = + std::make_shared::type, + memorySpace>>(d_BLASWrapperDevicePtr, + d_BasisOperatorDevicePtr, + d_atomicProjectorFnsContainer, + d_mpiCommParent); } #endif @@ -206,7 +226,8 @@ namespace dftfe if (updateNonlocalSparsity) { - d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = false; MPI_Barrier(d_mpiCommParent); double InitTime = MPI_Wtime(); d_atomicProjectorFnsContainer->computeSparseStructure( @@ -226,6 +247,15 @@ namespace dftfe kPointCoordinates, d_BasisOperatorHostPtr, d_nlpspQuadratureId); + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec + ->intitialisePartitionerKPointsAndComputeCMatrixEntries( + updateNonlocalSparsity, + kPointWeights, + kPointCoordinates, + d_BasisOperatorHostPtr, + d_nlpspQuadratureId); + MPI_Barrier(d_mpiCommParent); double TotalTime = MPI_Wtime() - InitTimeTotal; @@ -270,7 +300,8 @@ namespace dftfe if (updateNonlocalSparsity) { - d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = false; MPI_Barrier(d_mpiCommParent); double InitTime = MPI_Wtime(); d_atomicProjectorFnsContainer->getDataForSparseStructure( @@ -295,6 +326,15 @@ namespace dftfe kPointCoordinates, d_BasisOperatorHostPtr, d_nlpspQuadratureId); + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec + ->intitialisePartitionerKPointsAndComputeCMatrixEntries( + updateNonlocalSparsity, + kPointWeights, + kPointCoordinates, + d_BasisOperatorHostPtr, + d_nlpspQuadratureId); + MPI_Barrier(d_mpiCommParent); double TotalTime = MPI_Wtime() - InitTimeTotal; if (d_verbosity >= 2) @@ -678,6 +718,36 @@ namespace dftfe #endif } + template + const dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + oncvClass::getCouplingMatrixSinglePrec() + { + getCouplingMatrix(); + if (!d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated) + { + d_couplingMatrixEntriesSinglePrec.resize( + d_couplingMatrixEntries.size()); + if constexpr (memorySpace == dftfe::utils::MemorySpace::HOST) + d_BLASWrapperHostPtr->copyValueType1ArrToValueType2Arr( + d_couplingMatrixEntriesSinglePrec.size(), + d_couplingMatrixEntries.data(), + d_couplingMatrixEntriesSinglePrec.data()); +#if defined(DFTFE_WITH_DEVICE) + else + d_BLASWrapperDevicePtr->copyValueType1ArrToValueType2Arr( + d_couplingMatrixEntriesSinglePrec.size(), + d_couplingMatrixEntries.data(), + d_couplingMatrixEntriesSinglePrec.data()); +#endif + + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = true; + } + + return (d_couplingMatrixEntriesSinglePrec); + } + template const std::shared_ptr> @@ -686,6 +756,15 @@ namespace dftfe return d_nonLocalOperator; } + template + const std::shared_ptr::type, + memorySpace>> + oncvClass::getNonLocalOperatorSinglePrec() + { + return d_nonLocalOperatorSinglePrec; + } + template diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc index 80147c97b..268b4bef2 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc @@ -140,8 +140,11 @@ namespace dftfe void chebyshevOrthogonalizedSubspaceIterationSolver::solve( operatorDFTClass &operatorMatrix, - elpaScalaManager & elpaScala, - dataTypes::number * eigenVectorsFlattened, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, dataTypes::number * eigenVectorsRotFracDensityFlattened, const unsigned int totalNumberWaveFunctions, const unsigned int localVectorSize, @@ -149,6 +152,7 @@ namespace dftfe std::vector &residualNorms, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, + const bool isFirstFilteringCall, const bool computeResidual, const bool useMixedPrec, const bool isFirstScf) @@ -240,6 +244,20 @@ namespace dftfe distributedCPUMultiVec *eigenVectorsFlattenedArrayBlock2 = &operatorMatrix.getScratchFEMultivector(vectorsBlockSize, 1); + distributedCPUMultiVec + *eigenVectorsFlattenedArrayBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, + 0) : + NULL; + distributedCPUMultiVec + *eigenVectorsFlattenedArrayBlock2FP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, + 1) : + NULL; + + std::vector eigenValuesBlock(vectorsBlockSize); /// storage for cell wavefunction matrix std::vector cellWaveFunctionMatrix; @@ -267,6 +285,15 @@ namespace dftfe &operatorMatrix.getScratchFEMultivector(BVec, 0); eigenVectorsFlattenedArrayBlock2 = &operatorMatrix.getScratchFEMultivector(BVec, 1); + if (d_dftParams.useSinglePrecCheby) + { + eigenVectorsFlattenedArrayBlockFP32 = + &operatorMatrix.getScratchFEMultivectorSinglePrec(BVec, + 0); + eigenVectorsFlattenedArrayBlock2FP32 = + &operatorMatrix.getScratchFEMultivectorSinglePrec(BVec, + 1); + } } // fill the eigenVectorsFlattenedArrayBlock from @@ -288,17 +315,36 @@ namespace dftfe // call Chebyshev filtering function only for the current block to // be filtered and does in-place filtering computing_timer.enter_subsection("Chebyshev filtering"); - - - - linearAlgebraOperations::chebyshevFilter( - operatorMatrix, - *eigenVectorsFlattenedArrayBlock, - *eigenVectorsFlattenedArrayBlock2, - chebyshevOrder, - d_lowerBoundUnWantedSpectrum, - d_upperBoundUnWantedSpectrum, - d_lowerBoundWantedSpectrum); + if (d_dftParams.useSinglePrecCheby && !isFirstFilteringCall) + { + eigenValuesBlock.resize(BVec); + for (unsigned int i = 0; i < BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + + linearAlgebraOperations::chebyshevFilterSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*eigenVectorsFlattenedArrayBlock), + (*eigenVectorsFlattenedArrayBlock2), + (*eigenVectorsFlattenedArrayBlockFP32), + (*eigenVectorsFlattenedArrayBlock2FP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + } + else + linearAlgebraOperations::chebyshevFilter( + operatorMatrix, + *eigenVectorsFlattenedArrayBlock, + *eigenVectorsFlattenedArrayBlock2, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); computing_timer.leave_subsection("Chebyshev filtering"); diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc index 32679f1a0..36c99f402 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc @@ -216,7 +216,26 @@ namespace dftfe d_dftParams.overlapComputeCommunCheby ? &operatorMatrix.getScratchFEMultivector(vectorsBlockSize, 3) : NULL; + distributedDeviceVec *XBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 0) : + NULL; + distributedDeviceVec *HXBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 1) : + NULL; + + distributedDeviceVec *XBlock2FP32 = + d_dftParams.overlapComputeCommunCheby && d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 2) : + NULL; + distributedDeviceVec *HXBlock2FP32 = + d_dftParams.overlapComputeCommunCheby && d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 3) : + NULL; + operatorMatrix.reinitNumberWavefunctions(vectorsBlockSize); + std::vector eigenValuesBlock(vectorsBlockSize); if (isFirstFilteringCall) { @@ -272,7 +291,7 @@ namespace dftfe dftfe::utils::deviceSynchronize(); computingTimerStandard.leave_subsection("Lanczos upper bound"); } - + // d_lowerBoundWantedSpectrum = bounds.first; d_upperBoundUnWantedSpectrum = bounds.second; } @@ -323,6 +342,11 @@ namespace dftfe "lower bound of unwanted spectrum", d_lowerBoundUnWantedSpectrum); pcout << buffer; + sprintf(buffer, + "%s:%18.10e\n", + "lower bound of wanted spectrum", + d_lowerBoundWantedSpectrum); + pcout << buffer; sprintf(buffer, "%s: %u\n\n", "Chebyshev polynomial degree", @@ -396,8 +420,94 @@ namespace dftfe // call Chebyshev filtering function only for the current block // or two simulataneous blocks (in case of overlap computation // and communication) to be filtered and does in-place filtering - if (d_dftParams.overlapComputeCommunCheby && - numSimultaneousBlocksCurrent == 2) + if (d_dftParams.useSinglePrecCheby && !isFirstFilteringCall) + { + eigenValuesBlock.resize(vectorsBlockSize * + numSimultaneousBlocksCurrent); + if (d_dftParams.overlapComputeCommunCheby && + numSimultaneousBlocksCurrent == 2) + { + for (unsigned int i = 0; i < 2 * BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*XBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + } + linearAlgebraOperationsDevice:: + chebyshevFilterOverlapComputeCommunicationSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*XBlock), + (*HXBlock), + (*XBlock2), + (*HXBlock2), + (*XBlockFP32), + (*HXBlockFP32), + (*XBlock2FP32), + (*HXBlock2FP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*XBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + } + } + else + { + for (unsigned int i = 0; i < BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + } + linearAlgebraOperations::chebyshevFilterSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*XBlock), + (*HXBlock), + (*XBlockFP32), + (*HXBlockFP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + + if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + } + } + } + else if (d_dftParams.overlapComputeCommunCheby && + numSimultaneousBlocksCurrent == 2) { if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) { diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index 184d66737..526072ed4 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -575,6 +575,32 @@ namespace dftfe beta); } + template + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const + { + ApaBDDeviceKernel<<<(n * m / dftfe::utils::DEVICE_BLOCK_SIZE) + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + m, + n, + alpha, + dftfe::utils::makeDataTypeDeviceCompatible(A), + dftfe::utils::makeDataTypeDeviceCompatible(B), + dftfe::utils::makeDataTypeDeviceCompatible(D), + dftfe::utils::makeDataTypeDeviceCompatible(C)); + } + template void @@ -598,7 +624,7 @@ namespace dftfe } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -606,7 +632,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { axpyStridedBlockAtomicAddDeviceKernel<<< @@ -1607,6 +1633,46 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void BLASWrapper::axpby(const unsigned int n, const double alpha, @@ -1623,6 +1689,22 @@ namespace dftfe const double beta, std::complex * y) const; + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float *x, + const double beta, + float * y) const; + + + template void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + // for xscal template void BLASWrapper::xscal( @@ -1790,6 +1872,12 @@ namespace dftfe copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, float * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr( + const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); template void BLASWrapper:: @@ -1808,6 +1896,60 @@ namespace dftfe const dftfe::size_type startingId, const std::complex *copyFromVec, std::complex * copyToVec); + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double *A, + const double *B, + const double *D, + double *C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + float *C) const; + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + double *C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + template void BLASWrapper::copyRealArrsToComplexArr( diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index db616689f..da1b697c1 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -657,6 +657,35 @@ namespace dftfe beta); } + template + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const + { + hipLaunchKernelGGL(ApaBDDeviceKernel, + (n * m / dftfe::utils::DEVICE_BLOCK_SIZE) + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + m, + n, + alpha, + dftfe::utils::makeDataTypeDeviceCompatible(A), + dftfe::utils::makeDataTypeDeviceCompatible(B), + dftfe::utils::makeDataTypeDeviceCompatible(D), + dftfe::utils::makeDataTypeDeviceCompatible(C)); + } + template void @@ -681,7 +710,7 @@ namespace dftfe addToVecStartingContiguousBlockIds); } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -689,7 +718,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { hipLaunchKernelGGL(axpyStridedBlockAtomicAddDeviceKernel, @@ -1847,6 +1876,14 @@ namespace dftfe copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, float * valueType2Arr); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr( + const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); + template void BLASWrapper:: stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, @@ -1903,6 +1940,45 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; template void BLASWrapper::axpby(const unsigned int n, @@ -1920,6 +1996,76 @@ namespace dftfe const double beta, std::complex * y) const; + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float *x, + const double beta, + float * y) const; + + + template void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double *A, + const double *B, + const double *D, + double *C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + float *C) const; + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + double *C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + template void BLASWrapper::copyRealArrsToComplexArr( const dftfe::size_type size, diff --git a/utils/BLASWrapperDeviceKernels.cc b/utils/BLASWrapperDeviceKernels.cc index e0f1d6682..399584065 100644 --- a/utils/BLASWrapperDeviceKernels.cc +++ b/utils/BLASWrapperDeviceKernels.cc @@ -397,6 +397,183 @@ namespace dftfe } } + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceDoubleComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceFloatComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceFloatComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + __global__ void axpyStridedBlockAtomicAddDeviceKernel( const dftfe::size_type contiguousBlockSize, @@ -487,5 +664,35 @@ namespace dftfe } } + template + __global__ void + ApaBDDeviceKernel(const dftfe::size_type nRows, + const dftfe::size_type nCols, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = nCols * nRows; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type iRow = index % nCols; + const ValueType0 alphaD = alpha * D[iRow]; + dftfe::utils::copyValue( + C + index, + dftfe::utils::add(A[index], dftfe::utils::mult(B[index], alphaD))); + } + } + + } // namespace } // namespace dftfe diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index 1bdd6bddb..fad0db017 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -65,6 +65,72 @@ namespace dftfe &transA, &transB, &m, &n, &k, alpha, A, &lda, B, &ldb, beta, C, &ldc); } + template + void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 *A, + const ValueType2 *B, + const ValueType3 *D, + ValueType4 *C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = + A[iCol + n * iRow] + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + + template <> + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = std::complex(A[iCol + n * iRow]) + + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + + template <> + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = std::complex(A[iCol + n * iRow]) + + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + template void BLASWrapper::xscal( @@ -404,7 +470,7 @@ namespace dftfe std::plus<>{}); } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -412,7 +478,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) @@ -426,6 +492,31 @@ namespace dftfe } } + template <> + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) + { + double coeff = a * s[iBlock]; + std::transform(addFromVec + iBlock * contiguousBlockSize, + addFromVec + (iBlock + 1) * contiguousBlockSize, + addToVec + addToVecStartingContiguousBlockIds[iBlock], + addToVec + addToVecStartingContiguousBlockIds[iBlock], + [&coeff](auto &p, auto &q) { + return std::complex(p) * coeff + + std::complex(q); + }); + } + } + template void BLASWrapper::axpby(const unsigned int n, @@ -439,6 +530,20 @@ namespace dftfe }); } + template <> + void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const + { + std::transform(x, x + n, y, y, [&alpha, &beta](auto &p, auto &q) { + return alpha * std::complex(p) + beta * std::complex(q); + }); + } + void BLASWrapper::xsymv( const char UPLO, @@ -585,6 +690,83 @@ namespace dftfe } } + void + BLASWrapper::xgemmStridedBatched( + const char transA, + const char transB, + const unsigned int m, + const unsigned int n, + const unsigned int k, + const float * alpha, + const float * A, + const unsigned int lda, + long long int strideA, + const float * B, + const unsigned int ldb, + long long int strideB, + const float * beta, + float * C, + const unsigned int ldc, + long long int strideC, + const int batchCount) const + { + for (int iBatch = 0; iBatch < batchCount; iBatch++) + { + xgemm(transA, + transB, + m, + n, + k, + alpha, + A + iBatch * strideA, + lda, + B + iBatch * strideB, + ldb, + beta, + C + iBatch * strideC, + ldc); + } + } + + + void + BLASWrapper::xgemmStridedBatched( + const char transA, + const char transB, + const unsigned int m, + const unsigned int n, + const unsigned int k, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + long long int strideA, + const std::complex *B, + const unsigned int ldb, + long long int strideB, + const std::complex *beta, + std::complex * C, + const unsigned int ldc, + long long int strideC, + const int batchCount) const + { + for (int iBatch = 0; iBatch < batchCount; iBatch++) + { + xgemm(transA, + transB, + m, + n, + k, + alpha, + A + iBatch * strideA, + lda, + B + iBatch * strideB, + ldb, + beta, + C + iBatch * strideC, + ldc); + } + } + template void BLASWrapper::copyComplexArrToRealArrs( @@ -899,6 +1081,12 @@ namespace dftfe copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, float * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr( + const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); // axpyStridedBlockAtomicAdd template void @@ -937,6 +1125,46 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void BLASWrapper::axpby(const unsigned int n, const double alpha, @@ -953,6 +1181,49 @@ namespace dftfe const double beta, std::complex * y) const; + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float *x, + const double beta, + float * y) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double *A, + const double *B, + const double *D, + double * C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + float * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + double * C) const; + template void BLASWrapper::copyRealArrsToComplexArr( const dftfe::size_type size, diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index cc2952630..38308e208 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -2120,6 +2120,38 @@ namespace dftfe } } + template + void + FEBasisOperations:: + createScratchMultiVectorsSinglePrec(const unsigned int vecBlockSize, + const unsigned int numMultiVecs) const + { + auto iter = scratchMultiVectorsSinglePrec.find(vecBlockSize); + if (iter == scratchMultiVectorsSinglePrec.end()) + { + scratchMultiVectorsSinglePrec[vecBlockSize] = + std::vector::type, + memorySpace>>(numMultiVecs); + for (unsigned int iVec = 0; iVec < numMultiVecs; ++iVec) + scratchMultiVectorsSinglePrec[vecBlockSize][iVec].reinit( + mpiPatternP2P, vecBlockSize); + } + else + { + scratchMultiVectorsSinglePrec[vecBlockSize].resize( + scratchMultiVectorsSinglePrec[vecBlockSize].size() + numMultiVecs); + for (unsigned int iVec = 0; + iVec < scratchMultiVectorsSinglePrec[vecBlockSize].size(); + ++iVec) + scratchMultiVectorsSinglePrec[vecBlockSize][iVec].reinit( + mpiPatternP2P, vecBlockSize); + } + } + template @@ -2128,6 +2160,7 @@ namespace dftfe clearScratchMultiVectors() const { scratchMultiVectors.clear(); + scratchMultiVectorsSinglePrec.clear(); } template + dftfe::linearAlgebra::MultiVector< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + FEBasisOperations:: + getMultiVectorSinglePrec(const unsigned int vecBlockSize, + const unsigned int index) const + { + AssertThrow(scratchMultiVectorsSinglePrec.find(vecBlockSize) != + scratchMultiVectorsSinglePrec.end(), + dealii::ExcMessage( + "DFT-FE Error: MultiVector not found in scratch storage.")); + return scratchMultiVectorsSinglePrec[vecBlockSize][index]; + } + template *alpha, + std::complex * x, + const unsigned int * incx, + std::complex * y, + const unsigned int * incy) + { + caxpy_(n, alpha, x, incx, y, incy); + } + // @@ -479,14 +501,30 @@ namespace dftfe constraintMatrixInfo::distribute( distributedCPUMultiVec> &fieldVector) const; + template void + constraintMatrixInfo::distribute( + distributedCPUMultiVec &fieldVector) const; + + template void + constraintMatrixInfo::distribute( + distributedCPUMultiVec> &fieldVector) const; + template void constraintMatrixInfo::distribute_slave_to_master( distributedCPUMultiVec &fieldVector) const; + template void + constraintMatrixInfo::distribute_slave_to_master( + distributedCPUMultiVec &fieldVector) const; + template void constraintMatrixInfo::set_zero( distributedCPUMultiVec &fieldVector) const; + template void + constraintMatrixInfo::set_zero( + distributedCPUMultiVec &fieldVector) const; + } // namespace dftUtils } // namespace dftfe diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index 9e8cffade..a19f5bb19 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -367,6 +367,102 @@ namespace dftfe } } + __global__ void + distributeSlaveToMasterKernelAtomicAdd( + const unsigned int contiguousBlockSize, + float * xVec, + const unsigned int *constraintLocalRowIdsUnflattened, + const unsigned int numConstraints, + const unsigned int *constraintRowSizes, + const unsigned int *constraintRowSizesAccumulated, + const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, + const double * constraintColumnValuesAllRowsUnflattened) + { + const dealii::types::global_dof_index globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dealii::types::global_dof_index numberEntries = + numConstraints * contiguousBlockSize; + + for (dealii::types::global_dof_index index = globalThreadId; + index < numberEntries; + index += blockDim.x * gridDim.x) + { + const unsigned int blockIndex = index / contiguousBlockSize; + const unsigned int intraBlockIndex = index % contiguousBlockSize; + const unsigned int constrainedRowId = + constraintLocalRowIdsUnflattened[blockIndex]; + const unsigned int numberColumns = constraintRowSizes[blockIndex]; + const unsigned int startingColumnNumber = + constraintRowSizesAccumulated[blockIndex]; + const dealii::types::global_dof_index xVecStartingIdRow = + constrainedRowId * contiguousBlockSize; + for (unsigned int i = 0; i < numberColumns; ++i) + { + const unsigned int constrainedColumnId = + constraintLocalColumnIdsAllRowsUnflattened + [startingColumnNumber + i]; + const dealii::types::global_dof_index xVecStartingIdColumn = + constrainedColumnId * contiguousBlockSize; + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex]), + constraintColumnValuesAllRowsUnflattened + [startingColumnNumber + i] * + xVec[xVecStartingIdRow + intraBlockIndex]); + } + xVec[xVecStartingIdRow + intraBlockIndex] = 0.0; + } + } + + + __global__ void + distributeSlaveToMasterKernelAtomicAdd( + const unsigned int contiguousBlockSize, + dftfe::utils::deviceFloatComplex *xVec, + const unsigned int * constraintLocalRowIdsUnflattened, + const unsigned int numConstraints, + const unsigned int * constraintRowSizes, + const unsigned int * constraintRowSizesAccumulated, + const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, + const double * constraintColumnValuesAllRowsUnflattened) + { + const dealii::types::global_dof_index globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dealii::types::global_dof_index numberEntries = + numConstraints * contiguousBlockSize; + + for (dealii::types::global_dof_index index = globalThreadId; + index < numberEntries; + index += blockDim.x * gridDim.x) + { + const unsigned int blockIndex = index / contiguousBlockSize; + const unsigned int intraBlockIndex = index % contiguousBlockSize; + const unsigned int constrainedRowId = + constraintLocalRowIdsUnflattened[blockIndex]; + const unsigned int numberColumns = constraintRowSizes[blockIndex]; + const unsigned int startingColumnNumber = + constraintRowSizesAccumulated[blockIndex]; + const dealii::types::global_dof_index xVecStartingIdRow = + constrainedRowId * contiguousBlockSize; + for (unsigned int i = 0; i < numberColumns; ++i) + { + const unsigned int constrainedColumnId = + constraintLocalColumnIdsAllRowsUnflattened + [startingColumnNumber + i]; + const dealii::types::global_dof_index xVecStartingIdColumn = + constrainedColumnId * contiguousBlockSize; + const dftfe::utils::deviceDoubleComplex tempComplval = + dftfe::utils::mult(constraintColumnValuesAllRowsUnflattened + [startingColumnNumber + i], + xVec[xVecStartingIdRow + intraBlockIndex]); + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex].x), + tempComplval.x); + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex].y), + tempComplval.y); + } + xVec[xVecStartingIdRow + intraBlockIndex].x = 0.0; + xVec[xVecStartingIdRow + intraBlockIndex].y = 0.0; + } + } + __global__ void setzeroKernel(const unsigned int contiguousBlockSize, @@ -678,54 +774,10 @@ namespace dftfe // set the constrained degrees of freedom to values so that constraints // are satisfied for flattened array // + template void constraintMatrixInfoDevice::distribute_slave_to_master( - distributedDeviceVec &fieldVector) const - { - if (d_numConstrainedDofs == 0) - return; - - const unsigned int blockSize = fieldVector.numVectors(); -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - distributeSlaveToMasterKernelAtomicAdd<<< - min((blockSize * d_numConstrainedDofs + - (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE, - 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin()); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL(distributeSlaveToMasterKernelAtomicAdd, - min((blockSize * d_numConstrainedDofs + - (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE, - 30000), - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - blockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - fieldVector.begin()), - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin()); -#endif - } - - - void - constraintMatrixInfoDevice::distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const + distributedDeviceVec &fieldVector) const { if (d_numConstrainedDofs == 0) return; @@ -767,7 +819,6 @@ namespace dftfe #endif } - template void constraintMatrixInfoDevice::set_zero( @@ -867,6 +918,22 @@ namespace dftfe constraintMatrixInfoDevice::set_zero( distributedDeviceVec> &fieldVector) const; + template void + constraintMatrixInfoDevice::distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; + + template void + constraintMatrixInfoDevice::distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; + + template void + constraintMatrixInfoDevice::distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; + + template void + constraintMatrixInfoDevice::distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; + } // namespace dftUtils } // namespace dftfe diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index 51b32a929..0f113d44e 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -879,7 +879,7 @@ namespace dftfe prm.declare_entry( "SPECTRUM SPLIT STARTING SCF ITER", - "0", + "1000", dealii::Patterns::Integer(0), "[Advanced] SCF iteration no beyond which spectrum splitting based can be used."); @@ -988,6 +988,12 @@ namespace dftfe dealii::Patterns::Bool(), "[Advanced] Use mixed precision communication only for XtX and XtHX instead of mixed precision compute and communication. This setting has been found to be more optimal on certain architectures. Default setting is false."); + prm.declare_entry( + "USE SINGLE PREC CHEBY", + "false", + dealii::Patterns::Bool(), + "[Advanced] Use a modified single precision algorithm Chebyshev filtering."); + prm.declare_entry( "OVERLAP COMPUTE COMMUN CHEBY", "true", @@ -1613,7 +1619,8 @@ namespace dftfe useMixedPrecSubspaceRotRR = prm.get_bool("USE MIXED PREC RR_SR"); useMixedPrecCommunOnlyXTHXCGSO = prm.get_bool("USE MIXED PREC COMMUN ONLY XTX XTHX"); - useMixedPrecCheby = prm.get_bool("USE MIXED PREC CHEBY"); + useMixedPrecCheby = prm.get_bool("USE MIXED PREC CHEBY"); + useSinglePrecCheby = prm.get_bool("USE SINGLE PREC CHEBY"); overlapComputeCommunCheby = prm.get_bool("OVERLAP COMPUTE COMMUN CHEBY"); overlapComputeCommunOrthoRR = From 291a90be8411289fefb4aed537f66c5aad91e71c Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 22 Apr 2024 00:09:25 +0530 Subject: [PATCH 03/11] Remove couts --- src/dft/dft.cc | 8 -------- src/dft/kohnShamEigenSolve.cc | 1 - 2 files changed, 9 deletions(-) diff --git a/src/dft/dft.cc b/src/dft/dft.cc index d89969f3b..6451dd297 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -3060,14 +3060,6 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE - std::cout - << "DEBUG ss dft.cc " - << (scfIter < - d_dftParamsPtr->spectrumSplitStartingScfIter || - scfConverged) - << " " << scfIter << " " - << d_dftParamsPtr->spectrumSplitStartingScfIter << " " - << scfConverged << std::endl; if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) kohnShamEigenSpaceCompute( diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index 9b28be56f..c83f081b4 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -528,7 +528,6 @@ namespace dftfe if (d_dftParamsPtr->spinPolarized == 1) pcout << "spin: " << spinType + 1 << std::endl; } - std::cout << "DEBUG ss " << isSpectrumSplit << std::endl; std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); From 117e320676586303943332c5fbb0c1a9ea650bb9 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 22 Apr 2024 11:43:53 +0530 Subject: [PATCH 04/11] Cleanups --- ...ogonalizedSubspaceIterationSolverDevice.cc | 7 +- utils/BLASWrapperDevice.cu.cc | 461 +---------------- utils/BLASWrapperDevice.hip.cc | 463 +----------------- utils/BLASWrapperDevice.inst.cc | 457 +++++++++++++++++ utils/dftParameters.cc | 2 +- 5 files changed, 461 insertions(+), 929 deletions(-) create mode 100644 utils/BLASWrapperDevice.inst.cc diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc index 36c99f402..24490e39f 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc @@ -291,7 +291,7 @@ namespace dftfe dftfe::utils::deviceSynchronize(); computingTimerStandard.leave_subsection("Lanczos upper bound"); } - // d_lowerBoundWantedSpectrum = bounds.first; + d_upperBoundUnWantedSpectrum = bounds.second; } @@ -342,11 +342,6 @@ namespace dftfe "lower bound of unwanted spectrum", d_lowerBoundUnWantedSpectrum); pcout << buffer; - sprintf(buffer, - "%s:%18.10e\n", - "lower bound of wanted spectrum", - d_lowerBoundWantedSpectrum); - pcout << buffer; sprintf(buffer, "%s: %u\n\n", "Chebyshev polynomial degree", diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index 526072ed4..4c070c216 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -1476,24 +1476,6 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), copyFromVecStartingContiguousBlockIds); } - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); template void @@ -1515,448 +1497,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(s), dftfe::utils::makeDataTypeDeviceCompatible(x)); } - // for stridedBlockScale - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - // axpyStridedBlockAtomicAdd - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const float * addFromVec, - float * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - const float * addFromVec, - float * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const double *x, - const double beta, - double *y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const float *x, - const double beta, - float * y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - // for xscal - template void - BLASWrapper::xscal( - double * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - float * x, - const float a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - // strided copy from block - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVecBlock, - double * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVecBlock, - float * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVecBlock, - float * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVecBlock, - double * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - double * valueType2Arr); - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - float * valueType2Arr); - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr( - const dftfe::size_type size, - const std::complex *valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const double * copyFromVec, - double * copyToVec); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const std::complex *copyFromVec, - std::complex * copyToVec); - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const double *A, - const double *B, - const double *D, - double *C) const; - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex *A, - const std::complex *B, - const double * D, - std::complex * C) const; - - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const float * A, - const double *B, - const double *D, - float *C) const; - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const float * A, - const double *B, - const double *D, - double *C) const; - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex * A, - const std::complex *B, - const double * D, - std::complex * C) const; - - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex * A, - const std::complex *B, - const double * D, - std::complex * C) const; - - - template void - BLASWrapper::copyRealArrsToComplexArr( - const dftfe::size_type size, - const double * realArr, - const double * imagArr, - std::complex * complexArr); +#include "./BLASWrapperDevice.inst.cc" } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index da1b697c1..fdbc991bd 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -1475,24 +1475,6 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), copyFromVecStartingContiguousBlockIds); } - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); void BLASWrapper::add( @@ -1628,450 +1610,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(s), dftfe::utils::makeDataTypeDeviceCompatible(x)); } - // for stridedBlockScale - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - - // for xscal - template void - BLASWrapper::xscal( - double * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - float * x, - const float a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - // strided copy from block - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVecBlock, - double * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVecBlock, - float * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVecBlock, - float * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVecBlock, - double * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyFromBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVecBlock, - std::complex * copyToVec, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - double * valueType2Arr); - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - float * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr( - const dftfe::size_type size, - const std::complex *valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const double * copyFromVec, - double * copyToVec); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const std::complex *copyFromVec, - std::complex * copyToVec); - - - // axpyStridedBlockAtomicAdd - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const float * addFromVec, - float * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - const float * addFromVec, - float * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const double *x, - const double beta, - double *y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const float *x, - const double beta, - float * y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const double *A, - const double *B, - const double *D, - double *C) const; - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex *A, - const std::complex *B, - const double * D, - std::complex * C) const; - - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const float * A, - const double *B, - const double *D, - float *C) const; - template void - BLASWrapper::ApaBD(const unsigned int m, - const unsigned int n, - const double alpha, - const float * A, - const double *B, - const double *D, - double *C) const; - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex * A, - const std::complex *B, - const double * D, - std::complex * C) const; - - template void - BLASWrapper::ApaBD( - const unsigned int m, - const unsigned int n, - const double alpha, - const std::complex * A, - const std::complex *B, - const double * D, - std::complex * C) const; - - template void - BLASWrapper::copyRealArrsToComplexArr( - const dftfe::size_type size, - const double * realArr, - const double * imagArr, - std::complex * complexArr); +#include "./BLASWrapperDevice.inst.cc" } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.inst.cc b/utils/BLASWrapperDevice.inst.cc new file mode 100644 index 000000000..0238120a8 --- /dev/null +++ b/utils/BLASWrapperDevice.inst.cc @@ -0,0 +1,457 @@ +template void +BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); +template void +BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); +// for stridedBlockScale +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + double * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + float * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + float * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + double * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + std::complex * x); +// for xscal +template void +BLASWrapper::xscal( + double * x, + const double a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + float * x, + const float a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const std::complex a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const std::complex a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const double a, + const dftfe::size_type n) const; + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +// strided copy from block +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + double * valueType2Arr); +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + float * valueType2Arr); +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const double * copyFromVec, + double * copyToVec); + +template void +BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const std::complex *copyFromVec, + std::complex * copyToVec); +// axpyStridedBlockAtomicAdd +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpby(const unsigned int n, + const double alpha, + const double * x, + const double beta, + double *y) const; + + +template void +BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + +template void +BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float * x, + const double beta, + float *y) const; + + +template void +BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double * A, + const double * B, + const double * D, + double *C) const; +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double * B, + const double * D, + float *C) const; +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double * B, + const double * D, + double *C) const; +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::copyRealArrsToComplexArr( + const dftfe::size_type size, + const double * realArr, + const double * imagArr, + std::complex * complexArr); diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index 0f113d44e..9fd04bfc8 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -879,7 +879,7 @@ namespace dftfe prm.declare_entry( "SPECTRUM SPLIT STARTING SCF ITER", - "1000", + "0", dealii::Patterns::Integer(0), "[Advanced] SCF iteration no beyond which spectrum splitting based can be used."); From dee2731c16631f29230b8cacd9852b5dd5456052 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 26 Apr 2024 11:31:26 +0530 Subject: [PATCH 05/11] Fix merge conflicts --- include/constraintMatrixInfo.h | 8 ++------ utils/constraintMatrixInfo.cc | 30 +++++++++++++++++++++-------- utils/constraintMatrixInfoDevice.cc | 21 +++++++++++--------- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h index b15dc0c19..19265a36e 100644 --- a/include/constraintMatrixInfo.h +++ b/include/constraintMatrixInfo.h @@ -237,14 +237,10 @@ namespace dftfe * care of constraints * @param blockSize number of components for a given node */ + template void distribute_slave_to_master( - distributedDeviceVec &fieldVector) const; - - void - distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const; - + distributedDeviceVec &fieldVector) const; template void diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index d8945db13..d9a8f9e96 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -521,27 +521,41 @@ namespace dftfe template void constraintMatrixInfo::distribute( - distributedCPUMultiVec &fieldVector) const; + dftfe::linearAlgebra::MultiVector + &fieldVector) const; template void constraintMatrixInfo::distribute( - dftfe::linearAlgebra::MultiVector, dftfe::utils::MemorySpace::HOST> &fieldVector) const; + dftfe::linearAlgebra::MultiVector, + dftfe::utils::MemorySpace::HOST> + &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - dftfe::linearAlgebra::MultiVector &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + dftfe::linearAlgebra::MultiVector + &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - dftfe::linearAlgebra::MultiVector &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + dftfe::linearAlgebra::MultiVector + &fieldVector) const; template void constraintMatrixInfo::set_zero( - dftfe::linearAlgebra::MultiVector &fieldVector) const; + dftfe::linearAlgebra::MultiVector + &fieldVector) const; template void constraintMatrixInfo::set_zero( - dftfe::linearAlgebra::MultiVector &fieldVector) const; + dftfe::linearAlgebra::MultiVector + &fieldVector) const; + template class constraintMatrixInfo; } // namespace dftUtils diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index a7271dee2..fc47463c5 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -967,22 +967,25 @@ namespace dftfe distributedDeviceVec> &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - distributedDeviceVec &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - distributedDeviceVec &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; template void - constraintMatrixInfo::distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const; + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; template class constraintMatrixInfo; - } // namespace dftUtils } // namespace dftfe From d7dbfef2c2fdf6de6b5a6b72015c1025ca1eec62 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 26 Apr 2024 11:36:12 +0530 Subject: [PATCH 06/11] Fix parameters --- utils/dftParameters.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index d517b7ef4..2711bf49b 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -970,10 +970,10 @@ namespace dftfe "[Advanced] Use mixed precision arithmetic in Rayleigh-Ritz subspace rotation step. Default setting is false."); prm.declare_entry( - "USE MIXED PREC CHEBY", + "USE SINGLE PREC COMMUN CHEBY", "false", dealii::Patterns::Bool(), - "[Advanced] Use mixed precision arithmetic in Chebyshev filtering. Currently this option is only available for real executable and USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false."); + "[Advanced] Use single precision communication in Chebyshev filtering. Currently this option is only available if USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false."); prm.declare_entry( "USE MIXED PREC COMMUN ONLY XTX XTHX", @@ -985,7 +985,7 @@ namespace dftfe "USE SINGLE PREC CHEBY", "false", dealii::Patterns::Bool(), - "[Advanced] Use a modified single precision algorithm Chebyshev filtering."); + "[Advanced] Use a modified single precision algorithm Chebyshev filtering. This cannot be used in conjunction with spectrum splitting"); prm.declare_entry( "OVERLAP COMPUTE COMMUN CHEBY", @@ -1610,7 +1610,7 @@ namespace dftfe useMixedPrecSubspaceRotRR = prm.get_bool("USE MIXED PREC RR_SR"); useMixedPrecCommunOnlyXTHXCGSO = prm.get_bool("USE MIXED PREC COMMUN ONLY XTX XTHX"); - useMixedPrecCheby = prm.get_bool("USE MIXED PREC CHEBY"); + useMixedPrecCheby = prm.get_bool("USE SINGLE PREC COMMUN CHEBY"); useSinglePrecCheby = prm.get_bool("USE SINGLE PREC CHEBY"); overlapComputeCommunCheby = prm.get_bool("OVERLAP COMPUTE COMMUN CHEBY"); @@ -2014,6 +2014,9 @@ namespace dftfe if (numCoreWfcRR == 0) spectrumSplitStartingScfIter = 10000; + + if (numCoreWfcRR != 0) + useSinglePrecCheby = false; } From 8f76612a9caa6905f5dd4aba7954a835361cc83e Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 26 Apr 2024 11:41:43 +0530 Subject: [PATCH 07/11] Update documentation --- doc/manual/parameters.tex | 29 +++++++++++++++++++++++------ utils/dftParameters.cc | 4 ++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/doc/manual/parameters.tex b/doc/manual/parameters.tex index a87b0ea7a..2d7a66e50 100644 --- a/doc/manual/parameters.tex +++ b/doc/manual/parameters.tex @@ -2872,20 +2872,20 @@ \subsection{Parameters in section \tt SCF parameters/Eigen-solver parameters} {\it Possible values:} A boolean value (true or false) -\item {\it Parameter name:} {\tt USE MIXED PREC CHEBY} -\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE MIXED PREC CHEBY} -\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20MIXED_20PREC_20CHEBY} +\item {\it Parameter name:} {\tt USE SINGLE PREC COMMUN CHEBY} +\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE USE SINGLE PREC COMMUN CHEBY} +\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20SINGLE_20PREC_20COMMUN_20CHEBY} -\index[prmindex]{USE MIXED PREC CHEBY} -\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE MIXED PREC CHEBY} +\index[prmindex]{USE SINGLE PREC COMMUN CHEBY} +\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE SINGLE PREC COMMUN CHEBY} {\it Value:} false {\it Default:} false -{\it Description:} [Advanced] Use mixed precision arithmetic in Chebyshev filtering. Currently this option is only available for real executable and USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false. +{\it Description:} [Advanced] Use single precision communication in Chebyshev filtering. Currently this option is only available if USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false. {\it Possible values:} A boolean value (true or false) @@ -2906,6 +2906,23 @@ \subsection{Parameters in section \tt SCF parameters/Eigen-solver parameters} {\it Possible values:} A boolean value (true or false) +\item {\it Parameter name:} {\tt USE SINGLE PREC CHEBY} +\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE USE SINGLE PREC CHEBY} +\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20SINGLE_20PREC_20CHEBY} + + +\index[prmindex]{USE SINGLE PREC CHEBY} +\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE SINGLE PREC CHEBY} +{\it Value:} false + + +{\it Default:} false + + +{\it Description:} [Advanced] Use a modified single precision algorithm for Chebyshev filtering. This cannot be used in conjunction with spectrum splitting. Default setting is false. + +{\it Possible values:} A boolean value (true or false) + \item {\it Parameter name:} {\tt USE MIXED PREC RR\_SR} \phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE MIXED PREC RR_5fSR} \label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20MIXED_20PREC_20RR_5fSR} diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index 2711bf49b..4f0bee4c2 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -985,7 +985,7 @@ namespace dftfe "USE SINGLE PREC CHEBY", "false", dealii::Patterns::Bool(), - "[Advanced] Use a modified single precision algorithm Chebyshev filtering. This cannot be used in conjunction with spectrum splitting"); + "[Advanced] Use a modified single precision algorithm for Chebyshev filtering. This cannot be used in conjunction with spectrum splitting. Default setting is false."); prm.declare_entry( "OVERLAP COMPUTE COMMUN CHEBY", @@ -1973,7 +1973,7 @@ namespace dftfe AssertThrow( useELPA, dealii::ExcMessage( - "DFT-FE Error: USE ELPA must be set to true for USE MIXED PREC CHEBY.")); + "DFT-FE Error: USE ELPA must be set to true for USE SINGLE PREC COMMUN CHEBY.")); if (verbosity >= 5) computeEnergyEverySCF = true; From 4a2868ddbe8e8458e15c248c2871293fe5e6ac99 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 26 Apr 2024 11:53:56 +0530 Subject: [PATCH 08/11] Undo verbosity change --- src/linAlg/linearAlgebraOperationsOpt.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index ed09bdb21..499211267 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -3040,7 +3040,7 @@ namespace dftfe std::ceil(eigenValuesT[lanczosIterations - 1] + (dftParams.reproducible_output ? YNorm : YNorm / 10.0)); - if (dftParams.verbosity >= 4 && this_mpi_process == 0) + if (dftParams.verbosity >= 5 && this_mpi_process == 0) { std::cout << "bUp1: " << eigenValuesT[lanczosIterations - 1] << ", fvector norm: " << YNorm << std::endl; From 9d2bdfbcb7f9f89c9f34d5bec263cbeeed62eb4a Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Sat, 8 Jun 2024 16:13:43 +0530 Subject: [PATCH 09/11] Change variable name --- include/dftParameters.h | 2 +- src/dft/energyCalculator.cc | 2 +- ...ogonalizedSubspaceIterationSolverDevice.cc | 28 +++++++++++-------- utils/dftParameters.cc | 14 +++------- 4 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/dftParameters.h b/include/dftParameters.h index 7aca3f73f..7c0d17af0 100644 --- a/include/dftParameters.h +++ b/include/dftParameters.h @@ -135,7 +135,7 @@ namespace dftfe bool useDevice; bool deviceFineGrainedTimings; bool allowFullCPUMemSubspaceRot; - bool useMixedPrecCheby; + bool useSinglePrecCommunCheby; bool useSinglePrecCheby; bool overlapComputeCommunCheby; bool overlapComputeCommunOrthoRR; diff --git a/src/dft/energyCalculator.cc b/src/dft/energyCalculator.cc index dfb51a744..6512208fc 100644 --- a/src/dft/energyCalculator.cc +++ b/src/dft/energyCalculator.cc @@ -117,7 +117,7 @@ namespace dftfe pcout << std::endl << "Energy computations (Hartree) " << std::endl; pcout << "-------------------" << std::endl; if (dftParams.useMixedPrecCGS_O || dftParams.useMixedPrecCGS_SR || - dftParams.useMixedPrecCheby) + dftParams.useSinglePrecCommunCheby) pcout << std::setw(25) << "Total energy" << ": " << std::fixed << std::setprecision(6) << std::setw(20) << totalEnergyTrunc << std::endl; diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc index 24490e39f..d90d65e72 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc @@ -426,7 +426,8 @@ namespace dftfe { eigenValuesBlock[i] = eigenValues[jvec + i]; } - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -454,7 +455,8 @@ namespace dftfe d_lowerBoundUnWantedSpectrum, d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -472,7 +474,8 @@ namespace dftfe { eigenValuesBlock[i] = eigenValues[jvec + i]; } - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -492,7 +495,8 @@ namespace dftfe d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -504,7 +508,7 @@ namespace dftfe else if (d_dftParams.overlapComputeCommunCheby && numSimultaneousBlocksCurrent == 2) { - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -526,7 +530,7 @@ namespace dftfe d_lowerBoundUnWantedSpectrum, d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -540,7 +544,7 @@ namespace dftfe } else { - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -555,7 +559,7 @@ namespace dftfe d_lowerBoundUnWantedSpectrum, d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -1025,7 +1029,7 @@ namespace dftfe numSimultaneousBlocksCurrent == 2) { if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision:: @@ -1053,7 +1057,7 @@ namespace dftfe d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -1068,7 +1072,7 @@ namespace dftfe else { if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision:: @@ -1086,7 +1090,7 @@ namespace dftfe d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index 1c9a90322..81a3f3f87 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -1269,7 +1269,7 @@ namespace dftfe useDevice = false; deviceFineGrainedTimings = false; allowFullCPUMemSubspaceRot = true; - useMixedPrecCheby = false; + useSinglePrecCommunCheby = false; overlapComputeCommunCheby = false; overlapComputeCommunOrthoRR = false; autoDeviceBlockSizes = true; @@ -1610,8 +1610,8 @@ namespace dftfe useMixedPrecSubspaceRotRR = prm.get_bool("USE MIXED PREC RR_SR"); useMixedPrecCommunOnlyXTHXCGSO = prm.get_bool("USE MIXED PREC COMMUN ONLY XTX XTHX"); - useMixedPrecCheby = prm.get_bool("USE SINGLE PREC COMMUN CHEBY"); - useSinglePrecCheby = prm.get_bool("USE SINGLE PREC CHEBY"); + useSinglePrecCommunCheby = prm.get_bool("USE SINGLE PREC COMMUN CHEBY"); + useSinglePrecCheby = prm.get_bool("USE SINGLE PREC CHEBY"); overlapComputeCommunCheby = prm.get_bool("OVERLAP COMPUTE COMMUN CHEBY"); overlapComputeCommunOrthoRR = @@ -1936,7 +1936,7 @@ namespace dftfe useMixedPrecCGS_O = true; useMixedPrecCGS_SR = true; useMixedPrecXTHXSpectrumSplit = true; - useMixedPrecCheby = true; + useSinglePrecCommunCheby = true; reuseLanczosUpperBoundFromFirstCall = true; } @@ -1972,12 +1972,6 @@ namespace dftfe useDeviceDirectAllReduce = useDCCL && useDeviceDirectAllReduce; #endif - if (useMixedPrecCheby) - AssertThrow( - useELPA, - dealii::ExcMessage( - "DFT-FE Error: USE ELPA must be set to true for USE SINGLE PREC COMMUN CHEBY.")); - if (verbosity >= 5) computeEnergyEverySCF = true; From 4b80cdea1e6b33338a85ab6498bc4bac7602667d Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 11 Jun 2024 13:16:12 +0530 Subject: [PATCH 10/11] Add ctests and correct parameter description --- .../complex/fccAl_05.mpirun=16.output | 542 ++++++++++++++++++ .../pseudopotential/complex/fccAl_05.prm.in | 91 +++ .../complex/accuracyBenchmarks/outputMg2x_7 | 104 ++++ .../complex/jobscripts/crusher.slurm | 2 + .../frontierJobScript6GCDs6MPITasks.rc | 1 + .../jobscripts/matrixlabgpu18Tasks6GPUs.slurm | 1 + .../complex/jobscripts/perlmutter.slurm | 2 + .../complex/jobscripts/summit.lsf | 2 + .../complex/parameterFileMg2x_7.prm | 86 +++ .../real/parameterFileN2_3.prm | 2 +- utils/dftParameters.cc | 2 +- 11 files changed, 833 insertions(+), 2 deletions(-) create mode 100644 tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output create mode 100644 tests/dft/pseudopotential/complex/fccAl_05.prm.in create mode 100644 testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 create mode 100644 testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm diff --git a/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output b/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output new file mode 100644 index 000000000..626058985 --- /dev/null +++ b/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output @@ -0,0 +1,542 @@ +number of atoms: 4 +number of atoms types: 1 +Z:13 +============================================================================================================================= +number of electrons: 12 +number of eigen values: 20 +============================================================================================================================= +Reduced k-Point-coordinates and weights: +2.500000000000000000e-01 2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 -2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 -2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 -2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 -2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 + check 0.1 + check 0.2 +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 7.599999999999999645e+00 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 7.599999999999999645e+00 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 7.599999999999999645e+00 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 1: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 2: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 3: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +----------------------------------------------------------------------------------------- +Number Image Charges 1724 +Reading tria info and rho data from checkpoint in progress... +...Reading from checkpoint done. + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 2 +FE interpolating polynomial order for electrostatics solve: 2 +FE interpolating polynomial order for nodal electron density computation: 4 +number of elements: 1408 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 15849 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... +Starting NSCF iteration.... +Fermi Energy computed: -2.456430006978324310e-01 + +Energy computations (Hartree) +------------------- + Total energy: -8.36780768 +Total energy : -8.36780768 +Total entropic energy: 0.00604495 +Total free energy: -8.37385263 +Writing tdos File... +epsValue Dos +-16.54768903 0.00008134 +-16.52047764 0.00008176 +-16.49326625 0.00008218 +-16.46605487 0.00008260 +-16.43884348 0.00008303 +-16.41163210 0.00008346 +-16.38442071 0.00008389 +-16.35720932 0.00008433 +-16.32999794 0.00008477 +-16.30278655 0.00008521 +-16.27557516 0.00008566 +-16.24836378 0.00008611 +-16.22115239 0.00008657 +-16.19394101 0.00008703 +-16.16672962 0.00008749 +-16.13951824 0.00008796 +-16.11230685 0.00008844 +-16.08509546 0.00008891 +-16.05788408 0.00008939 +-16.03067269 0.00008988 +-16.00346130 0.00009037 +-15.97624992 0.00009086 +-15.94903853 0.00009136 +-15.92182715 0.00009186 +-15.89461576 0.00009237 +-15.86740437 0.00009288 +-15.84019299 0.00009340 +-15.81298160 0.00009392 +-15.78577022 0.00009444 +-15.75855883 0.00009498 +-15.73134745 0.00009551 +-15.70413606 0.00009605 +-15.67692467 0.00009660 +-15.64971329 0.00009715 +-15.62250190 0.00009771 +-15.59529052 0.00009827 +-15.56807913 0.00009883 +-15.54086774 0.00009941 +-15.51365636 0.00009998 +-15.48644497 0.00010057 +-15.45923358 0.00010116 +-15.43202220 0.00010175 +-15.40481081 0.00010235 +-15.37759943 0.00010296 +-15.35038804 0.00010357 +-15.32317665 0.00010419 +-15.29596527 0.00010482 +-15.26875388 0.00010545 +-15.24154250 0.00010608 +-15.21433111 0.00010673 +-15.18711973 0.00010738 +-15.15990834 0.00010804 +-15.13269695 0.00010870 +-15.10548557 0.00010937 +-15.07827418 0.00011005 +-15.05106279 0.00011073 +-15.02385141 0.00011142 +-14.99664002 0.00011212 +-14.96942864 0.00011283 +-14.94221725 0.00011354 +-14.91500586 0.00011426 +-14.88779448 0.00011499 +-14.86058309 0.00011573 +-14.83337171 0.00011647 +-14.80616032 0.00011723 +-14.77894893 0.00011799 +-14.75173755 0.00011876 +-14.72452616 0.00011953 +-14.69731478 0.00012032 +-14.67010339 0.00012112 +-14.64289200 0.00012192 +-14.61568062 0.00012273 +-14.58846923 0.00012355 +-14.56125785 0.00012439 +-14.53404646 0.00012523 +-14.50683507 0.00012608 +-14.47962369 0.00012694 +-14.45241230 0.00012781 +-14.42520092 0.00012869 +-14.39798953 0.00012958 +-14.37077814 0.00013048 +-14.34356676 0.00013139 +-14.31635537 0.00013232 +-14.28914399 0.00013325 +-14.26193260 0.00013419 +-14.23472121 0.00013515 +-14.20750983 0.00013612 +-14.18029844 0.00013710 +-14.15308706 0.00013809 +-14.12587567 0.00013909 +-14.09866428 0.00014011 +-14.07145290 0.00014114 +-14.04424151 0.00014218 +-14.01703013 0.00014324 +-13.98981874 0.00014431 +-13.96260735 0.00014539 +-13.93539597 0.00014648 +-13.90818458 0.00014759 +-13.88097320 0.00014872 +-13.85376181 0.00014986 +-13.82655042 0.00015101 +-13.79933904 0.00015218 +-13.77212765 0.00015337 +-13.74491627 0.00015457 +-13.71770488 0.00015579 +-13.69049349 0.00015702 +-13.66328211 0.00015827 +-13.63607072 0.00015954 +-13.60885934 0.00016083 +-13.58164795 0.00016213 +-13.55443656 0.00016345 +-13.52722518 0.00016479 +-13.50001379 0.00016615 +-13.47280240 0.00016753 +-13.44559102 0.00016893 +-13.41837963 0.00017035 +-13.39116825 0.00017179 +-13.36395686 0.00017325 +-13.33674548 0.00017473 +-13.30953409 0.00017624 +-13.28232270 0.00017777 +-13.25511132 0.00017932 +-13.22789993 0.00018089 +-13.20068855 0.00018249 +-13.17347716 0.00018411 +-13.14626577 0.00018576 +-13.11905439 0.00018744 +-13.09184300 0.00018914 +-13.06463161 0.00019086 +-13.03742023 0.00019262 +-13.01020884 0.00019440 +-12.98299746 0.00019621 +-12.95578607 0.00019806 +-12.92857468 0.00019993 +-12.90136330 0.00020183 +-12.87415191 0.00020377 +-12.84694053 0.00020573 +-12.81972914 0.00020773 +-12.79251776 0.00020977 +-12.76530637 0.00021184 +-12.73809498 0.00021395 +-12.71088360 0.00021609 +-12.68367221 0.00021827 +-12.65646082 0.00022049 +-12.62924944 0.00022275 +-12.60203805 0.00022506 +-12.57482667 0.00022740 +-12.54761528 0.00022979 +-12.52040389 0.00023222 +-12.49319251 0.00023469 +-12.46598112 0.00023722 +-12.43876974 0.00023979 +-12.41155835 0.00024241 +-12.38434696 0.00024508 +-12.35713558 0.00024781 +-12.32992419 0.00025059 +-12.30271281 0.00025342 +-12.27550142 0.00025631 +-12.24829003 0.00025926 +-12.22107865 0.00026227 +-12.19386726 0.00026534 +-12.16665588 0.00026848 +-12.13944449 0.00027168 +-12.11223310 0.00027495 +-12.08502172 0.00027829 +-12.05781033 0.00028171 +-12.03059895 0.00028519 +-12.00338756 0.00028876 +-11.97617617 0.00029241 +-11.94896479 0.00029613 +-11.92175340 0.00029995 +-11.89454202 0.00030385 +-11.86733063 0.00030784 +-11.84011924 0.00031193 +-11.81290786 0.00031611 +-11.78569647 0.00032039 +-11.75848509 0.00032478 +-11.73127370 0.00032927 +-11.70406231 0.00033388 +-11.67685093 0.00033860 +-11.64963954 0.00034345 +-11.62242816 0.00034841 +-11.59521677 0.00035351 +-11.56800538 0.00035874 +-11.54079400 0.00036411 +-11.51358261 0.00036962 +-11.48637123 0.00037528 +-11.45915984 0.00038110 +-11.43194845 0.00038708 +-11.40473707 0.00039323 +-11.37752568 0.00039956 +-11.35031430 0.00040606 +-11.32310291 0.00041276 +-11.29589152 0.00041966 +-11.26868014 0.00042676 +-11.24146875 0.00043408 +-11.21425737 0.00044162 +-11.18704598 0.00044940 +-11.15983459 0.00045743 +-11.13262321 0.00046571 +-11.10541182 0.00047426 +-11.07820043 0.00048309 +-11.05098905 0.00049221 +-11.02377766 0.00050165 +-10.99656628 0.00051141 +-10.96935489 0.00052151 +-10.94214351 0.00053196 +-10.91493212 0.00054279 +-10.88772073 0.00055401 +-10.86050935 0.00056565 +-10.83329796 0.00057772 +-10.80608658 0.00059025 +-10.77887519 0.00060327 +-10.75166380 0.00061679 +-10.72445242 0.00063086 +-10.69724103 0.00064550 +-10.67002964 0.00066073 +-10.64281826 0.00067661 +-10.61560687 0.00069316 +-10.58839549 0.00071043 +-10.56118410 0.00072846 +-10.53397272 0.00074729 +-10.50676133 0.00076698 +-10.47954994 0.00078759 +-10.45233856 0.00080918 +-10.42512717 0.00083180 +-10.39791579 0.00085553 +-10.37070440 0.00088044 +-10.34349301 0.00090663 +-10.31628163 0.00093418 +-10.28907024 0.00096318 +-10.26185885 0.00099376 +-10.23464747 0.00102602 +-10.20743608 0.00106011 +-10.18022470 0.00109615 +-10.15301331 0.00113431 +-10.12580192 0.00117477 +-10.09859054 0.00121772 +-10.07137915 0.00126337 +-10.04416777 0.00131196 +-10.01695638 0.00136375 +-9.98974500 0.00141904 +-9.96253361 0.00147816 +-9.93532222 0.00154148 +-9.90811084 0.00160941 +-9.88089945 0.00168243 +-9.85368807 0.00176106 +-9.82647668 0.00184590 +-9.79926529 0.00193764 +-9.77205391 0.00203706 +-9.74484252 0.00214506 +-9.71763113 0.00226266 +-9.69041975 0.00239105 +-9.66320836 0.00253162 +-9.63599698 0.00268597 +-9.60878559 0.00285597 +-9.58157420 0.00304386 +-9.55436282 0.00325224 +-9.52715143 0.00348423 +-9.49994005 0.00374358 +-9.47272866 0.00403478 +-9.44551727 0.00436331 +-9.41830589 0.00473586 +-9.39109450 0.00516069 +-9.36388312 0.00564812 +-9.33667173 0.00621108 +-9.30946034 0.00686609 +-9.28224896 0.00763436 +-9.25503757 0.00854362 +-9.22782619 0.00963058 +-9.20061480 0.01094467 +-9.17340341 0.01255371 +-9.14619203 0.01455269 +-9.11898064 0.01707790 +-9.09176926 0.02033029 +-9.06455787 0.02461568 +-9.03734648 0.03041702 +-9.01013510 0.03853110 +-8.98292371 0.05034263 +-8.95571233 0.06841301 +-8.92850094 0.09784615 +-8.90128955 0.14971543 +-8.87407817 0.24999785 +-8.84686678 0.45738644 +-8.81965540 0.80573500 +-8.79244401 0.85773698 +-8.76523262 0.51002225 +-8.73802124 0.27592182 +-8.71080985 0.16251138 +-8.68359847 0.10478173 +-8.65638708 0.07252403 +-8.62917569 0.05296640 +-8.60196431 0.04031008 +-8.57475292 0.03168571 +-8.54754154 0.02556052 +-8.52033015 0.02106134 +-8.49311876 0.01766328 +-8.46590738 0.01503632 +-8.43869599 0.01296490 +-8.41148461 0.01130357 +-8.38427322 0.00995140 +-8.35706183 0.00883663 +-8.32985045 0.00790711 +-8.30263906 0.00712425 +-8.27542767 0.00645899 +-8.24821629 0.00588911 +-8.22100490 0.00539742 +-8.19379352 0.00497041 +-8.16658213 0.00459738 +-8.13937075 0.00426975 +-8.11215936 0.00398059 +-8.08494797 0.00372424 +-8.05773659 0.00349605 +-8.03052520 0.00329216 +-8.00331382 0.00310936 +-7.97610243 0.00294496 +-7.94889104 0.00279669 +-7.92167966 0.00266261 +-7.89446827 0.00254108 +-7.86725688 0.00243069 +-7.84004550 0.00233022 +-7.81283411 0.00223862 +-7.78562273 0.00215498 +-7.75841134 0.00207851 +-7.73119996 0.00200851 +-7.70398857 0.00194438 +-7.67677718 0.00188558 +-7.64956580 0.00183164 +-7.62235441 0.00178213 +-7.59514302 0.00173670 +-7.56793164 0.00169501 +-7.54072025 0.00165676 +-7.51350887 0.00162170 +-7.48629748 0.00158959 +-7.45908609 0.00156023 +-7.43187471 0.00153342 +-7.40466332 0.00150900 +-7.37745194 0.00148681 +-7.35024055 0.00146672 +-7.32302917 0.00144861 +-7.29581778 0.00143237 +-7.26860639 0.00141789 +-7.24139501 0.00140509 +-7.21418362 0.00139389 +-7.18697223 0.00138422 +-7.15976085 0.00137601 +-7.13254946 0.00136921 +-7.10533808 0.00136376 +-7.07812669 0.00135962 +-7.05091531 0.00135676 +-7.02370392 0.00135512 +-6.99649253 0.00135469 +-6.96928115 0.00135545 +-6.94206976 0.00135735 +-6.91485837 0.00136041 +-6.88764699 0.00136459 +-6.86043560 0.00136988 +-6.83322422 0.00137629 +-6.80601283 0.00138381 +-6.77880144 0.00139244 +-6.75159006 0.00140217 +-6.72437867 0.00141303 +-6.69716729 0.00142502 +-6.66995590 0.00143815 +-6.64274451 0.00145244 +-6.61553313 0.00146792 +-6.58832174 0.00148459 +-6.56111036 0.00150250 +-6.53389897 0.00152167 +-6.50668758 0.00154213 +-6.47947620 0.00156393 +-6.45226481 0.00158711 +-6.42505343 0.00161171 +-6.39784204 0.00163778 +-6.37063065 0.00166539 +-6.34341927 0.00169459 +-6.31620788 0.00172545 +-6.28899650 0.00175804 +-6.26178511 0.00179245 +-6.23457372 0.00182875 +-6.20736234 0.00186704 +-6.18015095 0.00190742 +-6.15293957 0.00195001 +-6.12572818 0.00199491 +-6.09851679 0.00204227 +-6.07130541 0.00209222 +-6.04409402 0.00214491 +-6.01688264 0.00220050 +-5.98967125 0.00225919 +-5.96245986 0.00232116 +-5.93524848 0.00238662 +-5.90803709 0.00245582 +-5.88082571 0.00252900 +-5.85361432 0.00260645 +-5.82640293 0.00268846 +-5.79919155 0.00277538 +-5.77198016 0.00286756 +-5.74476878 0.00296542 +-5.71755739 0.00306940 +-5.69034600 0.00317998 +-5.66313462 0.00329771 +-5.63592323 0.00342319 +-5.60871185 0.00355709 +-5.58150046 0.00370015 +-5.55428907 0.00385318 +-5.52707769 0.00401713 +-5.49986630 0.00419301 +-5.47265491 0.00438199 +-5.44544353 0.00458538 +-5.41823214 0.00480464 +-5.39102076 0.00504146 +-5.36380937 0.00529773 +-5.33659799 0.00557561 +-5.30938660 0.00587757 +-5.28217521 0.00620646 +-5.25496383 0.00656554 +-5.22775244 0.00695861 +-5.20054106 0.00739005 +-5.17332967 0.00786501 +-5.14611828 0.00838952 +-5.11890690 0.00897068 +-5.09169551 0.00961693 +-5.06448412 0.01033832 +-5.03727274 0.01114694 +-5.01006135 0.01205741 +-4.98284997 0.01308752 +-4.95563858 0.01425916 +-4.92842720 0.01559940 +-4.90121581 0.01714215 +-4.87400442 0.01893025 +-4.84679304 0.02101848 +-4.81958165 0.02347774 +-4.79237026 0.02640115 +-4.76515888 0.02991284 +-4.73794749 0.03418120 +-4.71073611 0.03943916 +-4.68352472 0.04601613 +-4.65631334 0.05438983 +-4.62910195 0.06527347 +-4.60189056 0.07976846 +-4.57467918 0.09964434 +-4.54746779 0.12788076 +-4.52025640 0.16978468 +-4.49304502 0.23546598 +-4.46583363 0.34577718 +-4.43862225 0.54760540 +-4.41141086 0.95229794 +-4.38419947 1.77020450 +-4.35698809 2.70074066 +-4.32977670 2.16450795 +-4.30256532 1.18697729 +-4.27535393 0.66043653 +-4.24814254 0.40386505 +-4.22093116 0.26829958 +-4.19371977 0.18988206 +-4.16650839 0.14099032 +-4.13929700 0.10864096 +-4.11208561 0.08620145 +-4.08487423 0.07003182 +-4.05766284 0.05801104 +-4.03045146 0.04883982 +-4.00324007 0.04168805 +-3.97602868 0.03600609 +-3.94881730 0.03141869 +-3.92160591 0.02766280 +-3.89439453 0.02454968 +-3.86718314 0.02194113 +-3.83997175 0.01973414 +-3.81276037 0.01785063 +-3.78554898 0.01623061 +-3.75833760 0.01482736 +-3.73112621 0.01360406 +-3.70391482 0.01253138 +-3.67670344 0.01158574 +-3.64949205 0.01074799 +-3.62228067 0.01000245 +-3.59506928 0.00933622 +-3.56785789 0.00873855 +-3.54064651 0.00820047 +-3.51343512 0.00771443 +-3.48622374 0.00727405 +-3.45901235 0.00687389 +-3.43180096 0.00650931 +-3.40458958 0.00617632 diff --git a/tests/dft/pseudopotential/complex/fccAl_05.prm.in b/tests/dft/pseudopotential/complex/fccAl_05.prm.in new file mode 100644 index 000000000..f770a3fff --- /dev/null +++ b/tests/dft/pseudopotential/complex/fccAl_05.prm.in @@ -0,0 +1,91 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true +set SOLVER MODE = NSCF +set RESTART FOLDER = ../../fccAl_03.release/mpirun=16/fccAl_03 + +subsection Post-processing Options + set WRITE BANDS = false + set WRITE DENSITY OF STATES=true +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=false + set FLOATING NUCLEAR CHARGES=false + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true + set POINT WISE DIRICHLET CONSTRAINT=true + set SELF POTENTIAL RADIUS = 0 + set CONSTRAINTS PARALLEL CHECK=true +end + + +subsection Brillouin zone k point sampling options + set USE TIME REVERSAL SYMMETRY = false + set kPOINT RULE FILE = @SOURCE_DIR@/kPoint2x2x2File +end + +subsection SCF Checkpointing and Restart + set LOAD RHO DATA = true + set RESTART SP FROM NO SP = false + set SAVE RHO DATA = false +end + +subsection Parallelization + set NPKPT=2 +end + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 1 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = @SOURCE_DIR@/pseudoAlKB.inp + set PSEUDO TESTS FLAG = true +end + + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 2 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AT ATOM = 0.6 + set MESH SIZE AROUND ATOM = 0.6 + end + +end + +subsection Geometry + set NATOMS=4 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = @SOURCE_DIR@/fccAl_coordinates.inp + set DOMAIN VECTORS FILE = @SOURCE_DIR@/fccAl_domainBoundingVectors.inp +end + + +subsection Poisson problem parameters + set MAXIMUM ITERATIONS = 4000 + set TOLERANCE = 1e-12 +end + + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER=false + + set MIXING HISTORY = 70 + set MIXING PARAMETER = 0.5 + set MAXIMUM ITERATIONS = 50 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + + subsection Eigen-solver parameters + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 20 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set ORTHOGONALIZATION TYPE=CGS + set CHEBYSHEV POLYNOMIAL DEGREE=8 + set USE ELPA=true + set USE SINGLE PREC CHEBY = true + end +end diff --git a/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 b/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 new file mode 100644 index 000000000..37260f3a9 --- /dev/null +++ b/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 @@ -0,0 +1,104 @@ +number of atoms: 31 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 310 +number of eigen values: 180 +============================================================================================================================= +Total number of k-points 2 +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 1.176399999999999935e+01 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 1.917200000000000060e+01 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 2.037584570023999930e+01 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 2.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 1: 2.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 2: 0.000000000000000000e+00 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 3: 0.000000000000000000e+00 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 4: 2.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 5: 2.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 6: 0.000000000000000000e+00 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 7: 0.000000000000000000e+00 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 8: 2.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 9: 2.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 10: 0.000000000000000000e+00 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 11: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 12: 2.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 13: 2.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 14: 0.000000000000000000e+00 7.500000000000000000e-01 6.666666666670000296e-01 +AtomId 15: 5.000000000000000000e-01 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 16: 7.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 17: 7.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 18: 5.000000000000000000e-01 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 19: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 20: 7.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 21: 7.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 22: 5.000000000000000000e-01 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 23: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 24: 7.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 25: 7.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 26: 5.000000000000000000e-01 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 27: 5.000000000000000000e-01 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 28: 7.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 29: 7.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 30: 5.000000000000000000e-01 7.500000000000000000e-01 6.666666666670000296e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2094 + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 3 +FE interpolating polynomial order for electrostatics solve: 3 +FE interpolating polynomial order for nodal electron density computation: 5 +number of elements: 1440 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 52791 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... + +Starting SCF iterations.... +SCF iterations converged to the specified tolerance after: 39 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -1673.56610253 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.000820,0.001893,0.012176 +AtomId 1: 0.001957,0.000000,0.000544 +AtomId 2: 0.000000,0.001558,0.141588 +AtomId 3: 0.000050,0.000000,0.001169 +AtomId 4: 0.000591,0.001197,0.010672 +AtomId 5: 0.002144,0.000000,0.000842 +AtomId 6: 0.000000,0.000888,0.144486 +AtomId 7: 0.000007,0.000000,0.001077 +AtomId 8: 0.000820,0.001893,0.012176 +AtomId 9: 0.000298,0.000000,0.001856 +AtomId 10: 0.000000,0.001558,0.141588 +AtomId 11: 0.000007,0.000000,0.000027 +AtomId 12: 0.000591,0.001197,0.010671 +AtomId 13: 0.000615,0.000000,0.002028 +AtomId 14: 0.000000,0.000888,0.144486 +AtomId 15: 0.000048,0.000000,0.001309 +AtomId 16: 0.000820,0.001893,0.012176 +AtomId 17: 0.001957,0.000000,0.000544 +AtomId 18: 0.000059,0.001056,0.143329 +AtomId 19: 0.000039,0.000000,0.000261 +AtomId 20: 0.000591,0.001197,0.010672 +AtomId 21: 0.002144,0.000000,0.000842 +AtomId 22: 0.000059,0.001259,0.144065 +AtomId 23: 0.000007,0.000000,0.000709 +AtomId 24: 0.000820,0.001893,0.012176 +AtomId 25: 0.000298,0.000000,0.001856 +AtomId 26: 0.000059,0.001056,0.143329 +AtomId 27: 0.000007,0.000000,0.000171 +AtomId 28: 0.000591,0.001197,0.010671 +AtomId 29: 0.000615,0.000000,0.002028 +AtomId 30: 0.000059,0.001259,0.144065 +-------------------------------------------------------------------------------------------- diff --git a/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm b/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm index ffdd0573e..15df4fb3c 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm @@ -26,5 +26,7 @@ srun -n 6 -c 1 ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n 6 -c 1 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 1 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + srun -n 6 -c 1 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc index aa420fa35..8de363644 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc +++ b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -29,4 +29,5 @@ srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm b/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm index bfb8883f8..0677d90d2 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm @@ -21,5 +21,6 @@ srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_3.prm > outputMg2x_3 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_4.prm > outputMg2x_4 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm b/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm index bfff8ad16..d688032eb 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm @@ -31,5 +31,7 @@ srun ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + srun -n 6 -c 1 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/summit.lsf b/testsGPU/pseudopotential/complex/jobscripts/summit.lsf index 4925d9782..d2c1bc50f 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/summit.lsf +++ b/testsGPU/pseudopotential/complex/jobscripts/summit.lsf @@ -23,4 +23,6 @@ jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe p jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm b/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm new file mode 100644 index 000000000..3a13a0bb2 --- /dev/null +++ b/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm @@ -0,0 +1,86 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true +set USE GPU=true +subsection GPU + set AUTO GPU BLOCK SIZES=false + set USE GPUDIRECT MPI ALL REDUCE = true + set USE ELPA GPU KERNEL=false +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=true + set FLOATING NUCLEAR CHARGES=true + set CONSTRAINTS FROM SERIAL DOFHANDLER = false + set CONSTRAINTS PARALLEL CHECK = false + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true + set SELF POTENTIAL RADIUS = 3.0 + set POINT WISE DIRICHLET CONSTRAINT =false +end + + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 4 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = pseudoMg.inp + set SPIN POLARIZATION=1 + set START MAGNETIZATION=0.1 +end + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 3 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AROUND ATOM = 1.0 + set MESH SIZE AT ATOM = 1.0 + end +end + +subsection Geometry + set NATOMS=31 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = coordinatesMg2x.inp + set DOMAIN VECTORS FILE = domainVectorsMg2x.inp + subsection Optimization + set ION FORCE = true + end +end + +subsection Brillouin zone k point sampling options + set USE TIME REVERSAL SYMMETRY = false + subsection Monkhorst-Pack (MP) grid generation + set SAMPLING POINTS 1 = 2 + set SAMPLING POINTS 2 = 1 + set SAMPLING POINTS 3 = 1 + set SAMPLING SHIFT 1 = 1 + set SAMPLING SHIFT 2 = 0 + set SAMPLING SHIFT 3 = 0 + end +end + + +subsection Parallelization + set NPBAND=1 + set NPKPT=2 +end + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER = false + set MAXIMUM ITERATIONS = 100 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + subsection Eigen-solver parameters + set CHEBYSHEV POLYNOMIAL DEGREE = 20 + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 180 + set CHEBY WFC BLOCK SIZE=90 + set WFC BLOCK SIZE=90 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set SCALAPACKPROCS=2 + set USE SINGLE PREC CHEBY = true + end +end diff --git a/testsGPU/pseudopotential/real/parameterFileN2_3.prm b/testsGPU/pseudopotential/real/parameterFileN2_3.prm index 51b5c5fb7..3678f49bf 100644 --- a/testsGPU/pseudopotential/real/parameterFileN2_3.prm +++ b/testsGPU/pseudopotential/real/parameterFileN2_3.prm @@ -62,7 +62,7 @@ subsection SCF parameters set USE ELPA=true set SCALAPACKPROCS=2 set OVERLAP COMPUTE COMMUN CHEBY=true - set USE MIXED PREC CHEBY=true + set USE SINGLE PREC COMMUN CHEBY=true set OVERLAP COMPUTE COMMUN ORTHO RR=false end end diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index 81a3f3f87..aca4cf417 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -973,7 +973,7 @@ namespace dftfe "USE SINGLE PREC COMMUN CHEBY", "false", dealii::Patterns::Bool(), - "[Advanced] Use single precision communication in Chebyshev filtering. Currently this option is only available if USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false."); + "[Advanced] Use single precision communication in Chebyshev filtering. Default setting is false."); prm.declare_entry( "USE MIXED PREC COMMUN ONLY XTX XTHX", From 4a589e92a0d530d95141a142766384cbb58e804c Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 11 Jun 2024 15:42:41 +0530 Subject: [PATCH 11/11] Update documentation --- doc/manual/parameters.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/manual/parameters.tex b/doc/manual/parameters.tex index 9b74d7fb6..e2b7526af 100644 --- a/doc/manual/parameters.tex +++ b/doc/manual/parameters.tex @@ -2885,7 +2885,7 @@ \subsection{Parameters in section \tt SCF parameters/Eigen-solver parameters} {\it Default:} false -{\it Description:} [Advanced] Use single precision communication in Chebyshev filtering. Currently this option is only available if USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false. +{\it Description:} [Advanced] Use single precision communication in Chebyshev filtering. Default setting is false. {\it Possible values:} A boolean value (true or false)