diff --git a/CMakeLists.txt b/CMakeLists.txt index e70e8974f..f0b6a6830 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -162,6 +162,7 @@ SET(TARGET_SRC ./utils/FEBasisOperations.cc ./utils/FEBasisOperationsKernels.cc ./src/force/locateAtomCoreNodesForce.cc + ./src/atom/AtomicCenteredNonLocalOperator.cc ./src/atom/AtomPseudoWavefunctions.cc ./src/atom/AtomCenteredSphericalFunctionContainer.cc ./src/atom/AtomCenteredSphericalFunctionBase.cc @@ -172,7 +173,8 @@ SET(TARGET_SRC ./src/atom/AtomCenteredSphericalFunctionValenceDensitySpline.cc ./src/atom/AtomCenteredSphericalFunctionCoreDensitySpline.cc ./src/atom/AtomCenteredSphericalFunctionLocalPotentialSpline.cc - ./src/atom/AtomCenteredSphericalFunctionProjectorSpline.cc) + ./src/atom/AtomCenteredSphericalFunctionProjectorSpline.cc + ./src/pseudo/oncv/oncvClass.cc) IF ("${GPU_LANG}" STREQUAL "cuda") diff --git a/doc/manual/parameters.tex b/doc/manual/parameters.tex index 8fc7aa12c..e2b7526af 100644 --- a/doc/manual/parameters.tex +++ b/doc/manual/parameters.tex @@ -2872,20 +2872,20 @@ \subsection{Parameters in section \tt SCF parameters/Eigen-solver parameters} {\it Possible values:} A boolean value (true or false) -\item {\it Parameter name:} {\tt USE MIXED PREC CHEBY} -\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE MIXED PREC CHEBY} -\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20MIXED_20PREC_20CHEBY} +\item {\it Parameter name:} {\tt USE SINGLE PREC COMMUN CHEBY} +\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE USE SINGLE PREC COMMUN CHEBY} +\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20SINGLE_20PREC_20COMMUN_20CHEBY} -\index[prmindex]{USE MIXED PREC CHEBY} -\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE MIXED PREC CHEBY} +\index[prmindex]{USE SINGLE PREC COMMUN CHEBY} +\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE SINGLE PREC COMMUN CHEBY} {\it Value:} false {\it Default:} false -{\it Description:} [Advanced] Use mixed precision arithmetic in Chebyshev filtering. Currently this option is only available for real executable and USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false. +{\it Description:} [Advanced] Use single precision communication in Chebyshev filtering. Default setting is false. {\it Possible values:} A boolean value (true or false) @@ -2906,6 +2906,23 @@ \subsection{Parameters in section \tt SCF parameters/Eigen-solver parameters} {\it Possible values:} A boolean value (true or false) +\item {\it Parameter name:} {\tt USE SINGLE PREC CHEBY} +\phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE USE SINGLE PREC CHEBY} +\label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20SINGLE_20PREC_20CHEBY} + + +\index[prmindex]{USE SINGLE PREC CHEBY} +\index[prmindexfull]{SCF parameters!Eigen-solver parameters!USE SINGLE PREC CHEBY} +{\it Value:} false + + +{\it Default:} false + + +{\it Description:} [Advanced] Use a modified single precision algorithm for Chebyshev filtering. This cannot be used in conjunction with spectrum splitting. Default setting is false. + +{\it Possible values:} A boolean value (true or false) + \item {\it Parameter name:} {\tt USE MIXED PREC RR\_SR} \phantomsection\label{parameters:SCF parameters/Eigen_2dsolver parameters/USE MIXED PREC RR_5fSR} \label{parameters:SCF_20parameters/Eigen_2dsolver_20parameters/USE_20MIXED_20PREC_20RR_5fSR} diff --git a/include/AtomicCenteredNonLocalOperator.h b/include/AtomicCenteredNonLocalOperator.h index de38af052..77d39a486 100644 --- a/include/AtomicCenteredNonLocalOperator.h +++ b/include/AtomicCenteredNonLocalOperator.h @@ -61,7 +61,7 @@ namespace dftfe std::shared_ptr> BLASWrapperPtr, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperatorPtr, std::shared_ptr atomCenteredSphericalFunctionContainer, @@ -102,8 +102,9 @@ namespace dftfe const std::vector &kPointWeights, const std::vector &kPointCoordinates, std::shared_ptr< - dftfe::basis:: - FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperationsPtr, const unsigned int quadratureIndex); #if defined(DFTFE_WITH_DEVICE) @@ -313,7 +314,7 @@ namespace dftfe std::vector d_numberCellsForEachAtom; std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> d_basisOperatorPtr; @@ -419,11 +420,11 @@ namespace dftfe */ void computeCMatrixEntries( - std::shared_ptr< - dftfe::basis:: - FEBasisOperations> - basisOperationsPtr, - const unsigned int quadratureIndex); + std::shared_ptr> basisOperationsPtr, + const unsigned int quadratureIndex); std::map< unsigned int, @@ -477,6 +478,4 @@ namespace dftfe } // namespace dftfe -#include "../src/atom/AtomicCenteredNonLocalOperator.t.cc" - #endif // DFTFE_ATOMICCENTEREDNONLOCALOPERATOR_H diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index 7d30dbc67..18cb8749c 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -503,6 +503,19 @@ namespace dftfe const ValueType1 * x, const ValueType2 beta, ValueType1 * y) const; + template + void + ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const; template void @@ -513,14 +526,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; @@ -1030,6 +1043,21 @@ namespace dftfe const ValueType2 beta, ValueType1 * y) const; + template + void + ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const; + + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, @@ -1039,14 +1067,14 @@ namespace dftfe const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - template + template void axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, const dftfe::size_type numContiguousBlocks, const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index db56b3d1b..4fe728a2a 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -708,6 +708,17 @@ namespace dftfe createScratchMultiVectors(const unsigned int vecBlockSize, const unsigned int numMultiVecs = 1) const; + /** + * @brief Creates single precision scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs number of scratch multivectors needed with + * this vecBlockSize. + */ + void + createScratchMultiVectorsSinglePrec( + const unsigned int vecBlockSize, + const unsigned int numMultiVecs = 1) const; + /** * @brief Clears scratch multivectors. */ @@ -724,6 +735,18 @@ namespace dftfe getMultiVector(const unsigned int vecBlockSize, const unsigned int index = 0) const; + /** + * @brief Gets single precision scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs index of the multivector among those with the + * same vecBlockSize. + */ + dftfe::linearAlgebra::MultiVector< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + getMultiVectorSinglePrec(const unsigned int vecBlockSize, + const unsigned int index = 0) const; + /** * @brief Apply constraints on given multivector. * @param[inout] multiVector the given multivector. @@ -853,6 +876,13 @@ namespace dftfe dftfe::linearAlgebra::MultiVector>> scratchMultiVectors; + mutable std::map< + unsigned int, + std::vector::type, + memorySpace>>> + scratchMultiVectorsSinglePrec; + std::vector d_quadratureIDsVector; unsigned int d_quadratureID; unsigned int d_quadratureIndex; diff --git a/include/KohnShamHamiltonianOperator.h b/include/KohnShamHamiltonianOperator.h index 395d53b73..c05a6cf23 100644 --- a/include/KohnShamHamiltonianOperator.h +++ b/include/KohnShamHamiltonianOperator.h @@ -78,6 +78,11 @@ namespace dftfe const unsigned int index); + dftfe::linearAlgebra::MultiVector & + getScratchFEMultivectorSinglePrec(const unsigned int numVectors, + const unsigned int index); + + /** * @brief Computes effective potential involving exchange-correlation functionals * @param rhoValues electron-density @@ -168,6 +173,19 @@ namespace dftfe const bool skip2 = false, const bool skip3 = false); + void + HXCheby(dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse, + const bool skip1, + const bool skip2, + const bool skip3); + void HXRR( dftfe::linearAlgebra::MultiVector &src, @@ -180,6 +198,10 @@ namespace dftfe AtomicCenteredNonLocalOperator> d_ONCVnonLocalOperator; + std::shared_ptr< + AtomicCenteredNonLocalOperator> + d_ONCVnonLocalOperatorSinglePrec; + std::shared_ptr> d_BLASWrapperPtr; std::shared_ptr< @@ -197,6 +219,8 @@ namespace dftfe std::vector> d_cellHamiltonianMatrix; + std::vector> + d_cellHamiltonianMatrixSinglePrec; dftfe::utils::MemoryStorage d_cellHamiltonianMatrixExtPot; @@ -206,8 +230,15 @@ namespace dftfe dftfe::utils::MemoryStorage d_cellWaveFunctionMatrixDst; + dftfe::utils::MemoryStorage + d_cellWaveFunctionMatrixSrcSinglePrec; + dftfe::utils::MemoryStorage + d_cellWaveFunctionMatrixDstSinglePrec; + dftfe::linearAlgebra::MultiVector - d_ONCVNonLocalProjectorTimesVectorBlock; + d_ONCVNonLocalProjectorTimesVectorBlock; + dftfe::linearAlgebra::MultiVector + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec; dftfe::utils::MemoryStorage d_VeffJxW; dftfe::utils::MemoryStorage d_VeffExtPotJxW; diff --git a/include/MPICommunicatorP2P.h b/include/MPICommunicatorP2P.h index f531c8496..bfc30ec7f 100644 --- a/include/MPICommunicatorP2P.h +++ b/include/MPICommunicatorP2P.h @@ -58,22 +58,6 @@ namespace dftfe }; - template - struct singlePrecType - { - typedef T type; - }; - template <> - struct singlePrecType - { - typedef float type; - }; - template <> - struct singlePrecType> - { - typedef std::complex type; - }; - template class MPICommunicatorP2P { @@ -136,10 +120,14 @@ namespace dftfe MemoryStorage d_tempFloatImagArrayForAtomics; - MemoryStorage::type, memorySpace> + MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> d_sendRecvBufferSinglePrec; - MemoryStorage::type, memorySpace> + MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> d_ghostDataCopySinglePrec; #ifdef DFTFE_WITH_DEVICE @@ -149,12 +137,14 @@ namespace dftfe std::shared_ptr> d_sendRecvBufferHostPinnedPtr; - std::shared_ptr::type, - MemorySpace::HOST_PINNED>> + std::shared_ptr::type, + MemorySpace::HOST_PINNED>> d_ghostDataCopySinglePrecHostPinnedPtr; - std::shared_ptr::type, - MemorySpace::HOST_PINNED>> + std::shared_ptr::type, + MemorySpace::HOST_PINNED>> d_sendRecvBufferSinglePrecHostPinnedPtr; #endif // DFTFE_WITH_DEVICE diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h index 52e242366..2b7b8a065 100644 --- a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h +++ b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h @@ -24,7 +24,7 @@ #include "operator.h" #include "elpaScalaManager.h" #include "dftParameters.h" - +#include "BLASWrapper.h" namespace dftfe { @@ -65,7 +65,10 @@ namespace dftfe */ void solve(operatorDFTClass &operatorMatrix, - elpaScalaManager & elpaScala, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + elpaScalaManager & elpaScala, dataTypes::number * eigenVectorsFlattened, dataTypes::number * eigenVectorsRotFracDensityFlattened, const unsigned int totalNumberWaveFunctions, @@ -74,6 +77,7 @@ namespace dftfe std::vector &residuals, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, + const bool isFirstFilteringCall, const bool computeResidual, const bool useMixedPrec = false, const bool isFirstScf = false); diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h index b15dc0c19..19265a36e 100644 --- a/include/constraintMatrixInfo.h +++ b/include/constraintMatrixInfo.h @@ -237,14 +237,10 @@ namespace dftfe * care of constraints * @param blockSize number of components for a given node */ + template void distribute_slave_to_master( - distributedDeviceVec &fieldVector) const; - - void - distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const; - + distributedDeviceVec &fieldVector) const; template void diff --git a/include/dftParameters.h b/include/dftParameters.h index ce50253dc..7c0d17af0 100644 --- a/include/dftParameters.h +++ b/include/dftParameters.h @@ -135,7 +135,8 @@ namespace dftfe bool useDevice; bool deviceFineGrainedTimings; bool allowFullCPUMemSubspaceRot; - bool useMixedPrecCheby; + bool useSinglePrecCommunCheby; + bool useSinglePrecCheby; bool overlapComputeCommunCheby; bool overlapComputeCommunOrthoRR; bool autoDeviceBlockSizes; diff --git a/include/dftfeDataTypes.h b/include/dftfeDataTypes.h index 1ccb651a6..5f5eb51d0 100644 --- a/include/dftfeDataTypes.h +++ b/include/dftfeDataTypes.h @@ -108,6 +108,25 @@ namespace dftfe { return MPI_DOUBLE_COMPLEX; } + + template + struct singlePrecType + { + typedef T type; + }; + + template <> + struct singlePrecType + { + typedef float type; + }; + + template <> + struct singlePrecType> + { + typedef std::complex type; + }; + } // namespace dataTypes } // namespace dftfe diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index 7e3df2ed2..9d13de6a2 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -144,6 +144,13 @@ namespace dftfe double * y, const unsigned int *incy); void + saxpy_(const unsigned int *n, + const float * alpha, + const float * x, + const unsigned int *incx, + float * y, + const unsigned int *incy); + void dgemm_(const char * transA, const char * transB, const unsigned int *m, @@ -412,6 +419,13 @@ namespace dftfe std::complex * y, const unsigned int * incy); void + caxpy_(const unsigned int * n, + const std::complex *alpha, + const std::complex *x, + const unsigned int * incx, + std::complex * y, + const unsigned int * incy); + void dpotrf_(const char * uplo, const unsigned int *n, double * a, @@ -617,6 +631,21 @@ namespace dftfe const double b, const double a0); + template + void + chebyshevFilterSinglePrec( + const std::shared_ptr> + & BLASWrapperPtr, + operatorDFTClass & operatorMatrix, + dftfe::linearAlgebra::MultiVector & X, + dftfe::linearAlgebra::MultiVector & Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); /** @brief Orthogonalize given subspace using GramSchmidt orthogonalization diff --git a/include/linearAlgebraOperationsDevice.h b/include/linearAlgebraOperationsDevice.h index f614adc91..97e8c4b18 100644 --- a/include/linearAlgebraOperationsDevice.h +++ b/include/linearAlgebraOperationsDevice.h @@ -25,6 +25,7 @@ # include "elpaScalaManager.h" # include "deviceDirectCCLWrapper.h" # include "dftParameters.h" +# include namespace dftfe { @@ -94,6 +95,38 @@ namespace dftfe const double b, const double a0); + void + chebyshevFilterOverlapComputeCommunicationSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X1, + dftfe::linearAlgebra::MultiVector &Y1, + dftfe::linearAlgebra::MultiVector &X2, + dftfe::linearAlgebra::MultiVector &Y2, + dftfe::linearAlgebra::MultiVector + &X1_SP, + dftfe::linearAlgebra::MultiVector + &Y1_SP, + dftfe::linearAlgebra::MultiVector + &X2_SP, + dftfe::linearAlgebra::MultiVector + & Y2_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); + /** @brief Computes Sc=X^{T}*Xc. * * diff --git a/include/oncvClass.h b/include/oncvClass.h index 90264513d..215662821 100644 --- a/include/oncvClass.h +++ b/include/oncvClass.h @@ -101,7 +101,8 @@ namespace dftfe unsigned int densityQuadratureIdElectro, std::shared_ptr excFunctionalPtr, const std::vector> &atomLocations, - unsigned int numEigenValues); + unsigned int numEigenValues, + const bool singlePrecNonLocalOperator); /** * @brief Initialises all the data members with addresses/values to/of dftClass. @@ -201,6 +202,17 @@ namespace dftfe AtomicCenteredNonLocalOperator> getNonLocalOperator(); + const dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + getCouplingMatrixSinglePrec(); + + + const std::shared_ptr::type, + memorySpace>> + getNonLocalOperatorSinglePrec(); + private: /** * @brief Converts the periodic image data structure to relevant form for the container class @@ -241,8 +253,13 @@ namespace dftfe std::map> d_atomicNonLocalPseudoPotentialConstants; dftfe::utils::MemoryStorage d_couplingMatrixEntries; + dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> + d_couplingMatrixEntriesSinglePrec; bool d_HamiltonianCouplingMatrixEntriesUpdated; + bool d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated; std::vector> d_atomicWaveFnsVector; std::shared_ptr @@ -276,11 +293,12 @@ namespace dftfe d_BasisOperatorDevicePtr; #endif - std::map d_atomTypeCoreFlagMap; - bool d_floatingNuclearCharges; - int d_verbosity; - std::vector> d_atomLocations; - std::set d_atomTypes; + std::map d_atomTypeCoreFlagMap; + bool d_floatingNuclearCharges; + bool d_singlePrecNonLocalOperator; + int d_verbosity; + std::vector> d_atomLocations; + std::set d_atomTypes; std::map> d_atomTypesList; std::string d_dftfeScratchFolderName; std::vector d_imageIds; @@ -292,6 +310,11 @@ namespace dftfe std::shared_ptr> d_nonLocalOperator; + std::shared_ptr::type, + memorySpace>> + d_nonLocalOperatorSinglePrec; + std::vector> d_atomicProjectorFnsVector; @@ -312,7 +335,5 @@ namespace dftfe }; // end of class - } // end of namespace dftfe -#include "../src/pseudo/oncv/oncvClass.t.cc" #endif // DFTFE_PSEUDOPOTENTIALBASE_H diff --git a/include/operator.h b/include/operator.h index 030f6a59d..83d58a134 100644 --- a/include/operator.h +++ b/include/operator.h @@ -56,6 +56,11 @@ namespace dftfe getScratchFEMultivector(const unsigned int numVectors, const unsigned int index) = 0; + virtual dftfe::linearAlgebra::MultiVector & + getScratchFEMultivectorSinglePrec(const unsigned int numVectors, + const unsigned int index) = 0; + virtual void init(const std::vector &kPointCoordinates, const std::vector &kPointWeights) = 0; @@ -88,6 +93,19 @@ namespace dftfe const bool skip2 = false, const bool skip3 = false) = 0; + virtual void + HXCheby(dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse = false, + const bool skip1 = false, + const bool skip2 = false, + const bool skip3 = false) = 0; + virtual dftUtils::constraintMatrixInfo * getOverloadedConstraintMatrixHost() const = 0; diff --git a/include/sphericalHarmonicUtils.h b/include/sphericalHarmonicUtils.h index fd25e4edc..ccb24c113 100644 --- a/include/sphericalHarmonicUtils.h +++ b/include/sphericalHarmonicUtils.h @@ -18,7 +18,7 @@ // #ifndef DFTFE_SPHERICALHARMONICUTILS_H #define DFTFE_SPHERICALHARMONICUTILS_H - +#include namespace dftfe { namespace sphericalHarmonicUtils diff --git a/src/atom/AtomicCenteredNonLocalOperator.t.cc b/src/atom/AtomicCenteredNonLocalOperator.cc similarity index 98% rename from src/atom/AtomicCenteredNonLocalOperator.t.cc rename to src/atom/AtomicCenteredNonLocalOperator.cc index 214c394b3..483301e3d 100644 --- a/src/atom/AtomicCenteredNonLocalOperator.t.cc +++ b/src/atom/AtomicCenteredNonLocalOperator.cc @@ -16,6 +16,7 @@ // // @author Kartick Ramakrishnan, Sambit Das, Phani Motamarri, Vishal Subramanian // +#include #if defined(DFTFE_WITH_DEVICE) # include # include @@ -23,7 +24,6 @@ # include # include #endif - namespace dftfe { template @@ -32,7 +32,7 @@ namespace dftfe std::shared_ptr> BLASWrapperPtr, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperatorPtr, std::shared_ptr atomCenteredSphericalFunctionContainer, @@ -117,11 +117,11 @@ namespace dftfe template void AtomicCenteredNonLocalOperator::computeCMatrixEntries( - std::shared_ptr< - dftfe::basis:: - FEBasisOperations> - basisOperationsPtr, - const unsigned int quadratureIndex) + std::shared_ptr> basisOperationsPtr, + const unsigned int quadratureIndex) { d_locallyOwnedCells = basisOperationsPtr->nCells(); basisOperationsPtr->reinit(0, 0, quadratureIndex); @@ -138,7 +138,7 @@ namespace dftfe ->shapeFunctionBasisData(); // shapeFunctionData() for complex const dftfe::utils::MemoryStorage quadraturePointsVector = basisOperationsPtr->quadPoints(); - const dftfe::utils::MemoryStorage JxwVector = basisOperationsPtr->JxW(); const std::vector &atomicNumber = @@ -2061,7 +2061,7 @@ namespace dftfe else { initialiseOperatorActionOnX(kPointIndex); - dftfe::utils::MemoryStorage cellWaveFunctionMatrix; cellWaveFunctionMatrix.resize(d_locallyOwnedCells * @@ -2260,8 +2260,9 @@ namespace dftfe const std::vector &kPointWeights, const std::vector &kPointCoordinates, std::shared_ptr< - dftfe::basis:: - FEBasisOperations> + dftfe::basis::FEBasisOperations> basisOperationsPtr, const unsigned int quadratureIndex) { @@ -2285,5 +2286,19 @@ namespace dftfe { return (d_flattenedNonLocalCellDofIndexToProcessDofIndexMap); } + template class AtomicCenteredNonLocalOperator< + dataTypes::number, + dftfe::utils::MemorySpace::HOST>; + template class AtomicCenteredNonLocalOperator< + dataTypes::numberFP32, + dftfe::utils::MemorySpace::HOST>; +#if defined(DFTFE_WITH_DEVICE) + template class AtomicCenteredNonLocalOperator< + dataTypes::number, + dftfe::utils::MemorySpace::DEVICE>; + template class AtomicCenteredNonLocalOperator< + dataTypes::numberFP32, + dftfe::utils::MemorySpace::DEVICE>; +#endif } // namespace dftfe diff --git a/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc b/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc index d3986724c..2f2c4d6ee 100644 --- a/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc +++ b/src/atom/AtomicCenteredNonLocalOperatorKernelsDevice.cc @@ -246,38 +246,38 @@ namespace dftfe template void copyToDealiiParallelNonLocalVec( - const unsigned int numWfcs, - const unsigned int totalEntries, - const double * sphericalFnTimesWfcParallelVec, - double * sphericalFnTimesWfcDealiiParallelVec, - const unsigned int *indexMapDealiiParallelNumbering); + const unsigned int numWfcs, + const unsigned int totalEntries, + const dataTypes::number *sphericalFnTimesWfcParallelVec, + dataTypes::number * sphericalFnTimesWfcDealiiParallelVec, + const unsigned int * indexMapDealiiParallelNumbering); template void copyToDealiiParallelNonLocalVec( - const unsigned int numWfcs, - const unsigned int totalEntries, - const std::complex *sphericalFnTimesWfcParallelVec, - std::complex * sphericalFnTimesWfcDealiiParallelVec, - const unsigned int * indexMapDealiiParallelNumbering); + const unsigned int numWfcs, + const unsigned int totalEntries, + const dataTypes::numberFP32 *sphericalFnTimesWfcParallelVec, + dataTypes::numberFP32 * sphericalFnTimesWfcDealiiParallelVec, + const unsigned int * indexMapDealiiParallelNumbering); template void copyFromParallelNonLocalVecToAllCellsVec( - const unsigned int numWfcs, - const unsigned int numNonLocalCells, - const unsigned int maxSingleAtomContribution, - const double * sphericalFnTimesWfcParallelVec, - double * sphericalFnTimesWfcAllCellsVec, - const int * indexMapPaddedToParallelVec); + const unsigned int numWfcs, + const unsigned int numNonLocalCells, + const unsigned int maxSingleAtomContribution, + const dataTypes::number *sphericalFnTimesWfcParallelVec, + dataTypes::number * sphericalFnTimesWfcAllCellsVec, + const int * indexMapPaddedToParallelVec); template void copyFromParallelNonLocalVecToAllCellsVec( - const unsigned int numWfcs, - const unsigned int numNonLocalCells, - const unsigned int maxSingleAtomContribution, - const std::complex *sphericalFnTimesWfcParallelVec, - std::complex * sphericalFnTimesWfcAllCellsVec, - const int * indexMapPaddedToParallelVec); + const unsigned int numWfcs, + const unsigned int numNonLocalCells, + const unsigned int maxSingleAtomContribution, + const dataTypes::numberFP32 *sphericalFnTimesWfcParallelVec, + dataTypes::numberFP32 * sphericalFnTimesWfcAllCellsVec, + const int * indexMapPaddedToParallelVec); template void @@ -286,10 +286,10 @@ namespace dftfe const unsigned int numberNodesPerElement, const unsigned int numberWfc, const unsigned int numberCellsTraversed, - const dftfe::utils::MemoryStorage - & nonLocalContribution, - double *TotalContribution, + & nonLocalContribution, + dataTypes::number *TotalContribution, const dftfe::utils::MemoryStorage &cellNodeIdMapNonLocalToLocal); @@ -300,10 +300,10 @@ namespace dftfe const unsigned int numberNodesPerElement, const unsigned int numberWfc, const unsigned int numberCellsTraversed, - const dftfe::utils::MemoryStorage, + const dftfe::utils::MemoryStorage - & nonLocalContribution, - std::complex *TotalContribution, + & nonLocalContribution, + dataTypes::numberFP32 *TotalContribution, const dftfe::utils::MemoryStorage &cellNodeIdMapNonLocalToLocal); diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 04e474c56..3a20f4067 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1187,7 +1187,8 @@ namespace dftfe d_densityQuadratureIdElectro, d_excManagerPtr, atomLocations, - d_numEigenValues); + d_numEigenValues, + d_dftParamsPtr->useSinglePrecCheby); // diff --git a/src/dft/energyCalculator.cc b/src/dft/energyCalculator.cc index dfb51a744..6512208fc 100644 --- a/src/dft/energyCalculator.cc +++ b/src/dft/energyCalculator.cc @@ -117,7 +117,7 @@ namespace dftfe pcout << std::endl << "Energy computations (Hartree) " << std::endl; pcout << "-------------------" << std::endl; if (dftParams.useMixedPrecCGS_O || dftParams.useMixedPrecCGS_SR || - dftParams.useMixedPrecCheby) + dftParams.useSinglePrecCommunCheby) pcout << std::setw(25) << "Total energy" << ": " << std::fixed << std::setprecision(6) << std::setw(20) << totalEnergyTrunc << std::endl; diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index f9e5825ce..0513b6ca4 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -317,10 +317,16 @@ namespace dftfe d_basisOperationsPtrHost->createScratchMultiVectors(1, 3); d_basisOperationsPtrHost->createScratchMultiVectors(BVec, 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrHost->createScratchMultiVectorsSinglePrec(BVec, + 2); if (d_numEigenValues % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors(d_numEigenValues % BVec, 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrHost->createScratchMultiVectorsSinglePrec( + d_numEigenValues % BVec, 2); if (d_numEigenValues != d_numEigenValuesRR && d_numEigenValuesRR % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( @@ -360,6 +366,10 @@ namespace dftfe d_basisOperationsPtrDevice->createScratchMultiVectors(1, 3); d_basisOperationsPtrDevice->createScratchMultiVectors( BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrDevice->createScratchMultiVectorsSinglePrec( + BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + d_basisOperationsPtrDevice->computeCellStiffnessMatrix( d_feOrderPlusOneQuadratureId, 50, true, false); if (std::is_same>::value) @@ -408,6 +418,9 @@ namespace dftfe d_basisOperationsPtrDevice->createScratchMultiVectors(1, 3); d_basisOperationsPtrDevice->createScratchMultiVectors( BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); + if (d_dftParamsPtr->useSinglePrecCheby) + d_basisOperationsPtrDevice->createScratchMultiVectorsSinglePrec( + BVec, d_dftParamsPtr->overlapComputeCommunCheby ? 4 : 2); } #endif diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index ecaa6dd2f..b8b10791a 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -338,6 +338,12 @@ namespace dftfe std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } if (d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + @@ -405,6 +411,7 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, + d_BLASWrapperPtrHost, elpaScala, d_eigenVectorsFlattenedHost.data() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -420,6 +427,8 @@ namespace dftfe residualNormWaveFunctions, interBandGroupComm, mpi_communicator, + d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + + spinType], computeResidual, useMixedPrec, isFirstScf); @@ -519,13 +528,18 @@ namespace dftfe if (d_dftParamsPtr->spinPolarized == 1) pcout << "spin: " << spinType + 1 << std::endl; } - std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); std::vector eigenValuesDummy(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } subspaceIterationSolverDevice.reinitSpectrumBounds( a0[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType], @@ -813,6 +827,13 @@ namespace dftfe std::vector eigenValuesTemp(d_numEigenValues, 0.0); + if (d_dftParamsPtr->useSinglePrecCheby) + for (unsigned int i = 0; i < d_numEigenValues; i++) + { + eigenValuesTemp[i] = + eigenValues[kPointIndex][spinType * d_numEigenValues + i]; + } + if (d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + @@ -868,6 +889,7 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, + d_BLASWrapperPtrHost, *d_elpaScala, d_eigenVectorsFlattenedHost.data() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -883,6 +905,8 @@ namespace dftfe residualNormWaveFunctions, interBandGroupComm, mpi_communicator, + d_isFirstFilteringCall[(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + + spinType], true, false); diff --git a/src/dftOperator/KohnShamHamiltonianOperator.cc b/src/dftOperator/KohnShamHamiltonianOperator.cc index 553d9f6db..6dbde6271 100644 --- a/src/dftOperator/KohnShamHamiltonianOperator.cc +++ b/src/dftOperator/KohnShamHamiltonianOperator.cc @@ -71,6 +71,9 @@ namespace dftfe { if (d_dftParamsPtr->isPseudopotential) d_ONCVnonLocalOperator = oncvClassPtr->getNonLocalOperator(); + if (d_dftParamsPtr->isPseudopotential && d_dftParamsPtr->useSinglePrecCheby) + d_ONCVnonLocalOperatorSinglePrec = + oncvClassPtr->getNonLocalOperatorSinglePrec(); d_cellsBlockSizeHamiltonianConstruction = memorySpace == dftfe::utils::MemorySpace::HOST ? 1 : 50; d_cellsBlockSizeHX = memorySpace == dftfe::utils::MemorySpace::HOST ? @@ -110,6 +113,8 @@ namespace dftfe d_dftParamsPtr->memOptMode ? 1 : (d_kPointWeights.size() * (d_dftParamsPtr->spinPolarized + 1))); + d_cellHamiltonianMatrixSinglePrec.resize( + d_dftParamsPtr->useSinglePrecCheby ? d_cellHamiltonianMatrix.size() : 0); const unsigned int nCells = d_basisOperationsPtr->nCells(); const unsigned int nDofsPerCell = d_basisOperationsPtr->nDofsPerCell(); @@ -123,6 +128,12 @@ namespace dftfe ++iHamiltonian) d_cellHamiltonianMatrix[iHamiltonian].resize(nDofsPerCell * nDofsPerCell * nCells); + for (unsigned int iHamiltonian = 0; + iHamiltonian < d_cellHamiltonianMatrixSinglePrec.size(); + ++iHamiltonian) + d_cellHamiltonianMatrixSinglePrec[iHamiltonian].resize( + nDofsPerCell * nDofsPerCell * nCells); + d_basisOperationsPtrHost->reinit(0, 0, d_densityQuadratureID, false); const unsigned int numberQuadraturePoints = d_basisOperationsPtrHost->nQuadsPerCell(); @@ -617,6 +628,11 @@ namespace dftfe if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) if (d_dftParamsPtr->isPseudopotential) d_ONCVnonLocalOperator->initialiseOperatorActionOnX(d_kPointIndex); + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + if (d_dftParamsPtr->isPseudopotential && + d_dftParamsPtr->useSinglePrecCheby) + d_ONCVnonLocalOperatorSinglePrec->initialiseOperatorActionOnX( + d_kPointIndex); } @@ -631,10 +647,20 @@ namespace dftfe nCells * nDofsPerCell * numWaveFunctions) d_cellWaveFunctionMatrixSrc.resize(nCells * nDofsPerCell * numWaveFunctions); + if (d_dftParamsPtr->useSinglePrecCheby && + d_cellWaveFunctionMatrixSrcSinglePrec.size() < + nCells * nDofsPerCell * numWaveFunctions) + d_cellWaveFunctionMatrixSrcSinglePrec.resize(nCells * nDofsPerCell * + numWaveFunctions); if (d_cellWaveFunctionMatrixDst.size() < d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions) d_cellWaveFunctionMatrixDst.resize(d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions); + if (d_dftParamsPtr->useSinglePrecCheby && + d_cellWaveFunctionMatrixDstSinglePrec.size() < + d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions) + d_cellWaveFunctionMatrixDstSinglePrec.resize( + d_cellsBlockSizeHX * nDofsPerCell * numWaveFunctions); if (d_dftParamsPtr->isPseudopotential) { @@ -649,6 +675,23 @@ namespace dftfe d_ONCVnonLocalOperator->initialiseFlattenedDataStructure( numWaveFunctions, d_ONCVNonLocalProjectorTimesVectorBlock); } + if (d_dftParamsPtr->isPseudopotential && d_dftParamsPtr->useSinglePrecCheby) + { + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + { + d_ONCVnonLocalOperatorSinglePrec->initialiseFlattenedDataStructure( + numWaveFunctions, + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec); + d_ONCVnonLocalOperatorSinglePrec + ->initialiseCellWaveFunctionPointers( + d_cellWaveFunctionMatrixSrcSinglePrec); + } + else + d_ONCVnonLocalOperatorSinglePrec->initialiseFlattenedDataStructure( + numWaveFunctions, + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec); + } + d_basisOperationsPtr->reinit(numWaveFunctions, d_cellsBlockSizeHX, d_densityQuadratureID, @@ -696,6 +739,15 @@ namespace dftfe return d_basisOperationsPtr->getMultiVector(numVectors, index); } + template + dftfe::linearAlgebra::MultiVector & + KohnShamHamiltonianOperator::getScratchFEMultivectorSinglePrec( + const unsigned int numVectors, + const unsigned int index) + { + return d_basisOperationsPtr->getMultiVectorSinglePrec(numVectors, index); + } + template void KohnShamHamiltonianOperator< @@ -820,6 +872,11 @@ namespace dftfe 1); } } + if (d_dftParamsPtr->useSinglePrecCheby) + d_BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + d_cellHamiltonianMatrix[d_HamiltonianIndex].size(), + d_cellHamiltonianMatrix[d_HamiltonianIndex].data(), + d_cellHamiltonianMatrixSinglePrec[d_HamiltonianIndex].data()); if (d_dftParamsPtr->memOptMode) if ((d_dftParamsPtr->isPseudopotential || d_dftParamsPtr->smearedNuclearCharges) && @@ -1109,6 +1166,162 @@ namespace dftfe } } + template + void + KohnShamHamiltonianOperator::HXCheby( + dftfe::linearAlgebra::MultiVector &src, + const double scalarHX, + const double scalarY, + const double scalarX, + dftfe::linearAlgebra::MultiVector &dst, + const bool onlyHPrimePartForFirstOrderDensityMatResponse, + const bool skip1, + const bool skip2, + const bool skip3) + { + const unsigned int numCells = d_basisOperationsPtr->nCells(); + const unsigned int numDoFsPerCell = d_basisOperationsPtr->nDofsPerCell(); + const unsigned int numberWavefunctions = src.numVectors(); + if (d_numVectorsInternal != numberWavefunctions) + reinitNumberWavefunctions(numberWavefunctions); + + if (d_basisOperationsPtr->d_nVectors != numberWavefunctions) + d_basisOperationsPtr->reinit(numberWavefunctions, + d_cellsBlockSizeHX, + d_densityQuadratureID, + false, + false); + const bool hasNonlocalComponents = + d_dftParamsPtr->isPseudopotential && + (d_ONCVnonLocalOperatorSinglePrec + ->getTotalNonLocalElementsInCurrentProcessor() > 0) && + !onlyHPrimePartForFirstOrderDensityMatResponse; + const dataTypes::numberFP32 scalarCoeffAlpha = dataTypes::numberFP32(1.0), + scalarCoeffBeta = dataTypes::numberFP32(0.0); + + if (!skip1 && !skip2 && !skip3) + src.updateGhostValues(); + if (!skip1) + { + d_basisOperationsPtr + ->d_constraintInfo[d_basisOperationsPtr->d_dofHandlerID] + .distribute(src); + if constexpr (memorySpace == dftfe::utils::MemorySpace::HOST) + if (d_dftParamsPtr->isPseudopotential) + d_ONCVnonLocalOperatorSinglePrec->initialiseOperatorActionOnX( + d_kPointIndex); + for (unsigned int iCell = 0; iCell < numCells; + iCell += d_cellsBlockSizeHX) + { + std::pair cellRange( + iCell, std::min(iCell + d_cellsBlockSizeHX, numCells)); + d_BLASWrapperPtr->stridedCopyToBlock( + numberWavefunctions, + numDoFsPerCell * (cellRange.second - cellRange.first), + src.data(), + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + d_basisOperationsPtr->d_flattenedCellDofIndexToProcessDofIndexMap + .data() + + cellRange.first * numDoFsPerCell); + if (hasNonlocalComponents) + d_ONCVnonLocalOperatorSinglePrec->applyCconjtransOnX( + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + cellRange); + } + } + if (!skip2) + { + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec.setValue(0); + d_ONCVnonLocalOperatorSinglePrec->applyAllReduceOnCconjtransX( + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec, true); + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .accumulateAddLocallyOwnedBegin(); + } + src.zeroOutGhosts(); + inverseMassVectorScaledConstraintsNoneDataInfoPtr->set_zero(src); + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .accumulateAddLocallyOwnedEnd(); + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .updateGhostValuesBegin(); + } + d_BLASWrapperPtr->axpby(src.locallyOwnedSize() * src.numVectors(), + scalarX, + src.data(), + scalarY, + dst.data()); + if (d_dftParamsPtr->isPseudopotential && + !onlyHPrimePartForFirstOrderDensityMatResponse) + { + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec + .updateGhostValuesEnd(); + d_ONCVnonLocalOperatorSinglePrec->applyVOnCconjtransX( + CouplingStructure::diagonal, + d_oncvClassPtr->getCouplingMatrixSinglePrec(), + d_ONCVNonLocalProjectorTimesVectorBlockSinglePrec, + true); + } + } + if (!skip3) + { + for (unsigned int iCell = 0; iCell < numCells; + iCell += d_cellsBlockSizeHX) + { + std::pair cellRange( + iCell, std::min(iCell + d_cellsBlockSizeHX, numCells)); + + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + numberWavefunctions, + numDoFsPerCell, + numDoFsPerCell, + &scalarCoeffAlpha, + d_cellWaveFunctionMatrixSrcSinglePrec.data() + + cellRange.first * numDoFsPerCell * numberWavefunctions, + numberWavefunctions, + numDoFsPerCell * numberWavefunctions, + d_cellHamiltonianMatrixSinglePrec[d_HamiltonianIndex].data() + + cellRange.first * numDoFsPerCell * numDoFsPerCell, + numDoFsPerCell, + numDoFsPerCell * numDoFsPerCell, + &scalarCoeffBeta, + d_cellWaveFunctionMatrixDstSinglePrec.data(), + numberWavefunctions, + numDoFsPerCell * numberWavefunctions, + cellRange.second - cellRange.first); + if (hasNonlocalComponents) + d_ONCVnonLocalOperatorSinglePrec->applyCOnVCconjtransX( + d_cellWaveFunctionMatrixDstSinglePrec.data(), cellRange); + d_BLASWrapperPtr->axpyStridedBlockAtomicAdd( + numberWavefunctions, + numDoFsPerCell * (cellRange.second - cellRange.first), + scalarHX, + d_basisOperationsPtr->cellInverseMassVectorBasisData().data() + + cellRange.first * numDoFsPerCell, + d_cellWaveFunctionMatrixDstSinglePrec.data(), + dst.data(), + d_basisOperationsPtr->d_flattenedCellDofIndexToProcessDofIndexMap + .data() + + cellRange.first * numDoFsPerCell); + } + + inverseMassVectorScaledConstraintsNoneDataInfoPtr + ->distribute_slave_to_master(dst); + } + if (!skip1 && !skip2 && !skip3) + { + dst.accumulateAddLocallyOwned(); + dst.zeroOutGhosts(); + } + } template class KohnShamHamiltonianOperator; diff --git a/src/linAlg/linearAlgebraOperationsDevice.cc b/src/linAlg/linearAlgebraOperationsDevice.cc index a3999b6ca..dde29c22b 100644 --- a/src/linAlg/linearAlgebraOperationsDevice.cc +++ b/src/linAlg/linearAlgebraOperationsDevice.cc @@ -490,6 +490,301 @@ namespace dftfe X2 = Y2; } + void + chebyshevFilterOverlapComputeCommunicationSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X1, + dftfe::linearAlgebra::MultiVector &Y1, + dftfe::linearAlgebra::MultiVector &X2, + dftfe::linearAlgebra::MultiVector &Y2, + dftfe::linearAlgebra::MultiVector + &X1_SP, + dftfe::linearAlgebra::MultiVector + &Y1_SP, + dftfe::linearAlgebra::MultiVector + &X2_SP, + dftfe::linearAlgebra::MultiVector + & Y2_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0) + { + double e, c, sigma, sigma1, sigma2, gamma, alpha1Old, alpha2Old; + e = (b - a) / 2.0; + c = (b + a) / 2.0; + sigma = e / (a0 - c); + sigma1 = sigma; + gamma = 2.0 / sigma1; + + dftfe::utils::MemoryStorage + eigenValuesFiltered, eigenValuesFiltered1, eigenValuesFiltered2; + eigenValuesFiltered.resize(eigenvalues.size()); + eigenValuesFiltered.copyFrom(eigenvalues); + eigenValuesFiltered1 = eigenValuesFiltered; + eigenValuesFiltered2 = eigenValuesFiltered; + eigenValuesFiltered1.setValue(1.0); + + // + // create YArray + // initialize to zeros. + // x + operatorMatrix.HXCheby(X1, 1.0, 0.0, 0.0, Y1); + + + // + // call HX + // + + + double alpha1 = sigma1 / e, alpha2 = -c; + eigenValuesFiltered2.setValue(alpha1 * alpha2); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered2.size(), + alpha1, + eigenValuesFiltered2.data(), + eigenValuesFiltered1.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered2.data()); + BLASWrapperPtr->ApaBD(X1.locallyOwnedSize(), + X1.numVectors(), + -1.0, + Y1.data(), + X1.data(), + eigenValuesFiltered.data(), + Y1.data()); + X1_SP.setValue(0.0); + X2_SP.setValue(0.0); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X1.locallyOwnedSize() * X1.numVectors(), Y1.data(), Y1_SP.data()); + BLASWrapperPtr->xscal(Y1_SP.data(), + dataTypes::numberFP32(alpha1), + X2.locallyOwnedSize() * X2.numVectors()); + X2.updateGhostValues(); + operatorMatrix.HXCheby(X2, 1.0, 0.0, 0.0, Y2, false, false, true, true); + // + // polynomial loop + // + for (unsigned int degree = 2; degree < m + 1; ++degree) + { + sigma2 = 1.0 / (gamma - sigma); + alpha1Old = alpha1, alpha2Old = alpha2; + alpha1 = 2.0 * sigma2 / e, alpha2 = -(sigma * sigma2); + + if (degree == 2) + { + operatorMatrix.HXCheby( + X2, 1.0, 0.0, 0.0, Y2, false, true, false, true); + Y1_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby( + X2, 1.0, 0.0, 0.0, Y2, false, true, true, false); + Y1_SP.updateGhostValuesEnd(); + Y2.accumulateAddLocallyOwnedBegin(); + } + else + { + operatorMatrix.HXCheby(Y2_SP, + alpha1Old, + alpha2Old, + -c * alpha1Old, + X2_SP, + false, + true, + false, + true); + Y1_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby(Y2_SP, + alpha1Old, + alpha2Old, + -c * alpha1Old, + X2_SP, + false, + true, + true, + false); + Y1_SP.updateGhostValuesEnd(); + X2_SP.accumulateAddLocallyOwnedBegin(); + } + + + // + // call HX + // + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + false, + true, + true); + if (degree == 2) + { + Y2.accumulateAddLocallyOwnedEnd(); + Y2.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2.locallyOwnedSize(), + X2.numVectors(), + -1.0, + Y2.data(), + X2.data(), + eigenValuesFiltered.data() + + X1.numVectors(), + Y2.data()); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X2.locallyOwnedSize() * X2.numVectors(), + Y2.data(), + Y2_SP.data()); + BLASWrapperPtr->xscal(Y2_SP.data(), + dataTypes::numberFP32(alpha1Old), + X2.locallyOwnedSize() * X2.numVectors()); + } + else + { + X2_SP.accumulateAddLocallyOwnedEnd(); + X2_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2_SP.locallyOwnedSize(), + X2_SP.numVectors(), + alpha1Old, + X2_SP.data(), + Y2.data(), + eigenValuesFiltered2.data() + + X1_SP.numVectors(), + X2_SP.data()); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1Old, + eigenValuesFiltered2.data(), + alpha2Old, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1Old, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + X2_SP.swap(Y2_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + } + + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + true, + false, + true); + Y2_SP.updateGhostValuesBegin(); + operatorMatrix.HXCheby(Y1_SP, + alpha1, + alpha2, + -c * alpha1, + X1_SP, + false, + true, + true, + false); + Y2_SP.updateGhostValuesEnd(); + X1_SP.accumulateAddLocallyOwnedBegin(); + operatorMatrix.HXCheby(Y2_SP, + alpha1, + alpha2, + -c * alpha1, + X2_SP, + false, + false, + true, + true); + X1_SP.accumulateAddLocallyOwnedEnd(); + X1_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X1_SP.locallyOwnedSize(), + X1_SP.numVectors(), + alpha1, + X1_SP.data(), + Y1.data(), + eigenValuesFiltered2.data(), + X1_SP.data()); + + // + // XArray = YArray + // + X1_SP.swap(Y1_SP); + + if (degree == m) + { + operatorMatrix.HXCheby(Y2_SP, + alpha1, + alpha2, + -c * alpha1, + X2_SP, + false, + true, + false, + false); + X2_SP.accumulateAddLocallyOwned(); + X2_SP.zeroOutGhosts(); + BLASWrapperPtr->ApaBD(X2_SP.locallyOwnedSize(), + X2_SP.numVectors(), + alpha1, + X2_SP.data(), + Y2.data(), + eigenValuesFiltered2.data() + + X1_SP.numVectors(), + X2_SP.data()); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1, + eigenValuesFiltered2.data(), + alpha2, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + X2_SP.swap(Y2_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + } + + // + // YArray = YNewArray + // + sigma = sigma2; + } + + // copy back YArray to XArray + BLASWrapperPtr->ApaBD(X1.locallyOwnedSize(), + X1.numVectors(), + 1.0, + Y1_SP.data(), + X1.data(), + eigenValuesFiltered2.data(), + X1.data()); + + BLASWrapperPtr->ApaBD(X2.locallyOwnedSize(), + X2.numVectors(), + 1.0, + Y2_SP.data(), + X2.data(), + eigenValuesFiltered2.data() + X1.numVectors(), + X2.data()); + } + void subspaceRotationSpectrumSplitScalapack( diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index 31e68657a..499211267 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -336,6 +336,129 @@ namespace dftfe X = Y; } + // + // chebyshev filtering of given subspace XArray + // + template + void + chebyshevFilterSinglePrec( + const std::shared_ptr> + & BLASWrapperPtr, + operatorDFTClass & operatorMatrix, + dftfe::linearAlgebra::MultiVector & X, + dftfe::linearAlgebra::MultiVector & Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0) + { + double e, c, sigma, sigma1, sigma2, gamma; + e = (b - a) / 2.0; + c = (b + a) / 2.0; + sigma = e / (a0 - c); + sigma1 = sigma; + gamma = 2.0 / sigma1; + + dftfe::utils::MemoryStorage eigenValuesFiltered, + eigenValuesFiltered1, eigenValuesFiltered2; + eigenValuesFiltered.resize(eigenvalues.size()); + eigenValuesFiltered.copyFrom(eigenvalues); + eigenValuesFiltered1 = eigenValuesFiltered; + eigenValuesFiltered2 = eigenValuesFiltered; + eigenValuesFiltered1.setValue(1.0); + // + // create YArray + // initialize to zeros. + // x + operatorMatrix.HXCheby(X, 1.0, 0.0, 0.0, Y); + // Y=HX + // + // call HX + // + double alpha1 = sigma1 / e, alpha2 = -c; + // Y=alpha1*HX+alpha1 * alpha2*X + eigenValuesFiltered2.setValue(alpha1 * alpha2); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered2.size(), + alpha1, + eigenValuesFiltered2.data(), + eigenValuesFiltered1.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered2.data()); + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + -1.0, + Y.data(), + X.data(), + eigenValuesFiltered.data(), + Y.data()); + X_SP.setValue(0.0); + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + X.locallyOwnedSize() * X.numVectors(), Y.data(), Y_SP.data()); + BLASWrapperPtr->xscal(Y_SP.data(), + dataTypes::numberFP32(alpha1), + X.locallyOwnedSize() * X.numVectors()); + // + // polynomial loop + // + for (unsigned int degree = 2; degree < m + 1; ++degree) + { + sigma2 = 1.0 / (gamma - sigma); + alpha1 = 2.0 * sigma2 / e, alpha2 = -(sigma * sigma2); + + + operatorMatrix.HXCheby(Y_SP, alpha1, alpha2, -c * alpha1, X_SP); + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + alpha1, + X_SP.data(), + Y.data(), + eigenValuesFiltered2.data(), + X_SP.data()); + + // + // call HX + // + // operatorMatrix.HXCheby( + // Y, X, Y, eigenValuesFiltered2, alpha1, alpha2, -c * alpha1, X); + BLASWrapperPtr->axpby(eigenValuesFiltered2.size(), + -c * alpha1, + eigenValuesFiltered2.data(), + alpha2, + eigenValuesFiltered1.data()); + BLASWrapperPtr->ApaBD(1, + eigenValuesFiltered1.size(), + alpha1, + eigenValuesFiltered1.data(), + eigenValuesFiltered2.data(), + eigenValuesFiltered.data(), + eigenValuesFiltered1.data()); + + + // + // XArray = YArray + // + X_SP.swap(Y_SP); + eigenValuesFiltered1.swap(eigenValuesFiltered2); + + // + // YArray = YNewArray + // + sigma = sigma2; + } + BLASWrapperPtr->ApaBD(X.locallyOwnedSize(), + X.numVectors(), + 1.0, + Y_SP.data(), + X.data(), + eigenValuesFiltered2.data(), + X.data()); + + // copy back YArray to XArray + } template @@ -2912,18 +3035,20 @@ namespace dftfe 1, operatorMatrix.getMPICommunicatorDomain(), &YNorm); + double lowerBound = std::floor(eigenValuesT[0]); + double upperBound = + std::ceil(eigenValuesT[lanczosIterations - 1] + + (dftParams.reproducible_output ? YNorm : YNorm / 10.0)); if (dftParams.verbosity >= 5 && this_mpi_process == 0) { std::cout << "bUp1: " << eigenValuesT[lanczosIterations - 1] << ", fvector norm: " << YNorm << std::endl; std::cout << "aLow: " << eigenValuesT[0] << std::endl; + std::cout << "boundL: " << lowerBound << std::endl; + std::cout << "boundU: " << upperBound << std::endl; } - double lowerBound = std::floor(eigenValuesT[0]); - double upperBound = - std::ceil(eigenValuesT[lanczosIterations - 1] + - (dftParams.reproducible_output ? YNorm : YNorm / 10.0)); return (std::make_pair(lowerBound, upperBound)); } @@ -3694,6 +3819,26 @@ namespace dftfe const double, const double, const double); + template void + chebyshevFilterSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X, + dftfe::linearAlgebra::MultiVector &Y, + dftfe::linearAlgebra::MultiVector &X_SP, + dftfe::linearAlgebra::MultiVector &Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); + #ifdef DFTFE_WITH_DEVICE template void chebyshevFilter( @@ -3706,6 +3851,27 @@ namespace dftfe const double, const double, const double); + template void + chebyshevFilterSinglePrec( + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + operatorDFTClass &operatorMatrix, + dftfe::linearAlgebra::MultiVector &X, + dftfe::linearAlgebra::MultiVector &Y, + dftfe::linearAlgebra::MultiVector + &X_SP, + dftfe::linearAlgebra::MultiVector + & Y_SP, + std::vector eigenvalues, + const unsigned int m, + const double a, + const double b, + const double a0); #endif diff --git a/src/pseudo/oncv/oncvClass.t.cc b/src/pseudo/oncv/oncvClass.cc similarity index 87% rename from src/pseudo/oncv/oncvClass.t.cc rename to src/pseudo/oncv/oncvClass.cc index 695b30986..7190f30fe 100644 --- a/src/pseudo/oncv/oncvClass.t.cc +++ b/src/pseudo/oncv/oncvClass.cc @@ -16,7 +16,7 @@ // // @author Kartick Ramakrishnan // - +#include namespace dftfe { template @@ -118,7 +118,8 @@ namespace dftfe unsigned int densityQuadratureIdElectro, std::shared_ptr excFunctionalPtr, const std::vector> &atomLocations, - unsigned int numEigenValues) + unsigned int numEigenValues, + const bool singlePrecNonLocalOperator) { MPI_Barrier(d_mpiCommParent); d_BasisOperatorHostPtr = basisOperationsHostPtr; @@ -142,6 +143,7 @@ namespace dftfe d_nlpspQuadratureId = nlpspQuadratureId; d_excManagerPtr = excFunctionalPtr; d_numEigenValues = numEigenValues; + d_singlePrecNonLocalOperator = singlePrecNonLocalOperator; createAtomCenteredSphericalFunctionsForDensities(); createAtomCenteredSphericalFunctionsForProjectors(); @@ -161,6 +163,15 @@ namespace dftfe d_BasisOperatorHostPtr, d_atomicProjectorFnsContainer, d_mpiCommParent); + if constexpr (dftfe::utils::MemorySpace::HOST == memorySpace) + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec = + std::make_shared::type, + memorySpace>>(d_BLASWrapperHostPtr, + d_BasisOperatorHostPtr, + d_atomicProjectorFnsContainer, + d_mpiCommParent); } #if defined(DFTFE_WITH_DEVICE) else @@ -172,6 +183,15 @@ namespace dftfe d_BasisOperatorDevicePtr, d_atomicProjectorFnsContainer, d_mpiCommParent); + if constexpr (dftfe::utils::MemorySpace::DEVICE == memorySpace) + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec = + std::make_shared::type, + memorySpace>>(d_BLASWrapperDevicePtr, + d_BasisOperatorDevicePtr, + d_atomicProjectorFnsContainer, + d_mpiCommParent); } #endif @@ -206,7 +226,8 @@ namespace dftfe if (updateNonlocalSparsity) { - d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = false; MPI_Barrier(d_mpiCommParent); double InitTime = MPI_Wtime(); d_atomicProjectorFnsContainer->computeSparseStructure( @@ -226,6 +247,15 @@ namespace dftfe kPointCoordinates, d_BasisOperatorHostPtr, d_nlpspQuadratureId); + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec + ->intitialisePartitionerKPointsAndComputeCMatrixEntries( + updateNonlocalSparsity, + kPointWeights, + kPointCoordinates, + d_BasisOperatorHostPtr, + d_nlpspQuadratureId); + MPI_Barrier(d_mpiCommParent); double TotalTime = MPI_Wtime() - InitTimeTotal; @@ -270,7 +300,8 @@ namespace dftfe if (updateNonlocalSparsity) { - d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixEntriesUpdated = false; + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = false; MPI_Barrier(d_mpiCommParent); double InitTime = MPI_Wtime(); d_atomicProjectorFnsContainer->getDataForSparseStructure( @@ -295,6 +326,15 @@ namespace dftfe kPointCoordinates, d_BasisOperatorHostPtr, d_nlpspQuadratureId); + if (d_singlePrecNonLocalOperator) + d_nonLocalOperatorSinglePrec + ->intitialisePartitionerKPointsAndComputeCMatrixEntries( + updateNonlocalSparsity, + kPointWeights, + kPointCoordinates, + d_BasisOperatorHostPtr, + d_nlpspQuadratureId); + MPI_Barrier(d_mpiCommParent); double TotalTime = MPI_Wtime() - InitTimeTotal; if (d_verbosity >= 2) @@ -678,6 +718,36 @@ namespace dftfe #endif } + template + const dftfe::utils::MemoryStorage< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + oncvClass::getCouplingMatrixSinglePrec() + { + getCouplingMatrix(); + if (!d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated) + { + d_couplingMatrixEntriesSinglePrec.resize( + d_couplingMatrixEntries.size()); + if constexpr (memorySpace == dftfe::utils::MemorySpace::HOST) + d_BLASWrapperHostPtr->copyValueType1ArrToValueType2Arr( + d_couplingMatrixEntriesSinglePrec.size(), + d_couplingMatrixEntries.data(), + d_couplingMatrixEntriesSinglePrec.data()); +#if defined(DFTFE_WITH_DEVICE) + else + d_BLASWrapperDevicePtr->copyValueType1ArrToValueType2Arr( + d_couplingMatrixEntriesSinglePrec.size(), + d_couplingMatrixEntries.data(), + d_couplingMatrixEntriesSinglePrec.data()); +#endif + + d_HamiltonianCouplingMatrixSinglePrecEntriesUpdated = true; + } + + return (d_couplingMatrixEntriesSinglePrec); + } + template const std::shared_ptr> @@ -686,6 +756,15 @@ namespace dftfe return d_nonLocalOperator; } + template + const std::shared_ptr::type, + memorySpace>> + oncvClass::getNonLocalOperatorSinglePrec() + { + return d_nonLocalOperatorSinglePrec; + } + template @@ -716,4 +795,9 @@ namespace dftfe d_atomicProjectorFnsContainer->getTotalNumberOfSphericalFunctionsPerAtom( atomicNumbers[atomId])); } + template class oncvClass; +#if defined(DFTFE_WITH_DEVICE) + template class oncvClass; +#endif } // namespace dftfe diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc index 80147c97b..268b4bef2 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc @@ -140,8 +140,11 @@ namespace dftfe void chebyshevOrthogonalizedSubspaceIterationSolver::solve( operatorDFTClass &operatorMatrix, - elpaScalaManager & elpaScala, - dataTypes::number * eigenVectorsFlattened, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, dataTypes::number * eigenVectorsRotFracDensityFlattened, const unsigned int totalNumberWaveFunctions, const unsigned int localVectorSize, @@ -149,6 +152,7 @@ namespace dftfe std::vector &residualNorms, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, + const bool isFirstFilteringCall, const bool computeResidual, const bool useMixedPrec, const bool isFirstScf) @@ -240,6 +244,20 @@ namespace dftfe distributedCPUMultiVec *eigenVectorsFlattenedArrayBlock2 = &operatorMatrix.getScratchFEMultivector(vectorsBlockSize, 1); + distributedCPUMultiVec + *eigenVectorsFlattenedArrayBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, + 0) : + NULL; + distributedCPUMultiVec + *eigenVectorsFlattenedArrayBlock2FP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, + 1) : + NULL; + + std::vector eigenValuesBlock(vectorsBlockSize); /// storage for cell wavefunction matrix std::vector cellWaveFunctionMatrix; @@ -267,6 +285,15 @@ namespace dftfe &operatorMatrix.getScratchFEMultivector(BVec, 0); eigenVectorsFlattenedArrayBlock2 = &operatorMatrix.getScratchFEMultivector(BVec, 1); + if (d_dftParams.useSinglePrecCheby) + { + eigenVectorsFlattenedArrayBlockFP32 = + &operatorMatrix.getScratchFEMultivectorSinglePrec(BVec, + 0); + eigenVectorsFlattenedArrayBlock2FP32 = + &operatorMatrix.getScratchFEMultivectorSinglePrec(BVec, + 1); + } } // fill the eigenVectorsFlattenedArrayBlock from @@ -288,17 +315,36 @@ namespace dftfe // call Chebyshev filtering function only for the current block to // be filtered and does in-place filtering computing_timer.enter_subsection("Chebyshev filtering"); - - - - linearAlgebraOperations::chebyshevFilter( - operatorMatrix, - *eigenVectorsFlattenedArrayBlock, - *eigenVectorsFlattenedArrayBlock2, - chebyshevOrder, - d_lowerBoundUnWantedSpectrum, - d_upperBoundUnWantedSpectrum, - d_lowerBoundWantedSpectrum); + if (d_dftParams.useSinglePrecCheby && !isFirstFilteringCall) + { + eigenValuesBlock.resize(BVec); + for (unsigned int i = 0; i < BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + + linearAlgebraOperations::chebyshevFilterSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*eigenVectorsFlattenedArrayBlock), + (*eigenVectorsFlattenedArrayBlock2), + (*eigenVectorsFlattenedArrayBlockFP32), + (*eigenVectorsFlattenedArrayBlock2FP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + } + else + linearAlgebraOperations::chebyshevFilter( + operatorMatrix, + *eigenVectorsFlattenedArrayBlock, + *eigenVectorsFlattenedArrayBlock2, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); computing_timer.leave_subsection("Chebyshev filtering"); diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc index 32679f1a0..d90d65e72 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc @@ -216,7 +216,26 @@ namespace dftfe d_dftParams.overlapComputeCommunCheby ? &operatorMatrix.getScratchFEMultivector(vectorsBlockSize, 3) : NULL; + distributedDeviceVec *XBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 0) : + NULL; + distributedDeviceVec *HXBlockFP32 = + d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 1) : + NULL; + + distributedDeviceVec *XBlock2FP32 = + d_dftParams.overlapComputeCommunCheby && d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 2) : + NULL; + distributedDeviceVec *HXBlock2FP32 = + d_dftParams.overlapComputeCommunCheby && d_dftParams.useSinglePrecCheby ? + &operatorMatrix.getScratchFEMultivectorSinglePrec(vectorsBlockSize, 3) : + NULL; + operatorMatrix.reinitNumberWavefunctions(vectorsBlockSize); + std::vector eigenValuesBlock(vectorsBlockSize); if (isFirstFilteringCall) { @@ -396,10 +415,100 @@ namespace dftfe // call Chebyshev filtering function only for the current block // or two simulataneous blocks (in case of overlap computation // and communication) to be filtered and does in-place filtering - if (d_dftParams.overlapComputeCommunCheby && - numSimultaneousBlocksCurrent == 2) + if (d_dftParams.useSinglePrecCheby && !isFirstFilteringCall) + { + eigenValuesBlock.resize(vectorsBlockSize * + numSimultaneousBlocksCurrent); + if (d_dftParams.overlapComputeCommunCheby && + numSimultaneousBlocksCurrent == 2) + { + for (unsigned int i = 0; i < 2 * BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*XBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + } + linearAlgebraOperationsDevice:: + chebyshevFilterOverlapComputeCommunicationSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*XBlock), + (*HXBlock), + (*XBlock2), + (*HXBlock2), + (*XBlockFP32), + (*HXBlockFP32), + (*XBlock2FP32), + (*HXBlock2FP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*XBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock2).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + } + } + else + { + for (unsigned int i = 0; i < BVec; i++) + { + eigenValuesBlock[i] = eigenValues[jvec + i]; + } + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::single); + } + linearAlgebraOperations::chebyshevFilterSinglePrec( + BLASWrapperPtr, + operatorMatrix, + (*XBlock), + (*HXBlock), + (*XBlockFP32), + (*HXBlockFP32), + eigenValuesBlock, + chebyshevOrder, + d_lowerBoundUnWantedSpectrum, + d_upperBoundUnWantedSpectrum, + d_lowerBoundWantedSpectrum); + + if (useMixedPrecOverall && + d_dftParams.useSinglePrecCommunCheby) + { + (*XBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + (*HXBlock).setCommunicationPrecision( + dftfe::utils::mpi::communicationPrecision::full); + } + } + } + else if (d_dftParams.overlapComputeCommunCheby && + numSimultaneousBlocksCurrent == 2) { - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -421,7 +530,7 @@ namespace dftfe d_lowerBoundUnWantedSpectrum, d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -435,7 +544,7 @@ namespace dftfe } else { - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::single); @@ -450,7 +559,7 @@ namespace dftfe d_lowerBoundUnWantedSpectrum, d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); - if (useMixedPrecOverall && d_dftParams.useMixedPrecCheby) + if (useMixedPrecOverall && d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -920,7 +1029,7 @@ namespace dftfe numSimultaneousBlocksCurrent == 2) { if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision:: @@ -948,7 +1057,7 @@ namespace dftfe d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); @@ -963,7 +1072,7 @@ namespace dftfe else { if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision:: @@ -981,7 +1090,7 @@ namespace dftfe d_upperBoundUnWantedSpectrum, d_lowerBoundWantedSpectrum); if (useMixedPrecOverall && - d_dftParams.useMixedPrecCheby) + d_dftParams.useSinglePrecCommunCheby) { (*XBlock).setCommunicationPrecision( dftfe::utils::mpi::communicationPrecision::full); diff --git a/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output b/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output new file mode 100644 index 000000000..626058985 --- /dev/null +++ b/tests/dft/pseudopotential/complex/fccAl_05.mpirun=16.output @@ -0,0 +1,542 @@ +number of atoms: 4 +number of atoms types: 1 +Z:13 +============================================================================================================================= +number of electrons: 12 +number of eigen values: 20 +============================================================================================================================= +Reduced k-Point-coordinates and weights: +2.500000000000000000e-01 2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 -2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +2.500000000000000000e-01 -2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 -2.500000000000000000e-01 2.500000000000000000e-01 1.250000000000000000e-01 +-2.500000000000000000e-01 -2.500000000000000000e-01 -2.500000000000000000e-01 1.250000000000000000e-01 + check 0.1 + check 0.2 +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 7.599999999999999645e+00 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 7.599999999999999645e+00 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 7.599999999999999645e+00 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 1: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 2: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 3: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +----------------------------------------------------------------------------------------- +Number Image Charges 1724 +Reading tria info and rho data from checkpoint in progress... +...Reading from checkpoint done. + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 2 +FE interpolating polynomial order for electrostatics solve: 2 +FE interpolating polynomial order for nodal electron density computation: 4 +number of elements: 1408 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 15849 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... +Starting NSCF iteration.... +Fermi Energy computed: -2.456430006978324310e-01 + +Energy computations (Hartree) +------------------- + Total energy: -8.36780768 +Total energy : -8.36780768 +Total entropic energy: 0.00604495 +Total free energy: -8.37385263 +Writing tdos File... +epsValue Dos +-16.54768903 0.00008134 +-16.52047764 0.00008176 +-16.49326625 0.00008218 +-16.46605487 0.00008260 +-16.43884348 0.00008303 +-16.41163210 0.00008346 +-16.38442071 0.00008389 +-16.35720932 0.00008433 +-16.32999794 0.00008477 +-16.30278655 0.00008521 +-16.27557516 0.00008566 +-16.24836378 0.00008611 +-16.22115239 0.00008657 +-16.19394101 0.00008703 +-16.16672962 0.00008749 +-16.13951824 0.00008796 +-16.11230685 0.00008844 +-16.08509546 0.00008891 +-16.05788408 0.00008939 +-16.03067269 0.00008988 +-16.00346130 0.00009037 +-15.97624992 0.00009086 +-15.94903853 0.00009136 +-15.92182715 0.00009186 +-15.89461576 0.00009237 +-15.86740437 0.00009288 +-15.84019299 0.00009340 +-15.81298160 0.00009392 +-15.78577022 0.00009444 +-15.75855883 0.00009498 +-15.73134745 0.00009551 +-15.70413606 0.00009605 +-15.67692467 0.00009660 +-15.64971329 0.00009715 +-15.62250190 0.00009771 +-15.59529052 0.00009827 +-15.56807913 0.00009883 +-15.54086774 0.00009941 +-15.51365636 0.00009998 +-15.48644497 0.00010057 +-15.45923358 0.00010116 +-15.43202220 0.00010175 +-15.40481081 0.00010235 +-15.37759943 0.00010296 +-15.35038804 0.00010357 +-15.32317665 0.00010419 +-15.29596527 0.00010482 +-15.26875388 0.00010545 +-15.24154250 0.00010608 +-15.21433111 0.00010673 +-15.18711973 0.00010738 +-15.15990834 0.00010804 +-15.13269695 0.00010870 +-15.10548557 0.00010937 +-15.07827418 0.00011005 +-15.05106279 0.00011073 +-15.02385141 0.00011142 +-14.99664002 0.00011212 +-14.96942864 0.00011283 +-14.94221725 0.00011354 +-14.91500586 0.00011426 +-14.88779448 0.00011499 +-14.86058309 0.00011573 +-14.83337171 0.00011647 +-14.80616032 0.00011723 +-14.77894893 0.00011799 +-14.75173755 0.00011876 +-14.72452616 0.00011953 +-14.69731478 0.00012032 +-14.67010339 0.00012112 +-14.64289200 0.00012192 +-14.61568062 0.00012273 +-14.58846923 0.00012355 +-14.56125785 0.00012439 +-14.53404646 0.00012523 +-14.50683507 0.00012608 +-14.47962369 0.00012694 +-14.45241230 0.00012781 +-14.42520092 0.00012869 +-14.39798953 0.00012958 +-14.37077814 0.00013048 +-14.34356676 0.00013139 +-14.31635537 0.00013232 +-14.28914399 0.00013325 +-14.26193260 0.00013419 +-14.23472121 0.00013515 +-14.20750983 0.00013612 +-14.18029844 0.00013710 +-14.15308706 0.00013809 +-14.12587567 0.00013909 +-14.09866428 0.00014011 +-14.07145290 0.00014114 +-14.04424151 0.00014218 +-14.01703013 0.00014324 +-13.98981874 0.00014431 +-13.96260735 0.00014539 +-13.93539597 0.00014648 +-13.90818458 0.00014759 +-13.88097320 0.00014872 +-13.85376181 0.00014986 +-13.82655042 0.00015101 +-13.79933904 0.00015218 +-13.77212765 0.00015337 +-13.74491627 0.00015457 +-13.71770488 0.00015579 +-13.69049349 0.00015702 +-13.66328211 0.00015827 +-13.63607072 0.00015954 +-13.60885934 0.00016083 +-13.58164795 0.00016213 +-13.55443656 0.00016345 +-13.52722518 0.00016479 +-13.50001379 0.00016615 +-13.47280240 0.00016753 +-13.44559102 0.00016893 +-13.41837963 0.00017035 +-13.39116825 0.00017179 +-13.36395686 0.00017325 +-13.33674548 0.00017473 +-13.30953409 0.00017624 +-13.28232270 0.00017777 +-13.25511132 0.00017932 +-13.22789993 0.00018089 +-13.20068855 0.00018249 +-13.17347716 0.00018411 +-13.14626577 0.00018576 +-13.11905439 0.00018744 +-13.09184300 0.00018914 +-13.06463161 0.00019086 +-13.03742023 0.00019262 +-13.01020884 0.00019440 +-12.98299746 0.00019621 +-12.95578607 0.00019806 +-12.92857468 0.00019993 +-12.90136330 0.00020183 +-12.87415191 0.00020377 +-12.84694053 0.00020573 +-12.81972914 0.00020773 +-12.79251776 0.00020977 +-12.76530637 0.00021184 +-12.73809498 0.00021395 +-12.71088360 0.00021609 +-12.68367221 0.00021827 +-12.65646082 0.00022049 +-12.62924944 0.00022275 +-12.60203805 0.00022506 +-12.57482667 0.00022740 +-12.54761528 0.00022979 +-12.52040389 0.00023222 +-12.49319251 0.00023469 +-12.46598112 0.00023722 +-12.43876974 0.00023979 +-12.41155835 0.00024241 +-12.38434696 0.00024508 +-12.35713558 0.00024781 +-12.32992419 0.00025059 +-12.30271281 0.00025342 +-12.27550142 0.00025631 +-12.24829003 0.00025926 +-12.22107865 0.00026227 +-12.19386726 0.00026534 +-12.16665588 0.00026848 +-12.13944449 0.00027168 +-12.11223310 0.00027495 +-12.08502172 0.00027829 +-12.05781033 0.00028171 +-12.03059895 0.00028519 +-12.00338756 0.00028876 +-11.97617617 0.00029241 +-11.94896479 0.00029613 +-11.92175340 0.00029995 +-11.89454202 0.00030385 +-11.86733063 0.00030784 +-11.84011924 0.00031193 +-11.81290786 0.00031611 +-11.78569647 0.00032039 +-11.75848509 0.00032478 +-11.73127370 0.00032927 +-11.70406231 0.00033388 +-11.67685093 0.00033860 +-11.64963954 0.00034345 +-11.62242816 0.00034841 +-11.59521677 0.00035351 +-11.56800538 0.00035874 +-11.54079400 0.00036411 +-11.51358261 0.00036962 +-11.48637123 0.00037528 +-11.45915984 0.00038110 +-11.43194845 0.00038708 +-11.40473707 0.00039323 +-11.37752568 0.00039956 +-11.35031430 0.00040606 +-11.32310291 0.00041276 +-11.29589152 0.00041966 +-11.26868014 0.00042676 +-11.24146875 0.00043408 +-11.21425737 0.00044162 +-11.18704598 0.00044940 +-11.15983459 0.00045743 +-11.13262321 0.00046571 +-11.10541182 0.00047426 +-11.07820043 0.00048309 +-11.05098905 0.00049221 +-11.02377766 0.00050165 +-10.99656628 0.00051141 +-10.96935489 0.00052151 +-10.94214351 0.00053196 +-10.91493212 0.00054279 +-10.88772073 0.00055401 +-10.86050935 0.00056565 +-10.83329796 0.00057772 +-10.80608658 0.00059025 +-10.77887519 0.00060327 +-10.75166380 0.00061679 +-10.72445242 0.00063086 +-10.69724103 0.00064550 +-10.67002964 0.00066073 +-10.64281826 0.00067661 +-10.61560687 0.00069316 +-10.58839549 0.00071043 +-10.56118410 0.00072846 +-10.53397272 0.00074729 +-10.50676133 0.00076698 +-10.47954994 0.00078759 +-10.45233856 0.00080918 +-10.42512717 0.00083180 +-10.39791579 0.00085553 +-10.37070440 0.00088044 +-10.34349301 0.00090663 +-10.31628163 0.00093418 +-10.28907024 0.00096318 +-10.26185885 0.00099376 +-10.23464747 0.00102602 +-10.20743608 0.00106011 +-10.18022470 0.00109615 +-10.15301331 0.00113431 +-10.12580192 0.00117477 +-10.09859054 0.00121772 +-10.07137915 0.00126337 +-10.04416777 0.00131196 +-10.01695638 0.00136375 +-9.98974500 0.00141904 +-9.96253361 0.00147816 +-9.93532222 0.00154148 +-9.90811084 0.00160941 +-9.88089945 0.00168243 +-9.85368807 0.00176106 +-9.82647668 0.00184590 +-9.79926529 0.00193764 +-9.77205391 0.00203706 +-9.74484252 0.00214506 +-9.71763113 0.00226266 +-9.69041975 0.00239105 +-9.66320836 0.00253162 +-9.63599698 0.00268597 +-9.60878559 0.00285597 +-9.58157420 0.00304386 +-9.55436282 0.00325224 +-9.52715143 0.00348423 +-9.49994005 0.00374358 +-9.47272866 0.00403478 +-9.44551727 0.00436331 +-9.41830589 0.00473586 +-9.39109450 0.00516069 +-9.36388312 0.00564812 +-9.33667173 0.00621108 +-9.30946034 0.00686609 +-9.28224896 0.00763436 +-9.25503757 0.00854362 +-9.22782619 0.00963058 +-9.20061480 0.01094467 +-9.17340341 0.01255371 +-9.14619203 0.01455269 +-9.11898064 0.01707790 +-9.09176926 0.02033029 +-9.06455787 0.02461568 +-9.03734648 0.03041702 +-9.01013510 0.03853110 +-8.98292371 0.05034263 +-8.95571233 0.06841301 +-8.92850094 0.09784615 +-8.90128955 0.14971543 +-8.87407817 0.24999785 +-8.84686678 0.45738644 +-8.81965540 0.80573500 +-8.79244401 0.85773698 +-8.76523262 0.51002225 +-8.73802124 0.27592182 +-8.71080985 0.16251138 +-8.68359847 0.10478173 +-8.65638708 0.07252403 +-8.62917569 0.05296640 +-8.60196431 0.04031008 +-8.57475292 0.03168571 +-8.54754154 0.02556052 +-8.52033015 0.02106134 +-8.49311876 0.01766328 +-8.46590738 0.01503632 +-8.43869599 0.01296490 +-8.41148461 0.01130357 +-8.38427322 0.00995140 +-8.35706183 0.00883663 +-8.32985045 0.00790711 +-8.30263906 0.00712425 +-8.27542767 0.00645899 +-8.24821629 0.00588911 +-8.22100490 0.00539742 +-8.19379352 0.00497041 +-8.16658213 0.00459738 +-8.13937075 0.00426975 +-8.11215936 0.00398059 +-8.08494797 0.00372424 +-8.05773659 0.00349605 +-8.03052520 0.00329216 +-8.00331382 0.00310936 +-7.97610243 0.00294496 +-7.94889104 0.00279669 +-7.92167966 0.00266261 +-7.89446827 0.00254108 +-7.86725688 0.00243069 +-7.84004550 0.00233022 +-7.81283411 0.00223862 +-7.78562273 0.00215498 +-7.75841134 0.00207851 +-7.73119996 0.00200851 +-7.70398857 0.00194438 +-7.67677718 0.00188558 +-7.64956580 0.00183164 +-7.62235441 0.00178213 +-7.59514302 0.00173670 +-7.56793164 0.00169501 +-7.54072025 0.00165676 +-7.51350887 0.00162170 +-7.48629748 0.00158959 +-7.45908609 0.00156023 +-7.43187471 0.00153342 +-7.40466332 0.00150900 +-7.37745194 0.00148681 +-7.35024055 0.00146672 +-7.32302917 0.00144861 +-7.29581778 0.00143237 +-7.26860639 0.00141789 +-7.24139501 0.00140509 +-7.21418362 0.00139389 +-7.18697223 0.00138422 +-7.15976085 0.00137601 +-7.13254946 0.00136921 +-7.10533808 0.00136376 +-7.07812669 0.00135962 +-7.05091531 0.00135676 +-7.02370392 0.00135512 +-6.99649253 0.00135469 +-6.96928115 0.00135545 +-6.94206976 0.00135735 +-6.91485837 0.00136041 +-6.88764699 0.00136459 +-6.86043560 0.00136988 +-6.83322422 0.00137629 +-6.80601283 0.00138381 +-6.77880144 0.00139244 +-6.75159006 0.00140217 +-6.72437867 0.00141303 +-6.69716729 0.00142502 +-6.66995590 0.00143815 +-6.64274451 0.00145244 +-6.61553313 0.00146792 +-6.58832174 0.00148459 +-6.56111036 0.00150250 +-6.53389897 0.00152167 +-6.50668758 0.00154213 +-6.47947620 0.00156393 +-6.45226481 0.00158711 +-6.42505343 0.00161171 +-6.39784204 0.00163778 +-6.37063065 0.00166539 +-6.34341927 0.00169459 +-6.31620788 0.00172545 +-6.28899650 0.00175804 +-6.26178511 0.00179245 +-6.23457372 0.00182875 +-6.20736234 0.00186704 +-6.18015095 0.00190742 +-6.15293957 0.00195001 +-6.12572818 0.00199491 +-6.09851679 0.00204227 +-6.07130541 0.00209222 +-6.04409402 0.00214491 +-6.01688264 0.00220050 +-5.98967125 0.00225919 +-5.96245986 0.00232116 +-5.93524848 0.00238662 +-5.90803709 0.00245582 +-5.88082571 0.00252900 +-5.85361432 0.00260645 +-5.82640293 0.00268846 +-5.79919155 0.00277538 +-5.77198016 0.00286756 +-5.74476878 0.00296542 +-5.71755739 0.00306940 +-5.69034600 0.00317998 +-5.66313462 0.00329771 +-5.63592323 0.00342319 +-5.60871185 0.00355709 +-5.58150046 0.00370015 +-5.55428907 0.00385318 +-5.52707769 0.00401713 +-5.49986630 0.00419301 +-5.47265491 0.00438199 +-5.44544353 0.00458538 +-5.41823214 0.00480464 +-5.39102076 0.00504146 +-5.36380937 0.00529773 +-5.33659799 0.00557561 +-5.30938660 0.00587757 +-5.28217521 0.00620646 +-5.25496383 0.00656554 +-5.22775244 0.00695861 +-5.20054106 0.00739005 +-5.17332967 0.00786501 +-5.14611828 0.00838952 +-5.11890690 0.00897068 +-5.09169551 0.00961693 +-5.06448412 0.01033832 +-5.03727274 0.01114694 +-5.01006135 0.01205741 +-4.98284997 0.01308752 +-4.95563858 0.01425916 +-4.92842720 0.01559940 +-4.90121581 0.01714215 +-4.87400442 0.01893025 +-4.84679304 0.02101848 +-4.81958165 0.02347774 +-4.79237026 0.02640115 +-4.76515888 0.02991284 +-4.73794749 0.03418120 +-4.71073611 0.03943916 +-4.68352472 0.04601613 +-4.65631334 0.05438983 +-4.62910195 0.06527347 +-4.60189056 0.07976846 +-4.57467918 0.09964434 +-4.54746779 0.12788076 +-4.52025640 0.16978468 +-4.49304502 0.23546598 +-4.46583363 0.34577718 +-4.43862225 0.54760540 +-4.41141086 0.95229794 +-4.38419947 1.77020450 +-4.35698809 2.70074066 +-4.32977670 2.16450795 +-4.30256532 1.18697729 +-4.27535393 0.66043653 +-4.24814254 0.40386505 +-4.22093116 0.26829958 +-4.19371977 0.18988206 +-4.16650839 0.14099032 +-4.13929700 0.10864096 +-4.11208561 0.08620145 +-4.08487423 0.07003182 +-4.05766284 0.05801104 +-4.03045146 0.04883982 +-4.00324007 0.04168805 +-3.97602868 0.03600609 +-3.94881730 0.03141869 +-3.92160591 0.02766280 +-3.89439453 0.02454968 +-3.86718314 0.02194113 +-3.83997175 0.01973414 +-3.81276037 0.01785063 +-3.78554898 0.01623061 +-3.75833760 0.01482736 +-3.73112621 0.01360406 +-3.70391482 0.01253138 +-3.67670344 0.01158574 +-3.64949205 0.01074799 +-3.62228067 0.01000245 +-3.59506928 0.00933622 +-3.56785789 0.00873855 +-3.54064651 0.00820047 +-3.51343512 0.00771443 +-3.48622374 0.00727405 +-3.45901235 0.00687389 +-3.43180096 0.00650931 +-3.40458958 0.00617632 diff --git a/tests/dft/pseudopotential/complex/fccAl_05.prm.in b/tests/dft/pseudopotential/complex/fccAl_05.prm.in new file mode 100644 index 000000000..f770a3fff --- /dev/null +++ b/tests/dft/pseudopotential/complex/fccAl_05.prm.in @@ -0,0 +1,91 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true +set SOLVER MODE = NSCF +set RESTART FOLDER = ../../fccAl_03.release/mpirun=16/fccAl_03 + +subsection Post-processing Options + set WRITE BANDS = false + set WRITE DENSITY OF STATES=true +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=false + set FLOATING NUCLEAR CHARGES=false + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true + set POINT WISE DIRICHLET CONSTRAINT=true + set SELF POTENTIAL RADIUS = 0 + set CONSTRAINTS PARALLEL CHECK=true +end + + +subsection Brillouin zone k point sampling options + set USE TIME REVERSAL SYMMETRY = false + set kPOINT RULE FILE = @SOURCE_DIR@/kPoint2x2x2File +end + +subsection SCF Checkpointing and Restart + set LOAD RHO DATA = true + set RESTART SP FROM NO SP = false + set SAVE RHO DATA = false +end + +subsection Parallelization + set NPKPT=2 +end + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 1 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = @SOURCE_DIR@/pseudoAlKB.inp + set PSEUDO TESTS FLAG = true +end + + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 2 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AT ATOM = 0.6 + set MESH SIZE AROUND ATOM = 0.6 + end + +end + +subsection Geometry + set NATOMS=4 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = @SOURCE_DIR@/fccAl_coordinates.inp + set DOMAIN VECTORS FILE = @SOURCE_DIR@/fccAl_domainBoundingVectors.inp +end + + +subsection Poisson problem parameters + set MAXIMUM ITERATIONS = 4000 + set TOLERANCE = 1e-12 +end + + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER=false + + set MIXING HISTORY = 70 + set MIXING PARAMETER = 0.5 + set MAXIMUM ITERATIONS = 50 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + + subsection Eigen-solver parameters + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 20 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set ORTHOGONALIZATION TYPE=CGS + set CHEBYSHEV POLYNOMIAL DEGREE=8 + set USE ELPA=true + set USE SINGLE PREC CHEBY = true + end +end diff --git a/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 b/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 new file mode 100644 index 000000000..37260f3a9 --- /dev/null +++ b/testsGPU/pseudopotential/complex/accuracyBenchmarks/outputMg2x_7 @@ -0,0 +1,104 @@ +number of atoms: 31 +number of atoms types: 1 +Z:12 +============================================================================================================================= +number of electrons: 310 +number of eigen values: 180 +============================================================================================================================= +Total number of k-points 2 +-----------Simulation Domain bounding vectors (lattice vectors in fully periodic case)------------- +v1 : 1.176399999999999935e+01 0.000000000000000000e+00 0.000000000000000000e+00 +v2 : 0.000000000000000000e+00 1.917200000000000060e+01 0.000000000000000000e+00 +v3 : 0.000000000000000000e+00 0.000000000000000000e+00 2.037584570023999930e+01 +----------------------------------------------------------------------------------------- +-----Fractional coordinates of atoms------ +AtomId 0: 2.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 1: 2.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 2: 0.000000000000000000e+00 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 3: 0.000000000000000000e+00 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 4: 2.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 5: 2.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 6: 0.000000000000000000e+00 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 7: 0.000000000000000000e+00 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 8: 2.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 9: 2.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 10: 0.000000000000000000e+00 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 11: 0.000000000000000000e+00 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 12: 2.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 13: 2.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 14: 0.000000000000000000e+00 7.500000000000000000e-01 6.666666666670000296e-01 +AtomId 15: 5.000000000000000000e-01 0.000000000000000000e+00 0.000000000000000000e+00 +AtomId 16: 7.500000000000000000e-01 2.500000000000000000e-01 4.166666666669999741e-01 +AtomId 17: 7.500000000000000000e-01 0.000000000000000000e+00 2.500000000000000000e-01 +AtomId 18: 5.000000000000000000e-01 2.500000000000000000e-01 1.666666666670000019e-01 +AtomId 19: 5.000000000000000000e-01 0.000000000000000000e+00 5.000000000000000000e-01 +AtomId 20: 7.500000000000000000e-01 2.500000000000000000e-01 9.166666666670000296e-01 +AtomId 21: 7.500000000000000000e-01 0.000000000000000000e+00 7.500000000000000000e-01 +AtomId 22: 5.000000000000000000e-01 2.500000000000000000e-01 6.666666666670000296e-01 +AtomId 23: 5.000000000000000000e-01 5.000000000000000000e-01 0.000000000000000000e+00 +AtomId 24: 7.500000000000000000e-01 7.500000000000000000e-01 4.166666666669999741e-01 +AtomId 25: 7.500000000000000000e-01 5.000000000000000000e-01 2.500000000000000000e-01 +AtomId 26: 5.000000000000000000e-01 7.500000000000000000e-01 1.666666666670000019e-01 +AtomId 27: 5.000000000000000000e-01 5.000000000000000000e-01 5.000000000000000000e-01 +AtomId 28: 7.500000000000000000e-01 7.500000000000000000e-01 9.166666666670000296e-01 +AtomId 29: 7.500000000000000000e-01 5.000000000000000000e-01 7.500000000000000000e-01 +AtomId 30: 5.000000000000000000e-01 7.500000000000000000e-01 6.666666666670000296e-01 +----------------------------------------------------------------------------------------- +Number Image Charges 2094 + +Finite element mesh information +------------------------------------------------- +FE interpolating polynomial order for Kohn-Sham eigenvalue problem: 3 +FE interpolating polynomial order for electrostatics solve: 3 +FE interpolating polynomial order for nodal electron density computation: 5 +number of elements: 1440 +number of degrees of freedom for the Kohn-Sham eigenvalue problem : 52791 +------------------------------------------------- + +Setting initial guess for wavefunctions.... + +Reading initial guess for electron-density..... + +Pseudopotential initalization.... + +Starting SCF iterations.... +SCF iterations converged to the specified tolerance after: 39 iterations. + +Energy computations (Hartree) +------------------- + Total energy: -1673.56610253 + +Absolute values of ion forces (Hartree/Bohr) +-------------------------------------------------------------------------------------------- +AtomId 0: 0.000820,0.001893,0.012176 +AtomId 1: 0.001957,0.000000,0.000544 +AtomId 2: 0.000000,0.001558,0.141588 +AtomId 3: 0.000050,0.000000,0.001169 +AtomId 4: 0.000591,0.001197,0.010672 +AtomId 5: 0.002144,0.000000,0.000842 +AtomId 6: 0.000000,0.000888,0.144486 +AtomId 7: 0.000007,0.000000,0.001077 +AtomId 8: 0.000820,0.001893,0.012176 +AtomId 9: 0.000298,0.000000,0.001856 +AtomId 10: 0.000000,0.001558,0.141588 +AtomId 11: 0.000007,0.000000,0.000027 +AtomId 12: 0.000591,0.001197,0.010671 +AtomId 13: 0.000615,0.000000,0.002028 +AtomId 14: 0.000000,0.000888,0.144486 +AtomId 15: 0.000048,0.000000,0.001309 +AtomId 16: 0.000820,0.001893,0.012176 +AtomId 17: 0.001957,0.000000,0.000544 +AtomId 18: 0.000059,0.001056,0.143329 +AtomId 19: 0.000039,0.000000,0.000261 +AtomId 20: 0.000591,0.001197,0.010672 +AtomId 21: 0.002144,0.000000,0.000842 +AtomId 22: 0.000059,0.001259,0.144065 +AtomId 23: 0.000007,0.000000,0.000709 +AtomId 24: 0.000820,0.001893,0.012176 +AtomId 25: 0.000298,0.000000,0.001856 +AtomId 26: 0.000059,0.001056,0.143329 +AtomId 27: 0.000007,0.000000,0.000171 +AtomId 28: 0.000591,0.001197,0.010671 +AtomId 29: 0.000615,0.000000,0.002028 +AtomId 30: 0.000059,0.001259,0.144065 +-------------------------------------------------------------------------------------------- diff --git a/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm b/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm index ffdd0573e..15df4fb3c 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/crusher.slurm @@ -26,5 +26,7 @@ srun -n 6 -c 1 ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n 6 -c 1 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 1 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + srun -n 6 -c 1 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc index aa420fa35..8de363644 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc +++ b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -29,4 +29,5 @@ srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm b/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm index bfb8883f8..0677d90d2 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/matrixlabgpu18Tasks6GPUs.slurm @@ -21,5 +21,6 @@ srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_3.prm > outputMg2x_3 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_4.prm > outputMg2x_4 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 srun -n $SLURM_NTASKS --mpi=pmi2 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm b/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm index bfff8ad16..d688032eb 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm +++ b/testsGPU/pseudopotential/complex/jobscripts/perlmutter.slurm @@ -31,5 +31,7 @@ srun ./dftfe parameterFileMg2x_5.prm > outputMg2x_5 srun ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + srun -n 6 -c 1 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/jobscripts/summit.lsf b/testsGPU/pseudopotential/complex/jobscripts/summit.lsf index 4925d9782..d2c1bc50f 100644 --- a/testsGPU/pseudopotential/complex/jobscripts/summit.lsf +++ b/testsGPU/pseudopotential/complex/jobscripts/summit.lsf @@ -23,4 +23,6 @@ jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe p jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_6.prm > outputMg2x_6 +jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileMg2x_7.prm > outputMg2x_7 + jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 -d packed -b packed:7 ./dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm b/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm new file mode 100644 index 000000000..3a13a0bb2 --- /dev/null +++ b/testsGPU/pseudopotential/complex/parameterFileMg2x_7.prm @@ -0,0 +1,86 @@ +set VERBOSITY = 0 +set REPRODUCIBLE OUTPUT=true +set USE GPU=true +subsection GPU + set AUTO GPU BLOCK SIZES=false + set USE GPUDIRECT MPI ALL REDUCE = true + set USE ELPA GPU KERNEL=false +end + +subsection Boundary conditions + set SMEARED NUCLEAR CHARGES=true + set FLOATING NUCLEAR CHARGES=true + set CONSTRAINTS FROM SERIAL DOFHANDLER = false + set CONSTRAINTS PARALLEL CHECK = false + set PERIODIC1 = true + set PERIODIC2 = true + set PERIODIC3 = true + set SELF POTENTIAL RADIUS = 3.0 + set POINT WISE DIRICHLET CONSTRAINT =false +end + + +subsection DFT functional parameters + set EXCHANGE CORRELATION TYPE = 4 + set PSEUDOPOTENTIAL CALCULATION = true + set PSEUDOPOTENTIAL FILE NAMES LIST = pseudoMg.inp + set SPIN POLARIZATION=1 + set START MAGNETIZATION=0.1 +end + + +subsection Finite element mesh parameters + set POLYNOMIAL ORDER = 3 + subsection Auto mesh generation parameters + set AUTO ADAPT BASE MESH SIZE=false + set ATOM BALL RADIUS = 2.0 + set BASE MESH SIZE = 4.0 + set MESH SIZE AROUND ATOM = 1.0 + set MESH SIZE AT ATOM = 1.0 + end +end + +subsection Geometry + set NATOMS=31 + set NATOM TYPES=1 + set ATOMIC COORDINATES FILE = coordinatesMg2x.inp + set DOMAIN VECTORS FILE = domainVectorsMg2x.inp + subsection Optimization + set ION FORCE = true + end +end + +subsection Brillouin zone k point sampling options + set USE TIME REVERSAL SYMMETRY = false + subsection Monkhorst-Pack (MP) grid generation + set SAMPLING POINTS 1 = 2 + set SAMPLING POINTS 2 = 1 + set SAMPLING POINTS 3 = 1 + set SAMPLING SHIFT 1 = 1 + set SAMPLING SHIFT 2 = 0 + set SAMPLING SHIFT 3 = 0 + end +end + + +subsection Parallelization + set NPBAND=1 + set NPKPT=2 +end + +subsection SCF parameters + set COMPUTE ENERGY EACH ITER = false + set MAXIMUM ITERATIONS = 100 + set TEMPERATURE = 500 + set TOLERANCE = 1e-6 + set STARTING WFC=ATOMIC + subsection Eigen-solver parameters + set CHEBYSHEV POLYNOMIAL DEGREE = 20 + set NUMBER OF KOHN-SHAM WAVEFUNCTIONS = 180 + set CHEBY WFC BLOCK SIZE=90 + set WFC BLOCK SIZE=90 + set CHEBYSHEV FILTER TOLERANCE=1e-5 + set SCALAPACKPROCS=2 + set USE SINGLE PREC CHEBY = true + end +end diff --git a/testsGPU/pseudopotential/real/parameterFileN2_3.prm b/testsGPU/pseudopotential/real/parameterFileN2_3.prm index 51b5c5fb7..3678f49bf 100644 --- a/testsGPU/pseudopotential/real/parameterFileN2_3.prm +++ b/testsGPU/pseudopotential/real/parameterFileN2_3.prm @@ -62,7 +62,7 @@ subsection SCF parameters set USE ELPA=true set SCALAPACKPROCS=2 set OVERLAP COMPUTE COMMUN CHEBY=true - set USE MIXED PREC CHEBY=true + set USE SINGLE PREC COMMUN CHEBY=true set OVERLAP COMPUTE COMMUN ORTHO RR=false end end diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index ed0df0434..129b0a10f 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -574,6 +574,32 @@ namespace dftfe beta); } + template + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const + { + ApaBDDeviceKernel<<<(n * m / dftfe::utils::DEVICE_BLOCK_SIZE) + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + m, + n, + alpha, + dftfe::utils::makeDataTypeDeviceCompatible(A), + dftfe::utils::makeDataTypeDeviceCompatible(B), + dftfe::utils::makeDataTypeDeviceCompatible(D), + dftfe::utils::makeDataTypeDeviceCompatible(C)); + } + template void @@ -597,7 +623,7 @@ namespace dftfe } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -605,7 +631,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { axpyStridedBlockAtomicAddDeviceKernel<<< @@ -1449,24 +1475,6 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), copyFromVecStartingContiguousBlockIds); } - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); template void @@ -1488,268 +1496,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(s), dftfe::utils::makeDataTypeDeviceCompatible(x)); } - // for stridedBlockScale - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - // axpyStridedBlockAtomicAdd - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const double *x, - const double beta, - double *y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - // for xscal - template void - BLASWrapper::xscal( - double * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - float * x, - const float a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - double * valueType2Arr); - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - float * valueType2Arr); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const double * copyFromVec, - double * copyToVec); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const std::complex *copyFromVec, - std::complex * copyToVec); - - template void - BLASWrapper::copyRealArrsToComplexArr( - const dftfe::size_type size, - const double * realArr, - const double * imagArr, - std::complex * complexArr); +#include "./BLASWrapperDevice.inst.cc" } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 50632c7f8..fdbc991bd 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -657,6 +657,35 @@ namespace dftfe beta); } + template + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) const + { + hipLaunchKernelGGL(ApaBDDeviceKernel, + (n * m / dftfe::utils::DEVICE_BLOCK_SIZE) + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + m, + n, + alpha, + dftfe::utils::makeDataTypeDeviceCompatible(A), + dftfe::utils::makeDataTypeDeviceCompatible(B), + dftfe::utils::makeDataTypeDeviceCompatible(D), + dftfe::utils::makeDataTypeDeviceCompatible(C)); + } + template void @@ -681,7 +710,7 @@ namespace dftfe addToVecStartingContiguousBlockIds); } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -689,7 +718,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { hipLaunchKernelGGL(axpyStridedBlockAtomicAddDeviceKernel, @@ -1446,24 +1475,6 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), copyFromVecStartingContiguousBlockIds); } - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); - template void - BLASWrapper::stridedBlockScaleCopy( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); void BLASWrapper::add( @@ -1599,269 +1610,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(s), dftfe::utils::makeDataTypeDeviceCompatible(x)); } - // for stridedBlockScale - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - float * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float a, - const float * s, - double * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex a, - const std::complex *s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - template void - BLASWrapper::stridedBlockScale( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - std::complex * x); - - - // for xscal - template void - BLASWrapper::xscal( - double * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - float * x, - const float a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const std::complex a, - const dftfe::size_type n) const; - - template void - BLASWrapper::xscal( - std::complex * x, - const double a, - const dftfe::size_type n) const; - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - double * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const float * copyFromVec, - float * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - template void - BLASWrapper::stridedCopyToBlock( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * copyFromVec, - std::complex * copyToVecBlock, - const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); - - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - std::complex * valueType2Arr); - - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - double * valueType2Arr); - template void - BLASWrapper:: - copyValueType1ArrToValueType2Arr(const dftfe::size_type size, - const double * valueType1Arr, - float * valueType2Arr); - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const double * copyFromVec, - double * copyToVec); - - template void - BLASWrapper:: - stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, - const dftfe::size_type blockSizeFrom, - const dftfe::size_type numBlocks, - const dftfe::size_type startingId, - const std::complex *copyFromVec, - std::complex * copyToVec); - - - // axpyStridedBlockAtomicAdd - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const double * addFromVec, - double * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - template void - BLASWrapper::axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const double a, - const double * s, - const std::complex * addFromVec, - std::complex * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; - - - template void - BLASWrapper::axpby(const unsigned int n, - const double alpha, - const double *x, - const double beta, - double *y) const; - - - template void - BLASWrapper::axpby( - const unsigned int n, - const double alpha, - const std::complex *x, - const double beta, - std::complex * y) const; - - template void - BLASWrapper::copyRealArrsToComplexArr( - const dftfe::size_type size, - const double * realArr, - const double * imagArr, - std::complex * complexArr); +#include "./BLASWrapperDevice.inst.cc" } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.inst.cc b/utils/BLASWrapperDevice.inst.cc new file mode 100644 index 000000000..0238120a8 --- /dev/null +++ b/utils/BLASWrapperDevice.inst.cc @@ -0,0 +1,457 @@ +template void +BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); +template void +BLASWrapper::stridedBlockScaleCopy( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds); +// for stridedBlockScale +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + double * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + float * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + float * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + double * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex a, + const std::complex *s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + std::complex * x); + +template void +BLASWrapper::stridedBlockScale( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + std::complex * x); +// for xscal +template void +BLASWrapper::xscal( + double * x, + const double a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + float * x, + const float a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const std::complex a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const std::complex a, + const dftfe::size_type n) const; + +template void +BLASWrapper::xscal( + std::complex * x, + const double a, + const dftfe::size_type n) const; + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +// strided copy from block +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVecBlock, + float * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVecBlock, + double * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + +template void +BLASWrapper::stridedCopyFromBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVecBlock, + std::complex * copyToVec, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + double * valueType2Arr); +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + float * valueType2Arr); +template void +BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); + +template void +BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const double * copyFromVec, + double * copyToVec); + +template void +BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const std::complex *copyFromVec, + std::complex * copyToVec); +// axpyStridedBlockAtomicAdd +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + +template void +BLASWrapper::axpby(const unsigned int n, + const double alpha, + const double * x, + const double beta, + double *y) const; + + +template void +BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + +template void +BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float * x, + const double beta, + float *y) const; + + +template void +BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const; + +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double * A, + const double * B, + const double * D, + double *C) const; +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double * B, + const double * D, + float *C) const; +template void +BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double * B, + const double * D, + double *C) const; +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const; + +template void +BLASWrapper::copyRealArrsToComplexArr( + const dftfe::size_type size, + const double * realArr, + const double * imagArr, + std::complex * complexArr); diff --git a/utils/BLASWrapperDeviceKernels.cc b/utils/BLASWrapperDeviceKernels.cc index e0f1d6682..399584065 100644 --- a/utils/BLASWrapperDeviceKernels.cc +++ b/utils/BLASWrapperDeviceKernels.cc @@ -397,6 +397,183 @@ namespace dftfe } } + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceDoubleComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceFloatComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex], + dftfe::utils::mult(addFromVec[index], coeff)); + } + } + + __global__ void + axpyStridedBlockAtomicAddDeviceKernel( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const dftfe::utils::deviceFloatComplex *addFromVec, + dftfe::utils::deviceFloatComplex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = + numContiguousBlocks * contiguousBlockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / contiguousBlockSize; + dftfe::size_type intraBlockIndex = index % contiguousBlockSize; + const double coeff = dftfe::utils::mult(a, s[blockIndex]); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .x), + dftfe::utils::mult(addFromVec[index].x, coeff)); + atomicAdd(&(addToVec[addToVecStartingContiguousBlockIds[blockIndex] + + intraBlockIndex] + .y), + dftfe::utils::mult(addFromVec[index].y, coeff)); + } + } + __global__ void axpyStridedBlockAtomicAddDeviceKernel( const dftfe::size_type contiguousBlockSize, @@ -487,5 +664,35 @@ namespace dftfe } } + template + __global__ void + ApaBDDeviceKernel(const dftfe::size_type nRows, + const dftfe::size_type nCols, + const ValueType0 alpha, + const ValueType1 * A, + const ValueType2 * B, + const ValueType3 * D, + ValueType4 * C) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = nCols * nRows; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type iRow = index % nCols; + const ValueType0 alphaD = alpha * D[iRow]; + dftfe::utils::copyValue( + C + index, + dftfe::utils::add(A[index], dftfe::utils::mult(B[index], alphaD))); + } + } + + } // namespace } // namespace dftfe diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index 1bdd6bddb..fad0db017 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -65,6 +65,72 @@ namespace dftfe &transA, &transB, &m, &n, &k, alpha, A, &lda, B, &ldb, beta, C, &ldc); } + template + void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const ValueType0 alpha, + const ValueType1 *A, + const ValueType2 *B, + const ValueType3 *D, + ValueType4 *C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = + A[iCol + n * iRow] + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + + template <> + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = std::complex(A[iCol + n * iRow]) + + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + + template <> + void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex * A, + const std::complex *B, + const double * D, + std::complex * C) const + { + for (unsigned int iRow = 0; iRow < m; ++iRow) + { + for (unsigned int iCol = 0; iCol < n; ++iCol) + { + C[iCol + n * iRow] = std::complex(A[iCol + n * iRow]) + + alpha * B[iCol + n * iRow] * D[iCol]; + } + } + } + template void BLASWrapper::xscal( @@ -404,7 +470,7 @@ namespace dftfe std::plus<>{}); } - template + template void BLASWrapper::axpyStridedBlockAtomicAdd( const dftfe::size_type contiguousBlockSize, @@ -412,7 +478,7 @@ namespace dftfe const ValueType1 a, const ValueType1 * s, const ValueType2 * addFromVec, - ValueType2 * addToVec, + ValueType3 * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const { for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) @@ -426,6 +492,31 @@ namespace dftfe } } + template <> + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) + { + double coeff = a * s[iBlock]; + std::transform(addFromVec + iBlock * contiguousBlockSize, + addFromVec + (iBlock + 1) * contiguousBlockSize, + addToVec + addToVecStartingContiguousBlockIds[iBlock], + addToVec + addToVecStartingContiguousBlockIds[iBlock], + [&coeff](auto &p, auto &q) { + return std::complex(p) * coeff + + std::complex(q); + }); + } + } + template void BLASWrapper::axpby(const unsigned int n, @@ -439,6 +530,20 @@ namespace dftfe }); } + template <> + void + BLASWrapper::axpby( + const unsigned int n, + const double alpha, + const std::complex *x, + const double beta, + std::complex * y) const + { + std::transform(x, x + n, y, y, [&alpha, &beta](auto &p, auto &q) { + return alpha * std::complex(p) + beta * std::complex(q); + }); + } + void BLASWrapper::xsymv( const char UPLO, @@ -585,6 +690,83 @@ namespace dftfe } } + void + BLASWrapper::xgemmStridedBatched( + const char transA, + const char transB, + const unsigned int m, + const unsigned int n, + const unsigned int k, + const float * alpha, + const float * A, + const unsigned int lda, + long long int strideA, + const float * B, + const unsigned int ldb, + long long int strideB, + const float * beta, + float * C, + const unsigned int ldc, + long long int strideC, + const int batchCount) const + { + for (int iBatch = 0; iBatch < batchCount; iBatch++) + { + xgemm(transA, + transB, + m, + n, + k, + alpha, + A + iBatch * strideA, + lda, + B + iBatch * strideB, + ldb, + beta, + C + iBatch * strideC, + ldc); + } + } + + + void + BLASWrapper::xgemmStridedBatched( + const char transA, + const char transB, + const unsigned int m, + const unsigned int n, + const unsigned int k, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + long long int strideA, + const std::complex *B, + const unsigned int ldb, + long long int strideB, + const std::complex *beta, + std::complex * C, + const unsigned int ldc, + long long int strideC, + const int batchCount) const + { + for (int iBatch = 0; iBatch < batchCount; iBatch++) + { + xgemm(transA, + transB, + m, + n, + k, + alpha, + A + iBatch * strideA, + lda, + B + iBatch * strideB, + ldb, + beta, + C + iBatch * strideC, + ldc); + } + } + template void BLASWrapper::copyComplexArrToRealArrs( @@ -899,6 +1081,12 @@ namespace dftfe copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, float * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr( + const dftfe::size_type size, + const std::complex *valueType1Arr, + std::complex * valueType2Arr); // axpyStridedBlockAtomicAdd template void @@ -937,6 +1125,46 @@ namespace dftfe std::complex * addToVec, const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double a, + const double * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const float * addFromVec, + float * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float a, + const float * s, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + template void BLASWrapper::axpby(const unsigned int n, const double alpha, @@ -953,6 +1181,49 @@ namespace dftfe const double beta, std::complex * y) const; + template void + BLASWrapper::axpby(const unsigned int n, + const double alpha, + const float *x, + const double beta, + float * y) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const double *A, + const double *B, + const double *D, + double * C) const; + template void + BLASWrapper::ApaBD( + const unsigned int m, + const unsigned int n, + const double alpha, + const std::complex *A, + const std::complex *B, + const double * D, + std::complex * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + float * C) const; + + template void + BLASWrapper::ApaBD(const unsigned int m, + const unsigned int n, + const double alpha, + const float * A, + const double *B, + const double *D, + double * C) const; + template void BLASWrapper::copyRealArrsToComplexArr( const dftfe::size_type size, diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index cc2952630..38308e208 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -2120,6 +2120,38 @@ namespace dftfe } } + template + void + FEBasisOperations:: + createScratchMultiVectorsSinglePrec(const unsigned int vecBlockSize, + const unsigned int numMultiVecs) const + { + auto iter = scratchMultiVectorsSinglePrec.find(vecBlockSize); + if (iter == scratchMultiVectorsSinglePrec.end()) + { + scratchMultiVectorsSinglePrec[vecBlockSize] = + std::vector::type, + memorySpace>>(numMultiVecs); + for (unsigned int iVec = 0; iVec < numMultiVecs; ++iVec) + scratchMultiVectorsSinglePrec[vecBlockSize][iVec].reinit( + mpiPatternP2P, vecBlockSize); + } + else + { + scratchMultiVectorsSinglePrec[vecBlockSize].resize( + scratchMultiVectorsSinglePrec[vecBlockSize].size() + numMultiVecs); + for (unsigned int iVec = 0; + iVec < scratchMultiVectorsSinglePrec[vecBlockSize].size(); + ++iVec) + scratchMultiVectorsSinglePrec[vecBlockSize][iVec].reinit( + mpiPatternP2P, vecBlockSize); + } + } + template @@ -2128,6 +2160,7 @@ namespace dftfe clearScratchMultiVectors() const { scratchMultiVectors.clear(); + scratchMultiVectorsSinglePrec.clear(); } template + dftfe::linearAlgebra::MultiVector< + typename dftfe::dataTypes::singlePrecType::type, + memorySpace> & + FEBasisOperations:: + getMultiVectorSinglePrec(const unsigned int vecBlockSize, + const unsigned int index) const + { + AssertThrow(scratchMultiVectorsSinglePrec.find(vecBlockSize) != + scratchMultiVectorsSinglePrec.end(), + dealii::ExcMessage( + "DFT-FE Error: MultiVector not found in scratch storage.")); + return scratchMultiVectorsSinglePrec[vecBlockSize][index]; + } + template ::type, + MemoryStorage::type, MemorySpace::HOST_PINNED>>( d_mpiPatternP2P->localGhostSize() * d_blockSize, 0.0); d_sendRecvBufferSinglePrecHostPinnedPtr = std::make_shared< - MemoryStorage::type, + MemoryStorage::type, MemorySpace::HOST_PINNED>>( d_mpiPatternP2P->getOwnedLocalIndicesForTargetProcs() .size() * @@ -347,8 +349,8 @@ namespace dftfe } else { - typename singlePrecType::type *recvArrayStartPtr = - d_ghostDataCopySinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *recvArrayStartPtr = d_ghostDataCopySinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -372,7 +374,8 @@ namespace dftfe .data()[2 * i]) * d_blockSize * sizeof( - typename singlePrecType::type), + typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getGhostProcIds().data()[i], static_cast( @@ -416,8 +419,8 @@ namespace dftfe d_sendRecvBufferSinglePrec); // initiate non-blocking sends to target processors - typename singlePrecType::type *sendArrayStartPtr = - d_sendRecvBufferSinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *sendArrayStartPtr = d_sendRecvBufferSinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -456,7 +459,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getTargetProcIds().data()[i], @@ -483,7 +487,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getGhostProcIds().data()[i], @@ -511,7 +516,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getTargetProcIds().data()[i], static_cast( @@ -793,8 +799,8 @@ namespace dftfe else { // initiate non-blocking receives from target processors - typename singlePrecType::type *recvArrayStartPtr = - d_sendRecvBufferSinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *recvArrayStartPtr = d_sendRecvBufferSinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) { @@ -815,7 +821,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getTargetProcIds().data()[i], static_cast( @@ -855,8 +862,8 @@ namespace dftfe d_ghostDataCopySinglePrec.data()); // initiate non-blocking sends to ghost processors - typename singlePrecType::type *sendArrayStartPtr = - d_ghostDataCopySinglePrec.data(); + typename dftfe::dataTypes::singlePrecType::type + *sendArrayStartPtr = d_ghostDataCopySinglePrec.data(); #ifdef DFTFE_WITH_DEVICE if constexpr (memorySpace == MemorySpace::DEVICE) @@ -898,7 +905,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getGhostProcIds().data()[i], @@ -923,7 +931,8 @@ namespace dftfe d_mpiPatternP2P->getNumOwnedIndicesForTargetProcs() .data()[i] * d_blockSize * - (sizeof(typename singlePrecType::type) / + (sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type) / 4), ncclFloat, d_mpiPatternP2P->getTargetProcIds().data()[i], @@ -951,7 +960,8 @@ namespace dftfe d_mpiPatternP2P->getGhostLocalIndicesRanges() .data()[2 * i]) * d_blockSize * - sizeof(typename singlePrecType::type), + sizeof(typename dftfe::dataTypes::singlePrecType< + ValueType>::type), MPI_BYTE, d_mpiPatternP2P->getGhostProcIds().data()[i], static_cast( diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index 1006bc717..d9a8f9e96 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -53,6 +53,28 @@ namespace dftfe zaxpy_(n, alpha, x, incx, y, incy); } + void + callaxpy(const unsigned int *n, + const float * alpha, + float * x, + const unsigned int *incx, + float * y, + const unsigned int *incy) + { + saxpy_(n, alpha, x, incx, y, incy); + } + + void + callaxpy(const unsigned int * n, + const std::complex *alpha, + std::complex * x, + const unsigned int * incx, + std::complex * y, + const unsigned int * incy) + { + caxpy_(n, alpha, x, incx, y, incy); + } + // @@ -497,6 +519,17 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> &fieldVector) const; + template void + constraintMatrixInfo::distribute( + dftfe::linearAlgebra::MultiVector + &fieldVector) const; + + template void + constraintMatrixInfo::distribute( + dftfe::linearAlgebra::MultiVector, + dftfe::utils::MemorySpace::HOST> + &fieldVector) const; + template void constraintMatrixInfo:: distribute_slave_to_master( @@ -504,12 +537,24 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> &fieldVector) const; + template void + constraintMatrixInfo:: + distribute_slave_to_master( + dftfe::linearAlgebra::MultiVector + &fieldVector) const; + template void constraintMatrixInfo::set_zero( dftfe::linearAlgebra::MultiVector &fieldVector) const; + template void + constraintMatrixInfo::set_zero( + dftfe::linearAlgebra::MultiVector + &fieldVector) const; template class constraintMatrixInfo; } // namespace dftUtils diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index df1067acb..fc47463c5 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -368,6 +368,102 @@ namespace dftfe } } + __global__ void + distributeSlaveToMasterKernelAtomicAdd( + const unsigned int contiguousBlockSize, + float * xVec, + const unsigned int *constraintLocalRowIdsUnflattened, + const unsigned int numConstraints, + const unsigned int *constraintRowSizes, + const unsigned int *constraintRowSizesAccumulated, + const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, + const double * constraintColumnValuesAllRowsUnflattened) + { + const dealii::types::global_dof_index globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dealii::types::global_dof_index numberEntries = + numConstraints * contiguousBlockSize; + + for (dealii::types::global_dof_index index = globalThreadId; + index < numberEntries; + index += blockDim.x * gridDim.x) + { + const unsigned int blockIndex = index / contiguousBlockSize; + const unsigned int intraBlockIndex = index % contiguousBlockSize; + const unsigned int constrainedRowId = + constraintLocalRowIdsUnflattened[blockIndex]; + const unsigned int numberColumns = constraintRowSizes[blockIndex]; + const unsigned int startingColumnNumber = + constraintRowSizesAccumulated[blockIndex]; + const dealii::types::global_dof_index xVecStartingIdRow = + constrainedRowId * contiguousBlockSize; + for (unsigned int i = 0; i < numberColumns; ++i) + { + const unsigned int constrainedColumnId = + constraintLocalColumnIdsAllRowsUnflattened + [startingColumnNumber + i]; + const dealii::types::global_dof_index xVecStartingIdColumn = + constrainedColumnId * contiguousBlockSize; + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex]), + constraintColumnValuesAllRowsUnflattened + [startingColumnNumber + i] * + xVec[xVecStartingIdRow + intraBlockIndex]); + } + xVec[xVecStartingIdRow + intraBlockIndex] = 0.0; + } + } + + + __global__ void + distributeSlaveToMasterKernelAtomicAdd( + const unsigned int contiguousBlockSize, + dftfe::utils::deviceFloatComplex *xVec, + const unsigned int * constraintLocalRowIdsUnflattened, + const unsigned int numConstraints, + const unsigned int * constraintRowSizes, + const unsigned int * constraintRowSizesAccumulated, + const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, + const double * constraintColumnValuesAllRowsUnflattened) + { + const dealii::types::global_dof_index globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dealii::types::global_dof_index numberEntries = + numConstraints * contiguousBlockSize; + + for (dealii::types::global_dof_index index = globalThreadId; + index < numberEntries; + index += blockDim.x * gridDim.x) + { + const unsigned int blockIndex = index / contiguousBlockSize; + const unsigned int intraBlockIndex = index % contiguousBlockSize; + const unsigned int constrainedRowId = + constraintLocalRowIdsUnflattened[blockIndex]; + const unsigned int numberColumns = constraintRowSizes[blockIndex]; + const unsigned int startingColumnNumber = + constraintRowSizesAccumulated[blockIndex]; + const dealii::types::global_dof_index xVecStartingIdRow = + constrainedRowId * contiguousBlockSize; + for (unsigned int i = 0; i < numberColumns; ++i) + { + const unsigned int constrainedColumnId = + constraintLocalColumnIdsAllRowsUnflattened + [startingColumnNumber + i]; + const dealii::types::global_dof_index xVecStartingIdColumn = + constrainedColumnId * contiguousBlockSize; + const dftfe::utils::deviceDoubleComplex tempComplval = + dftfe::utils::mult(constraintColumnValuesAllRowsUnflattened + [startingColumnNumber + i], + xVec[xVecStartingIdRow + intraBlockIndex]); + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex].x), + tempComplval.x); + atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex].y), + tempComplval.y); + } + xVec[xVecStartingIdRow + intraBlockIndex].x = 0.0; + xVec[xVecStartingIdRow + intraBlockIndex].y = 0.0; + } + } + __global__ void setzeroKernel(const unsigned int contiguousBlockSize, @@ -682,56 +778,11 @@ namespace dftfe // set the constrained degrees of freedom to values so that constraints // are satisfied for flattened array // + template void constraintMatrixInfo:: distribute_slave_to_master( - distributedDeviceVec &fieldVector) const - { - if (d_numConstrainedDofs == 0) - return; - - const unsigned int blockSize = fieldVector.numVectors(); -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - distributeSlaveToMasterKernelAtomicAdd<<< - min((blockSize * d_numConstrainedDofs + - (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE, - 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin()); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL(distributeSlaveToMasterKernelAtomicAdd, - min((blockSize * d_numConstrainedDofs + - (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE, - 30000), - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - blockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - fieldVector.begin()), - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin()); -#endif - } - - - void - constraintMatrixInfo:: - distribute_slave_to_master( - distributedDeviceVec> &fieldVector) const + distributedDeviceVec &fieldVector) const { if (d_numConstrainedDofs == 0) return; @@ -773,7 +824,6 @@ namespace dftfe #endif } - template void constraintMatrixInfo::set_zero( @@ -916,6 +966,25 @@ namespace dftfe constraintMatrixInfo::set_zero( distributedDeviceVec> &fieldVector) const; + template void + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; + + template void + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; + + template void + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec &fieldVector) const; + + template void + constraintMatrixInfo:: + distribute_slave_to_master( + distributedDeviceVec> &fieldVector) const; template class constraintMatrixInfo; } // namespace dftUtils diff --git a/utils/dftParameters.cc b/utils/dftParameters.cc index b3d308d83..aca4cf417 100644 --- a/utils/dftParameters.cc +++ b/utils/dftParameters.cc @@ -970,10 +970,10 @@ namespace dftfe "[Advanced] Use mixed precision arithmetic in Rayleigh-Ritz subspace rotation step. Default setting is false."); prm.declare_entry( - "USE MIXED PREC CHEBY", + "USE SINGLE PREC COMMUN CHEBY", "false", dealii::Patterns::Bool(), - "[Advanced] Use mixed precision arithmetic in Chebyshev filtering. Currently this option is only available for real executable and USE ELPA=true for which DFT-FE also has to be linked to ELPA library. Default setting is false."); + "[Advanced] Use single precision communication in Chebyshev filtering. Default setting is false."); prm.declare_entry( "USE MIXED PREC COMMUN ONLY XTX XTHX", @@ -981,6 +981,12 @@ namespace dftfe dealii::Patterns::Bool(), "[Advanced] Use mixed precision communication only for XtX and XtHX instead of mixed precision compute and communication. This setting has been found to be more optimal on certain architectures. Default setting is false."); + prm.declare_entry( + "USE SINGLE PREC CHEBY", + "false", + dealii::Patterns::Bool(), + "[Advanced] Use a modified single precision algorithm for Chebyshev filtering. This cannot be used in conjunction with spectrum splitting. Default setting is false."); + prm.declare_entry( "OVERLAP COMPUTE COMMUN CHEBY", "true", @@ -1263,7 +1269,7 @@ namespace dftfe useDevice = false; deviceFineGrainedTimings = false; allowFullCPUMemSubspaceRot = true; - useMixedPrecCheby = false; + useSinglePrecCommunCheby = false; overlapComputeCommunCheby = false; overlapComputeCommunOrthoRR = false; autoDeviceBlockSizes = true; @@ -1604,7 +1610,8 @@ namespace dftfe useMixedPrecSubspaceRotRR = prm.get_bool("USE MIXED PREC RR_SR"); useMixedPrecCommunOnlyXTHXCGSO = prm.get_bool("USE MIXED PREC COMMUN ONLY XTX XTHX"); - useMixedPrecCheby = prm.get_bool("USE MIXED PREC CHEBY"); + useSinglePrecCommunCheby = prm.get_bool("USE SINGLE PREC COMMUN CHEBY"); + useSinglePrecCheby = prm.get_bool("USE SINGLE PREC CHEBY"); overlapComputeCommunCheby = prm.get_bool("OVERLAP COMPUTE COMMUN CHEBY"); overlapComputeCommunOrthoRR = @@ -1929,7 +1936,7 @@ namespace dftfe useMixedPrecCGS_O = true; useMixedPrecCGS_SR = true; useMixedPrecXTHXSpectrumSplit = true; - useMixedPrecCheby = true; + useSinglePrecCommunCheby = true; reuseLanczosUpperBoundFromFirstCall = true; } @@ -1965,12 +1972,6 @@ namespace dftfe useDeviceDirectAllReduce = useDCCL && useDeviceDirectAllReduce; #endif - if (useMixedPrecCheby) - AssertThrow( - useELPA, - dealii::ExcMessage( - "DFT-FE Error: USE ELPA must be set to true for USE MIXED PREC CHEBY.")); - if (verbosity >= 5) computeEnergyEverySCF = true; @@ -2010,6 +2011,9 @@ namespace dftfe if (numCoreWfcRR == 0) spectrumSplitStartingScfIter = 10000; + + if (numCoreWfcRR != 0) + useSinglePrecCheby = false; }