From 8b90a1980bff097225a524b6b9f9a4e3f2a92594 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 9 Feb 2024 14:27:14 +0530 Subject: [PATCH 01/24] Cleanups --- CMakeLists.txt | 9 +- include/BLASWrapper.h | 179 ++++- include/FEBasisOperations.h | 627 ++++-------------- include/FEBasisOperationsKernelsDevice.h | 52 -- include/FEBasisOperationsKernelsInternal.h | 88 +++ include/densityCalculator.h | 14 +- include/dft.h | 32 - include/kohnShamDFTOperator.h | 3 - include/kohnShamDFTOperatorDevice.h | 6 - include/linearAlgebraOperations.h | 47 ++ include/operator.h | 4 - include/operatorDevice.h | 14 - include/solveVselfInBinsDevice.h | 10 +- include/vselfBinsManager.h | 22 +- src/dft/atomicRho.cc | 45 -- src/dft/charge.cc | 179 ----- src/dft/density.cc | 4 + src/dft/densityCalculator.cc | 21 +- src/dft/densityCalculatorDeviceKernels.cc | 58 +- src/dft/dft.cc | 51 +- src/dft/femUtilityFunctions.cc | 31 - src/dft/initBoundaryConditions.cc | 7 + src/dft/pRefinedDoFHandler.cc | 6 + src/dft/solveNSCF.cc | 5 +- src/dft/solveVselfInBins.cc | 26 +- src/dft/solveVselfInBinsDevice.cc | 13 +- ...iltonianMatrixCalculatorFlattenedDevice.cc | 8 +- src/dftOperator/kohnShamDFTOperator.cc | 9 - src/dftOperator/kohnShamDFTOperatorDevice.cc | 18 - .../shapeFunctionDataCalculatorDevice.cc | 44 -- utils/BLASWrapperDevice.cu.cc | 250 +++++++ utils/BLASWrapperDevice.hip.cc | 227 +++++++ utils/BLASWrapperHost.cc | 166 ++++- ...isOperations.t.cc => FEBasisOperations.cc} | 557 +++++++++------- utils/FEBasisOperationsDevice.t.cc | 325 --------- utils/FEBasisOperationsHost.t.cc | 493 -------------- utils/FEBasisOperationsKernels.cc | 441 ++++++++++++ utils/FEBasisOperationsKernelsDevice.cc | 110 --- .../FEBasisOperationsKernelsInternalDevice.cc | 181 +++++ utils/FEBasisOperationsKernelsInternalHost.cc | 89 +++ 40 files changed, 2238 insertions(+), 2233 deletions(-) delete mode 100644 include/FEBasisOperationsKernelsDevice.h create mode 100644 include/FEBasisOperationsKernelsInternal.h rename utils/{FEBasisOperations.t.cc => FEBasisOperations.cc} (75%) delete mode 100644 utils/FEBasisOperationsDevice.t.cc delete mode 100644 utils/FEBasisOperationsHost.t.cc create mode 100644 utils/FEBasisOperationsKernels.cc delete mode 100644 utils/FEBasisOperationsKernelsDevice.cc create mode 100644 utils/FEBasisOperationsKernelsInternalDevice.cc create mode 100644 utils/FEBasisOperationsKernelsInternalHost.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 195a3f4a0..644aea0f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,7 +156,10 @@ SET(TARGET_SRC ./src/force/configurationalStressCompute/computeStressEself.cc ./src/force/configurationalStressCompute/stress.cc ./src/force/createBinObjectsForce.cc - ./src/force/locateAtomCoreNodesForce.cc) + ./src/force/locateAtomCoreNodesForce.cc + ./utils/FEBasisOperationsKernelsInternalHost.cc + ./utils/FEBasisOperations.cc + ./utils/FEBasisOperationsKernels.cc) IF ("${GPU_LANG}" STREQUAL "cuda") @@ -184,7 +187,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc - ./utils/FEBasisOperationsKernelsDevice.cc + ./utils/FEBasisOperationsKernelsInternalDevice.cc ) ELSEIF ("${GPU_LANG}" STREQUAL "hip") @@ -213,7 +216,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc - ./utils/FEBasisOperationsKernelsDevice.cc + ./utils/FEBasisOperationsKernelsInternalDevice.cc ) ENDIF() diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index e005f77db..1120b3f0d 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -101,6 +101,59 @@ namespace dftfe std::complex * C, const unsigned int ldc) const; + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const double * alpha, + const double * A, + const unsigned int lda, + const double * x, + const unsigned int incx, + const double * beta, + double * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const float * alpha, + const float * A, + const unsigned int lda, + const float * x, + const unsigned int incx, + const float * beta, + float * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const; + + template void xscal(ValueType1 * x, @@ -187,6 +240,14 @@ namespace dftfe float * y, const unsigned int incy) const; + // Complex float copy of data + void + xcopy(const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const; + // Real double symmetric matrix-vector product void xsymv(const char UPLO, @@ -417,22 +478,22 @@ namespace dftfe template void - axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - ValueType * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + ValueType * addToVec, + const dftfe::global_size_type + *addToVecStartingContiguousBlockIds) const; template void - axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - double * addToVecReal, - double * addToVecImag, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + double * addToVecReal, + double * addToVecImag, + const dftfe::global_size_type + *addToVecStartingContiguousBlockIds) const; template void @@ -528,6 +589,60 @@ namespace dftfe const unsigned int ldc) const; + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const double * alpha, + const double * A, + const unsigned int lda, + const double * x, + const unsigned int incx, + const double * beta, + double * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const float * alpha, + const float * A, + const unsigned int lda, + const float * x, + const unsigned int incx, + const float * beta, + float * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const; + + void + xgemv(const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const; + + + template void xscal(ValueType1 * x, @@ -613,6 +728,14 @@ namespace dftfe float * y, const unsigned int incy) const; + // Complex float copy of data + void + xcopy(const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const; + // Real double symmetric matrix-vector product void xsymv(const char UPLO, @@ -841,22 +964,22 @@ namespace dftfe template void - axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - ValueType * addToVec, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + ValueType * addToVec, + const dftfe::global_size_type + *addToVecStartingContiguousBlockIds) const; template void - axpyStridedBlockAtomicAdd( - const dftfe::size_type contiguousBlockSize, - const dftfe::size_type numContiguousBlocks, - const ValueType * addFromVec, - double * addToVecReal, - double * addToVecImag, - const dftfe::global_size_type *addToVecStartingContiguousBlockIds); + axpyStridedBlockAtomicAdd(const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + double * addToVecReal, + double * addToVecImag, + const dftfe::global_size_type + *addToVecStartingContiguousBlockIds) const; template void @@ -879,6 +1002,9 @@ namespace dftfe const ValueType beta, const dftfe::size_type size); + dftfe::utils::deviceBlasHandle_t & + getDeviceBlasHandle(); + private: # ifdef DFTFE_WITH_DEVICE_AMD void @@ -889,7 +1015,6 @@ namespace dftfe dftfe::utils::deviceBlasHandle_t d_deviceBlasHandle; dftfe::utils::deviceStream_t d_streamId; - dftfe::utils::deviceBlasStatus_t create(); diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 2f6c6379a..9a58b05af 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -81,7 +81,7 @@ namespace dftfe template - class FEBasisOperationsBase + class FEBasisOperations { protected: mutable dftfe::utils::MemoryStorage @@ -99,7 +99,7 @@ namespace dftfe * be the same vector which was passed for the construction of the given * MatrixFree object. */ - FEBasisOperationsBase( + FEBasisOperations( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, std::vector *> &constraintsVector, @@ -110,7 +110,7 @@ namespace dftfe /** * @brief Default Destructor */ - ~FEBasisOperationsBase() = default; + ~FEBasisOperations() = default; /** * @brief fills required data structures for the given dofHandlerID @@ -130,9 +130,9 @@ namespace dftfe */ template void - init(const FEBasisOperationsBase &basisOperationsSrc); + init(const FEBasisOperations &basisOperationsSrc); /** * @brief sets internal variables and optionally resizes internal temp storage for interpolation operations @@ -208,6 +208,16 @@ namespace dftfe void initializeShapeFunctionAndJacobianBasisData(); + + /** + * @brief Computes the cell-level stiffness matrix. + */ + void + computeCellStiffnessMatrix(const unsigned int quadratureID, + const unsigned int cellsBlockSize, + const bool basisType = false, + const bool ceoffType = true); + /** * @brief Resizes the internal temp storage to be sufficient for the vector and cell block sizes provided in reinit. */ @@ -292,17 +302,24 @@ namespace dftfe * d_nDofsPerCell + iNode] and if true it is indexed as [iNode * * d_nQuadsPerCell + iQuad]. */ - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - shapeFunctionBasisData(bool transpose = false) const; - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - shapeFunctionBasisData(bool transpose = false) const; - + const auto & + shapeFunctionBasisData(bool transpose = false) const + { + if constexpr (std::is_same::value) + { + return transpose ? + d_shapeFunctionDataTranspose.find(d_quadratureID)->second : + d_shapeFunctionData.find(d_quadratureID)->second; + } + else + { + return transpose ? + d_shapeFunctionBasisDataTranspose.find(d_quadratureID) + ->second : + d_shapeFunctionBasisData.find(d_quadratureID)->second; + } + } /** * @brief Shape function gradient values at quadrature points in ValueTypeBasisData. * @param[in] transpose if false the the data is indexed as [iDim * @@ -310,16 +327,27 @@ namespace dftfe * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell + * iNode * d_nQuadsPerCell + iQuad]. */ - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - shapeFunctionGradientBasisData(bool transpose = false) const; - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - shapeFunctionGradientBasisData(bool transpose = false) const; + const auto & + shapeFunctionGradientBasisData(bool transpose = false) const + { + if constexpr (std::is_same::value) + { + return transpose ? + d_shapeFunctionGradientDataTranspose.find(d_quadratureID) + ->second : + d_shapeFunctionGradientData.find(d_quadratureID)->second; + } + else + { + return transpose ? + d_shapeFunctionGradientBasisDataTranspose + .find(d_quadratureID) + ->second : + d_shapeFunctionGradientBasisData.find(d_quadratureID) + ->second; + } + } /** * @brief Inverse Jacobian matrices in ValueTypeBasisData, for cartesian cells returns the @@ -327,31 +355,65 @@ namespace dftfe * affine cells returns the 3x3 inverse Jacobians for each cell otherwise * returns the 3x3 inverse Jacobians at each quad point for each cell. */ - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - inverseJacobiansBasisData() const; - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - inverseJacobiansBasisData() const; + const auto & + inverseJacobiansBasisData() const + { + if constexpr (std::is_same::value) + { + return d_inverseJacobianData + .find(areAllCellsAffine ? 0 : d_quadratureID) + ->second; + } + else + { + return d_inverseJacobianBasisData + .find(areAllCellsAffine ? 0 : d_quadratureID) + ->second; + } + } /** * @brief determinant of Jacobian times the quadrature weight in ValueTypeBasisData at each * quad point for each cell. */ - template ::value, int> = 0> - const dftfe::utils::MemoryStorage & - JxWBasisData() const; - template ::value, int> = 0> + const auto & + JxWBasisData() const + { + if constexpr (std::is_same::value) + { + return d_JxWData.find(d_quadratureID)->second; + } + else + { + return d_JxWBasisData.find(d_quadratureID)->second; + } + } + + /** + * @brief Cell level stiffness matrix in ValueTypeBasisCoeff + */ + const auto & + cellStiffnessMatrix() const + { + if constexpr (std::is_same::value) + { + return d_cellStiffnessMatrixBasisType; + } + else + { + return d_cellStiffnessMatrixCoeffType; + } + } + + + /** + * @brief Cell level stiffness matrix in ValueTypeBasisData + */ const dftfe::utils::MemoryStorage & - JxWBasisData() const; + cellStiffnessMatrixBasisData() const; /** * @brief returns 2 if all cells on current processor are Cartesian, @@ -494,7 +556,10 @@ namespace dftfe dftfe::utils::MemoryStorage> d_shapeFunctionGradientBasisDataTranspose; - + dftfe::utils::MemoryStorage + d_cellStiffnessMatrixBasisType; + dftfe::utils::MemoryStorage + d_cellStiffnessMatrixCoeffType; mutable std::map< unsigned int, std::vector< @@ -518,129 +583,6 @@ namespace dftfe std::shared_ptr> mpiPatternP2P; - }; - template - class FEBasisOperations : FEBasisOperationsBase - {}; - - template - class FEBasisOperations - : public FEBasisOperationsBase - { - public: - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::FEBasisOperationsBase; - - using FEBasisOperationsBase::d_nCells; - using FEBasisOperationsBase::d_localSize; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_locallyOwnedSize; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::tempCellNodalData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsDataNonAffine; - using FEBasisOperationsBase::d_nVectors; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_BLASWrapperPtr; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_quadratureID; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_quadratureIndex; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_nQuadsPerCell; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_nDofsPerCell; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::areAllCellsAffine; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::areAllCellsCartesian; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_updateFlags; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_shapeFunctionData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_shapeFunctionDataTranspose; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientDataTranspose; - using FEBasisOperationsBase:: - d_shapeFunctionGradientDataInternalLayout; - using FEBasisOperationsBase::d_JxWData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_inverseJacobianData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_cellIndexToCellIdMap; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_cellDofIndexToProcessDofIndexMap; - using FEBasisOperationsBase:: - d_flattenedCellDofIndexToProcessDofIndexMap; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::HOST>::d_constraintsVector; /** @@ -654,12 +596,10 @@ namespace dftfe * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. */ void - interpolate( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients = NULL) const; + interpolate(dftfe::linearAlgebra::MultiVector &nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; // FIXME Untested function /** @@ -675,8 +615,7 @@ namespace dftfe integrateWithBasis( ValueTypeBasisCoeff *quadratureValues, ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector &nodalData) const; /** @@ -688,8 +627,7 @@ namespace dftfe */ void extractToCellNodalData( - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector & nodalData, ValueTypeBasisCoeff *cellNodalDataPtr) const; // FIXME Untested function @@ -702,8 +640,7 @@ namespace dftfe void accumulateFromCellNodalData( const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector &nodalData) const; /** @@ -721,9 +658,8 @@ namespace dftfe void interpolateKernel( const dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff * quadratureValues, + memorySpace> &nodalData, + ValueTypeBasisCoeff * quadratureValues, ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; @@ -762,8 +698,7 @@ namespace dftfe integrateWithBasisKernel( const ValueTypeBasisCoeff *quadratureValues, const ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector & nodalData, const std::pair cellRange) const; @@ -780,10 +715,9 @@ namespace dftfe void extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff * cellNodalDataPtr, - const std::pair cellRange) const; + memorySpace> &nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const; // FIXME Untested function /** @@ -797,318 +731,13 @@ namespace dftfe void accumulateFromCellNodalDataKernel( const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const; - }; -#if defined(DFTFE_WITH_DEVICE) - template - class FEBasisOperations - : public FEBasisOperationsBase - { - public: - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::FEBasisOperationsBase; - using FEBasisOperationsBase::d_nCells; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_localSize; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_locallyOwnedSize; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::tempCellNodalData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_BLASWrapperPtr; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::tempQuadratureGradientsData; - using FEBasisOperationsBase:: - tempQuadratureGradientsDataNonAffine; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_nVectors; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_cellsBlockSize; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_quadratureID; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_quadratureIndex; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_nQuadsPerCell; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_nDofsPerCell; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::areAllCellsAffine; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::areAllCellsCartesian; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_updateFlags; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionDataTranspose; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionGradientData; - using FEBasisOperationsBase:: - d_shapeFunctionGradientDataTranspose; - using FEBasisOperationsBase:: - d_shapeFunctionGradientDataInternalLayout; - using FEBasisOperationsBase::d_JxWData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_inverseJacobianData; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_cellIndexToCellIdMap; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_cellDofIndexToProcessDofIndexMap; - using FEBasisOperationsBase:: - d_flattenedCellDofIndexToProcessDofIndexMap; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_constraintsVector; - - // FIXME has to be removed in a future PR - /** - * @brief sets device blas handle for internal blas operations. - */ - dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr; - void - setDeviceBLASHandle( - dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr); - - // FIXME has to be removed in a future PR - /** - * @brief gets device blas handle for blas operations. - */ - dftfe::utils::deviceBlasHandle_t & - getDeviceBLASHandle(); - - - - /** - * @brief Interpolate process level nodal data to cell level quadrature data. - * @param[in] nodalData process level nodal data, the multivector should - * already have ghost data and constraints should have been applied. - * @param[out] quadratureValues Cell level quadrature values, indexed by - * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[out] quadratureGradients Cell level quadrature gradients, - * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - */ - void - interpolate( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients = NULL) const; - - - // FIXME Untested function - /** - * @brief Integrate cell level quadrature data times shape functions to process level nodal data. - * @param[in] quadratureValues Cell level quadrature values, indexed by - * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[in] quadratureGradients Cell level quadrature gradients, - * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[out] nodalData process level nodal data. - */ - void - integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const; - - /** - * @brief Get cell level nodal data from process level nodal data. - * @param[in] nodalData process level nodal data, the multivector should - * already have ghost data and constraints should have been applied. - * @param[out] cellNodalDataPtr Cell level nodal values, indexed by - * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. - */ - void - extractToCellNodalData( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const; - - // FIXME Untested function - /** - * @brief Accumulate cell level nodal data into process level nodal data. - * @param[in] cellNodalDataPtr Cell level nodal values, indexed by - * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. - * @param[out] nodalData process level nodal data. - */ - void - accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - &nodalData) const; - - /** - * @brief Interpolate process level nodal data to cell level quadrature data. - * @param[in] nodalData process level nodal data, the multivector should - * already have ghost data and constraints should have been applied. - * @param[out] quadratureValues Cell level quadrature values, indexed by - * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[out] quadratureGradients Cell level quadrature gradients, - * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[in] cellRange the range of cells for which interpolation has to - * be done. - */ - void - interpolateKernel( - const dftfe::linearAlgebra::MultiVector< - ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> & nodalData, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const; - - /** - * @brief Interpolate cell level nodal data to cell level quadrature data. - * @param[in] nodalData cell level nodal data, the multivector should - * already have ghost data and constraints should have been applied. - * @param[out] quadratureValues Cell level quadrature values, indexed by - * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[out] quadratureGradients Cell level quadrature gradients, - * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[in] cellRange the range of cells for which interpolation has to - * be done. - */ - void - interpolateKernel( - const ValueTypeBasisCoeff * nodalData, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const; - - // FIXME Untested function - /** - * @brief Integrate cell level quadrature data times shape functions to process level nodal data. - * @param[in] quadratureValues Cell level quadrature values, indexed by - * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[in] quadratureGradients Cell level quadrature gradients, - * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[out] nodalData process level nodal data. - * @param[in] cellRange the range of cells for which integration has to be - * done. - */ - void - integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const; - - - /** - * @brief Get cell level nodal data from process level nodal data. - * @param[in] nodalData process level nodal data, the multivector should - * already have ghost data and constraints should have been applied. - * @param[out] cellNodalDataPtr Cell level nodal values, indexed by - * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. - * @param[in] cellRange the range of cells for which extraction has to be - * done. - */ - void - extractToCellNodalDataKernel( - const dftfe::linearAlgebra::MultiVector< - ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> & nodalData, - ValueTypeBasisCoeff * cellNodalDataPtr, - const std::pair cellRange) const; - - // FIXME Untested function - /** - * @brief Accumulate cell level nodal data into process level nodal data. - * @param[in] cellNodalDataPtr Cell level nodal values, indexed by - * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. - * @param[out] nodalData process level nodal data. - * @param[in] cellRange the range of cells for which extraction has to be - * done. - */ - void - accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector & nodalData, const std::pair cellRange) const; }; -#endif } // end of namespace basis } // end of namespace dftfe -#include "../utils/FEBasisOperations.t.cc" -#include "../utils/FEBasisOperationsHost.t.cc" -#if defined(DFTFE_WITH_DEVICE) -# include "../utils/FEBasisOperationsDevice.t.cc" -#endif +// #include "../utils/FEBasisOperations.t.cc" +// #include "../utils/FEBasisOperationsKernels.t.cc" #endif // dftfeBasisOperations_h diff --git a/include/FEBasisOperationsKernelsDevice.h b/include/FEBasisOperationsKernelsDevice.h deleted file mode 100644 index 8a38c53a8..000000000 --- a/include/FEBasisOperationsKernelsDevice.h +++ /dev/null @@ -1,52 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- - -#ifndef dftfeFEBasisOperationsKernelsDevice_h -#define dftfeFEBasisOperationsKernelsDevice_h - -#ifdef DFTFE_WITH_DEVICE -# include - -namespace dftfe -{ - namespace basis - { - namespace FEBasisOperationsKernelsDevice - { - /** - * @brief rehsape gradient data from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * - * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. - * @param[in] numVecs number of vectors. - * @param[in] numQuads number of quadrature points per cell. - * @param[in] numCells number of locally owned cells. - * @param[in] copyFromVec source data pointer. - * @param[out] copyToVec destination data pointer. - */ - template - void - reshapeNonAffineCase(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const ValueType1 * copyFromVec, - ValueType2 * copyToVec); - - - }; // namespace FEBasisOperationsKernelsDevice - } // namespace basis -} // namespace dftfe - -#endif // DFTFE_WITH_DEVICE -#endif // dftfeFEBasisOperationsKernelsDevice_h diff --git a/include/FEBasisOperationsKernelsInternal.h b/include/FEBasisOperationsKernelsInternal.h new file mode 100644 index 000000000..9aff3c66d --- /dev/null +++ b/include/FEBasisOperationsKernelsInternal.h @@ -0,0 +1,88 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- + +#ifndef dftfeFEBasisOperationsKernelsInternal_h +#define dftfeFEBasisOperationsKernelsInternal_h + +#ifdef DFTFE_WITH_DEVICE +# include +# include +# include +# include +# include +# include +namespace dftfe +{ + namespace basis + { + namespace FEBasisOperationsKernelsInternal + { + /** + * @brief rehsape gradient data from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] numVecs number of vectors. + * @param[in] numQuads number of quadrature points per cell. + * @param[in] numCells number of locally owned cells. + * @param[in] copyFromVec source data pointer. + * @param[out] copyToVec destination data pointer. + */ + template + void + reshapeFromNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec); + + template + void + reshapeFromNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec); + + /** + * @brief rehsape gradient data to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] numVecs number of vectors. + * @param[in] numQuads number of quadrature points per cell. + * @param[in] numCells number of locally owned cells. + * @param[in] copyFromVec source data pointer. + * @param[out] copyToVec destination data pointer. + */ + template + void + reshapeToNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec); + + template + void + reshapeToNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec); + + } // namespace FEBasisOperationsKernelsInternal + } // namespace basis +} // namespace dftfe + +#endif // DFTFE_WITH_DEVICE +#endif // dftfeFEBasisOperationsKernelsInternal_h diff --git a/include/densityCalculator.h b/include/densityCalculator.h index 8b95986f5..5f73db5a8 100644 --- a/include/densityCalculator.h +++ b/include/densityCalculator.h @@ -38,7 +38,9 @@ namespace dftfe const double fermiEnergyDown, std::shared_ptr< dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, const std::vector &kPointWeights, @@ -61,7 +63,10 @@ namespace dftfe std::shared_ptr< dftfe::basis:: FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, double * partialOccupVec, @@ -81,7 +86,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, double * partialOccupVec, diff --git a/include/dft.h b/include/dft.h index e5a26978c..171274f2f 100644 --- a/include/dft.h +++ b/include/dft.h @@ -648,16 +648,6 @@ namespace dftfe & quadratureGradValueData, const bool isConsiderGradData = false); - /** - *@brief subtract atomic densities at quadrature points - * - */ - void - subtractAtomicRhoQuadValuesGradients( - std::map> &quadratureValueData, - std::map> &quadratureGradValueData, - const bool isConsiderGradData = false); - /** *@brief Finds the global dof ids of the nodes containing atoms. @@ -782,12 +772,6 @@ namespace dftfe /** *@brief Computes total charge by integrating the electron-density */ - double - totalCharge(const dealii::DoFHandler<3> & dofHandlerOfField, - const distributedCPUVec &rhoNodalField, - std::map> &rhoQuadValues); - - double totalCharge(const dealii::DoFHandler<3> & dofHandlerOfField, const distributedCPUVec &rhoNodalField); @@ -810,10 +794,6 @@ namespace dftfe const distributedCPUVec & rhoNodalField); - void - dipole(const dealii::DoFHandler<3> &dofHandlerOfField, - const std::map> *rhoQuadValues, - bool centerofCharge); double rhofieldl2Norm(const dealii::MatrixFree<3, double> &matrixFreeDataObject, @@ -834,18 +814,6 @@ namespace dftfe fieldGradl2Norm(const dealii::MatrixFree<3, double> &matrixFreeDataObject, const distributedCPUVec & field); - /** - *@brief l2 projection - */ - void - l2ProjectionQuadToNodal( - const dealii::MatrixFree<3, double> & matrixFreeDataObject, - const dealii::AffineConstraints & constraintMatrix, - const unsigned int dofHandlerId, - const unsigned int quadratureId, - const std::map> &quadratureValueData, - distributedCPUVec & nodalField); - /** *@brief l2 projection */ diff --git a/include/kohnShamDFTOperator.h b/include/kohnShamDFTOperator.h index 39bba1b7f..263bf5d09 100644 --- a/include/kohnShamDFTOperator.h +++ b/include/kohnShamDFTOperator.h @@ -433,9 +433,6 @@ node is stored distributedCPUMultiVec & getParallelProjectorKetTimesBlockVector(); - const std::vector & - getShapeFunctionValuesDensityGaussQuad() const; - const std::vector & getShapeFunctionGradValuesDensityGaussQuad() const; diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index 42f524f74..2399ada00 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -72,12 +72,6 @@ namespace dftfe distributedCPUVec & getProjectorKetTimesVectorSingle(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientIntegral(); - - dftfe::utils::MemoryStorage & - getShapeFunctionGradientIntegralElectro(); - const dftfe::utils::MemoryStorage & getShapeFunctionValues(); diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index b35aeb4a3..b10a9d486 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -45,6 +45,45 @@ namespace dftfe const double * beta, double * C, const unsigned int *INCY); + + void + sgemv_(const char * TRANS, + const unsigned int *M, + const unsigned int *N, + const float * alpha, + const float * A, + const unsigned int *LDA, + const float * X, + const unsigned int *INCX, + const float * beta, + float * C, + const unsigned int *INCY); + + void + zgemv_(const char * TRANS, + const unsigned int * M, + const unsigned int * N, + const std::complex *alpha, + const std::complex *A, + const unsigned int * LDA, + const std::complex *X, + const unsigned int * INCX, + const std::complex *beta, + std::complex * C, + const unsigned int * INCY); + + void + cgemv_(const char * TRANS, + const unsigned int * M, + const unsigned int * N, + const std::complex *alpha, + const std::complex *A, + const unsigned int * LDA, + const std::complex *X, + const unsigned int * INCX, + const std::complex *beta, + std::complex * C, + const unsigned int * INCY); void dsymv_(const char * UPLO, const unsigned int *N, @@ -337,6 +376,14 @@ namespace dftfe const unsigned int * incx, std::complex * y, const unsigned int * incy); + + void + ccopy_(const unsigned int * n, + const std::complex *x, + const unsigned int * incx, + std::complex * y, + const unsigned int * incy); + std::complex zdotc_(const unsigned int * N, const std::complex *X, diff --git a/include/operator.h b/include/operator.h index 3c74807d9..f4c495c55 100644 --- a/include/operator.h +++ b/include/operator.h @@ -263,10 +263,6 @@ namespace dftfe virtual distributedCPUMultiVec & getParallelProjectorKetTimesBlockVector() = 0; - - virtual const std::vector & - getShapeFunctionValuesDensityGaussQuad() const = 0; - virtual const std::vector & getShapeFunctionGradValuesDensityGaussQuad() const = 0; diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 09daee7bf..933c9a267 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -77,14 +77,6 @@ namespace dftfe virtual distributedCPUVec & getProjectorKetTimesVectorSingle() = 0; - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientIntegral() = 0; - - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientIntegralElectro() = 0; - virtual const dftfe::utils::MemoryStorage & @@ -405,12 +397,6 @@ namespace dftfe // const dealii::MatrixFree<3, double> *d_matrix_free_data; - dftfe::utils::MemoryStorage - d_cellShapeFunctionGradientIntegralFlattenedDevice; - - dftfe::utils::MemoryStorage - d_cellShapeFunctionGradientIntegralFlattenedDeviceElectro; - dftfe::utils::MemoryStorage d_shapeFunctionGradientValueNLPTransposedDevice; diff --git a/include/solveVselfInBinsDevice.h b/include/solveVselfInBinsDevice.h index 8908efd49..6d69c68cd 100644 --- a/include/solveVselfInBinsDevice.h +++ b/include/solveVselfInBinsDevice.h @@ -21,15 +21,19 @@ # include # include -# include - +# include namespace dftfe { namespace poissonDevice { void solveVselfInBins( - operatorDFTDeviceClass & operatorMatrix, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const dealii::MatrixFree<3, double> & matrixFreeData, const unsigned int mfDofHandlerIndex, const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, diff --git a/include/vselfBinsManager.h b/include/vselfBinsManager.h index c9d614eb1..d1ef26d48 100644 --- a/include/vselfBinsManager.h +++ b/include/vselfBinsManager.h @@ -172,11 +172,16 @@ namespace dftfe const std::shared_ptr< dftfe::basis:: FEBasisOperations> - & basisOperationsPtr, - const unsigned int mfBaseDofHandlerIndex, - const unsigned int matrixFreeQuadratureIdAX, - const unsigned int offset, - operatorDFTDeviceClass & operatorMatrix, + & basisOperationsPtr, + const unsigned int mfBaseDofHandlerIndex, + const unsigned int matrixFreeQuadratureIdAX, + const unsigned int offset, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, const std::vector> & imagePositions, const std::vector & imageIds, @@ -216,7 +221,12 @@ namespace dftfe const unsigned int matrixFreeQuadratureIdAX, const unsigned int offset, # ifdef DFTFE_WITH_DEVICE - operatorDFTDeviceClass &operatorMatrix, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + &BLASWrapperPtr, # endif const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, const std::vector> & imagePositions, diff --git a/src/dft/atomicRho.cc b/src/dft/atomicRho.cc index 3e57e992f..3311996d6 100644 --- a/src/dft/atomicRho.cc +++ b/src/dft/atomicRho.cc @@ -539,51 +539,6 @@ namespace dftfe } } - // - // - template - void - dftClass::subtractAtomicRhoQuadValuesGradients( - std::map> &quadratureValueData, - std::map> &quadratureGradValueData, - const bool isConsiderGradData) - { - const dealii::Quadrature<3> &quadrature_formula = - matrix_free_data.get_quadrature(d_densityQuadratureId); - const unsigned int n_q_points = quadrature_formula.size(); - - dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(), - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - std::vector &rhoValues = - quadratureValueData.find(cell->id())->second; - const std::vector &rhoAtomicValues = - d_rhoAtomsValues.find(cell->id())->second; - for (unsigned int q_point = 0; q_point < n_q_points; ++q_point) - rhoValues[q_point] -= rhoAtomicValues[q_point]; - - if (isConsiderGradData) - { - std::vector &gradRhoValues = - quadratureGradValueData.find(cell->id())->second; - const std::vector &gradRhoAtomicValues = - d_gradRhoAtomsValues.find(cell->id())->second; - for (unsigned int q_point = 0; q_point < n_q_points; ++q_point) - { - gradRhoValues[3 * q_point + 0] -= - gradRhoAtomicValues[3 * q_point + 0]; - gradRhoValues[3 * q_point + 1] -= - gradRhoAtomicValues[3 * q_point + 1]; - gradRhoValues[3 * q_point + 2] -= - gradRhoAtomicValues[3 * q_point + 2]; - } - } - } - } - // // compute l2 projection of quad data to nodal data // diff --git a/src/dft/charge.cc b/src/dft/charge.cc index 99ac277ce..9e098ca89 100644 --- a/src/dft/charge.cc +++ b/src/dft/charge.cc @@ -135,49 +135,6 @@ namespace dftfe return dealii::Utilities::MPI::sum(normValue, mpi_communicator); } - // - // compute total charge using nodal point values by filling the quadrature - // point values of the nodal field - // - template - double - dftClass::totalCharge( - const dealii::DoFHandler<3> & dofHandlerOfField, - const distributedCPUVec & rhoNodalField, - std::map> &rhoQuadValues) - { - double normValue = 0.0; - const dealii::Quadrature<3> &quadrature_formula = - matrix_free_data.get_quadrature(d_densityQuadratureId); - dealii::FEValues<3> fe_values(dofHandlerOfField.get_fe(), - quadrature_formula, - dealii::update_values | - dealii::update_JxW_values); - const unsigned int dofs_per_cell = dofHandlerOfField.get_fe().dofs_per_cell; - const unsigned int n_q_points = quadrature_formula.size(); - rhoNodalField.update_ghost_values(); - std::vector tempRho(n_q_points); - - dealii::DoFHandler<3>::active_cell_iterator cell = dofHandlerOfField - .begin_active(), - endc = dofHandlerOfField.end(); - for (; cell != endc; ++cell) - { - if (cell->is_locally_owned()) - { - fe_values.reinit(cell); - fe_values.get_function_values(rhoNodalField, tempRho); - rhoQuadValues[cell->id()].resize(n_q_points); - for (unsigned int q_point = 0; q_point < n_q_points; ++q_point) - { - rhoQuadValues[cell->id()][q_point] = tempRho[q_point]; - normValue += tempRho[q_point] * fe_values.JxW(q_point); - } - } - } - return dealii::Utilities::MPI::sum(normValue, mpi_communicator); - } - // // compute total charge using nodal point values by using FEEvaluation object // @@ -402,142 +359,6 @@ namespace dftfe return dealii::Utilities::MPI::sum(value, mpi_communicator); } - - -/*template -void -dftClass::dipole( - const dealii::DoFHandler<3> & dofHandlerOfField, - const std::map> *rhoQuadValues, - bool centerofCharge) -{ - std::vector dipolevector(3, 0.0); - pcout << " Here!! " << dipolevector.size() << std::endl; - const dealii::Quadrature<3> &quadrature_formula = - matrix_free_data.get_quadrature(d_densityQuadratureId); - dealii::FEValues<3> fe_values(dofHandlerOfField.get_fe(), - quadrature_formula, - dealii::update_values | dealii::update_JxW_values | - dealii::update_quadrature_points); - const unsigned int dofs_per_cell = dofHandlerOfField.get_fe().dofs_per_cell; - const unsigned int n_q_points = quadrature_formula.size(); - pcout << "Setting of qpoints and ndofs" << std::endl; - dealii::DoFHandler<3>::active_cell_iterator cell = -dofHandlerOfField.begin_active(), endc = dofHandlerOfField.end(); if -(!centerofCharge) - - { - for (; cell != endc; ++cell) - { - if (cell->is_locally_owned()) - { - fe_values.reinit(cell); - const std::vector &rhoValues = - (*rhoQuadValues).find(cell->id())->second; - for (unsigned int q_point = 0; q_point < n_q_points; ++q_point) - { - if (!d_dftParamsPtr->periodicX) - dipolevector[0] += rhoValues[q_point] * - fe_values.JxW(q_point) * - fe_values.quadrature_point(q_point)[0]; - if (!d_dftParamsPtr->periodicY) - dipolevector[1] += rhoValues[q_point] * - fe_values.JxW(q_point) * - fe_values.quadrature_point(q_point)[1]; - if (!d_dftParamsPtr->periodicZ) - dipolevector[2] += rhoValues[q_point] * - fe_values.JxW(q_point) * - fe_values.quadrature_point(q_point)[2]; - } - } - } - MPI_Allreduce(MPI_IN_PLACE, - &dipolevector[0], - 3, - MPI_DOUBLE, - MPI_SUM, - mpi_communicator); - pcout << "Electron dipole moment: " << dipolevector[0] << " " - << dipolevector[1] << " " << dipolevector[2] << std::endl; - - for (int iAtom = 0; iAtom < atomLocations.size(); iAtom++) - { - if (!d_dftParamsPtr->periodicX) - dipolevector[0] += - -atomLocations[iAtom][1] * atomLocations[iAtom][2]; - if (!d_dftParamsPtr->periodicY) - dipolevector[1] += - -atomLocations[iAtom][1] * atomLocations[iAtom][3]; - if (!d_dftParamsPtr->periodicZ) - dipolevector[2] += - -atomLocations[iAtom][1] * atomLocations[iAtom][4]; - } - pcout << "Total dipole moment: " << dipolevector[0] << " " - << dipolevector[1] << " " << dipolevector[2] << std::endl; - } - else - { - std::vector COM(3, 0.0); - double Mass = 0.0; - for (int iAtom = 0; iAtom < atomLocations.size(); iAtom++) - { - COM[0] += -atomLocations[iAtom][1] * (atomLocations[iAtom][2]); - COM[1] += -atomLocations[iAtom][1] * (atomLocations[iAtom][3]); - COM[2] += -atomLocations[iAtom][1] * (atomLocations[iAtom][4]); - Mass += -atomLocations[iAtom][1]; - } - COM[0] /= Mass; - COM[1] /= Mass; - COM[2] /= Mass; - for (; cell != endc; ++cell) - { - if (cell->is_locally_owned()) - { - fe_values.reinit(cell); - const std::vector &rhoValues = - (*rhoQuadValues).find(cell->id())->second; - for (unsigned int q_point = 0; q_point < n_q_points; ++q_point) - { - if (!d_dftParamsPtr->periodicX) - dipolevector[0] += - rhoValues[q_point] * fe_values.JxW(q_point) * - (fe_values.quadrature_point(q_point)[0] - COM[0]); - if (!d_dftParamsPtr->periodicY) - dipolevector[1] += - rhoValues[q_point] * fe_values.JxW(q_point) * - (fe_values.quadrature_point(q_point)[1] - COM[1]); - if (!d_dftParamsPtr->periodicZ) - dipolevector[2] += - rhoValues[q_point] * fe_values.JxW(q_point) * - (fe_values.quadrature_point(q_point)[2] - COM[2]); - } - } - } - MPI_Allreduce(MPI_IN_PLACE, - &dipolevector[0], - 3, - MPI_DOUBLE, - MPI_SUM, - mpi_communicator); - pcout << "Electron dipole moment wrt COM: " << dipolevector[0] << " " - << dipolevector[1] << " " << dipolevector[2] << std::endl; - - for (int iAtom = 0; iAtom < atomLocations.size(); iAtom++) - { - if (!d_dftParamsPtr->periodicX) - dipolevector[0] += - -atomLocations[iAtom][1] * (atomLocations[iAtom][2] - COM[0]); - if (!d_dftParamsPtr->periodicY) - dipolevector[1] += - -atomLocations[iAtom][1] * (atomLocations[iAtom][3] - COM[1]); - if (!d_dftParamsPtr->periodicZ) - dipolevector[2] += - -atomLocations[iAtom][1] * (atomLocations[iAtom][4] - COM[2]); - } - pcout << "Total dipole moment wrt COM: " << dipolevector[0] << " " - << dipolevector[1] << " " << dipolevector[2] << std::endl; - } -} */ #include "dft.inst.cc" } // namespace dftfe diff --git a/src/dft/density.cc b/src/dft/density.cc index d5845e205..c5828e296 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -118,6 +118,7 @@ namespace dftfe fermiEnergyUp, fermiEnergyDown, d_basisOperationsPtrDevice, + d_BLASWrapperPtr, d_densityDofHandlerIndex, d_densityQuadratureId, d_kPointWeights, @@ -142,6 +143,7 @@ namespace dftfe fermiEnergyUp, fermiEnergyDown, d_basisOperationsPtrHost, + d_BLASWrapperPtrHost, d_densityDofHandlerIndex, d_densityQuadratureId, d_kPointWeights, @@ -359,6 +361,7 @@ namespace dftfe fermiEnergyUp, fermiEnergyDown, d_basisOperationsPtrDevice, + d_BLASWrapperPtr, d_densityDofHandlerIndex, d_gllQuadratureId, d_kPointWeights, @@ -382,6 +385,7 @@ namespace dftfe fermiEnergyUp, fermiEnergyDown, d_basisOperationsPtrHost, + d_BLASWrapperPtrHost, d_densityDofHandlerIndex, d_gllQuadratureId, d_kPointWeights, diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index d4ba99528..932dc15d0 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -47,7 +47,9 @@ namespace dftfe const double fermiEnergyDown, std::shared_ptr< dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, const std::vector &kPointWeights, @@ -324,6 +326,7 @@ namespace dftfe ++spinIndex) computeRhoGradRhoFromInterpolatedValues( basisOperationsPtr, + BLASWrapperPtr, std::pair( startingCellId, startingCellId + currentCellsBlockSize), @@ -484,6 +487,7 @@ namespace dftfe ++spinIndex) computeRhoGradRhoFromInterpolatedValues( basisOperationsPtr, + BLASWrapperPtr, std::pair( startingCellId, startingCellId + currentCellsBlockSize), @@ -621,7 +625,10 @@ namespace dftfe std::shared_ptr< dftfe::basis:: FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, double * partialOccupVec, @@ -697,7 +704,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtrDevice, + &basisOperationsPtrDevice, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, const std::vector &kPointWeights, @@ -731,7 +741,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, const std::vector &kPointWeights, diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc index 8b8ac25c1..8b9253d79 100644 --- a/src/dft/densityCalculatorDeviceKernels.cc +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -144,7 +144,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, double * partialOccupVec, @@ -194,36 +197,32 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), isEvaluateGradRho); #endif - dftfe::utils::deviceBlasWrapper::gemv( - basisOperationsPtr->getDeviceBLASHandle(), - dftfe::utils::DEVICEBLAS_OP_T, - vectorsBlockSize, - cellsBlockSize * nQuadsPerCell, - &scalarCoeffAlphaRho, - rhoCellsWfcContributions, - vectorsBlockSize, - partialOccupVec, - 1, - &scalarCoeffBetaRho, - rho + cellRange.first * nQuadsPerCell, - 1); + BLASWrapperPtr->xgemv('T', + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell, + &scalarCoeffAlphaRho, + rhoCellsWfcContributions, + vectorsBlockSize, + partialOccupVec, + 1, + &scalarCoeffBetaRho, + rho + cellRange.first * nQuadsPerCell, + 1); if (isEvaluateGradRho) { - dftfe::utils::deviceBlasWrapper::gemv( - basisOperationsPtr->getDeviceBLASHandle(), - dftfe::utils::DEVICEBLAS_OP_T, - vectorsBlockSize, - cellsBlockSize * nQuadsPerCell * 3, - &scalarCoeffAlphaGradRho, - gradRhoCellsWfcContributions, - vectorsBlockSize, - partialOccupVec, - 1, - &scalarCoeffBetaGradRho, - gradRho + cellRange.first * nQuadsPerCell * 3, - 1); + BLASWrapperPtr->xgemv('T', + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell * 3, + &scalarCoeffAlphaGradRho, + gradRhoCellsWfcContributions, + vectorsBlockSize, + partialOccupVec, + 1, + &scalarCoeffBetaGradRho, + gradRho + cellRange.first * nQuadsPerCell * 3, + 1); } } template void @@ -232,7 +231,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, double * partialOccupVec, diff --git a/src/dft/dft.cc b/src/dft/dft.cc index b2181e97f..3df43fdaf 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1772,12 +1772,6 @@ namespace dftfe if (d_dftParamsPtr->writeLocalizationLengths) compute_localizationLength("localizationLengths.out"); - /*if (d_dftParamsPtr->computeDipoleMoment) - { - dipole(d_dofHandlerPRefined, rhoOutValues, false); - dipole(d_dofHandlerPRefined, rhoOutValues, true); - } */ - if (d_dftParamsPtr->verbosity >= 1) pcout << std::endl @@ -1843,8 +1837,6 @@ namespace dftfe if (initializeCublas) { kohnShamDFTEigenOperatorDevice.createDeviceBlasHandle(); - d_basisOperationsPtrDevice->setDeviceBLASHandle( - &(kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle())); } AssertThrow( @@ -1945,9 +1937,6 @@ namespace dftfe d_kohnShamDFTOperatorDevicePtr->reinit( std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues), true); - - d_basisOperationsPtrDevice->setDeviceBLASHandle( - &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); } #endif } @@ -2074,7 +2063,10 @@ namespace dftfe d_baseDofHandlerIndexElectro, d_phiTotAXQuadratureIdElectro, d_binsStartDofHandlerIndexElectro, - kohnShamDFTEigenOperatorDevice, + FEOrder == FEOrderElectro ? + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData() : + d_basisOperationsPtrElectroDevice->cellStiffnessMatrixBasisData(), + d_BLASWrapperPtr, d_constraintsPRefined, d_imagePositionsTrunc, d_imageIdsTrunc, @@ -3811,11 +3803,6 @@ namespace dftfe #endif ); } -#ifdef DFTFE_WITH_DEVICE - if (d_dftParamsPtr->useDevice) - d_basisOperationsPtrDevice->setDeviceBLASHandle( - &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); -#endif forcePtr->computeStress(matrix_free_data, #ifdef DFTFE_WITH_DEVICE @@ -3903,13 +3890,6 @@ namespace dftfe false, d_dftParamsPtr->verbosity >= 4 ? true : false); -#ifdef DFTFE_WITH_DEVICE - if (d_dftParamsPtr->useDevice) - kohnShamDFTEigenOperatorDevice - .preComputeShapeFunctionGradientIntegrals(d_lpspQuadratureId, - true); -#endif - computing_timer.enter_subsection( "Nuclear self-potential perturbation solve"); @@ -3919,7 +3899,10 @@ namespace dftfe d_phiTotAXQuadratureIdElectro, d_binsStartDofHandlerIndexElectro, #ifdef DFTFE_WITH_DEVICE - kohnShamDFTEigenOperatorDevice, + FEOrder == FEOrderElectro ? + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData() : + d_basisOperationsPtrElectroDevice->cellStiffnessMatrixBasisData(), + d_BLASWrapperPtr, #endif d_constraintsPRefined, d_imagePositionsTrunc, @@ -3955,13 +3938,6 @@ namespace dftfe false, d_dftParamsPtr->verbosity >= 4 ? true : false); -#ifdef DFTFE_WITH_DEVICE - if (d_dftParamsPtr->useDevice) - kohnShamDFTEigenOperatorDevice - .preComputeShapeFunctionGradientIntegrals(d_lpspQuadratureId, - true); -#endif - computing_timer.enter_subsection( "Nuclear self-potential perturbation solve"); @@ -3971,7 +3947,10 @@ namespace dftfe d_phiTotAXQuadratureIdElectro, d_binsStartDofHandlerIndexElectro, #ifdef DFTFE_WITH_DEVICE - kohnShamDFTEigenOperatorDevice, + FEOrder == FEOrderElectro ? + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData() : + d_basisOperationsPtrElectroDevice->cellStiffnessMatrixBasisData(), + d_BLASWrapperPtr, #endif d_constraintsPRefined, d_imagePositionsTrunc, @@ -4006,12 +3985,6 @@ namespace dftfe true, false, d_dftParamsPtr->verbosity >= 4 ? true : false); - -#ifdef DFTFE_WITH_DEVICE - if (d_dftParamsPtr->useDevice) - kohnShamDFTEigenOperatorDevice.preComputeShapeFunctionGradientIntegrals( - d_lpspQuadratureId, true); -#endif } // Output wfc diff --git a/src/dft/femUtilityFunctions.cc b/src/dft/femUtilityFunctions.cc index aae16dc34..b97786a1d 100644 --- a/src/dft/femUtilityFunctions.cc +++ b/src/dft/femUtilityFunctions.cc @@ -399,37 +399,6 @@ namespace dftfe return dealii::Utilities::MPI::sum(value, mpi_communicator); } - // - // compute l2 projection of quad data to nodal data - // - template - void - dftClass::l2ProjectionQuadToNodal( - const dealii::MatrixFree<3, double> & matrixFreeDataObject, - const dealii::AffineConstraints & constraintMatrix, - const unsigned int dofHandlerId, - const unsigned int quadratureId, - const std::map> &quadratureValueData, - distributedCPUVec & nodalField) - { - std::function< - double(const typename dealii::DoFHandler<3>::active_cell_iterator &cell, - const unsigned int q)> - funcRho = - [&](const typename dealii::DoFHandler<3>::active_cell_iterator &cell, - const unsigned int q) { - return quadratureValueData.find(cell->id())->second[q]; - }; - dealii::VectorTools::project<3, distributedCPUVec>( - dealii::MappingQ1<3, 3>(), - matrixFreeDataObject.get_dof_handler(dofHandlerId), - constraintMatrix, - matrixFreeDataObject.get_quadrature(quadratureId), - funcRho, - nodalField); - constraintMatrix.set_zero(nodalField); - nodalField.update_ghost_values(); - } template void dftClass::l2ProjectionQuadToNodal( diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 26868affe..920be0981 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -294,6 +294,8 @@ namespace dftfe d_basisOperationsPtrHost->init(d_densityDofHandlerIndex, quadratureIndices, updateFlags); + d_basisOperationsPtrHost->computeCellStiffnessMatrix( + d_feOrderPlusOneQuadratureId, 1, true, false); } } if (!d_dftParamsPtr->useDevice && recomputeBasisData) @@ -336,6 +338,8 @@ namespace dftfe else d_basisOperationsPtrDevice->createScratchMultiVectors( BVec, (d_dftParamsPtr->spinPolarized + 1)); + d_basisOperationsPtrDevice->computeCellStiffnessMatrix( + d_feOrderPlusOneQuadratureId, 50, true, false); } else { @@ -361,6 +365,9 @@ namespace dftfe d_basisOperationsPtrDevice->init(d_densityDofHandlerIndex, quadratureIndices, updateFlags); + if (FEOrder != FEOrderElectro) + d_basisOperationsPtrDevice->computeCellStiffnessMatrix( + d_feOrderPlusOneQuadratureId, 50, true, false); } } else if (d_dftParamsPtr->useDevice) diff --git a/src/dft/pRefinedDoFHandler.cc b/src/dft/pRefinedDoFHandler.cc index c81daf6fd..ba27203e7 100644 --- a/src/dft/pRefinedDoFHandler.cc +++ b/src/dft/pRefinedDoFHandler.cc @@ -434,6 +434,9 @@ namespace dftfe d_BLASWrapperPtr); d_basisOperationsPtrElectroDevice->init( *d_basisOperationsPtrElectroHost); + if (FEOrder != FEOrderElectro) + d_basisOperationsPtrElectroDevice->computeCellStiffnessMatrix( + d_phiTotAXQuadratureIdElectro, 50, true, false); } else { @@ -447,6 +450,9 @@ namespace dftfe updateFlagsGradientsAndInvJacobians}; d_basisOperationsPtrElectroDevice->init( d_baseDofHandlerIndexElectro, quadratureIndices, updateFlags); + if (FEOrder != FEOrderElectro) + d_basisOperationsPtrElectroDevice->computeCellStiffnessMatrix( + d_phiTotAXQuadratureIdElectro, 50, true, false); } } #endif diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc index a8b4cc549..ad3ac2d8e 100644 --- a/src/dft/solveNSCF.cc +++ b/src/dft/solveNSCF.cc @@ -79,7 +79,10 @@ namespace dftfe d_baseDofHandlerIndexElectro, d_phiTotAXQuadratureIdElectro, d_binsStartDofHandlerIndexElectro, - kohnShamDFTEigenOperatorDevice, + FEOrder == FEOrderElectro ? + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData() : + d_basisOperationsPtrElectroDevice->cellStiffnessMatrixBasisData(), + d_BLASWrapperPtr, d_constraintsPRefined, d_imagePositionsTrunc, d_imageIdsTrunc, diff --git a/src/dft/solveVselfInBins.cc b/src/dft/solveVselfInBins.cc index 9acf114a0..1fa908a85 100644 --- a/src/dft/solveVselfInBins.cc +++ b/src/dft/solveVselfInBins.cc @@ -822,11 +822,15 @@ namespace dftfe const std::shared_ptr< dftfe::basis:: FEBasisOperations> - & basisOperationsPtr, - const unsigned int mfBaseDofHandlerIndex, - const unsigned int matrixFreeQuadratureIdAX, - const unsigned int offset, - operatorDFTDeviceClass & operatorMatrix, + & basisOperationsPtr, + const unsigned int mfBaseDofHandlerIndex, + const unsigned int matrixFreeQuadratureIdAX, + const unsigned int offset, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, const std::vector> & imagePositions, const std::vector & imageIds, @@ -1400,7 +1404,8 @@ namespace dftfe // // Device poisson solve // - poissonDevice::solveVselfInBins(operatorMatrix, + poissonDevice::solveVselfInBins(cellGradNIGradNJIntergralDevice, + BLASWrapperPtr, matrix_free_data, mfBaseDofHandlerIndex, hangingPeriodicConstraintMatrix, @@ -1636,7 +1641,11 @@ namespace dftfe const unsigned int matrixFreeQuadratureIdAX, const unsigned int offset, #ifdef DFTFE_WITH_DEVICE - operatorDFTDeviceClass &operatorMatrix, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + &BLASWrapperPtr, #endif const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, const std::vector> & imagePositions, @@ -1668,7 +1677,8 @@ namespace dftfe mfBaseDofHandlerIndex, matrixFreeQuadratureIdAX, offset, - operatorMatrix, + cellGradNIGradNJIntergralDevice, + BLASWrapperPtr, hangingPeriodicConstraintMatrix, imagePositions, imageIds, diff --git a/src/dft/solveVselfInBinsDevice.cc b/src/dft/solveVselfInBinsDevice.cc index 3fc681a31..ee36aa725 100644 --- a/src/dft/solveVselfInBinsDevice.cc +++ b/src/dft/solveVselfInBinsDevice.cc @@ -404,7 +404,12 @@ namespace dftfe void solveVselfInBins( - operatorDFTDeviceClass & operatorMatrix, + const dftfe::utils::MemoryStorage + &cellGradNIGradNJIntergralDevice, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, const dealii::MatrixFree<3, double> & matrixFreeData, const unsigned int mfDofHandlerIndex, const dealii::AffineConstraints &hangingPeriodicConstraintMatrix, @@ -561,13 +566,11 @@ namespace dftfe << " poissonDevice::solveVselfInBins: time for mem allocation: " << time << std::endl; - cgSolver(operatorMatrix.getDeviceBlasHandle(), + cgSolver(BLASWrapperPtr->getDeviceBlasHandle(), constraintsMatrixDataInfoDevice, bD.begin(), diagonalAD.begin(), - isElectroFEOrderDifferentFromFEOrder ? - operatorMatrix.getShapeFunctionGradientIntegralElectro() : - operatorMatrix.getShapeFunctionGradientIntegral(), + cellGradNIGradNJIntergralDevice, inhomoIdsColoredVecFlattenedD, cellLocalProcIndexIdMapD, localSize, diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc index 6ff668b2b..db9888a15 100644 --- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc +++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc @@ -1258,7 +1258,7 @@ kohnShamDFTOperatorDeviceClass:: d_basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), d_basisOperationsPtrDevice->inverseJacobiansBasisData().data(), d_basisOperationsPtrDevice->cellsTypeFlag(), - d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData().begin(), d_vEffJxWDevice.begin(), d_basisOperationsPtrDevice->JxWBasisData().begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -1290,7 +1290,7 @@ kohnShamDFTOperatorDeviceClass:: d_basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), d_basisOperationsPtrDevice->inverseJacobiansBasisData().data(), d_basisOperationsPtrDevice->cellsTypeFlag(), - d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData().begin(), d_vEffJxWDevice.begin(), d_basisOperationsPtrDevice->JxWBasisData().begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -1321,7 +1321,7 @@ kohnShamDFTOperatorDeviceClass:: d_basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), d_basisOperationsPtrDevice->inverseJacobiansBasisData().data(), d_basisOperationsPtrDevice->cellsTypeFlag(), - d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData().begin(), d_vEffJxWDevice.begin(), d_basisOperationsPtrDevice->JxWBasisData().begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin(), @@ -1352,7 +1352,7 @@ kohnShamDFTOperatorDeviceClass:: d_basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), d_basisOperationsPtrDevice->inverseJacobiansBasisData().data(), d_basisOperationsPtrDevice->cellsTypeFlag(), - d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), + d_basisOperationsPtrDevice->cellStiffnessMatrixBasisData().begin(), d_vEffJxWDevice.begin(), d_basisOperationsPtrDevice->JxWBasisData().begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin(), diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc index 47a30c06f..f11ae9812 100644 --- a/src/dftOperator/kohnShamDFTOperator.cc +++ b/src/dftOperator/kohnShamDFTOperator.cc @@ -392,15 +392,6 @@ namespace dftfe return dftPtr->d_projectorKetTimesVectorParFlattened; } - template - const std::vector & - kohnShamDFTOperatorClass:: - getShapeFunctionValuesDensityGaussQuad() const - { - return d_densityGaussQuadShapeFunctionValues; - } - - template const std::vector & kohnShamDFTOperatorClass:: diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 57e12c165..c22a704b2 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -342,24 +342,6 @@ namespace dftfe } - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientIntegral() - { - return d_cellShapeFunctionGradientIntegralFlattenedDevice; - } - - - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientIntegralElectro() - { - return d_cellShapeFunctionGradientIntegralFlattenedDeviceElectro; - } - - template const dftfe::utils::MemoryStorage & kohnShamDFTOperatorDeviceClass:: const unsigned int numberPhysicalCells = dftPtr->matrix_free_data.n_physical_cells(); - dftfe::utils::deviceSynchronize(); - MPI_Barrier(d_mpiCommParent); - double device_time = MPI_Wtime(); - - - d_basisOperationsPtrDevice->reinit(0, - 0, - dftPtr->d_feOrderPlusOneQuadratureId); - unsigned int numberQuadraturePointsPlusOne = - d_basisOperationsPtrDevice->nQuadsPerCell(); - shapeFuncDevice::computeShapeGradNINJIntegral( - d_basisOperationsPtrDevice, - d_cellShapeFunctionGradientIntegralFlattenedDevice); - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(d_mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_mpi_process == 0 && dftPtr->d_dftParamsPtr->verbosity >= 2) - std::cout - << "Time for shapeFuncDevice::computeShapeGradNINJIntegral for FEOrder: " - << device_time << std::endl; - - if (FEOrderElectro != FEOrder) - { - dftfe::utils::deviceSynchronize(); - MPI_Barrier(d_mpiCommParent); - device_time = MPI_Wtime(); - - dftPtr->d_basisOperationsPtrElectroDevice->reinit( - 0, 0, dftPtr->d_phiTotAXQuadratureIdElectro); - shapeFuncDevice::computeShapeGradNINJIntegral( - dftPtr->d_basisOperationsPtrElectroDevice, - d_cellShapeFunctionGradientIntegralFlattenedDeviceElectro); - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(d_mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_mpi_process == 0 && dftPtr->d_dftParamsPtr->verbosity >= 2) - std::cout - << "Time for shapeFuncDevice::computeShapeGradNINJIntegral for FEOrderElectro: " - << device_time << std::endl; - } if (!onlyUpdateGradNiNjIntegral) { dealii::QIterated<3> quadratureNLP(dealii::QGauss<1>( diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index 17e08942b..b0addaf70 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -35,6 +35,12 @@ namespace dftfe status = setMathMode(dftfe::utils::DEVICEBLAS_TF32_TENSOR_OP_MATH); } + dftfe::utils::deviceBlasHandle_t & + BLASWrapper::getDeviceBlasHandle() + { + return d_deviceBlasHandle; + } + void BLASWrapper::xgemm( @@ -255,6 +261,168 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const double * alpha, + const double * A, + const unsigned int lda, + const double * x, + const unsigned int incx, + const double * beta, + double * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else + { + // Assert Statement + } + dftfe::utils::deviceBlasStatus_t status = cublasDgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const float * alpha, + const float * A, + const unsigned int lda, + const float * x, + const unsigned int incx, + const float * beta, + float * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else + { + // Assert Statement + } + + dftfe::utils::deviceBlasStatus_t status = cublasSgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else if (transA == 'C') + transa = dftfe::utils::DEVICEBLAS_OP_C; + else + { + // Assert Statement + } + + dftfe::utils::deviceBlasStatus_t status = + cublasZgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + int(lda), + dftfe::utils::makeDataTypeDeviceCompatible(x), + int(incx), + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else if (transA == 'C') + transa = dftfe::utils::DEVICEBLAS_OP_C; + else + { + // Assert Statement + } + + dftfe::utils::deviceBlasStatus_t status = + cublasCgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + int(lda), + dftfe::utils::makeDataTypeDeviceCompatible(x), + int(incx), + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + dftfe::utils::deviceBlasStatus_t BLASWrapper::create() { @@ -328,6 +496,29 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + ValueType * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + axpyStridedBlockAtomicAddDeviceKernel<<< + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(addFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(addToVec), + addToVecStartingContiguousBlockIds); + } + + + void BLASWrapper::xdot( const unsigned int N, @@ -1167,6 +1358,22 @@ namespace dftfe const double * s, std::complex * x); + // axpyStridedBlockAtomicAdd + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; // for xscal template void @@ -1193,5 +1400,48 @@ namespace dftfe const std::complex a, const dftfe::size_type n) const; + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + double * valueType2Arr); + } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 4582b482d..24a985030 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -87,6 +87,12 @@ namespace dftfe status = setStream(NULL); } + dftfe::utils::deviceBlasHandle_t & + BLASWrapper::getDeviceBlasHandle() + { + return d_deviceBlasHandle; + } + void BLASWrapper::xgemm( @@ -303,6 +309,170 @@ namespace dftfe int(ldc)); DEVICEBLAS_API_CHECK(status); } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const double * alpha, + const double * A, + const unsigned int lda, + const double * x, + const unsigned int incx, + const double * beta, + double * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else + { + // Assert Statement + } + deviceBlasStatus_t status = hipblasDgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const float * alpha, + const float * A, + const unsigned int lda, + const float * x, + const unsigned int incx, + const float * beta, + float * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else + { + // Assert Statement + } + + deviceBlasStatus_t status = hipblasSgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else if (transA == 'C') + transa = dftfe::utils::DEVICEBLAS_OP_C; + else + { + // Assert Statement + } + + deviceBlasStatus_t status = + hipblasZgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), + int(lda), + dftfe::utils::makeDataTypeHipBlasCompatible(x), + int(incx), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(y), + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + dftfe::utils::deviceBlasOperation_t transa, transb; + if (transA == 'N') + transa = dftfe::utils::DEVICEBLAS_OP_N; + else if (transA == 'T') + transa = dftfe::utils::DEVICEBLAS_OP_T; + else if (transA == 'C') + transa = dftfe::utils::DEVICEBLAS_OP_C; + else + { + // Assert Statement + } + + deviceBlasStatus_t status = + hipblasCgemv(d_deviceBlasHandle, + transa, + int(m), + int(n), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), + int(lda), + dftfe::utils::makeDataTypeHipBlasCompatible(x), + int(incx), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(y), + int(incy)); + DEVICEBLAS_API_CHECK(status); + } + + dftfe::utils::deviceBlasStatus_t BLASWrapper::create() { @@ -368,6 +538,31 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + ValueType * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + hipLaunchKernelGGL(axpyStridedBlockAtomicAddDeviceKernel, + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(addFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(addToVec), + addToVecStartingContiguousBlockIds); + } + + + void BLASWrapper::xdot( const unsigned int N, @@ -1255,5 +1450,37 @@ namespace dftfe const std::complex a, const dftfe::size_type n) const; + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index dd9c677d7..42089df5a 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -161,6 +161,74 @@ namespace dftfe &transA, &transB, &m, &n, &k, alpha, A, &lda, B, &ldb, beta, C, &ldc); } + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const double * alpha, + const double * A, + const unsigned int lda, + const double * x, + const unsigned int incx, + const double * beta, + double * y, + const unsigned int incy) const + { + dgemv_(&transA, &m, &n, alpha, A, &lda, x, &incx, beta, y, &incy); + } + + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const float * alpha, + const float * A, + const unsigned int lda, + const float * x, + const unsigned int incx, + const float * beta, + float * y, + const unsigned int incy) const + { + sgemv_(&transA, &m, &n, alpha, A, &lda, x, &incx, beta, y, &incy); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + zgemv_(&transA, &m, &n, alpha, A, &lda, x, &incx, beta, y, &incy); + } + + void + BLASWrapper::xgemv( + const char transA, + const unsigned int m, + const unsigned int n, + const std::complex *alpha, + const std::complex *A, + const unsigned int lda, + const std::complex *x, + const unsigned int incx, + const std::complex *beta, + std::complex * y, + const unsigned int incy) const + { + cgemv_(&transA, &m, &n, alpha, A, &lda, x, &incx, beta, y, &incy); + } void @@ -174,6 +242,17 @@ namespace dftfe zcopy_(&n, x, &incx, y, &incy); } + void + BLASWrapper::xcopy( + const unsigned int n, + const std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + ccopy_(&n, x, &incx, y, &incy); + } + void BLASWrapper::xnrm2( const unsigned int n, @@ -251,6 +330,22 @@ namespace dftfe } + template + void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const ValueType * addFromVec, + ValueType * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const + { + for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) + std::transform(addFromVec + iBlock * contiguousBlockSize, + addFromVec + (iBlock + 1) * contiguousBlockSize, + addToVec + addToVecStartingContiguousBlockIds[iBlock], + addToVec + addToVecStartingContiguousBlockIds[iBlock], + std::plus<>{}); + } void BLASWrapper::xsymv( @@ -420,7 +515,8 @@ namespace dftfe const ValueType1 * valueType1Arr, ValueType2 * valueType2Arr) { - AssertThrow(false, dftUtils::ExcNotImplementedYet()); + for (unsigned int i = 0; i < size; ++i) + valueType2Arr[i] = valueType1Arr[i]; } template @@ -432,7 +528,14 @@ namespace dftfe ValueType2 * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) { - AssertThrow(false, dftUtils::ExcNotImplementedYet()); + for (unsigned int iBlock = 0; iBlock < numContiguousBlocks; ++iBlock) + { + xcopy(contiguousBlockSize, + copyFromVec + copyFromVecStartingContiguousBlockIds[iBlock], + 1, + copyToVecBlock + iBlock * contiguousBlockSize, + 1); + } } @@ -585,6 +688,65 @@ namespace dftfe // const double a, // const double * s, // std::complex * x); + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + double * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const float * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + double * valueType2Arr); + // axpyStridedBlockAtomicAdd + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/FEBasisOperations.t.cc b/utils/FEBasisOperations.cc similarity index 75% rename from utils/FEBasisOperations.t.cc rename to utils/FEBasisOperations.cc index bf8edc54b..6a3182af8 100644 --- a/utils/FEBasisOperations.t.cc +++ b/utils/FEBasisOperations.cc @@ -14,6 +14,8 @@ // // --------------------------------------------------------------------- // +#include +#include namespace dftfe { @@ -22,10 +24,8 @@ namespace dftfe template - FEBasisOperationsBase:: - FEBasisOperationsBase( + FEBasisOperations:: + FEBasisOperations( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, std::vector *> &constraintsVector, @@ -68,12 +68,10 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::init(const unsigned int & dofHandlerID, - const std::vector &quadratureID, - const std::vector updateFlags) + FEBasisOperations:: + init(const unsigned int & dofHandlerID, + const std::vector &quadratureID, + const std::vector updateFlags) { AssertThrow( updateFlags.size() == quadratureID.size(), @@ -87,7 +85,8 @@ namespace dftfe initializeIndexMaps(); initializeMPIPattern(); initializeShapeFunctionAndJacobianData(); - if (!std::is_same::value) + if constexpr (!std::is_same::value) initializeShapeFunctionAndJacobianBasisData(); } @@ -97,12 +96,10 @@ namespace dftfe dftfe::utils::MemorySpace memorySpace> template void - FEBasisOperationsBase:: - init(const FEBasisOperationsBase &basisOperationsSrc) + FEBasisOperations:: + init(const FEBasisOperations &basisOperationsSrc) { d_matrixFreeDataPtr = basisOperationsSrc.d_matrixFreeDataPtr; d_constraintsVector = basisOperationsSrc.d_constraintsVector; @@ -196,7 +193,8 @@ namespace dftfe } } } - if (!std::is_same::value) + if constexpr (!std::is_same::value) for (unsigned int iQuadIndex = 0; iQuadIndex < d_quadratureIDsVector.size(); ++iQuadIndex) @@ -272,13 +270,11 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::reinit(const unsigned int &vecBlockSize, - const unsigned int - &cellsBlockSize, - const unsigned int &quadratureID, - const bool isResizeTempStorage) + FEBasisOperations:: + reinit(const unsigned int &vecBlockSize, + const unsigned int &cellsBlockSize, + const unsigned int &quadratureID, + const bool isResizeTempStorage) { d_quadratureID = quadratureID; auto itr = std::find(d_quadratureIDsVector.begin(), @@ -304,9 +300,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::nQuadsPerCell() const + FEBasisOperations:: + nQuadsPerCell() const { return d_nQuadsPerCell[d_quadratureIndex]; } @@ -315,9 +310,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::nDofsPerCell() const + FEBasisOperations:: + nDofsPerCell() const { return d_nDofsPerCell; } @@ -326,9 +320,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::nCells() const + FEBasisOperations:: + nCells() const { return d_nCells; } @@ -337,9 +330,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::nRelaventDofs() const + FEBasisOperations:: + nRelaventDofs() const { return d_localSize; } @@ -348,9 +340,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::nOwnedDofs() const + FEBasisOperations:: + nOwnedDofs() const { return d_locallyOwnedSize; } @@ -359,9 +350,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::shapeFunctionData(bool transpose) const + FEBasisOperations:: + shapeFunctionData(bool transpose) const { return transpose ? d_shapeFunctionDataTranspose.find(d_quadratureID)->second : @@ -372,10 +362,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::shapeFunctionGradientData(bool transpose) const + FEBasisOperations:: + shapeFunctionGradientData(bool transpose) const { return transpose ? d_shapeFunctionGradientDataTranspose.find(d_quadratureID) @@ -387,9 +375,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::inverseJacobians() const + FEBasisOperations:: + inverseJacobians() const { return d_inverseJacobianData.find(areAllCellsAffine ? 0 : d_quadratureID) ->second; @@ -400,9 +387,8 @@ namespace dftfe dftfe::utils::MemorySpace memorySpace> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::quadPoints() const + FEBasisOperations:: + quadPoints() const { return d_quadPoints.find(d_quadratureID)->second; } @@ -413,141 +399,22 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::JxW() const + FEBasisOperations:: + JxW() const { return d_JxWData.find(d_quadratureID)->second; } - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::JxWBasisData() const - { - return d_JxWData.find(d_quadratureID)->second; - } - - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::JxWBasisData() const - { - return d_JxWBasisData.find(d_quadratureID)->second; - } - - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::inverseJacobiansBasisData() const - { - return d_inverseJacobianData.find(areAllCellsAffine ? 0 : d_quadratureID) - ->second; - } - - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::inverseJacobiansBasisData() const - { - return d_inverseJacobianBasisData - .find(areAllCellsAffine ? 0 : d_quadratureID) - ->second; - } - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) - const - { - return transpose ? - d_shapeFunctionDataTranspose.find(d_quadratureID)->second : - d_shapeFunctionData.find(d_quadratureID)->second; - } template - template ::value, int>> const dftfe::utils::MemoryStorage & - FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) - const + FEBasisOperations:: + cellStiffnessMatrixBasisData() const { - return transpose ? - d_shapeFunctionBasisDataTranspose.find(d_quadratureID)->second : - d_shapeFunctionBasisData.find(d_quadratureID)->second; - } - - - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::shapeFunctionGradientBasisData(bool transpose) const - { - return transpose ? - d_shapeFunctionGradientDataTranspose.find(d_quadratureID) - ->second : - d_shapeFunctionGradientData.find(d_quadratureID)->second; - } - - template - template ::value, int>> - const dftfe::utils::MemoryStorage & - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::shapeFunctionGradientBasisData(bool transpose) const - { - return transpose ? - d_shapeFunctionGradientBasisDataTranspose.find(d_quadratureID) - ->second : - d_shapeFunctionGradientBasisData.find(d_quadratureID)->second; + return d_cellStiffnessMatrixBasisType; } @@ -555,9 +422,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::cellsTypeFlag() const + FEBasisOperations:: + cellsTypeFlag() const { return (unsigned int)areAllCellsAffine + (unsigned int)areAllCellsCartesian; @@ -567,9 +433,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> dealii::CellId - FEBasisOperationsBase::cellID(const unsigned int iElem) const + FEBasisOperations:: + cellID(const unsigned int iElem) const { return d_cellIndexToCellIdMap[iElem]; } @@ -578,10 +443,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> unsigned int - FEBasisOperationsBase::cellIndex(const dealii::CellId cellid) - const + FEBasisOperations:: + cellIndex(const dealii::CellId cellid) const { return d_cellIdToCellIndexMap.find(cellid)->second; } @@ -591,9 +454,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dealii::MatrixFree<3, ValueTypeBasisData> & - FEBasisOperationsBase::matrixFreeData() const + FEBasisOperations:: + matrixFreeData() const { return *d_matrixFreeDataPtr; } @@ -602,9 +464,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> const dealii::DoFHandler<3> & - FEBasisOperationsBase::getDofHandler() const + FEBasisOperations:: + getDofHandler() const { return d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID); } @@ -615,9 +476,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::resizeTempStorage() + FEBasisOperations:: + resizeTempStorage() { tempCellNodalData.resize(d_nVectors * d_nDofsPerCell * d_cellsBlockSize); if (d_updateFlags[d_quadratureIndex] & update_gradients) @@ -638,9 +498,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::initializeFlattenedIndexMaps() + FEBasisOperations:: + initializeFlattenedIndexMaps() { #if defined(DFTFE_WITH_DEVICE) dftfe::utils::MemoryStorage void - FEBasisOperationsBase::initializeMPIPattern() + FEBasisOperations:: + initializeMPIPattern() { const std::pair &locallyOwnedRange = d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) @@ -695,9 +553,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::initializeIndexMaps() + FEBasisOperations:: + initializeIndexMaps() { d_nCells = d_matrixFreeDataPtr->n_physical_cells(); d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) @@ -747,9 +604,7 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase:: + FEBasisOperations:: reinitializeConstraints( std::vector *> &constraintsVector) @@ -763,9 +618,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::initializeConstraints() + FEBasisOperations:: + initializeConstraints() { d_constraintInfo.clear(); d_constraintInfo.resize((*d_constraintsVector).size()); @@ -781,9 +635,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::initializeShapeFunctionAndJacobianData() + FEBasisOperations:: + initializeShapeFunctionAndJacobianData() { d_nQuadsPerCell.resize(d_quadratureIDsVector.size()); for (unsigned int iQuadIndex = 0; @@ -1035,10 +888,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::initializeShapeFunctionAndJacobianBasisData() + FEBasisOperations:: + initializeShapeFunctionAndJacobianBasisData() { for (unsigned int iQuadIndex = 0; iQuadIndex < d_quadratureIDsVector.size(); @@ -1240,9 +1091,221 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase:: + FEBasisOperations:: + computeCellStiffnessMatrix(const unsigned int quadratureID, + const unsigned int cellsBlockSize, + const bool basisType, + const bool ceoffType) + { + auto itr = std::find(d_quadratureIDsVector.begin(), + d_quadratureIDsVector.end(), + quadratureID); + AssertThrow( + itr != d_quadratureIDsVector.end(), + dealii::ExcMessage( + "DFT-FE Error: FEBasisOperations Class not initialized with this quadrature Index.")); + + if (basisType) + d_cellStiffnessMatrixBasisType.resize(d_nDofsPerCell * d_nDofsPerCell * + d_nCells); + if (ceoffType) + if constexpr (std::is_same::value) + { + if (!basisType) + d_cellStiffnessMatrixBasisType.resize(d_nDofsPerCell * + d_nDofsPerCell * d_nCells); + } + else + d_cellStiffnessMatrixCoeffType.resize(d_nDofsPerCell * + d_nDofsPerCell * d_nCells); + + unsigned int quadratureIndex = + std::distance(d_quadratureIDsVector.begin(), itr); + unsigned int nQuadsPerCell = d_nQuadsPerCell[quadratureIndex]; + dftfe::utils::MemoryStorage + d_jacobianFactorHost; + +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_jacobianFactor; +#else + auto &d_jacobianFactor = d_jacobianFactorHost; +#endif + d_jacobianFactorHost.resize(9 * nQuadsPerCell * d_nCells); + + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(quadratureID); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_JxW_values | dealii::update_inverse_jacobians); + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + for (unsigned int iCell = 0; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + fe_values.reinit(cellPtr); + const auto &inverseJacobians = fe_values.get_inverse_jacobians(); + for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) + { + const auto &inverseJacobianQuad = inverseJacobians[iQuad]; + const auto jxw = fe_values.JxW(iQuad); + const auto jacobianFactorPtr = d_jacobianFactorHost.data() + + iCell * nQuadsPerCell * 9 + + iQuad * 9; + for (unsigned int kDim = 0; kDim < 3; ++kDim) + for (unsigned int jDim = 0; jDim < 3; ++jDim) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + jacobianFactorPtr[3 * jDim + iDim] += + inverseJacobianQuad[kDim][iDim] * + inverseJacobianQuad[kDim][jDim] * jxw; + } + ++iCell; + } +#if defined(DFTFE_WITH_DEVICE) + d_jacobianFactor.resize(d_jacobianFactorHost.size()); + d_jacobianFactor.copyFrom(d_jacobianFactorHost); +#endif + dftfe::utils::MemoryStorage + tempStiffnessMatrixBlock(d_nDofsPerCell * d_nDofsPerCell * + cellsBlockSize); + dftfe::utils::MemoryStorage + tempCellGradientsBlock(nQuadsPerCell * d_nDofsPerCell * cellsBlockSize * + 3); + dftfe::utils::MemoryStorage + tempCellGradientsBlock2(nQuadsPerCell * d_nDofsPerCell * + cellsBlockSize * 3); + dftfe::utils::MemoryStorage + zeroIndexVec(cellsBlockSize - 1, 0); + if constexpr (memorySpace == dftfe::utils::MemorySpace::HOST) + { + if constexpr (std::is_same::value) + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutHost( + d_nDofsPerCell, + nQuadsPerCell, + 1, + d_shapeFunctionGradientData[quadratureID].data(), + tempCellGradientsBlock.data()); + else + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutHost( + d_nDofsPerCell, + nQuadsPerCell, + 1, + d_shapeFunctionGradientBasisData[quadratureID].data(), + tempCellGradientsBlock.data()); + } + else + { + if constexpr (std::is_same::value) + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutDevice( + d_nDofsPerCell, + nQuadsPerCell, + 1, + d_shapeFunctionGradientData[quadratureID].data(), + tempCellGradientsBlock.data()); + else + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutDevice( + d_nDofsPerCell, + nQuadsPerCell, + 1, + d_shapeFunctionGradientBasisData[quadratureID].data(), + tempCellGradientsBlock.data()); + } + if (cellsBlockSize > 1) + d_BLASWrapperPtr->stridedCopyToBlock(nQuadsPerCell * d_nDofsPerCell * 3, + cellsBlockSize - 1, + tempCellGradientsBlock.data(), + tempCellGradientsBlock.data() + + nQuadsPerCell * d_nDofsPerCell * + 3, + zeroIndexVec.data()); + const ValueTypeBasisData scalarCoeffAlpha = ValueTypeBasisData(1.0), + scalarCoeffBeta = ValueTypeBasisData(0.0); + + for (unsigned int iCell = 0; iCell < d_nCells; iCell += cellsBlockSize) + { + std::pair cellRange( + iCell, std::min(iCell + cellsBlockSize, d_nCells)); + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + d_nDofsPerCell, + 3, + 3, + &scalarCoeffAlpha, + tempCellGradientsBlock.data(), + d_nDofsPerCell, + d_nDofsPerCell * 3, + d_jacobianFactor.data() + 9 * cellRange.first * nQuadsPerCell, + 3, + 9, + &scalarCoeffBeta, + tempCellGradientsBlock2.data(), + d_nDofsPerCell, + d_nDofsPerCell * 3, + (cellRange.second - cellRange.first) * nQuadsPerCell); + d_BLASWrapperPtr->xgemmStridedBatched('N', + 'T', + d_nDofsPerCell, + d_nDofsPerCell, + nQuadsPerCell * 3, + &scalarCoeffAlpha, + tempCellGradientsBlock2.data(), + d_nDofsPerCell, + d_nDofsPerCell * nQuadsPerCell * + 3, + tempCellGradientsBlock.data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + tempStiffnessMatrixBlock.data(), + d_nDofsPerCell, + d_nDofsPerCell * d_nDofsPerCell, + cellRange.second - + cellRange.first); + if (basisType) + d_cellStiffnessMatrixBasisType.copyFrom( + tempStiffnessMatrixBlock, + d_nDofsPerCell * d_nDofsPerCell * + (cellRange.second - cellRange.first), + 0, + cellRange.first * d_nDofsPerCell * d_nDofsPerCell); + if (ceoffType) + if constexpr (std::is_same::value) + { + if (!basisType) + d_cellStiffnessMatrixBasisType.copyFrom( + tempStiffnessMatrixBlock, + d_nDofsPerCell * d_nDofsPerCell * + (cellRange.second - cellRange.first), + 0, + cellRange.first * d_nDofsPerCell * d_nDofsPerCell); + } + else + d_BLASWrapperPtr->copyValueType1ArrToValueType2Arr( + d_nDofsPerCell * d_nDofsPerCell * + (cellRange.second - cellRange.first), + tempStiffnessMatrixBlock.data(), + d_cellStiffnessMatrixCoeffType.data() + + cellRange.first * d_nDofsPerCell * d_nDofsPerCell); + } + } + + template + void + FEBasisOperations:: createMultiVector( const unsigned int blocksize, dftfe::linearAlgebra::MultiVector @@ -1255,9 +1318,7 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase:: + FEBasisOperations:: createScratchMultiVectors(const unsigned int vecBlockSize, const unsigned int numMultiVecs) const { @@ -1288,9 +1349,8 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase::clearScratchMultiVectors() const + FEBasisOperations:: + clearScratchMultiVectors() const { scratchMultiVectors.clear(); } @@ -1299,11 +1359,9 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> dftfe::linearAlgebra::MultiVector & - FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - memorySpace>::getMultiVector(const unsigned int vecBlockSize, - const unsigned int index) const + FEBasisOperations:: + getMultiVector(const unsigned int vecBlockSize, + const unsigned int index) const { AssertThrow(scratchMultiVectors.find(vecBlockSize) != scratchMultiVectors.end(), @@ -1317,9 +1375,7 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperationsBase:: + FEBasisOperations:: distribute(dftfe::linearAlgebra::MultiVector &multiVector, unsigned int constraintIndex) const @@ -1331,6 +1387,35 @@ namespace dftfe .distribute(multiVector, multiVector.numVectors()); } - + template class FEBasisOperations; +#if defined(USE_COMPLEX) + template class FEBasisOperations, + double, + dftfe::utils::MemorySpace::HOST>; +#endif +#if defined(DFTFE_WITH_DEVICE) + template class FEBasisOperations; + template void + FEBasisOperations::init( + const FEBasisOperations + &basisOperationsSrc); +# if defined(USE_COMPLEX) + template class FEBasisOperations, + double, + dftfe::utils::MemorySpace::DEVICE>; + template void + FEBasisOperations, + double, + dftfe::utils::MemorySpace::DEVICE>:: + init(const FEBasisOperations, + double, + dftfe::utils::MemorySpace::HOST> + &basisOperationsSrc); +# endif +#endif } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsDevice.t.cc b/utils/FEBasisOperationsDevice.t.cc deleted file mode 100644 index e59fbde44..000000000 --- a/utils/FEBasisOperationsDevice.t.cc +++ /dev/null @@ -1,325 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#include -#include -#include -#include -#include -#include -#include -#include -namespace dftfe -{ - namespace basis - { - template - void - FEBasisOperations:: - interpolate( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients) const - { - interpolateKernel(nodalData, - quadratureValues, - quadratureGradients, - std::pair(0, d_nCells)); - } - - template - void - FEBasisOperations:: - integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - integrateWithBasisKernel(quadratureValues, - quadratureGradients, - nodalData, - std::pair(0, - d_nCells)); - } - - - template - void - FEBasisOperations:: - extractToCellNodalData( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const - { - extractToCellNodalDataKernel( - nodalData, - cellNodalDataPtr, - std::pair(0, d_nCells)); - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - accumulateFromCellNodalDataKernel( - cellNodalDataPtr, - nodalData, - std::pair(0, d_nCells)); - } - - - - template - void - FEBasisOperations:: - interpolateKernel( - const dftfe::linearAlgebra::MultiVector< - ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> & nodalValues, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const - { - extractToCellNodalDataKernel(nodalValues, - tempCellNodalData.data(), - cellRange); - interpolateKernel(tempCellNodalData.data(), - quadratureValues, - quadratureGradients, - cellRange); - } - - template - void - FEBasisOperations:: - interpolateKernel( - const ValueTypeBasisCoeff * cellNodalValues, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const - { - const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), - scalarCoeffBeta = ValueTypeBasisCoeff(0.0); - - if (quadratureValues != NULL) - { - d_BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - d_nVectors, - d_nQuadsPerCell[d_quadratureIndex], - d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalValues, - d_nVectors, - d_nVectors * d_nDofsPerCell, - d_shapeFunctionData.find(d_quadratureID)->second.data(), - d_nDofsPerCell, - 0, - &scalarCoeffBeta, - quadratureValues, - d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex], - cellRange.second - cellRange.first); - } - - if (quadratureGradients != NULL) - { - d_BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - d_nVectors, - d_nQuadsPerCell[d_quadratureIndex] * 3, - d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalValues, - d_nVectors, - d_nVectors * d_nDofsPerCell, - d_shapeFunctionGradientDataInternalLayout.find(d_quadratureID) - ->second.data(), - d_nDofsPerCell, - 0, - &scalarCoeffBeta, - areAllCellsCartesian ? quadratureGradients : - tempQuadratureGradientsData.data(), - d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, - cellRange.second - cellRange.first); - if (areAllCellsCartesian) - { - d_BLASWrapperPtr->stridedBlockScale( - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, - 3 * (cellRange.second - cellRange.first), - ValueTypeBasisCoeff(1.0), - d_inverseJacobianData.find(0)->second.data() + - cellRange.first * 3, - quadratureGradients); - } - else if (areAllCellsAffine) - { - d_BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, - 3, - 3, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * 3, - d_inverseJacobianData.find(0)->second.data() + - 9 * cellRange.first, - 3, - 9, - &scalarCoeffBeta, - quadratureGradients, - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, - cellRange.second - cellRange.first); - } - else - { - d_BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - d_nVectors, - 3, - 3, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - d_nVectors, - d_nVectors * 3, - d_inverseJacobianData.find(d_quadratureID)->second.data() + - 9 * cellRange.first * d_nQuadsPerCell[d_quadratureIndex], - 3, - 9, - &scalarCoeffBeta, - tempQuadratureGradientsDataNonAffine.data(), - d_nVectors, - d_nVectors * 3, - (cellRange.second - cellRange.first) * - d_nQuadsPerCell[d_quadratureIndex]); - - dftfe::basis::FEBasisOperationsKernelsDevice:: - reshapeNonAffineCase( - d_nVectors, - d_nQuadsPerCell[d_quadratureIndex], - (cellRange.second - cellRange.first), - tempQuadratureGradientsDataNonAffine.data(), - quadratureGradients); - } - } - } - - template - void - FEBasisOperations:: - integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const - {} - - template - void - FEBasisOperations:: - extractToCellNodalDataKernel( - const dftfe::linearAlgebra::MultiVector< - ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> & nodalData, - ValueTypeBasisCoeff * cellNodalDataPtr, - const std::pair cellRange) const - { - dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( - d_nVectors, - (cellRange.second - cellRange.first) * d_nDofsPerCell, - nodalData.data(), - cellNodalDataPtr, - d_flattenedCellDofIndexToProcessDofIndexMap.data() + - cellRange.first * d_nDofsPerCell); - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const - { - dftfe::utils::deviceKernelsGeneric::axpyStridedBlockAtomicAdd( - d_nVectors, - (cellRange.second - cellRange.first) * d_nDofsPerCell, - cellNodalDataPtr, - nodalData.begin(), - d_flattenedCellDofIndexToProcessDofIndexMap.begin() + - cellRange.first * d_nDofsPerCell); - } - - template - void - FEBasisOperations:: - setDeviceBLASHandle(dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr) - { - d_deviceBlasHandlePtr = deviceBlasHandlePtr; - } - - template - dftfe::utils::deviceBlasHandle_t & - FEBasisOperations::getDeviceBLASHandle() - { - return *d_deviceBlasHandlePtr; - } - } // namespace basis -} // namespace dftfe diff --git a/utils/FEBasisOperationsHost.t.cc b/utils/FEBasisOperationsHost.t.cc deleted file mode 100644 index 353359dfb..000000000 --- a/utils/FEBasisOperationsHost.t.cc +++ /dev/null @@ -1,493 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#include -namespace dftfe -{ - namespace basis - { - template - void - FEBasisOperations:: - interpolate( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients) const - { - interpolateKernel(nodalData, - quadratureValues, - quadratureGradients, - std::pair(0, d_nCells)); - } - - template - void - FEBasisOperations:: - integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - integrateWithBasisKernel(quadratureValues, - quadratureGradients, - nodalData, - std::pair(0, - d_nCells)); - } - - - template - void - FEBasisOperations:: - extractToCellNodalData( - dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const - { - extractToCellNodalDataKernel( - nodalData, - cellNodalDataPtr, - std::pair(0, d_nCells)); - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - accumulateFromCellNodalDataKernel( - cellNodalDataPtr, - nodalData, - std::pair(0, d_nCells)); - } - template - void - FEBasisOperations:: - interpolateKernel( - const dftfe::linearAlgebra::MultiVector - & nodalValues, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const - { - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - { - extractToCellNodalDataKernel( - nodalValues, - tempCellNodalData.data(), - std::pair(iCell, iCell + 1)); - interpolateKernel(tempCellNodalData.data(), - quadratureValues, - quadratureGradients, - std::pair(iCell, - iCell + 1)); - } - } - template - void - FEBasisOperations:: - interpolateKernel( - const ValueTypeBasisCoeff * cellNodalValues, - ValueTypeBasisCoeff * quadratureValues, - ValueTypeBasisCoeff * quadratureGradients, - const std::pair cellRange) const - { - const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), - scalarCoeffBeta = ValueTypeBasisCoeff(0.0); - const char transA = 'N', transB = 'N'; - - if (quadratureValues != NULL) - { - d_BLASWrapperPtr->xgemmStridedBatched( - transA, - transB, - d_nVectors, - d_nQuadsPerCell[d_quadratureIndex], - d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalValues, - d_nVectors, - d_nVectors * d_nDofsPerCell, - d_shapeFunctionData.find(d_quadratureID)->second.data(), - d_nDofsPerCell, - 0, - &scalarCoeffBeta, - quadratureValues, - d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex], - cellRange.second - cellRange.first); - } - if (quadratureGradients != NULL) - { - const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell[d_quadratureIndex] * 3; - - d_BLASWrapperPtr->xgemmStridedBatched( - transA, - transB, - d_nVectors, - d_nQuadsPerCellTimesThree, - d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalValues, - d_nVectors, - d_nVectors * d_nDofsPerCell, - d_shapeFunctionGradientDataInternalLayout.find(d_quadratureID) - ->second.data(), - d_nDofsPerCell, - 0, - &scalarCoeffBeta, - areAllCellsCartesian ? quadratureGradients : - tempQuadratureGradientsData.data(), - d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, - cellRange.second - cellRange.first); - if (areAllCellsCartesian) - { - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors; - const unsigned int one = 1; - for (unsigned int iCell = cellRange.first; - iCell < cellRange.second; - ++iCell) - { - for (unsigned int iDim = 0; iDim < 3; ++iDim) - d_BLASWrapperPtr->xscal( - quadratureGradients + - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * 3 * - (iCell - cellRange.first) + - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * iDim, - *(d_inverseJacobianData.find(0)->second.data() + - 3 * iCell + iDim), - d_nQuadsPerCellTimesnVectors); - } - } - else if (areAllCellsAffine) - { - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors; - const unsigned int three = 3; - d_BLASWrapperPtr->xgemmStridedBatched( - transA, - transB, - d_nQuadsPerCellTimesnVectors, - three, - three, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - d_nQuadsPerCellTimesnVectors, - d_nQuadsPerCellTimesnVectors * 3, - d_inverseJacobianData.find(0)->second.data() + - 9 * cellRange.first, - three, - 9, - &scalarCoeffBeta, - quadratureGradients, - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, - d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, - cellRange.second - cellRange.first); - } - else - { - const unsigned int three = 3; - d_BLASWrapperPtr->xgemmStridedBatched( - transA, - transB, - d_nVectors, - three, - three, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - d_nVectors, - d_nVectors * 3, - d_inverseJacobianData.find(d_quadratureID)->second.data() + - 9 * cellRange.first * d_nQuadsPerCell[d_quadratureIndex], - 3, - 9, - &scalarCoeffBeta, - tempQuadratureGradientsDataNonAffine.data(), - d_nVectors, - d_nVectors * 3, - (cellRange.second - cellRange.first) * - d_nQuadsPerCell[d_quadratureIndex]); - for (unsigned int iCell = cellRange.first; - iCell < cellRange.second; - ++iCell) - { - for (unsigned int iQuad = 0; - iQuad < d_nQuadsPerCell[d_quadratureIndex]; - ++iQuad) - - { - for (unsigned int iDim = 0; iDim < 3; ++iDim) - std::memcpy( - quadratureGradients + - d_nVectors * 3 * - d_nQuadsPerCell[d_quadratureIndex] * - (iCell - cellRange.first) + - d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * - iDim + - d_nVectors * iQuad, - tempQuadratureGradientsDataNonAffine.data() + - d_nVectors * 3 * iQuad + d_nVectors * iDim, - d_nVectors * sizeof(ValueTypeBasisCoeff)); - } - } - } - } - } - - template - void - FEBasisOperations:: - integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const - { - dftfe::utils::MemoryStorage - cellNodalData, tempQuadratureGradientsData, - tempQuadratureGradientsDataNonAffine; - cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); - if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize(3 * d_nVectors * - d_nQuadsPerCell[d_quadratureIndex]); - - if (quadratureGradients != NULL) - tempQuadratureGradientsDataNonAffine.resize( - areAllCellsAffine ? - 0 : - (3 * d_nVectors * d_nQuadsPerCell[d_quadratureIndex])); - - - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - { - const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), - scalarCoeffBeta = ValueTypeBasisCoeff(0.0); - const char transA = 'N', transB = 'T'; - d_BLASWrapperPtr->xgemm( - transA, - transB, - d_nVectors, - d_nDofsPerCell, - d_nQuadsPerCell[d_quadratureIndex], - &scalarCoeffAlpha, - quadratureValues + d_nQuadsPerCell[d_quadratureIndex] * iCell, - d_nVectors, - d_shapeFunctionData.find(d_quadratureID)->second.data(), - d_nQuadsPerCell[d_quadratureIndex], - &scalarCoeffBeta, - cellNodalData.data() + d_nDofsPerCell * iCell, - d_nVectors); - if (quadratureGradients != NULL) - { - if (areAllCellsCartesian) - { - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors; - const unsigned int one = 1; - std::memcpy(tempQuadratureGradientsData.data(), - quadratureGradients + - d_nQuadsPerCell[d_quadratureIndex] * - d_nVectors * 3 * iCell, - 3 * d_nQuadsPerCellTimesnVectors * - sizeof(ValueTypeBasisCoeff)); - for (unsigned int iDim = 0; iDim < 3; ++iDim) - { - d_BLASWrapperPtr->xscal( - tempQuadratureGradientsData.data() + - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * - iDim, - *(d_inverseJacobianData.find(0)->second.data() + - 3 * iCell + iDim), - d_nQuadsPerCellTimesnVectors); - } - } - else if (areAllCellsAffine) - { - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell[d_quadratureIndex] * d_nVectors; - const unsigned int three = 3; - d_BLASWrapperPtr->xgemm( - transA, - transB, - d_nQuadsPerCellTimesnVectors, - three, - three, - &scalarCoeffAlpha, - quadratureGradients + d_nQuadsPerCell[d_quadratureIndex] * - d_nVectors * 3 * iCell, - d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.find(0)->second.data() + 9 * iCell, - three, - &scalarCoeffBeta, - tempQuadratureGradientsData.data(), - d_nQuadsPerCellTimesnVectors); - } - else - { - for (unsigned int iQuad = 0; - iQuad < d_nQuadsPerCell[d_quadratureIndex]; - ++iQuad) - for (unsigned int iDim = 0; iDim < 3; ++iDim) - std::memcpy(tempQuadratureGradientsDataNonAffine.data() + - d_nVectors * 3 * iQuad + d_nVectors * iDim, - quadratureGradients + - d_nVectors * 3 * - d_nQuadsPerCell[d_quadratureIndex] * - iCell + - d_nVectors * - d_nQuadsPerCell[d_quadratureIndex] * - iDim + - d_nVectors * iQuad, - d_nVectors * sizeof(ValueTypeBasisCoeff)); - const unsigned int three = 3; - for (unsigned int iQuad = 0; - iQuad < d_nQuadsPerCell[d_quadratureIndex]; - ++iQuad) - d_BLASWrapperPtr->xgemm( - transA, - transB, - d_nVectors, - three, - three, - &scalarCoeffAlpha, - tempQuadratureGradientsDataNonAffine.data() + - d_nVectors * 3 * iQuad, - d_nVectors, - d_inverseJacobianData.find(d_quadratureID) - ->second.data() + - 9 * d_nQuadsPerCell[d_quadratureIndex] * iCell + - 9 * iQuad, - three, - &scalarCoeffBeta, - tempQuadratureGradientsData.data() + - d_nVectors * 3 * iQuad, - d_nVectors); - } - const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell[d_quadratureIndex] * 3; - d_BLASWrapperPtr->xgemm(transA, - transB, - d_nVectors, - d_nQuadsPerCellTimesThree, - d_nDofsPerCell, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - d_nVectors, - d_shapeFunctionGradientDataInternalLayout - .find(d_quadratureID) - ->second.data(), - d_nDofsPerCell, - &scalarCoeffBeta, - cellNodalData.data() + - d_nDofsPerCell * iCell, - d_nVectors); - } - accumulateFromCellNodalDataKernel( - cellNodalData.data(), - nodalData, - std::pair(iCell, iCell + 1)); - } - } - - template - void - FEBasisOperations:: - extractToCellNodalDataKernel( - const dftfe::linearAlgebra::MultiVector - & nodalData, - ValueTypeBasisCoeff * cellNodalDataPtr, - const std::pair cellRange) const - { - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) - { - std::memcpy(cellNodalDataPtr + - (iCell - cellRange.first) * d_nVectors * - d_nDofsPerCell + - iDof * d_nVectors, - nodalData.data() + - d_flattenedCellDofIndexToProcessDofIndexMap - [iCell * d_nDofsPerCell + iDof], - d_nVectors * sizeof(ValueTypeBasisCoeff)); - } - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - const std::pair cellRange) const - { - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) - std::transform( - cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + - iDof * d_nVectors, - cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + - iDof * d_nVectors + d_nVectors, - nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap - [iCell * d_nDofsPerCell + iDof], - nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap - [iCell * d_nDofsPerCell + iDof], - std::plus()); - } - } // namespace basis -} // namespace dftfe diff --git a/utils/FEBasisOperationsKernels.cc b/utils/FEBasisOperationsKernels.cc new file mode 100644 index 000000000..a000f899b --- /dev/null +++ b/utils/FEBasisOperationsKernels.cc @@ -0,0 +1,441 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +#include + +namespace dftfe +{ + namespace basis + { + template + void + FEBasisOperations:: + interpolate(dftfe::linearAlgebra::MultiVector &nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const + { + interpolateKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + integrateWithBasisKernel(quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, + d_nCells)); + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const + { + extractToCellNodalDataKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + accumulateFromCellNodalDataKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells)); + } + template + void + FEBasisOperations:: + interpolateKernel( + const dftfe::linearAlgebra::MultiVector &nodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + iCell += d_cellsBlockSize) + { + extractToCellNodalDataKernel( + nodalValues, + tempCellNodalData.data(), + std::pair( + iCell, std::min(d_nCells, iCell + d_cellsBlockSize))); + interpolateKernel(tempCellNodalData.data(), + quadratureValues, + quadratureGradients, + std::pair( + iCell, + std::min(d_nCells, iCell + d_cellsBlockSize))); + } + } + template + void + FEBasisOperations:: + interpolateKernel( + const ValueTypeBasisCoeff * cellNodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + + if (quadratureValues != NULL) + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex], + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues, + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionData.find(d_quadratureID)->second.data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + quadratureValues, + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex], + cellRange.second - cellRange.first); + } + + if (quadratureGradients != NULL) + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex] * 3, + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues, + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionGradientDataInternalLayout.find(d_quadratureID) + ->second.data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + areAllCellsCartesian ? quadratureGradients : + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, + cellRange.second - cellRange.first); + if (areAllCellsCartesian) + { + d_BLASWrapperPtr->stridedBlockScale( + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + 3 * (cellRange.second - cellRange.first), + ValueTypeBasisCoeff(1.0), + d_inverseJacobianData.find(0)->second.data() + + cellRange.first * 3, + quadratureGradients); + } + else if (areAllCellsAffine) + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * 3, + d_inverseJacobianData.find(0)->second.data() + + 9 * cellRange.first, + 3, + 9, + &scalarCoeffBeta, + quadratureGradients, + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, + cellRange.second - cellRange.first); + } + else + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', + d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * 3, + d_inverseJacobianData.find(d_quadratureID)->second.data() + + 9 * cellRange.first * d_nQuadsPerCell[d_quadratureIndex], + 3, + 9, + &scalarCoeffBeta, + tempQuadratureGradientsDataNonAffine.data(), + d_nVectors, + d_nVectors * 3, + (cellRange.second - cellRange.first) * + d_nQuadsPerCell[d_quadratureIndex]); + if (memorySpace == dftfe::utils::MemorySpace::HOST) + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeFromNonAffineLayoutHost( + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex], + (cellRange.second - cellRange.first), + tempQuadratureGradientsDataNonAffine.data(), + quadratureGradients); + else + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeFromNonAffineLayoutDevice( + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex], + (cellRange.second - cellRange.first), + tempQuadratureGradientsDataNonAffine.data(), + quadratureGradients); + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisKernel( + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + if (quadratureValues != NULL) + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'T', + d_nVectors, + d_nDofsPerCell, + d_nQuadsPerCell[d_quadratureIndex], + &scalarCoeffAlpha, + quadratureValues, + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex], + d_shapeFunctionData.find(d_quadratureID)->second.data(), + d_nQuadsPerCell[d_quadratureIndex], + 0, + &scalarCoeffBeta, + tempCellNodalData.data(), + d_nVectors, + d_nVectors * d_nDofsPerCell, + cellRange.second - cellRange.first); + } + if (quadratureGradients != NULL) + { + if (areAllCellsCartesian) + { + tempQuadratureGradientsData.template copyFrom( + quadratureGradients, + 3 * d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * + (cellRange.second - cellRange.first), + 3 * d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * + cellRange.first, + 0); + d_BLASWrapperPtr->stridedBlockScale( + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + 3 * (cellRange.second - cellRange.first), + ValueTypeBasisCoeff(1.0), + d_inverseJacobianData.find(0)->second.data() + + cellRange.first * 3, + tempQuadratureGradientsData.data()); + } + else if (areAllCellsAffine) + { + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'T', + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + quadratureGradients, + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors * 3, + d_inverseJacobianData.find(0)->second.data() + + 9 * cellRange.first, + 3, + 9, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + d_nQuadsPerCell[d_quadratureIndex] * d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex] * 3, + cellRange.second - cellRange.first); + } + else + { + if (memorySpace == dftfe::utils::MemorySpace::HOST) + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutHost( + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex], + (cellRange.second - cellRange.first), + quadratureGradients, + tempQuadratureGradientsDataNonAffine.data()); + else + dftfe::basis::FEBasisOperationsKernelsInternal:: + reshapeToNonAffineLayoutDevice( + d_nVectors, + d_nQuadsPerCell[d_quadratureIndex], + (cellRange.second - cellRange.first), + quadratureGradients, + tempQuadratureGradientsDataNonAffine.data()); + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'T', + d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsDataNonAffine.data(), + d_nVectors, + d_nVectors * 3, + d_inverseJacobianData.find(d_quadratureID)->second.data() + + 9 * cellRange.first * d_nQuadsPerCell[d_quadratureIndex], + 3, + 9, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * 3, + (cellRange.second - cellRange.first) * + d_nQuadsPerCell[d_quadratureIndex]); + } + d_BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'T', + d_nVectors, + d_nDofsPerCell, + d_nQuadsPerCell[d_quadratureIndex] * 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureIndex], + d_shapeFunctionGradientDataInternalLayout.find(d_quadratureID) + ->second.data(), + d_nQuadsPerCell[d_quadratureIndex], + 0, + &scalarCoeffBeta, + tempCellNodalData.data(), + d_nVectors, + d_nVectors * d_nDofsPerCell, + cellRange.second - cellRange.first); + } + accumulateFromCellNodalDataKernel(tempCellNodalData.data(), + nodalData, + cellRange); + } + + template + void + FEBasisOperations:: + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector &nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const + { + d_BLASWrapperPtr->stridedCopyToBlock( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + nodalData.data(), + cellNodalDataPtr, + d_flattenedCellDofIndexToProcessDofIndexMap.data() + + cellRange.first * d_nDofsPerCell); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataKernel( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + d_BLASWrapperPtr->axpyStridedBlockAtomicAdd( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + cellNodalDataPtr, + nodalData.begin(), + d_flattenedCellDofIndexToProcessDofIndexMap.begin() + + cellRange.first * d_nDofsPerCell); + } + template class FEBasisOperations; +#if defined(DFTFE_WITH_DEVICE) + template class FEBasisOperations; +#endif + + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsKernelsDevice.cc b/utils/FEBasisOperationsKernelsDevice.cc deleted file mode 100644 index 2ad2a4706..000000000 --- a/utils/FEBasisOperationsKernelsDevice.cc +++ /dev/null @@ -1,110 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#include -#include -#include -#include -#include - - -namespace dftfe -{ - namespace - { - template - __global__ void - reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const ValueType1 * copyFromVec, - ValueType2 * copyToVec) - { - const dftfe::size_type globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; - - for (dftfe::size_type index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - dftfe::size_type blockIndex = index / numVecs; - dftfe::size_type iVec = index - blockIndex * numVecs; - dftfe::size_type blockIndex2 = blockIndex / numQuads; - dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; - dftfe::size_type iCell = blockIndex2 / 3; - dftfe::size_type iDim = blockIndex2 - iCell * 3; - dftfe::utils::copyValue( - copyToVec + index, - copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + - iCell * 3 * numQuads * numVecs]); - } - } - } // namespace - namespace basis - { - namespace FEBasisOperationsKernelsDevice - { - template - void - reshapeNonAffineCase(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const ValueType1 * copyFromVec, - ValueType2 * copyToVec) - { -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - reshapeNonAffineCaseDeviceKernel<<<(numVecs * numCells * numQuads * 3) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - numVecs, - numQuads, - numCells, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), - dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - reshapeNonAffineCaseDeviceKernel, - (numVecs * numCells * numQuads * 3) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - numVecs, - numQuads, - numCells, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), - dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); -#endif - } - template void - reshapeNonAffineCase(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const double * copyFromVec, - double * copyToVec); - template void - reshapeNonAffineCase(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const std::complex *copyFromVec, - std::complex * copyToVec); - - } // namespace FEBasisOperationsKernelsDevice - } // namespace basis -} // namespace dftfe diff --git a/utils/FEBasisOperationsKernelsInternalDevice.cc b/utils/FEBasisOperationsKernelsInternalDevice.cc new file mode 100644 index 000000000..9f0c6ddb4 --- /dev/null +++ b/utils/FEBasisOperationsKernelsInternalDevice.cc @@ -0,0 +1,181 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include + + +namespace dftfe +{ + namespace + { + template + __global__ void + reshapeFromNonAffineDeviceKernel(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / numVecs; + dftfe::size_type iVec = index - blockIndex * numVecs; + dftfe::size_type blockIndex2 = blockIndex / numQuads; + dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + dftfe::size_type iCell = blockIndex2 / 3; + dftfe::size_type iDim = blockIndex2 - iCell * 3; + dftfe::utils::copyValue( + copyToVec + index, + copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + iCell * 3 * numQuads * numVecs]); + } + } + template + __global__ void + reshapeToNonAffineDeviceKernel(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / numVecs; + dftfe::size_type iVec = index - blockIndex * numVecs; + dftfe::size_type blockIndex2 = blockIndex / numQuads; + dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + dftfe::size_type iCell = blockIndex2 / 3; + dftfe::size_type iDim = blockIndex2 - iCell * 3; + dftfe::utils::copyValue(copyToVec + iVec + iDim * numVecs + + iQuad * 3 * numVecs + + iCell * 3 * numQuads * numVecs, + copyFromVec[index]); + } + } + } // namespace + namespace basis + { + namespace FEBasisOperationsKernelsInternal + { + template + + void + reshapeFromNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + reshapeFromNonAffineDeviceKernel<<<(numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + reshapeFromNonAffineDeviceKernel, + (numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template + void + reshapeToNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + reshapeToNonAffineDeviceKernel<<<(numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + reshapeToNonAffineDeviceKernel, + (numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template void + reshapeFromNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeFromNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + reshapeToNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeToNonAffineLayoutDevice(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + } // namespace FEBasisOperationsKernelsInternal + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsKernelsInternalHost.cc b/utils/FEBasisOperationsKernelsInternalHost.cc new file mode 100644 index 000000000..d75ffdf79 --- /dev/null +++ b/utils/FEBasisOperationsKernelsInternalHost.cc @@ -0,0 +1,89 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include + + +namespace dftfe +{ + namespace basis + { + namespace FEBasisOperationsKernelsInternal + { + template + void + reshapeFromNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { + for (unsigned int iCell = 0; iCell < numCells; ++iCell) + for (unsigned int iQuad = 0; iQuad < numQuads; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(copyToVec + numVecs * 3 * numQuads * iCell + + numVecs * numQuads * iDim + numVecs * iQuad, + copyFromVec + numVecs * 3 * numQuads * iCell + + numVecs * 3 * iQuad + numVecs * iDim, + numVecs * sizeof(ValueType)); + } + template + void + reshapeToNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType * copyFromVec, + ValueType * copyToVec) + { + for (unsigned int iCell = 0; iCell < numCells; ++iCell) + for (unsigned int iQuad = 0; iQuad < numQuads; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(copyToVec + numVecs * 3 * numQuads * iCell + + numVecs * 3 * iQuad + numVecs * iDim, + copyFromVec + numVecs * 3 * numQuads * iCell + + numVecs * numQuads * iDim + numVecs * iQuad, + numVecs * sizeof(ValueType)); + } + template void + reshapeFromNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeFromNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + reshapeToNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeToNonAffineLayoutHost(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + } // namespace FEBasisOperationsKernelsInternal + } // namespace basis +} // namespace dftfe From 75c2f90f22d67cc0a97a4636d48c835228e4fac5 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 9 Feb 2024 19:01:27 +0530 Subject: [PATCH 02/24] Bugfix for ONCV PR --- include/FEBasisOperations.h | 29 +++++++----- src/dft/dft.cc | 19 ++++++++ src/dft/initBoundaryConditions.cc | 23 ++++------ src/dft/pRefinedDoFHandler.cc | 31 +++++++------ utils/FEBasisOperations.cc | 75 +++++++++++++++++++++++-------- 5 files changed, 118 insertions(+), 59 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 9a58b05af..fc420d56a 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -93,16 +93,9 @@ namespace dftfe public: /** - * @brief Constructor, fills required data structures using deal.ii's MatrixFree and AffineConstraints objects - * @param[in] matrixFreeData MatrixFree object. - * @param[in] constraintsVector std::vector of AffineConstraints, should - * be the same vector which was passed for the construction of the given - * MatrixFree object. + * @brief Constructor */ FEBasisOperations( - dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, - std::vector *> - &constraintsVector, std::shared_ptr> BLASWrapperPtr); @@ -112,8 +105,18 @@ namespace dftfe */ ~FEBasisOperations() = default; + /** + * @brief Clears the FEBasisOperations internal storage. + */ + void + clear(); + /** * @brief fills required data structures for the given dofHandlerID + * @param[in] matrixFreeData MatrixFree object. + * @param[in] constraintsVector std::vector of AffineConstraints, should + * be the same vector which was passed for the construction of the given + * MatrixFree object. * @param[in] dofHandlerID dofHandler index to be used for getting data * from the MatrixFree object. * @param[in] quadratureID std::vector of quadratureIDs to be used, should @@ -121,9 +124,13 @@ namespace dftfe * MatrixFree object. */ void - init(const unsigned int & dofHandlerID, - const std::vector &quadratureID, - const std::vector updateFlags); + init(dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + & constraintsVector, + const unsigned int & dofHandlerID, + const std::vector &quadratureID, + const std::vector updateFlags); + /** * @brief fills required data structures from another FEBasisOperations object * @param[in] basisOperationsSrc Source FEBasisOperations object. diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 3df43fdaf..61a3b7823 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1038,11 +1038,30 @@ namespace dftfe d_BLASWrapperPtrHost = std::make_shared< dftfe::linearAlgebra::BLASWrapper>(); + d_basisOperationsPtrHost = std::make_shared< + dftfe::basis::FEBasisOperations>( + d_BLASWrapperPtrHost); + d_basisOperationsPtrElectroHost = std::make_shared< + dftfe::basis:: + FEBasisOperations>( + d_BLASWrapperPtrHost); #if defined(DFTFE_WITH_DEVICE) if (d_dftParamsPtr->useDevice) { d_BLASWrapperPtr = std::make_shared>(); + d_basisOperationsPtrDevice = std::make_shared< + dftfe::basis::FEBasisOperations>( + d_BLASWrapperPtr); + d_basisOperationsPtrElectroDevice = std::make_shared< + dftfe::basis::FEBasisOperations>( + d_BLASWrapperPtr); } #endif initImageChargesUpdateKPoints(); diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 920be0981..a19ea8393 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -269,11 +269,7 @@ namespace dftfe { if (!vselfPerturbationUpdateForStress) { - d_basisOperationsPtrHost = std::make_shared< - dftfe::basis::FEBasisOperations>( - matrix_free_data, d_constraintsVector, d_BLASWrapperPtrHost); + d_basisOperationsPtrHost->clear(); dftfe::basis::UpdateFlags updateFlagsAll = dftfe::basis::update_values | dftfe::basis::update_jxw | dftfe::basis::update_inversejacobians | @@ -291,7 +287,9 @@ namespace dftfe updateFlagsAll, updateFlagsAll, updateFlagsAll}; - d_basisOperationsPtrHost->init(d_densityDofHandlerIndex, + d_basisOperationsPtrHost->init(matrix_free_data, + d_constraintsVector, + d_densityDofHandlerIndex, quadratureIndices, updateFlags); d_basisOperationsPtrHost->computeCellStiffnessMatrix( @@ -322,13 +320,7 @@ namespace dftfe { if (!vselfPerturbationUpdateForStress) { - d_basisOperationsPtrDevice = - std::make_shared>(matrix_free_data, - d_constraintsVector, - d_BLASWrapperPtr); + d_basisOperationsPtrDevice->clear(); d_basisOperationsPtrDevice->init(*d_basisOperationsPtrHost); const unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); @@ -343,6 +335,7 @@ namespace dftfe } else { + d_basisOperationsPtrDevice->clear(); dftfe::basis::UpdateFlags updateFlagsGradientsAndInvJacobians = dftfe::basis::update_inversejacobians | dftfe::basis::update_jxw | dftfe::basis::update_gradients; @@ -362,7 +355,9 @@ namespace dftfe updateFlagsValuesGradients, updateFlagsAll, updateFlagsGradientsAndInvJacobians}; - d_basisOperationsPtrDevice->init(d_densityDofHandlerIndex, + d_basisOperationsPtrDevice->init(matrix_free_data, + d_constraintsVector, + d_densityDofHandlerIndex, quadratureIndices, updateFlags); if (FEOrder != FEOrderElectro) diff --git a/src/dft/pRefinedDoFHandler.cc b/src/dft/pRefinedDoFHandler.cc index ba27203e7..f8d2ffde1 100644 --- a/src/dft/pRefinedDoFHandler.cc +++ b/src/dft/pRefinedDoFHandler.cc @@ -380,14 +380,9 @@ namespace dftfe additional_data); if (recomputeBasisData) { - d_basisOperationsPtrElectroHost = std::make_shared< - dftfe::basis:: - FEBasisOperations>( - d_matrixFreeDataPRefined, - d_constraintsVectorElectro, - d_BLASWrapperPtrHost); if (!vselfPerturbationUpdateForStress) { + d_basisOperationsPtrElectroHost->clear(); dftfe::basis::UpdateFlags updateFlagsAll = dftfe::basis::update_values | dftfe::basis::update_jxw | dftfe::basis::update_inversejacobians | @@ -404,15 +399,20 @@ namespace dftfe updateFlagsAll, dftfe::basis::update_quadpoints, updateFlagsAll}; - d_basisOperationsPtrElectroHost->init(d_baseDofHandlerIndexElectro, + d_basisOperationsPtrElectroHost->init(d_matrixFreeDataPRefined, + d_constraintsVectorElectro, + d_baseDofHandlerIndexElectro, quadratureIndices, updateFlags); } else { + d_basisOperationsPtrElectroHost->clear(); std::vector quadratureIndices; std::vector updateFlags; - d_basisOperationsPtrElectroHost->init(d_baseDofHandlerIndexElectro, + d_basisOperationsPtrElectroHost->init(d_matrixFreeDataPRefined, + d_constraintsVectorElectro, + d_baseDofHandlerIndexElectro, quadratureIndices, updateFlags); } @@ -425,13 +425,7 @@ namespace dftfe { if (!vselfPerturbationUpdateForStress) { - d_basisOperationsPtrElectroDevice = - std::make_shared>(d_matrixFreeDataPRefined, - d_constraintsVectorElectro, - d_BLASWrapperPtr); + d_basisOperationsPtrElectroDevice->clear(); d_basisOperationsPtrElectroDevice->init( *d_basisOperationsPtrElectroHost); if (FEOrder != FEOrderElectro) @@ -440,6 +434,7 @@ namespace dftfe } else { + d_basisOperationsPtrElectroDevice->clear(); dftfe::basis::UpdateFlags updateFlagsGradientsAndInvJacobians = dftfe::basis::update_inversejacobians | dftfe::basis::update_jxw | dftfe::basis::update_gradients; @@ -449,7 +444,11 @@ namespace dftfe std::vector updateFlags{ updateFlagsGradientsAndInvJacobians}; d_basisOperationsPtrElectroDevice->init( - d_baseDofHandlerIndexElectro, quadratureIndices, updateFlags); + d_matrixFreeDataPRefined, + d_constraintsVectorElectro, + d_baseDofHandlerIndexElectro, + quadratureIndices, + updateFlags); if (FEOrder != FEOrderElectro) d_basisOperationsPtrElectroDevice->computeCellStiffnessMatrix( d_phiTotAXQuadratureIdElectro, 50, true, false); diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 6a3182af8..441038c6f 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -26,21 +26,71 @@ namespace dftfe dftfe::utils::MemorySpace memorySpace> FEBasisOperations:: FEBasisOperations( - dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, - std::vector *> - &constraintsVector, std::shared_ptr> BLASWrapperPtr) + { + d_BLASWrapperPtr = BLASWrapperPtr; + d_nOMPThreads = 1; + if (const char *penv = std::getenv("DFTFE_NUM_THREADS")) + d_nOMPThreads = std::stoi(std::string(penv)); + } + + template + void + FEBasisOperations:: + clear() + { + d_constraintInfo.clear(); + d_cellDofIndexToProcessDofIndexMap.clear(); + d_quadPoints.clear(); + d_flattenedCellDofIndexToProcessDofIndexMap.clear(); + d_cellIndexToCellIdMap.clear(); + d_cellIdToCellIndexMap.clear(); + d_inverseJacobianData.clear(); + d_JxWData.clear(); + d_shapeFunctionData.clear(); + d_shapeFunctionGradientDataInternalLayout.clear(); + d_shapeFunctionGradientData.clear(); + d_shapeFunctionDataTranspose.clear(); + d_shapeFunctionGradientDataTranspose.clear(); + d_inverseJacobianBasisData.clear(); + d_JxWBasisData.clear(); + d_shapeFunctionBasisData.clear(); + d_shapeFunctionGradientBasisData.clear(); + d_shapeFunctionBasisDataTranspose.clear(); + d_shapeFunctionGradientBasisDataTranspose.clear(); + + d_cellStiffnessMatrixBasisType.clear(); + d_cellStiffnessMatrixCoeffType.clear(); + scratchMultiVectors.clear(); + tempCellNodalData.clear(); + tempQuadratureGradientsData.clear(); + tempQuadratureGradientsDataNonAffine.clear(); + + d_quadratureIDsVector.clear(); + d_nQuadsPerCell.clear(); + d_updateFlags.clear(); + } + + template + void + FEBasisOperations:: + init(dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + & constraintsVector, + const unsigned int & dofHandlerID, + const std::vector &quadratureID, + const std::vector updateFlags) { d_matrixFreeDataPtr = &matrixFreeData; d_constraintsVector = &constraintsVector; - d_BLASWrapperPtr = BLASWrapperPtr; d_dofHandlerID = 0; d_nVectors = 0; areAllCellsAffine = true; - d_nOMPThreads = 1; - if (const char *penv = std::getenv("DFTFE_NUM_THREADS")) - d_nOMPThreads = std::stoi(std::string(penv)); for (unsigned int iMacroCell = 0; iMacroCell < d_matrixFreeDataPtr->n_cell_batches(); @@ -62,17 +112,6 @@ namespace dftfe iMacroCell) == dealii::internal::MatrixFreeFunctions::cartesian); } initializeConstraints(); - } - - template - void - FEBasisOperations:: - init(const unsigned int & dofHandlerID, - const std::vector &quadratureID, - const std::vector updateFlags) - { AssertThrow( updateFlags.size() == quadratureID.size(), dealii::ExcMessage( From 989325950eca76bf82c06be5a2d2e856b2751e5c Mon Sep 17 00:00:00 2001 From: Kartick Ramakrishnan Date: Sat, 10 Feb 2024 15:51:54 +0530 Subject: [PATCH 03/24] poisson problem device changes. compilation works --- include/BLASWrapper.h | 30 +++++++++++++--- include/linearAlgebraOperations.h | 4 +-- include/operatorDevice.h | 2 +- include/poissonSolverProblemDevice.h | 8 ++--- ...mputeOutputDensityDirectionalDerivative.cc | 2 +- src/dft/dft.cc | 8 ++--- src/dft/solveNSCF.cc | 4 +-- src/dftOperator/kohnShamDFTOperatorDevice.cc | 6 ++-- src/poisson/poissonSolverProblemDevice.cc | 33 +++++++++-------- utils/BLASWrapperDevice.cu.cc | 35 +++++++++++++++++-- utils/BLASWrapperDevice.hip.cc | 33 +++++++++++++++-- utils/BLASWrapperHost.cc | 13 +++++-- 12 files changed, 136 insertions(+), 42 deletions(-) diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index 1120b3f0d..3cc807f8f 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -23,6 +23,7 @@ #include #include #include +#include namespace dftfe @@ -187,7 +188,15 @@ namespace dftfe const double * Y, const unsigned int INCY, double * result) const; - + // Real dot proeuct with all Reduce call + void + xdot(const unsigned int N, + const double * X, + const unsigned int INCX, + const double * Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + double * result) const; // Complex dot product void @@ -202,7 +211,7 @@ namespace dftfe void xaxpy(const unsigned int n, const double * alpha, - double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const; @@ -211,7 +220,7 @@ namespace dftfe void xaxpy(const unsigned int n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int incx, std::complex * y, const unsigned int incy) const; @@ -677,6 +686,17 @@ namespace dftfe const unsigned int INCY, double * result) const; + // + // Real dot product + void + xdot(const unsigned int N, + const double * X, + const unsigned int INCX, + const double * Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + double * result) const; + // Complex dot product void xdot(const unsigned int N, @@ -690,7 +710,7 @@ namespace dftfe void xaxpy(const unsigned int n, const double * alpha, - double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const; @@ -699,7 +719,7 @@ namespace dftfe void xaxpy(const unsigned int n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int incx, std::complex * y, const unsigned int incy) const; diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index b10a9d486..f9eb19567 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -139,7 +139,7 @@ namespace dftfe void daxpy_(const unsigned int *n, const double * alpha, - double * x, + const double * x, const unsigned int *incx, double * y, const unsigned int *incy); @@ -407,7 +407,7 @@ namespace dftfe void zaxpy_(const unsigned int * n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int * incx, std::complex * y, const unsigned int * incy); diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 933c9a267..fa755c409 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -23,7 +23,7 @@ # include # include -# include +# include //FIX ME # include # include # include "process_grid.h" diff --git a/include/poissonSolverProblemDevice.h b/include/poissonSolverProblemDevice.h index f7f6a90ad..aff8dab45 100644 --- a/include/poissonSolverProblemDevice.h +++ b/include/poissonSolverProblemDevice.h @@ -23,10 +23,10 @@ # include # include # include -# include # include # include # include "FEBasisOperations.h" +# include "BLASWrapper.h" namespace dftfe { @@ -77,7 +77,7 @@ namespace dftfe const unsigned int smearedChargeQuadratureId, const dftfe::utils::MemoryStorage & rhoValues, - dftfe::utils::deviceBlasHandle_t &deviceBlasHandle, + const std::shared_ptr> BLASWrapperPtr, const bool isComputeDiagonalA = true, const bool isComputeMeanValueConstraints = false, const bool smearedNuclearCharges = false, @@ -246,8 +246,6 @@ namespace dftfe double *d_jacobianFactorPtr; int * d_mapPtr; - // cuBLAS handle for cuBLAS operations - dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr; // constraints dftUtils::constraintMatrixInfoDevice d_constraintsTotalPotentialInfo; @@ -322,6 +320,8 @@ namespace dftfe FEBasisOperations> d_basisOperationsPtr; /// + std::shared_ptr> + d_BLASWrapperPtr; bool d_isFastConstraintsInitialized; const MPI_Comm mpi_communicator; diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 2e8b14001..2976defac 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -103,7 +103,7 @@ namespace dftfe dummyMap, d_smearedChargeQuadratureIdElectro, charge, - d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle(), + d_BLASWrapperPtr, false, false); #endif diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 61a3b7823..a5bffa1d4 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -2528,7 +2528,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityInQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, false, false, d_dftParamsPtr->smearedNuclearCharges, @@ -2550,7 +2550,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityInQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, true, d_dftParamsPtr->periodicX && d_dftParamsPtr->periodicY && d_dftParamsPtr->periodicZ && @@ -3377,7 +3377,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityOutQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, false, false, d_dftParamsPtr->smearedNuclearCharges, @@ -3596,7 +3596,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityOutQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, false, false, d_dftParamsPtr->smearedNuclearCharges, diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc index ad3ac2d8e..2352bd305 100644 --- a/src/dft/solveNSCF.cc +++ b/src/dft/solveNSCF.cc @@ -207,7 +207,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityInQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, true, d_dftParamsPtr->periodicX && d_dftParamsPtr->periodicY && d_dftParamsPtr->periodicZ && !d_dftParamsPtr->pinnedNodeForPBC, @@ -958,7 +958,7 @@ namespace dftfe d_bQuadValuesAllAtoms, d_smearedChargeQuadratureIdElectro, d_densityInQuadValues[0], - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), + d_BLASWrapperPtr, false, false, d_dftParamsPtr->smearedNuclearCharges, diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index c22a704b2..a570734d2 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -17,12 +17,12 @@ // @author Phani Motamarri, Sambit Das // -#include -#include +#include //KFIX ME +#include #include #include #include -#include +#include //KFIX ME #include #include #include diff --git a/src/poisson/poissonSolverProblemDevice.cc b/src/poisson/poissonSolverProblemDevice.cc index e53603f08..c4eac1eef 100644 --- a/src/poisson/poissonSolverProblemDevice.cc +++ b/src/poisson/poissonSolverProblemDevice.cc @@ -81,7 +81,7 @@ namespace dftfe const unsigned int smearedChargeQuadratureId, const dftfe::utils::MemoryStorage & rhoValues, - dftfe::utils::deviceBlasHandle_t &deviceBlasHandle, + const std::shared_ptr> BLASWrapperPtr, const bool isComputeDiagonalA, const bool isComputeMeanValueConstraint, const bool smearedNuclearCharges, @@ -125,7 +125,7 @@ namespace dftfe d_smearedChargeGradientComponentId = smearedChargeGradientComponentId; d_isStoreSmearedChargeRhs = storeSmearedChargeRhs; d_isReuseSmearedChargeRhs = reuseSmearedChargeRhs; - d_deviceBlasHandlePtr = &deviceBlasHandle; + d_BLASWrapperPtr = BLASWrapperPtr; d_nLocalCells = d_matrixFreeDataPtr->n_cell_batches(); d_xLocalDof = d_xDevice.locallyOwnedSize() * d_xDevice.numVectors(); d_xLen = d_xDevice.localSize() * d_xDevice.numVectors(); @@ -452,12 +452,15 @@ namespace dftfe { // -\sum_{i \neq o} a_i * u_i computation which involves summation across // MPI tasks - const double constrainedNodeValue = dftfe::utils::deviceKernelsGeneric::dot( - d_meanValueConstraintDeviceVec.begin(), - vec.begin(), - d_xLocalDof, - mpi_communicator, - *d_deviceBlasHandlePtr); + const unsigned int one = 1; + double constrainedNodeValue = 0.0; + //dftfe::utils::deviceKernelsGeneric::dot( + // d_meanValueConstraintDeviceVec.begin(), + // vec.begin(), + // d_xLocalDof, + // mpi_communicator, + // *d_deviceBlasHandlePtr); //FIX ME + d_BLASWrapperPtr->xdot(d_xLocalDof,d_meanValueConstraintDeviceVec.begin(),one,vec.begin(),one,mpi_communicator, &constrainedNodeValue); if (dealii::Utilities::MPI::this_mpi_process(mpi_communicator) == d_meanValueConstraintProcId) @@ -494,12 +497,14 @@ namespace dftfe d_meanValueConstraintProcId, mpi_communicator); - dftfe::utils::deviceKernelsGeneric::add( - vec.begin(), - d_meanValueConstraintDeviceVec.begin(), - constrainedNodeValue, - d_xLocalDof, - *d_deviceBlasHandlePtr); + // dftfe::utils::deviceKernelsGeneric::add( + // vec.begin(), + // d_meanValueConstraintDeviceVec.begin(), + // constrainedNodeValue, + // d_xLocalDof, + // *d_deviceBlasHandlePtr); //FIX ME + + d_BLASWrapperPtr->add(vec.begin(),d_meanValueConstraintDeviceVec.begin(),constrainedNodeValue,d_xLocalDof); // meanValueConstraintSetZero if (d_isMeanValueConstraintComputed) diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index b0addaf70..28ca6cd61 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -466,7 +466,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -480,7 +480,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -519,6 +519,25 @@ namespace dftfe + void + BLASWrapper::xdot( + const unsigned int N, + const double * X, + const unsigned int INCX, + const double * Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + double * result) const + { + double localResult = 0.0; + dftfe::utils::deviceBlasStatus_t status = cublasDdot( + d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); + DEVICEBLAS_API_CHECK(status); + MPI_Allreduce( + &localResult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + + } + void BLASWrapper::xdot( const unsigned int N, @@ -531,6 +550,7 @@ namespace dftfe dftfe::utils::deviceBlasStatus_t status = cublasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), result); DEVICEBLAS_API_CHECK(status); + } @@ -1060,6 +1080,7 @@ namespace dftfe localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } void @@ -1079,6 +1100,16 @@ namespace dftfe } + void + BLASWrapper::add( + double * y, + const double * x, + const double alpha, + const dftfe::size_type size) + { + xaxpy(size, &alpha, x, 1, y, 1); + } + template void BLASWrapper::xscal( diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 24a985030..dbd99ebd1 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -508,7 +508,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -522,7 +522,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -577,6 +577,24 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); } + void + BLASWrapper::xdot( + const unsigned int N, + const double * X, + const unsigned int INCX, + const double * Y, + const unsigned int INCY, + const MPI_Comm & mpi_communicator, + double * result) const + { + double localResult = 0.0; + dftfe::utils::deviceBlasStatus_t status = hipblasDdot( + d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); + DEVICEBLAS_API_CHECK(status); + MPI_Allreduce( + &localResult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + } + void BLASWrapper::xdot( const unsigned int N, @@ -1103,6 +1121,7 @@ namespace dftfe localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); + *result = std::sqrt(*result); } void @@ -1218,6 +1237,16 @@ namespace dftfe copyFromVecStartingContiguousBlockIds); } + void + BLASWrapper::add( + double * y, + const double * x, + const double alpha, + const dftfe::size_type size) + { + xaxpy(size, &alpha, x, 1, y, 1); + } + template void diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index 42089df5a..fffbaf3c4 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -309,7 +309,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -321,7 +321,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - std::complex * x, + const std::complex * x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -363,6 +363,15 @@ namespace dftfe dsymv_(&UPLO, &N, alpha, A, &LDA, X, &INCX, beta, C, &INCY); } + void + BLASWrapper::add( + double * y, + const double * x, + const double alpha, + const dftfe::size_type size) + { + xaxpy(size, &alpha, x, 1, y, 1); + } void BLASWrapper::xgemmBatched( From b88551b02f0074d24007d110fad7dfe1042cd9b4 Mon Sep 17 00:00:00 2001 From: Kartick Ramakrishnan Date: Sat, 10 Feb 2024 17:40:13 +0530 Subject: [PATCH 04/24] Removed getDeviceHandle calss from more places --- include/BLASWrapper.h | 44 +++++- include/densityFirstOrderResponseCalculator.h | 6 +- include/linearAlgebraOperations.h | 4 +- include/linearSolverCGDevice.h | 25 ++-- include/linearSolverDevice.h | 11 +- include/poissonSolverProblemDevice.h | 29 ++-- ...mputeOutputDensityDirectionalDerivative.cc | 19 +-- ...nsityFirstOrderResponseCalculatorDevice.cc | 125 +++++++++--------- src/dft/dft.cc | 33 ++--- src/dft/kerker.cc | 12 +- src/dft/solveNSCF.cc | 23 ++-- src/dftOperator/kohnShamDFTOperatorDevice.cc | 4 +- src/poisson/poissonSolverProblemDevice.cc | 55 +++++--- src/solvers/linearSolverCGDevice.cc | 69 ++++++---- utils/BLASWrapperDevice.cu.cc | 71 ++++++++-- utils/BLASWrapperDevice.hip.cc | 78 +++++++++-- utils/BLASWrapperHost.cc | 56 +++++++- 17 files changed, 438 insertions(+), 226 deletions(-) diff --git a/include/BLASWrapper.h b/include/BLASWrapper.h index 3cc807f8f..02a9b8632 100644 --- a/include/BLASWrapper.h +++ b/include/BLASWrapper.h @@ -188,7 +188,7 @@ namespace dftfe const double * Y, const unsigned int INCY, double * result) const; - // Real dot proeuct with all Reduce call + // Real dot proeuct with all Reduce call void xdot(const unsigned int N, const double * X, @@ -211,7 +211,7 @@ namespace dftfe void xaxpy(const unsigned int n, const double * alpha, - const double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const; @@ -220,7 +220,7 @@ namespace dftfe void xaxpy(const unsigned int n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int incx, std::complex * y, const unsigned int incy) const; @@ -233,6 +233,14 @@ namespace dftfe double * y, const unsigned int incy) const; + // Real copy of double data to float + void + xcopy(const unsigned int n, + double * x, + const unsigned int incx, + float * y, + const unsigned int incy) const; + // Complex double copy of data void xcopy(const unsigned int n, @@ -257,6 +265,13 @@ namespace dftfe std::complex * y, const unsigned int incy) const; + void + xcopy(const unsigned int n, + std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const; + // Real double symmetric matrix-vector product void xsymv(const char UPLO, @@ -687,7 +702,7 @@ namespace dftfe double * result) const; // - // Real dot product + // Real dot product void xdot(const unsigned int N, const double * X, @@ -695,7 +710,7 @@ namespace dftfe const double * Y, const unsigned int INCY, const MPI_Comm & mpi_communicator, - double * result) const; + double * result) const; // Complex dot product void @@ -710,7 +725,7 @@ namespace dftfe void xaxpy(const unsigned int n, const double * alpha, - const double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const; @@ -719,7 +734,7 @@ namespace dftfe void xaxpy(const unsigned int n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int incx, std::complex * y, const unsigned int incy) const; @@ -732,6 +747,14 @@ namespace dftfe double * y, const unsigned int incy) const; + // Real copy of double data + void + xcopy(const unsigned int n, + double * x, + const unsigned int incx, + float * y, + const unsigned int incy) const; + // Complex double copy of data void xcopy(const unsigned int n, @@ -756,6 +779,13 @@ namespace dftfe std::complex * y, const unsigned int incy) const; + void + xcopy(const unsigned int n, + std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const; + // Real double symmetric matrix-vector product void xsymv(const char UPLO, diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index 4f5bb94f6..e9c53c53e 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -22,6 +22,7 @@ #include "headers.h" #include "operator.h" #include "dftParameters.h" +#include #if defined(DFTFE_WITH_DEVICE) # include "operatorDevice.h" @@ -108,7 +109,10 @@ namespace dftfe const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); + const dftParameters &dftParams, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr); #endif } // namespace dftfe #endif diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index f9eb19567..6edfcae03 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -139,7 +139,7 @@ namespace dftfe void daxpy_(const unsigned int *n, const double * alpha, - const double * x, + const double * x, const unsigned int *incx, double * y, const unsigned int *incy); @@ -407,7 +407,7 @@ namespace dftfe void zaxpy_(const unsigned int * n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int * incx, std::complex * y, const unsigned int * incy); diff --git a/include/linearSolverCGDevice.h b/include/linearSolverCGDevice.h index fe8aee48d..52902a5b3 100644 --- a/include/linearSolverCGDevice.h +++ b/include/linearSolverCGDevice.h @@ -23,6 +23,7 @@ # include # include # include +# include namespace dftfe { /** @@ -46,9 +47,13 @@ namespace dftfe * @param mpi_comm_domain domain mpi communicator * @param type enum specifying the choice of the linear solver */ - linearSolverCGDevice(const MPI_Comm & mpi_comm_parent, - const MPI_Comm & mpi_comm_domain, - const solverType type); + linearSolverCGDevice( + const MPI_Comm & mpi_comm_parent, + const MPI_Comm & mpi_comm_domain, + const solverType type, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr); /** * @brief Solve linear system, A*x=Rhs @@ -62,12 +67,11 @@ namespace dftfe * 2 - all debug output. */ void - solve(linearSolverProblemDevice & problem, - const double absTolerance, - const unsigned int maxNumberIterations, - dftfe::utils::deviceBlasHandle_t &handle, - const int debugLevel = 0, - bool distributeFlag = true); + solve(linearSolverProblemDevice &problem, + const double absTolerance, + const unsigned int maxNumberIterations, + const int debugLevel = 0, + bool distributeFlag = true); private: /// enum denoting the choice of the linear solver @@ -86,6 +90,9 @@ namespace dftfe const unsigned int n_mpi_processes; const unsigned int this_mpi_process; dealii::ConditionalOStream pcout; + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + d_BLASWrapperPtr; /** * @brief Combines precondition and dot product diff --git a/include/linearSolverDevice.h b/include/linearSolverDevice.h index 05950dac3..5a94d98f4 100644 --- a/include/linearSolverDevice.h +++ b/include/linearSolverDevice.h @@ -47,12 +47,11 @@ namespace dftfe * 2 - all debug output. */ virtual void - solve(linearSolverProblemDevice & problem, - const double absTolerance, - const unsigned int maxNumberIterations, - dftfe::utils::deviceBlasHandle_t &handle, - const int debugLevel = 0, - bool distributeFlag = true) = 0; + solve(linearSolverProblemDevice &problem, + const double absTolerance, + const unsigned int maxNumberIterations, + const int debugLevel = 0, + bool distributeFlag = true) = 0; private: }; diff --git a/include/poissonSolverProblemDevice.h b/include/poissonSolverProblemDevice.h index aff8dab45..b8d62ce79 100644 --- a/include/poissonSolverProblemDevice.h +++ b/include/poissonSolverProblemDevice.h @@ -76,17 +76,19 @@ namespace dftfe const std::map> &smearedChargeValues, const unsigned int smearedChargeQuadratureId, const dftfe::utils::MemoryStorage - & rhoValues, - const std::shared_ptr> BLASWrapperPtr, - const bool isComputeDiagonalA = true, - const bool isComputeMeanValueConstraints = false, - const bool smearedNuclearCharges = false, - const bool isRhoValues = true, - const bool isGradSmearedChargeRhs = false, - const unsigned int smearedChargeGradientComponentId = 0, - const bool storeSmearedChargeRhs = false, - const bool reuseSmearedChargeRhs = false, - const bool reinitializeFastConstraints = false); + &rhoValues, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr, + const bool isComputeDiagonalA = true, + const bool isComputeMeanValueConstraints = false, + const bool smearedNuclearCharges = false, + const bool isRhoValues = true, + const bool isGradSmearedChargeRhs = false, + const unsigned int smearedChargeGradientComponentId = 0, + const bool storeSmearedChargeRhs = false, + const bool reuseSmearedChargeRhs = false, + const bool reinitializeFastConstraints = false); /** * @brief Compute A matrix multipled by x. @@ -320,8 +322,9 @@ namespace dftfe FEBasisOperations> d_basisOperationsPtr; /// - std::shared_ptr> - d_BLASWrapperPtr; + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + d_BLASWrapperPtr; bool d_isFastConstraintsInitialized; const MPI_Comm mpi_communicator; diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 2976defac..68b54a1e2 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -65,7 +65,8 @@ namespace dftfe // set up linear solver Device linearSolverCGDevice CGSolverDevice(d_mpiCommParent, mpi_communicator, - linearSolverCGDevice::CG); + linearSolverCGDevice::CG, + d_BLASWrapperPtr); #endif @@ -130,12 +131,10 @@ namespace dftfe not d_dftParamsPtr->pinnedNodeForPBC) { #ifdef DFTFE_WITH_DEVICE - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absPoissonSolverToleranceLRD, - d_dftParamsPtr->maxLinearSolverIterations, - d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absPoissonSolverToleranceLRD, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else @@ -527,7 +526,8 @@ namespace dftfe d_mpiCommParent, interpoolcomm, interBandGroupComm, - *d_dftParamsPtr); + *d_dftParamsPtr, + d_BLASWrapperPtr); else computeRhoFirstOrderResponseDevice( @@ -550,7 +550,8 @@ namespace dftfe d_mpiCommParent, interpoolcomm, interBandGroupComm, - *d_dftParamsPtr); + *d_dftParamsPtr, + d_BLASWrapperPtr); } #endif if (!d_dftParamsPtr->useDevice) diff --git a/src/dft/densityFirstOrderResponseCalculatorDevice.cc b/src/dft/densityFirstOrderResponseCalculatorDevice.cc index 08945271a..6d742111b 100644 --- a/src/dft/densityFirstOrderResponseCalculatorDevice.cc +++ b/src/dft/densityFirstOrderResponseCalculatorDevice.cc @@ -134,7 +134,10 @@ namespace dftfe const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams) + const dftParameters &dftParams, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr) { int this_process; MPI_Comm_rank(mpiCommParent, &this_process); @@ -238,7 +241,7 @@ namespace dftfe shapeFunctionValuesTransposedDevice.setValue(zero); - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( + BLASWrapperPtr->copyValueType1ArrToValueType2Arr( numNodesPerElement * numQuadPoints, (operatorMatrix.getShapeFunctionValuesTransposed(true)).begin(), shapeFunctionValuesTransposedDevice.begin()); @@ -272,16 +275,15 @@ namespace dftfe .template copyTo( densityMatDerFermiEnergyVecDevice); - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - X + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayXBlock.begin()); + BLASWrapperPtr->stridedCopyToBlockConstantStride( + BVec, + totalNumWaveFunctions, + numLocalDofs, + jvec, + X + + numLocalDofs * totalNumWaveFunctions * + ((dftParams.spinPolarized + 1) * kPoint + spinIndex), + deviceFlattenedArrayXBlock.begin()); deviceFlattenedArrayXBlock.updateGhostValues(); @@ -289,16 +291,15 @@ namespace dftfe ->distribute(deviceFlattenedArrayXBlock, BVec); - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - XPrime + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayXPrimeBlock.begin()); + BLASWrapperPtr->stridedCopyToBlockConstantStride( + BVec, + totalNumWaveFunctions, + numLocalDofs, + jvec, + XPrime + + numLocalDofs * totalNumWaveFunctions * + ((dftParams.spinPolarized + 1) * kPoint + spinIndex), + deviceFlattenedArrayXPrimeBlock.begin()); deviceFlattenedArrayXPrimeBlock.updateGhostValues(); @@ -318,16 +319,15 @@ namespace dftfe - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); + BLASWrapperPtr->stridedCopyToBlock( + BVec, + currentCellsBlockSize * numNodesPerElement, + deviceFlattenedArrayXBlock.begin(), + cellWaveFunctionMatrix.begin(), + (operatorMatrix + .getFlattenedArrayCellLocalProcIndexIdMap()) + .begin() + + startingCellId * numNodesPerElement); NumberTypeLowPrec scalarCoeffAlpha = 1.0; NumberTypeLowPrec scalarCoeffBeta = 0.0; @@ -336,10 +336,10 @@ namespace dftfe int strideC = BVec * numQuadPoints; - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, + + BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', BVec, numQuadPoints, numNodesPerElement, @@ -357,21 +357,19 @@ namespace dftfe currentCellsBlockSize); - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXPrimeBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, + BLASWrapperPtr->stridedCopyToBlock( + BVec, + currentCellsBlockSize * numNodesPerElement, + deviceFlattenedArrayXPrimeBlock.begin(), + cellWaveFunctionMatrix.begin(), + (operatorMatrix + .getFlattenedArrayCellLocalProcIndexIdMap()) + .begin() + + startingCellId * numNodesPerElement); + + BLASWrapperPtr->xgemmStridedBatched( + 'N', + 'N', BVec, numQuadPoints, numNodesPerElement, @@ -388,6 +386,7 @@ namespace dftfe strideC, currentCellsBlockSize); + #ifdef DFTFE_WITH_DEVICE_LANG_CUDA computeRhoResponseFromInterpolatedValues<<< (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / @@ -415,10 +414,9 @@ namespace dftfe XPrimeQuadsDevice.begin())); #endif - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, + BLASWrapperPtr->xgemm( + 'N', + 'N', 1, currentCellsBlockSize * numQuadPoints, BVec, @@ -432,10 +430,9 @@ namespace dftfe startingCellId * numQuadPoints, 1); - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, + BLASWrapperPtr->xgemm( + 'N', + 'N', 1, currentCellsBlockSize * numQuadPoints, BVec, @@ -647,7 +644,10 @@ namespace dftfe const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); + const dftParameters &dftParams, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr); template void computeRhoFirstOrderResponseDevice( @@ -672,5 +672,8 @@ namespace dftfe const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); + const dftParameters &dftParams, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr); } // namespace dftfe diff --git a/src/dft/dft.cc b/src/dft/dft.cc index a5bffa1d4..7b1c7dff4 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -2018,7 +2018,8 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE linearSolverCGDevice CGSolverDevice(d_mpiCommParent, mpi_communicator, - linearSolverCGDevice::CG); + linearSolverCGDevice::CG, + d_BLASWrapperPtr); #endif // @@ -2617,12 +2618,10 @@ namespace dftfe not d_dftParamsPtr->pinnedNodeForPBC) { #ifdef DFTFE_WITH_DEVICE - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absLinearSolverTolerance, - d_dftParamsPtr->maxLinearSolverIterations, - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absLinearSolverTolerance, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else @@ -3387,12 +3386,10 @@ namespace dftfe false, true); - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absLinearSolverTolerance, - d_dftParamsPtr->maxLinearSolverIterations, - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absLinearSolverTolerance, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else @@ -3606,12 +3603,10 @@ namespace dftfe false, true); - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absLinearSolverTolerance, - d_dftParamsPtr->maxLinearSolverIterations, - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absLinearSolverTolerance, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else diff --git a/src/dft/kerker.cc b/src/dft/kerker.cc index 0c540c5b8..56b309999 100644 --- a/src/dft/kerker.cc +++ b/src/dft/kerker.cc @@ -80,13 +80,11 @@ namespace dftfe d_dftParamsPtr->poissonGPU) { #ifdef DFTFE_WITH_DEVICE - CGSolverDevice.solve( - kerkerPreconditionedResidualSolverProblemDevice, - d_dftParamsPtr->absLinearSolverToleranceHelmholtz, - d_dftParamsPtr->maxLinearSolverIterationsHelmholtz, - d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle(), - d_dftParamsPtr->verbosity, - false); + CGSolverDevice.solve(kerkerPreconditionedResidualSolverProblemDevice, + d_dftParamsPtr->absLinearSolverToleranceHelmholtz, + d_dftParamsPtr->maxLinearSolverIterationsHelmholtz, + d_dftParamsPtr->verbosity, + false); #endif } else diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc index 2352bd305..c626994bb 100644 --- a/src/dft/solveNSCF.cc +++ b/src/dft/solveNSCF.cc @@ -59,7 +59,8 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE linearSolverCGDevice CGSolverDevice(d_mpiCommParent, mpi_communicator, - linearSolverCGDevice::CG); + linearSolverCGDevice::CG, + d_BLASWrapperPtr); #endif @@ -251,12 +252,10 @@ namespace dftfe not d_dftParamsPtr->pinnedNodeForPBC) { #ifdef DFTFE_WITH_DEVICE - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absLinearSolverTolerance, - d_dftParamsPtr->maxLinearSolverIterations, - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absLinearSolverTolerance, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else @@ -968,12 +967,10 @@ namespace dftfe false, true); - CGSolverDevice.solve( - d_phiTotalSolverProblemDevice, - d_dftParamsPtr->absLinearSolverTolerance, - d_dftParamsPtr->maxLinearSolverIterations, - kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle(), - d_dftParamsPtr->verbosity); + CGSolverDevice.solve(d_phiTotalSolverProblemDevice, + d_dftParamsPtr->absLinearSolverTolerance, + d_dftParamsPtr->maxLinearSolverIterations, + d_dftParamsPtr->verbosity); #endif } else diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index a570734d2..0b8949b15 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -17,8 +17,8 @@ // @author Phani Motamarri, Sambit Das // -#include //KFIX ME -#include +#include //KFIX ME +#include #include #include #include diff --git a/src/poisson/poissonSolverProblemDevice.cc b/src/poisson/poissonSolverProblemDevice.cc index c4eac1eef..545ab22e3 100644 --- a/src/poisson/poissonSolverProblemDevice.cc +++ b/src/poisson/poissonSolverProblemDevice.cc @@ -80,17 +80,19 @@ namespace dftfe const std::map> &smearedChargeValues, const unsigned int smearedChargeQuadratureId, const dftfe::utils::MemoryStorage - & rhoValues, - const std::shared_ptr> BLASWrapperPtr, - const bool isComputeDiagonalA, - const bool isComputeMeanValueConstraint, - const bool smearedNuclearCharges, - const bool isRhoValues, - const bool isGradSmearedChargeRhs, - const unsigned int smearedChargeGradientComponentId, - const bool storeSmearedChargeRhs, - const bool reuseSmearedChargeRhs, - const bool reinitializeFastConstraints) + &rhoValues, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr, + const bool isComputeDiagonalA, + const bool isComputeMeanValueConstraint, + const bool smearedNuclearCharges, + const bool isRhoValues, + const bool isGradSmearedChargeRhs, + const unsigned int smearedChargeGradientComponentId, + const bool storeSmearedChargeRhs, + const bool reuseSmearedChargeRhs, + const bool reinitializeFastConstraints) { int this_process; MPI_Comm_rank(mpi_communicator, &this_process); @@ -125,7 +127,7 @@ namespace dftfe d_smearedChargeGradientComponentId = smearedChargeGradientComponentId; d_isStoreSmearedChargeRhs = storeSmearedChargeRhs; d_isReuseSmearedChargeRhs = reuseSmearedChargeRhs; - d_BLASWrapperPtr = BLASWrapperPtr; + d_BLASWrapperPtr = BLASWrapperPtr; d_nLocalCells = d_matrixFreeDataPtr->n_cell_batches(); d_xLocalDof = d_xDevice.locallyOwnedSize() * d_xDevice.numVectors(); d_xLen = d_xDevice.localSize() * d_xDevice.numVectors(); @@ -452,15 +454,21 @@ namespace dftfe { // -\sum_{i \neq o} a_i * u_i computation which involves summation across // MPI tasks - const unsigned int one = 1; - double constrainedNodeValue = 0.0; - //dftfe::utils::deviceKernelsGeneric::dot( - // d_meanValueConstraintDeviceVec.begin(), - // vec.begin(), - // d_xLocalDof, - // mpi_communicator, - // *d_deviceBlasHandlePtr); //FIX ME - d_BLASWrapperPtr->xdot(d_xLocalDof,d_meanValueConstraintDeviceVec.begin(),one,vec.begin(),one,mpi_communicator, &constrainedNodeValue); + const unsigned int one = 1; + double constrainedNodeValue = 0.0; + // dftfe::utils::deviceKernelsGeneric::dot( + // d_meanValueConstraintDeviceVec.begin(), + // vec.begin(), + // d_xLocalDof, + // mpi_communicator, + // *d_deviceBlasHandlePtr); //FIX ME + d_BLASWrapperPtr->xdot(d_xLocalDof, + d_meanValueConstraintDeviceVec.begin(), + one, + vec.begin(), + one, + mpi_communicator, + &constrainedNodeValue); if (dealii::Utilities::MPI::this_mpi_process(mpi_communicator) == d_meanValueConstraintProcId) @@ -504,7 +512,10 @@ namespace dftfe // d_xLocalDof, // *d_deviceBlasHandlePtr); //FIX ME - d_BLASWrapperPtr->add(vec.begin(),d_meanValueConstraintDeviceVec.begin(),constrainedNodeValue,d_xLocalDof); + d_BLASWrapperPtr->add(vec.begin(), + d_meanValueConstraintDeviceVec.begin(), + constrainedNodeValue, + d_xLocalDof); // meanValueConstraintSetZero if (d_isMeanValueConstraintComputed) diff --git a/src/solvers/linearSolverCGDevice.cc b/src/solvers/linearSolverCGDevice.cc index ba3ae5601..a11ec5212 100644 --- a/src/solvers/linearSolverCGDevice.cc +++ b/src/solvers/linearSolverCGDevice.cc @@ -245,9 +245,13 @@ namespace dftfe } // constructor - linearSolverCGDevice::linearSolverCGDevice(const MPI_Comm & mpi_comm_parent, - const MPI_Comm & mpi_comm_domain, - const solverType type) + linearSolverCGDevice::linearSolverCGDevice( + const MPI_Comm & mpi_comm_parent, + const MPI_Comm & mpi_comm_domain, + const solverType type, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + BLASWrapperPtr) : d_mpiCommParent(mpi_comm_parent) , mpi_communicator(mpi_comm_domain) , d_type(type) @@ -256,18 +260,17 @@ namespace dftfe dealii::Utilities::MPI::this_mpi_process(mpi_comm_domain)) , pcout(std::cout, (dealii::Utilities::MPI::this_mpi_process(mpi_comm_parent) == 0)) + , d_BLASWrapperPtr(BLASWrapperPtr) {} // solve void - linearSolverCGDevice::solve( - linearSolverProblemDevice & problem, - const double absTolerance, - const unsigned int maxNumberIterations, - dftfe::utils::deviceBlasHandle_t &deviceBlasHandle, - const int debugLevel, - bool distributeFlag) + linearSolverCGDevice::solve(linearSolverProblemDevice &problem, + const double absTolerance, + const unsigned int maxNumberIterations, + const int debugLevel, + bool distributeFlag) { int this_process; MPI_Comm_rank(mpi_communicator, &this_process); @@ -328,23 +331,27 @@ namespace dftfe double alpha = 0.0; double beta = 0.0; double delta = 0.0; - // r = Ax problem.computeAX(d_rvec, x); // r = Ax - rhs - dftfe::utils::deviceKernelsGeneric::add(d_rvec.begin(), - rhsDevice.begin(), - -1., - d_xLocalDof, - deviceBlasHandle); - + // dftfe::utils::deviceKernelsGeneric::add(d_rvec.begin(), + // rhsDevice.begin(), + // -1., + // d_xLocalDof, + // deviceBlasHandle); + d_BLASWrapperPtr->add(d_rvec.begin(), + rhsDevice.begin(), + -1, + d_xLocalDof); // res = r.r - res = dftfe::utils::deviceKernelsGeneric::l2_norm(d_rvec.begin(), - d_xLocalDof, - mpi_communicator, - deviceBlasHandle); + // res = dftfe::utils::deviceKernelsGeneric::l2_norm(d_rvec.begin(), + // d_xLocalDof, + // mpi_communicator, + // deviceBlasHandle); + d_BLASWrapperPtr->xnrm2( + d_xLocalDof, d_rvec.begin(), 1, mpi_communicator, &res); initial_res = res; if (res < absTolerance) @@ -385,12 +392,20 @@ namespace dftfe problem.computeAX(d_dvec, d_qvec); // alpha = q.d - alpha = - dftfe::utils::deviceKernelsGeneric::dot(d_qvec.begin(), - d_dvec.begin(), - d_xLocalDof, - mpi_communicator, - deviceBlasHandle); + // alpha = + // dftfe::utils::deviceKernelsGeneric::dot(d_qvec.begin(), + // d_dvec.begin(), + // d_xLocalDof, + // mpi_communicator, + // deviceBlasHandle); + + d_BLASWrapperPtr->xdot(d_xLocalDof, + d_qvec.begin(), + 1, + d_dvec.begin(), + 1, + mpi_communicator, + &alpha); AssertThrow(std::abs(alpha) != 0., dealii::ExcMessage("Division by zero\n")); diff --git a/utils/BLASWrapperDevice.cu.cc b/utils/BLASWrapperDevice.cu.cc index 28ca6cd61..1433721a8 100644 --- a/utils/BLASWrapperDevice.cu.cc +++ b/utils/BLASWrapperDevice.cu.cc @@ -466,7 +466,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - const double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -480,7 +480,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -529,13 +529,12 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localResult = 0.0; - dftfe::utils::deviceBlasStatus_t status = cublasDdot( + double localResult = 0.0; + dftfe::utils::deviceBlasStatus_t status = cublasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); DEVICEBLAS_API_CHECK(status); - MPI_Allreduce( + MPI_Allreduce( &localResult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); - } void @@ -550,7 +549,6 @@ namespace dftfe dftfe::utils::deviceBlasStatus_t status = cublasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), result); DEVICEBLAS_API_CHECK(status); - } @@ -1080,7 +1078,7 @@ namespace dftfe localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); - *result = std::sqrt(*result); + *result = std::sqrt(*result); } void @@ -1103,12 +1101,12 @@ namespace dftfe void BLASWrapper::add( double * y, - const double * x, - const double alpha, - const dftfe::size_type size) + const double * x, + const double alpha, + const dftfe::size_type size) { xaxpy(size, &alpha, x, 1, y, 1); - } + } template void @@ -1439,6 +1437,14 @@ namespace dftfe double * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1455,6 +1461,14 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1463,16 +1477,49 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void BLASWrapper:: copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, std::complex * valueType2Arr); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + template void BLASWrapper:: copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, double * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + float * valueType2Arr); + + template void + BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const double * copyFromVec, + double * copyToVec); + + template void + BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const std::complex *copyFromVec, + std::complex * copyToVec); + } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index dbd99ebd1..d5a0fde54 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -508,7 +508,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - const double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -522,7 +522,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -587,11 +587,11 @@ namespace dftfe const MPI_Comm & mpi_communicator, double * result) const { - double localResult = 0.0; - dftfe::utils::deviceBlasStatus_t status = hipblasDdot( + double localResult = 0.0; + dftfe::utils::deviceBlasStatus_t status = hipblasDdot( d_deviceBlasHandle, int(N), X, int(INCX), Y, int(INCY), &localResult); DEVICEBLAS_API_CHECK(status); - MPI_Allreduce( + MPI_Allreduce( &localResult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); } @@ -1121,7 +1121,7 @@ namespace dftfe localresult *= localresult; MPI_Allreduce( &localresult, result, 1, MPI_DOUBLE, MPI_SUM, mpi_communicator); - *result = std::sqrt(*result); + *result = std::sqrt(*result); } void @@ -1240,12 +1240,12 @@ namespace dftfe void BLASWrapper::add( double * y, - const double * x, - const double alpha, - const dftfe::size_type size) + const double * x, + const double alpha, + const dftfe::size_type size) { xaxpy(size, &alpha, x, 1, y, 1); - } + } template @@ -1487,6 +1487,14 @@ namespace dftfe double * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * copyFromVec, + float * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1503,6 +1511,14 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void + BLASWrapper::stridedCopyToBlock( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * copyFromVec, + std::complex * copyToVecBlock, + const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -1511,5 +1527,47 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + std::complex * valueType2Arr); + + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + double * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + float * valueType2Arr); + template void + BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const double * copyFromVec, + double * copyToVec); + + template void + BLASWrapper:: + stridedCopyToBlockConstantStride(const dftfe::size_type blockSizeTo, + const dftfe::size_type blockSizeFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + } // End of namespace linearAlgebra } // End of namespace dftfe diff --git a/utils/BLASWrapperHost.cc b/utils/BLASWrapperHost.cc index fffbaf3c4..86897ecd1 100644 --- a/utils/BLASWrapperHost.cc +++ b/utils/BLASWrapperHost.cc @@ -110,6 +110,17 @@ namespace dftfe dcopy_(&n, x, &incx, y, &incy); } + void + BLASWrapper::xcopy( + const unsigned int n, + double * x, + const unsigned int incx, + float * y, + const unsigned int incy) const + { + std::memcpy(x, y, n * sizeof(x)); + } + void BLASWrapper::xcopy( const unsigned int n, @@ -242,6 +253,17 @@ namespace dftfe zcopy_(&n, x, &incx, y, &incy); } + void + BLASWrapper::xcopy( + const unsigned int n, + std::complex *x, + const unsigned int incx, + std::complex * y, + const unsigned int incy) const + { + std::memcpy(x, y, n * sizeof(x)); + } + void BLASWrapper::xcopy( const unsigned int n, @@ -309,7 +331,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const double * alpha, - const double * x, + const double * x, const unsigned int incx, double * y, const unsigned int incy) const @@ -321,7 +343,7 @@ namespace dftfe BLASWrapper::xaxpy( const unsigned int n, const std::complex *alpha, - const std::complex * x, + const std::complex *x, const unsigned int incx, std::complex * y, const unsigned int incy) const @@ -366,12 +388,12 @@ namespace dftfe void BLASWrapper::add( double * y, - const double * x, - const double alpha, - const dftfe::size_type size) + const double * x, + const double alpha, + const dftfe::size_type size) { xaxpy(size, &alpha, x, 1, y, 1); - } + } void BLASWrapper::xgemmBatched( @@ -713,6 +735,14 @@ namespace dftfe float * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + // template void + // BLASWrapper::stridedCopyToBlock( + // const dftfe::size_type contiguousBlockSize, + // const dftfe::size_type numContiguousBlocks, + // const double * copyFromVec, + // float * copyToVecBlock, + // const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper::stridedCopyToBlock( const dftfe::size_type contiguousBlockSize, @@ -729,6 +759,14 @@ namespace dftfe std::complex * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + // template void + // BLASWrapper::stridedCopyToBlock( + // const dftfe::size_type contiguousBlockSize, + // const dftfe::size_type numContiguousBlocks, + // const std::complex * copyFromVec, + // std::complex * copyToVecBlock, + // const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds); + template void BLASWrapper:: copyValueType1ArrToValueType2Arr(const dftfe::size_type size, @@ -740,6 +778,12 @@ namespace dftfe copyValueType1ArrToValueType2Arr(const dftfe::size_type size, const double * valueType1Arr, double * valueType2Arr); + template void + BLASWrapper:: + copyValueType1ArrToValueType2Arr(const dftfe::size_type size, + const double * valueType1Arr, + float * valueType2Arr); + // axpyStridedBlockAtomicAdd template void BLASWrapper::axpyStridedBlockAtomicAdd( From feaff25edaff7ba81309aaa94880258d57f8a13f Mon Sep 17 00:00:00 2001 From: Kartick Ramakrishnan Date: Sat, 10 Feb 2024 18:38:21 +0530 Subject: [PATCH 05/24] Removed getDeviceBlasHandle from KSDFTOperator --- ...hogonalizedSubspaceIterationSolverDevice.h | 20 +++++++---- include/forceWfcContractionsDevice.h | 6 +++- include/kohnShamDFTOperatorDevice.h | 2 -- include/operatorDevice.h | 3 -- src/dft/kohnShamEigenSolve.cc | 3 ++ src/dftOperator/kohnShamDFTOperatorDevice.cc | 7 +--- ...onfigurationalForceEEshelbyFPSPFnlLinFE.cc | 1 + .../computeStressEEshelbyEPSPEnlEk.cc | 1 + src/force/forceWfcContractionsDevice.cc | 35 ++++++++++++++----- ...ogonalizedSubspaceIterationSolverDevice.cc | 35 +++++++++---------- 10 files changed, 67 insertions(+), 46 deletions(-) diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolverDevice.h b/include/chebyshevOrthogonalizedSubspaceIterationSolverDevice.h index d666a58fb..50e2b7ab5 100644 --- a/include/chebyshevOrthogonalizedSubspaceIterationSolverDevice.h +++ b/include/chebyshevOrthogonalizedSubspaceIterationSolverDevice.h @@ -25,6 +25,7 @@ # include "operatorDevice.h" # include "elpaScalaManager.h" # include "dftParameters.h" +# include namespace dftfe { @@ -58,9 +59,11 @@ namespace dftfe * @brief Solve a generalized eigen problem. */ double - solve(operatorDFTDeviceClass & operatorMatrix, - elpaScalaManager & elpaScala, - dataTypes::number * eigenVectorsFlattenedDevice, + solve(operatorDFTDeviceClass & operatorMatrix, + const std::shared_ptr> &BLASWrapperPtr, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattenedDevice, dataTypes::number * eigenVectorsRotFracDensityFlattenedDevice, const unsigned int flattenedSize, const unsigned int totalNumberWaveFunctions, @@ -78,8 +81,10 @@ namespace dftfe * @brief Used for XL-BOMD. */ void - solveNoRR(operatorDFTDeviceClass & operatorMatrix, - elpaScalaManager & elpaScala, + solveNoRR(operatorDFTDeviceClass & operatorMatrix, + const std::shared_ptr> &BLASWrapperPtr, + elpaScalaManager & elpaScala, dataTypes::number * eigenVectorsFlattenedDevice, const unsigned int flattenedSize, const unsigned int totalNumberWaveFunctions, @@ -95,7 +100,10 @@ namespace dftfe */ void densityMatrixEigenBasisFirstOrderResponse( - operatorDFTDeviceClass & operatorMatrix, + operatorDFTDeviceClass &operatorMatrix, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, dataTypes::number * eigenVectorsFlattenedDevice, const unsigned int flattenedSize, const unsigned int totalNumberWaveFunctions, diff --git a/include/forceWfcContractionsDevice.h b/include/forceWfcContractionsDevice.h index 4f0771dfe..de6d3599c 100644 --- a/include/forceWfcContractionsDevice.h +++ b/include/forceWfcContractionsDevice.h @@ -23,6 +23,7 @@ # include "operatorDevice.h" # include "dftParameters.h" # include "FEBasisOperations.h" +# include namespace dftfe { @@ -34,7 +35,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index 2399ada00..c02964e55 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -59,8 +59,6 @@ namespace dftfe void destroyDeviceBlasHandle(); - dftfe::utils::deviceBlasHandle_t & - getDeviceBlasHandle(); const double * getSqrtMassVec(); diff --git a/include/operatorDevice.h b/include/operatorDevice.h index fa755c409..48cd526be 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -65,9 +65,6 @@ namespace dftfe virtual void destroyDeviceBlasHandle() = 0; - virtual dftfe::utils::deviceBlasHandle_t & - getDeviceBlasHandle() = 0; - virtual const double * getSqrtMassVec() = 0; diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index d13555b85..8a36a291b 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -544,6 +544,7 @@ namespace dftfe { subspaceIterationSolverDevice.solveNoRR( kohnShamDFTEigenOperator, + d_BLASWrapperPtr, elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -565,6 +566,7 @@ namespace dftfe spinType] = subspaceIterationSolverDevice.solve( kohnShamDFTEigenOperator, + d_BLASWrapperPtr, elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * @@ -756,6 +758,7 @@ namespace dftfe subspaceIterationSolverDevice.densityMatrixEigenBasisFirstOrderResponse( kohnShamDFTEigenOperator, + d_BLASWrapperPtr, d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * d_numEigenValues * diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 0b8949b15..7b3790e78 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -310,12 +310,7 @@ namespace dftfe dftfe::utils::deviceBlasWrapper::destroy(d_deviceBlasHandle); } - template - dftfe::utils::deviceBlasHandle_t & - kohnShamDFTOperatorDeviceClass::getDeviceBlasHandle() - { - return d_deviceBlasHandle; - } + template const double * diff --git a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc index 64a091d42..cb20c1662 100644 --- a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc +++ b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc @@ -246,6 +246,7 @@ namespace dftfe forceDevice::wfcContractionsForceKernelsAllH( dftPtr->d_basisOperationsPtrDevice, + dftPtr->d_BLASWrapperPtr, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, diff --git a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc index e96051838..24886e1cc 100644 --- a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc +++ b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc @@ -245,6 +245,7 @@ namespace dftfe forceDevice::wfcContractionsForceKernelsAllH( dftPtr->d_basisOperationsPtrDevice, + dftPtr->d_BLASWrapperPtr, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index be22e7144..f9e7aec74 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -29,6 +29,8 @@ #include #include + + namespace dftfe { namespace forceDevice @@ -423,7 +425,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &Xb, const unsigned int BVec, @@ -605,7 +610,7 @@ namespace dftfe dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, 1, @@ -629,7 +634,7 @@ namespace dftfe const int strideBNLP = 0; dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, BVec, @@ -652,7 +657,7 @@ namespace dftfe // shapeGradRef^T*invJacobian^T dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, numNodesPerElement, @@ -683,7 +688,7 @@ namespace dftfe numNodesPerElement * 3 * numQuadsNLP; dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, BVec, @@ -711,6 +716,9 @@ namespace dftfe void nlpPsiContractionD( operatorDFTDeviceClass &operatorMatrix, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + &BLASWrapperPtr, #ifdef USE_COMPLEX const dftfe::utils::MemoryStorage @@ -814,7 +822,7 @@ namespace dftfe #endif dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, 1, @@ -891,7 +899,7 @@ namespace dftfe # endif dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), + BLASWrapperPtr->getDeviceBlasHandle(), dftfe::utils::DEVICEBLAS_OP_N, dftfe::utils::DEVICEBLAS_OP_N, 1, @@ -935,7 +943,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &deviceFlattenedArrayBlock, distributedDeviceVec &projectorKetTimesVectorD, @@ -1033,6 +1044,7 @@ namespace dftfe // double kernel1_time = MPI_Wtime(); interpolatePsiComputeELocWfcEshelbyTensorD(basisOperationsPtr, + BLASWrapperPtr, operatorMatrix, deviceFlattenedArrayBlock, numPsi, @@ -1097,6 +1109,7 @@ namespace dftfe { nlpPsiContractionD( operatorMatrix, + BLASWrapperPtr, #ifdef USE_COMPLEX psiQuadsNLPD, #endif @@ -1139,7 +1152,10 @@ namespace dftfe dftfe::basis::FEBasisOperations> - & basisOperationsPtr, + &basisOperationsPtr, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, @@ -1361,6 +1377,7 @@ namespace dftfe devicePortedForceKernelsAllD( basisOperationsPtr, + BLASWrapperPtr, operatorMatrix, deviceFlattenedArrayBlock, projectorKetTimesVectorD, diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc index 3f6289361..f1672f65f 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolverDevice.cc @@ -154,7 +154,10 @@ namespace dftfe // double chebyshevOrthogonalizedSubspaceIterationSolverDevice::solve( - operatorDFTDeviceClass & operatorMatrix, + operatorDFTDeviceClass &operatorMatrix, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, elpaScalaManager & elpaScala, dataTypes::number * eigenVectorsFlattenedDevice, dataTypes::number * eigenVectorsRotFracDensityFlattenedDevice, @@ -178,7 +181,7 @@ namespace dftfe dealii::TimerOutput::wall_times); dftfe::utils::deviceBlasHandle_t &deviceBlasHandle = - operatorMatrix.getDeviceBlasHandle(); + BLASWrapperPtr->getDeviceBlasHandle(); // // allocate memory for full flattened array on device and fill it up @@ -566,19 +569,7 @@ namespace dftfe // if (d_dftParams.measureOnlyChebyTime) // exit(0); - /* - int inc=1; - double result=0.0; - dftfe::utils::deviceBlasWrapper::nrm2(deviceBlasHandle, - flattenedSize, - eigenVectorsFlattenedDevice, - inc, - &result); - result=result*result; - result=dealii::Utilities::MPI::sum(result,operatorMatrix.getMPICommunicator()); - std::cout<<"l2 norm Chebyshev filtered x: - "<> + & BLASWrapperPtr, elpaScalaManager & elpaScala, dataTypes::number * eigenVectorsFlattenedDevice, const unsigned int flattenedSize, @@ -761,7 +755,7 @@ namespace dftfe const bool useMixedPrecOverall) { dftfe::utils::deviceBlasHandle_t &deviceBlasHandle = - operatorMatrix.getDeviceBlasHandle(); + BLASWrapperPtr->getDeviceBlasHandle(); // // allocate memory for full flattened array on device and fill it up @@ -1088,7 +1082,10 @@ namespace dftfe void chebyshevOrthogonalizedSubspaceIterationSolverDevice:: densityMatrixEigenBasisFirstOrderResponse( - operatorDFTDeviceClass & operatorMatrix, + operatorDFTDeviceClass &operatorMatrix, + const std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, dataTypes::number * eigenVectorsFlattenedDevice, const unsigned int flattenedSize, const unsigned int totalNumberWaveFunctions, @@ -1112,7 +1109,7 @@ namespace dftfe "Density matrix first order response on Device"); dftfe::utils::deviceBlasHandle_t &deviceBlasHandle = - operatorMatrix.getDeviceBlasHandle(); + BLASWrapperPtr->getDeviceBlasHandle(); // // allocate memory for full flattened array on device and fill it up From 00fd615046fb457e66c9b6075f6deba5329368be Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Sat, 10 Feb 2024 17:43:17 -0500 Subject: [PATCH 06/24] wip --- CMakeLists.txt | 6 +- include/densityCalculator.h | 5 +- include/densityFirstOrderResponseCalculator.h | 128 ++- setupDevelopPetsc.sh | 8 +- ...mputeOutputDensityDirectionalDerivative.cc | 176 +--- src/dft/density.cc | 2 +- .../densityFirstOrderResponseCalculator.cc | 527 +++++++++++ .../densityFirstOrderResponseCalculatorCPU.cc | 871 ------------------ ...nsityFirstOrderResponseCalculatorDevice.cc | 676 -------------- ...rstOrderResponseCalculatorDeviceKernels.cc | 185 ++++ src/dft/initBoundaryConditions.cc | 20 +- 11 files changed, 836 insertions(+), 1768 deletions(-) create mode 100644 src/dft/densityFirstOrderResponseCalculator.cc delete mode 100644 src/dft/densityFirstOrderResponseCalculatorCPU.cc delete mode 100644 src/dft/densityFirstOrderResponseCalculatorDevice.cc create mode 100644 src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 644aea0f9..444fb9d24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ SET(TARGET_SRC ./src/dft/vselfBinsManager.cc ./src/dft/energyCalculator.cc ./src/dft/densityCalculator.cc - ./src/dft/densityFirstOrderResponseCalculatorCPU.cc + ./src/dft/densityFirstOrderResponseCalculator.cc ./src/excManager/excDensityBaseClass.cpp ./src/excManager/excDensityLDAClass.cpp ./src/excManager/excWavefunctionBaseClass.cpp @@ -168,7 +168,7 @@ SET(DEVICE_SRC ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc ./src/dft/densityCalculatorDeviceKernels.cc - ./src/dft/densityFirstOrderResponseCalculatorDevice.cc + ./src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc ./src/dft/solveVselfInBinsDevice.cc @@ -197,7 +197,7 @@ SET(DEVICE_SRC ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc ./src/dft/densityCalculatorDeviceKernels.cc - ./src/dft/densityFirstOrderResponseCalculatorDevice.cc + ./src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc ./src/dft/solveVselfInBinsDevice.cc diff --git a/include/densityCalculator.h b/include/densityCalculator.h index 5f73db5a8..1b07d5266 100644 --- a/include/densityCalculator.h +++ b/include/densityCalculator.h @@ -15,11 +15,10 @@ // --------------------------------------------------------------------- // -#ifndef densityCalculatorDevice_H_ -#define densityCalculatorDevice_H_ +#ifndef densityCalculator_H_ +#define densityCalculator_H_ #include -#include #include "dftParameters.h" #include "FEBasisOperations.h" diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index 4f5bb94f6..39ce276ed 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -20,95 +20,79 @@ #define densityFirstOrderResponseCalculator_H_ #include "headers.h" -#include "operator.h" #include "dftParameters.h" +#include "FEBasisOperations.h" -#if defined(DFTFE_WITH_DEVICE) -# include "operatorDevice.h" -# include "dftfeDataTypes.h" -#endif namespace dftfe { - template + template void - computeRhoFirstOrderResponseCPU( - const NumberType * X, - const NumberType * XPrime, - const std::vector> & densityMatDerFermiEnergy, + computeRhoFirstOrderResponse( + const dftfe::utils::MemoryStorage & X, + const dftfe::utils::MemoryStorage & XPrime, const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); - - template - void - computeRhoFirstOrderResponseCPUMixedPrec( - const NumberType * X, - const NumberType * XPrime, const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, + std::vector> &rhoResponseValuesHam, + std::vector> &rhoResponseValuesFermiEnergy, const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, const dftParameters &dftParams); + template + void + computeRhoResponseFromInterpolatedValues( + std::shared_ptr< + dftfe::basis:: + FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + const std::pair cellRange, + const std::pair vecRange, + double * onesVec, + double * partialOccupVecPrime, + NumberType * wfcQuadPointData, + NumberType * wfcPrimeQuadPointData, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions, + double * rhoResponseHam, + double * rhoResponseFermiEnergy); + #if defined(DFTFE_WITH_DEVICE) - template + template void - computeRhoFirstOrderResponseDevice( - const NumberType * X, - const NumberType * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); + computeRhoResponseFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + const std::pair cellRange, + const std::pair vecRange, + double * onesVec, + double * partialOccupVecPrime, + NumberType * wfcQuadPointData, + NumberType * wfcPrimeQuadPointData, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions, + double * rhoResponseHam, + double * rhoResponseFermiEnergy); #endif + } // namespace dftfe #endif diff --git a/setupDevelopPetsc.sh b/setupDevelopPetsc.sh index 2e5e47c45..f550abd6f 100755 --- a/setupDevelopPetsc.sh +++ b/setupDevelopPetsc.sh @@ -49,7 +49,7 @@ withCustomizedDealii=ON #Compiler options and flags cxx_compiler=/sw/pkgs/arc/stacks/gcc/10.3.0/openmpi/4.1.6rc2/bin/mpicxx #sets DCMAKE_CXX_COMPILER -cxx_flags="-std=c++14 -march=native -fopenmp -fPIC" #sets DCMAKE_CXX_FLAGS +cxx_flags="-std=c++17 -march=native -fopenmp -fPIC" #sets DCMAKE_CXX_FLAGS cxx_flagsRelease="-O2" #sets DCMAKE_CXX_FLAGS_RELEASE device_flags="-arch=sm_70" # set DCMAKE_CXX_CUDA_FLAGS #(only applicable for withGPU=ON) @@ -79,7 +79,7 @@ out=`echo "$build_type" | tr '[:upper:]' '[:lower:]'` function cmake_configure() { if [ "$gpuLang" = "cuda" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -93,7 +93,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 elif [ "$gpuLang" = "hip" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -107,7 +107,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 else - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 2e8b14001..6c4258be1 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -431,17 +431,16 @@ namespace dftfe fvFermiEnergySpin1 = 0; } - std::map> + std::vector> rhoResponseHamPRefinedNodalData; - std::map> + std::vector> rhoResponseFermiEnergyPRefinedNodalData; - std::map> - rhoResponseHamPRefinedNodalDataSpinPolarized; - std::map> - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized; // initialize variables to be used later + d_basisOperationsPtrHost->reinit(0, 0, d_gllQuadratureId, false); + const unsigned int numLocallyOwnedCells = + d_basisOperationsPtrHost->nCells(); const unsigned int dofs_per_cell = d_dofHandlerRhoNodal.get_fe().dofs_per_cell; typename dealii::DoFHandler<3>::active_cell_iterator @@ -453,7 +452,7 @@ namespace dftfe matrix_free_data.get_quadrature(d_gllQuadratureId); const unsigned int numQuadPoints = quadrature_formula.size(); - // get access to quadrature point coordinates and 2p DoFHandler nodal points + // get access to quadrature point coordinates and density DoFHandler nodal points const std::vector> &quadraturePointCoor = quadrature_formula.get_points(); const std::vector> &supportPointNaturalCoor = @@ -478,26 +477,18 @@ namespace dftfe } // allocate the storage to compute 2p nodal values from wavefunctions - for (; cell != endc; ++cell) - { - if (cell->is_locally_owned()) - { - const dealii::CellId cellId = cell->id(); - rhoResponseHamPRefinedNodalData[cellId] = - std::vector(numQuadPoints, 0.0); - rhoResponseFermiEnergyPRefinedNodalData[cellId] = - std::vector(numQuadPoints, 0.0); - if (d_dftParamsPtr->spinPolarized == 1) - { - const dealii::CellId cellId = cell->id(); - rhoResponseHamPRefinedNodalDataSpinPolarized[cellId] = - std::vector(2 * numQuadPoints, 0.0); - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized[cellId] = - std::vector(2 * numQuadPoints, 0.0); - } - } - } + rhoResponseHamPRefinedNodalData.resize(d_dftParamsPtr->spinPolarized == 1 ? 2 : + 1); + rhoResponseFermiEnergyPRefinedNodalData.resize(d_dftParamsPtr->spinPolarized == 1 ? 2 : + 1); + + for (unsigned int iComp = 0; iComp < rhoResponseHamPRefinedNodalData.size(); + ++iComp) + { + rhoResponseHamPRefinedNodalData[iComp].resize(numLocallyOwnedCells*numQuadPoints,0); + rhoResponseFermiEnergyPRefinedNodalData[iComp].resize(numLocallyOwnedCells*numQuadPoints,0); + } // compute first order density response at nodal locations of 2p @@ -505,48 +496,17 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) { - if (d_dftParamsPtr->singlePrecLRD) - computeRhoFirstOrderResponseDevice( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), - d_densityMatDerFermiEnergy, + computeRhoFirstOrderResponse( + d_eigenVectorsFlattenedDevice, + d_eigenVectorsDensityMatrixPrimeDevice, d_numEigenValues, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - kohnShamDFTEigenOperatorDevice, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(), - quadrature_formula.size(), - d_kPointWeights, - rhoResponseHamPRefinedNodalData, - rhoResponseFermiEnergyPRefinedNodalData, - rhoResponseHamPRefinedNodalDataSpinPolarized, - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr); - else - computeRhoFirstOrderResponseDevice( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), d_densityMatDerFermiEnergy, - d_numEigenValues, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - kohnShamDFTEigenOperatorDevice, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(), - quadrature_formula.size(), + d_BLASWrapperPtrDevice, + d_densityDofHandlerIndex, + d_gllQuadratureId, d_kPointWeights, rhoResponseHamPRefinedNodalData, rhoResponseFermiEnergyPRefinedNodalData, - rhoResponseHamPRefinedNodalDataSpinPolarized, - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized, d_mpiCommParent, interpoolcomm, interBandGroupComm, @@ -555,47 +515,17 @@ namespace dftfe #endif if (!d_dftParamsPtr->useDevice) { - if (d_dftParamsPtr->singlePrecLRD) - computeRhoFirstOrderResponseCPUMixedPrec( - d_eigenVectorsFlattenedHost.data(), - d_eigenVectorsDensityMatrixPrimeHost.data(), - d_densityMatDerFermiEnergy, + computeRhoFirstOrderResponse( + d_eigenVectorsFlattenedHost, + d_eigenVectorsDensityMatrixPrimeHost, d_numEigenValues, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - kohnShamDFTEigenOperatorCPU, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(), - quadrature_formula.size(), - d_kPointWeights, - rhoResponseHamPRefinedNodalData, - rhoResponseFermiEnergyPRefinedNodalData, - rhoResponseHamPRefinedNodalDataSpinPolarized, - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr); - else - computeRhoFirstOrderResponseCPU( - d_eigenVectorsFlattenedHost.data(), - d_eigenVectorsDensityMatrixPrimeHost.data(), d_densityMatDerFermiEnergy, - d_numEigenValues, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - kohnShamDFTEigenOperatorCPU, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(), - quadrature_formula.size(), + d_BLASWrapperPtrHost, + d_densityDofHandlerIndex, + d_gllQuadratureId, d_kPointWeights, rhoResponseHamPRefinedNodalData, rhoResponseFermiEnergyPRefinedNodalData, - rhoResponseHamPRefinedNodalDataSpinPolarized, - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized, d_mpiCommParent, interpoolcomm, interBandGroupComm, @@ -608,22 +538,19 @@ namespace dftfe endcP = d_dofHandlerRhoNodal.end(); + unsigned int iCell = 0; for (; cellP != endcP; ++cellP) if (cellP->is_locally_owned()) { std::vector cell_dof_indices( dofs_per_cell); cellP->get_dof_indices(cell_dof_indices); - const std::vector &nodalValuesResponseHam = - rhoResponseHamPRefinedNodalData.find(cellP->id())->second; + const double * nodalValuesResponseHam = + rhoResponseHamPRefinedNodalData[0].data() + iCell * dofs_per_cell; - const std::vector &nodalValuesResponseFermiEnergy = - rhoResponseFermiEnergyPRefinedNodalData.find(cellP->id())->second; - Assert( - nodalValuesResponseHam.size() == dofs_per_cell, - dealii::ExcMessage( - "Number of nodes in 2p DoFHandler does not match with data stored in rhoNodal Values variable")); + const double * nodalValuesResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[0].data() + iCell * dofs_per_cell; for (unsigned int iNode = 0; iNode < dofs_per_cell; ++iNode) { @@ -640,6 +567,7 @@ namespace dftfe } } } + iCell++; } const double firstOrderResponseFermiEnergy = @@ -657,25 +585,25 @@ namespace dftfe cellP = d_dofHandlerRhoNodal.begin_active(); endcP = d_dofHandlerRhoNodal.end(); + iCell = 0; for (; cellP != endcP; ++cellP) if (cellP->is_locally_owned()) { std::vector cell_dof_indices( dofs_per_cell); cellP->get_dof_indices(cell_dof_indices); - const std::vector &nodalValuesResponseHam = - rhoResponseHamPRefinedNodalDataSpinPolarized.find(cellP->id()) - ->second; + const double * nodalValuesRhoTotResponseHam = + rhoResponseHamPRefinedNodalData[0].data() + iCell * dofs_per_cell; + + const double * nodalValuesRhoTotResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[0].data() + iCell * dofs_per_cell; + + const double * nodalValuesRhoMagResponseHam = + rhoResponseHamPRefinedNodalData[1].data() + iCell * dofs_per_cell; - const std::vector &nodalValuesResponseFermiEnergy = - rhoResponseFermiEnergyPRefinedNodalDataSpinPolarized - .find(cellP->id()) - ->second; + const double * nodalValuesRhoMagResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[1].data() + iCell * dofs_per_cell; - Assert( - nodalValuesResponseHam.size() == 2 * dofs_per_cell, - dealii::ExcMessage( - "Number of nodes in 2p DoFHandler does not match with data stored in rhoNodal Values variable")); for (unsigned int iNode = 0; iNode < dofs_per_cell; ++iNode) { @@ -686,19 +614,17 @@ namespace dftfe if (locallyOwnedDofs.is_element(nodeID)) { fvHamSpin0(nodeID) = - nodalValuesResponseHam[2 * renumberingMap[iNode]]; + 0.5*(nodalValuesRhoTotResponseHam[renumberingMap[iNode]]+nodalValuesRhoMagResponseHam[renumberingMap[iNode]]); fvHamSpin1(nodeID) = - nodalValuesResponseHam[2 * renumberingMap[iNode] + - 1]; + 0.5*(nodalValuesRhoTotResponseHam[renumberingMap[iNode]]-nodalValuesRhoMagResponseHam[renumberingMap[iNode]]); fvFermiEnergySpin0(nodeID) = - nodalValuesResponseFermiEnergy - [2 * renumberingMap[iNode]]; + 0.5*(nodalValuesRhoTotResponseFermiEnergy[renumberingMap[iNode]]+nodalValuesRhoMagResponseFermiEnergy[renumberingMap[iNode]]); fvFermiEnergySpin1(nodeID) = - nodalValuesResponseFermiEnergy - [2 * renumberingMap[iNode] + 1]; + 0.5*(nodalValuesRhoTotResponseFermiEnergy[renumberingMap[iNode]]-nodalValuesRhoMagResponseFermiEnergy[renumberingMap[iNode]]); } } } + iCell++; } for (unsigned int i = 0; i < fvHamSpin0.local_size(); i++) diff --git a/src/dft/density.cc b/src/dft/density.cc index c5828e296..662d69504 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -321,7 +321,7 @@ namespace dftfe matrix_free_data.get_quadrature(d_gllQuadratureId); const unsigned int numQuadPoints = quadrature_formula.size(); - // get access to quadrature point coordinates and 2p DoFHandler nodal points + // get access to quadrature point coordinates and density DoFHandler nodal points const std::vector> &quadraturePointCoor = quadrature_formula.get_points(); const std::vector> &supportPointNaturalCoor = diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc new file mode 100644 index 000000000..635cc8de2 --- /dev/null +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -0,0 +1,527 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + template + void + computeRhoFirstOrderResponse( + const dftfe::utils::MemoryStorage & X, + const dftfe::utils::MemoryStorage & XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> & densityMatDerFermiEnergy, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::vector> &rhoResponseValuesHam, + std::vector> &rhoResponseValuesFermiEnergy, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters &dftParams) + { + int this_process; + MPI_Comm_rank(mpiCommParent, &this_process); +#if defined(DFTFE_WITH_DEVICE) + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceSynchronize(); +#endif + MPI_Barrier(mpiCommParent); + double computeRho_time = MPI_Wtime(); + const unsigned int numKPoints = kPointWeights.size(); + const unsigned int numLocalDofs = basisOperationsPtr->nOwnedDofs(); + const unsigned int totalLocallyOwnedCells = basisOperationsPtr->nCells(); + const unsigned int numNodesPerElement = basisOperationsPtr->nDofsPerCell(); + // band group parallelization data structures + const unsigned int numberBandGroups = + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = + dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + totalNumWaveFunctions, + bandGroupLowHighPlusOneIndices); + + const unsigned int BVec = + std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); + + const double spinPolarizedFactor = + (dftParams.spinPolarized == 1) ? 1.0 : 2.0; + const unsigned int numSpinComponents = + (dftParams.spinPolarized == 1) ? 2 : 1; + + const NumberType zero = 0; + const NumberType scalarCoeffAlphaRho = 1.0; + const NumberType scalarCoeffBetaRho = 1.0; + + const unsigned int cellsBlockSize = + memorySpace == dftfe::utils::MemorySpace::DEVICE ? 50 : 1; + const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; + const unsigned int remCellBlockSize = + totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; + basisOperationsPtr->reinit(BVec, cellsBlockSize, quadratureIndex); + const unsigned int numQuadPoints = basisOperationsPtr->nQuadsPerCell(); + + std::vector> + wfcQuadPointData(numSpinComponents); + std::vector> + wfcPrimeQuadPointData(numSpinComponents); + std::vector> + rhoResponseHamWfcContributions(numSpinComponents); + std::vector> + rhoResponseFermiEnergyWfcContributions(numSpinComponents); + dftfe::utils::MemoryStorage + rhoResponseHamHost; + + dftfe::utils::MemoryStorage + rhoResponseFermiEnergyHost; +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage rhoResponseHam; + dftfe::utils::MemoryStorage rhoResponseFermiEnergy; +#else + auto &rhoResponseHam = rhoResponseHamHost; + auto &rhoResponseFermiEnergy = rhoResponseFermiEnergyHost; +#endif + + rhoResponseHam.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); + rhoResponseFermiEnergy.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + { + wfcQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + zero); + + wfcPrimeQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + zero); + + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + { + rhoResponseHamWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + 0.0); + + rhoResponseFermiEnergyWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + 0.0); + } + } + + + dftfe::utils::MemoryStorage onesVec(BVec,1.0); + + std::vector< + dftfe::utils::MemoryStorage> + partialOccupPrimeVecHost( + numSpinComponents, + dftfe::utils::MemoryStorage( + BVec, 0.0)); +#if defined(DFTFE_WITH_DEVICE) + std::vector> + partialOccupPrimeVec(numSpinComponents); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + partialOccupPrimeVec[spinIndex].resize(partialOccupPrimeVecHost[spinIndex].size()); +#else + auto &partialOccupPrimeVec = partialOccupPrimeVecHost; +#endif + + std::vector *> + flattenedArrayBlock(numSpinComponents*2); + + for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) + { + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + wfcQuadPointData[spinIndex].setValue(zero); + wfcPrimeQuadPointData[spinIndex].setValue(zero); + rhoResponseHamWfcContributions[spinIndex].setValue(0.0); + rhoResponseFermiEnergyWfcContributions[spinIndex].setValue(0.0); + } + for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; jvec += BVec) + { + const unsigned int currentBlockSize = + std::min(BVec, totalNumWaveFunctions - jvec); + for (unsigned int icomp = 0; icomp < flattenedArrayBlock.size(); + ++icomp) + flattenedArrayBlock[icomp] = + &(basisOperationsPtr->getMultiVector(currentBlockSize, + icomp)); + + if ((jvec + currentBlockSize) <= + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && + (jvec + currentBlockSize) > + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) + { + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupPrimeVecHost[spinIndex].begin() + + iEigenVec) = + densityMatDerFermiEnergy[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec]*kPointWeights[kPoint] * spinPolarizedFactor; + } +#if defined(DFTFE_WITH_DEVICE) + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + partialOccupPrimeVec[spinIndex].copyFrom( + partialOccupPrimeVecHost[spinIndex]); +#endif + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) + std::memcpy(flattenedArrayBlock[spinIndex]->data() + + iNode * currentBlockSize, + X.data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex) + + iNode * totalNumWaveFunctions + jvec, + currentBlockSize * sizeof(NumberType)); +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + totalNumWaveFunctions, + numLocalDofs, + jvec, + X.data() + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[spinIndex]->data()); +#endif + + + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) + std::memcpy(flattenedArrayBlock[numSpinComponents+spinIndex]->data() + + iNode * currentBlockSize, + XPrime.data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex) + + iNode * totalNumWaveFunctions + jvec, + currentBlockSize * sizeof(NumberType)); +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + totalNumWaveFunctions, + numLocalDofs, + jvec, + XPrime.data() + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[numSpinComponents+spinIndex]->data()); +#endif + + basisOperationsPtr->reinit(currentBlockSize, + cellsBlockSize, + quadratureIndex, + false); + + + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + flattenedArrayBlock[spinIndex]->updateGhostValues(); + basisOperationsPtr->distribute( + *(flattenedArrayBlock[spinIndex])); + + flattenedArrayBlock[numSpinComponents+spinIndex]->updateGhostValues(); + basisOperationsPtr->distribute( + *(flattenedArrayBlock[spinIndex])); + } + + for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) + { + const unsigned int currentCellsBlockSize = + (iblock == numCellBlocks) ? remCellBlockSize : + cellsBlockSize; + if (currentCellsBlockSize > 0) + { + const unsigned int startingCellId = + iblock * cellsBlockSize; + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + *(flattenedArrayBlock[spinIndex]), + wfcQuadPointData[spinIndex].data(), + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + *(flattenedArrayBlock[numSpinComponents+spinIndex]), + wfcPrimeQuadPointData[spinIndex].data(), + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + computeRhoResponseFromInterpolatedValues( + basisOperationsPtr, + BLASWrapperPtr, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize), + std::pair( + jvec, jvec + currentBlockSize), + onesVec.data(), + partialOccupPrimeVec[spinIndex].data(), + wfcQuadPointData[spinIndex].data(), + wfcPrimeQuadPointData[spinIndex].data(), + rhoResponseHamWfcContributions[spinIndex].data(), + rhoResponseFermiEnergyWfcContributions[spinIndex].data(), + rhoResponseHam.data() + spinIndex * totalLocallyOwnedCells * + numQuadPoints, + rhoResponseFermiEnergy.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints); + } // non-trivial cell block check + } // cells block loop + } + } + } +#if defined(DFTFE_WITH_DEVICE) + rhoResponseHamHost.resize(rhoResponseHam.size()); + + rhoResponseHamHost.copyFrom(rhoResponseHam); + + rhoResponseFermiEnergyHost.resize(rhoResponseFermiEnergy.size()); + + rhoResponseFermiEnergyHost.copyFrom(rhoResponseFermiEnergy); +#endif + + int size; + MPI_Comm_size(interpoolcomm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoResponseHamHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoResponseHamHost.data()), + MPI_SUM, + interpoolcomm); + + MPI_Allreduce(MPI_IN_PLACE, + rhoResponseFermiEnergyHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoResponseFermiEnergyHost.data()), + MPI_SUM, + interpoolcomm); + + } + MPI_Comm_size(interBandGroupComm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoResponseHamHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoResponseHamHost.data()), + MPI_SUM, + interBandGroupComm); + + MPI_Allreduce(MPI_IN_PLACE, + rhoResponseFermiEnergyHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoResponseFermiEnergyHost.data()), + MPI_SUM, + interBandGroupComm); + } + + if (dftParams.spinPolarized == 1) + { + rhoResponseValuesHam[0].resize(totalLocallyOwnedCells * numQuadPoints); + rhoResponseValuesHam[1].resize(totalLocallyOwnedCells * numQuadPoints); + std::transform(rhoResponseHamHost.begin(), + rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseValuesHam[0].begin(), + std::plus<>{}); + std::transform(rhoResponseHamHost.begin(), + rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseValuesHam[1].begin(), + std::minus<>{}); + + rhoResponseValuesFermiEnergy[0].resize(totalLocallyOwnedCells * numQuadPoints); + rhoResponseValuesFermiEnergy[1].resize(totalLocallyOwnedCells * numQuadPoints); + std::transform(rhoResponseFermiEnergyHost.begin(), + rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseValuesFermiEnergy[0].begin(), + std::plus<>{}); + std::transform(rhoResponseFermiEnergyHost.begin(), + rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseValuesFermiEnergy[1].begin(), + std::minus<>{}); + + } + else + { + rhoResponseValuesHam[0] = rhoResponseHamHost; + rhoResponseValuesFermiEnergy[0] = rhoResponseFermiEnergyHost; + } +#if defined(DFTFE_WITH_DEVICE) + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceSynchronize(); +#endif + MPI_Barrier(mpiCommParent); + computeRho_time = MPI_Wtime() - computeRho_time; + + if (this_process == 0 && dftParams.verbosity >= 2) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + std::cout << "Time for compute rho on CPU: " << computeRho_time + << std::endl; + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + std::cout << "Time for compute rho on Device: " << computeRho_time + << std::endl; + } + template + void + computeRhoResponseFromInterpolatedValues( + std::shared_ptr< + dftfe::basis:: + FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + const std::pair cellRange, + const std::pair vecRange, + double * onesVec, + double * partialOccupVecPrime, + NumberType * wfcQuadPointData, + NumberType * wfcPrimeQuadPointData, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions, + double * rhoResponseHam, + double * rhoResponseFermiEnergy) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) + for (unsigned int iWave = 0; iWave < vecRange.second - vecRange.first; + ++iWave) + { + const NumberType psi = + wfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * + vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]; + const NumberType psiPrime = + wfcPrimeQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * + vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]; + rhoResponseHam[iCell * nQuadsPerCell + iQuad] += + dftfe::utils::realPart(psi * dftfe::utils::complexConj(psiPrime)); + + rhoResponseFermiEnergy[iCell * nQuadsPerCell + iQuad] += + partialOccupVecPrime[iWave] * dftfe::utils::realPart(psi * dftfe::utils::complexConj(psi)); + + } + } +#if defined(DFTFE_WITH_DEVICE) + template void + computeRhoFirstOrderResponse( + const dftfe::utils::MemoryStorage & X, + const dftfe::utils::MemoryStorage & XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> & densityMatDerFermiEnergy, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::vector> &rhoResponseValuesHam, + std::vector> &rhoResponseValuesFermiEnergy, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters &dftParams); +#endif + + template void + computeRhoFirstOrderResponse( + const dftfe::utils::MemoryStorage & X, + const dftfe::utils::MemoryStorage & XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> & densityMatDerFermiEnergy, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr> + & BLASWrapperPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::vector> &rhoResponseValuesHam, + std::vector> &rhoResponseValuesFermiEnergy, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters &dftParams); +} // namespace dftfe diff --git a/src/dft/densityFirstOrderResponseCalculatorCPU.cc b/src/dft/densityFirstOrderResponseCalculatorCPU.cc deleted file mode 100644 index b0efcd893..000000000 --- a/src/dft/densityFirstOrderResponseCalculatorCPU.cc +++ /dev/null @@ -1,871 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2018 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -// source file for electron density related computations -#include -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - template - void - computeRhoFirstOrderResponseCPU( - const T * X, - const T * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - MPI_Barrier(mpiCommParent); - double cpu_time = MPI_Wtime(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - std::vector wfcQuads(numQuadPoints * BVec, T(0.0)); - std::vector wfcPrimeQuads(numQuadPoints * BVec, T(0.0)); - - std::vector shapeFunctionValues(numQuadPoints * numNodesPerElement, - T(0.0)); - const unsigned int numQuadPointsTimes3 = numQuadPoints * 3; - - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = - T(operatorMatrix.getShapeFunctionValuesDensityGaussLobattoQuad() - [iquad * numNodesPerElement + iNode]); - - - dftfe::distributedCPUVec flattenedArrayBlock1, flattenedArrayBlock2; - - std::vector cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0)); - - std::vector cellWaveFunctionPrimeMatrix(numNodesPerElement * BVec, - T(0.0)); - - // set density to zero - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::fill((rhoResponseValuesHam)[cellid].begin(), - (rhoResponseValuesHam)[cellid].end(), - 0.0); - std::fill((rhoResponseValuesFermiEnergy)[cellid].begin(), - (rhoResponseValuesFermiEnergy)[cellid].end(), - 0.0); - - if (dftParams.spinPolarized == 1) - { - std::fill((rhoResponseValuesHamSpinPolarized)[cellid].begin(), - (rhoResponseValuesHamSpinPolarized)[cellid].end(), - 0.0); - std::fill( - (rhoResponseValuesFermiEnergySpinPolarized)[cellid].begin(), - (rhoResponseValuesFermiEnergySpinPolarized)[cellid].end(), - 0.0); - } - } - - - std::vector rhoResponseHam(totalLocallyOwnedCells * numQuadPoints, - 0.0); - std::vector rhoResponseFermiEnergy(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - - std::vector rhoResponseHamSpinPolarized(totalLocallyOwnedCells * - numQuadPoints * 2, - 0.0); - std::vector rhoResponseFermiEnergySpinPolarized( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - std::vector rhoResponseContributionHam(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector rhoResponseContributionFermiEnergy( - totalLocallyOwnedCells * numQuadPoints, 0.0); - - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - const T *XCurrentKPoint = - X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * totalNumWaveFunctions; - - const T *XPrimeCurrentKPoint = - XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * totalNumWaveFunctions; - - const std::vector &densityMatDerFermiEnergyVec = - densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + - spinIndex]; - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, totalNumWaveFunctions - jvec); - - if (currentBlockSize != BVec || jvec == 0) - { - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock1, - true); - flattenedArrayBlock2.reinit(flattenedArrayBlock1); - } - - if ((jvec + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock1.local_element( - iNode * currentBlockSize + iWave) = - XCurrentKPoint[iNode * totalNumWaveFunctions + jvec + - iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock1, currentBlockSize); - - - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock2.local_element( - iNode * currentBlockSize + iWave) = - XPrimeCurrentKPoint[iNode * totalNumWaveFunctions + - jvec + iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock2, currentBlockSize); - - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - { - const unsigned int inc = 1; - for (unsigned int iNode = 0; iNode < numNodesPerElement; - ++iNode) - { - xcopy( - ¤tBlockSize, - flattenedArrayBlock1.begin() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionMatrix[currentBlockSize * iNode], - &inc); - - xcopy( - ¤tBlockSize, - flattenedArrayBlock2.begin() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionPrimeMatrix[currentBlockSize * - iNode], - &inc); - } - - - const T scalarCoeffAlpha = T(1.0), - scalarCoeffBeta = T(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionPrimeMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcPrimeQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - { - rhoResponseContributionHam[icell * numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - wfcQuads[iquad * currentBlockSize + iWave] * - dftfe::utils::complexConj( - wfcPrimeQuads[iquad * currentBlockSize + - iWave])); - - rhoResponseContributionFermiEnergy - [icell * numQuadPoints + iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - densityMatDerFermiEnergyVec[jvec + iWave] * - dftfe::utils::realPart( - wfcQuads[iquad * currentBlockSize + iWave] * - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave])); - } - - } // cells loop - } // band parallelizatoin check - } // wave function block loop - } // kpoint loop - - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseHam[icell * numQuadPoints + iquad] += - rhoResponseContributionHam[icell * numQuadPoints + iquad]; - rhoResponseFermiEnergy[icell * numQuadPoints + iquad] += - rhoResponseContributionFermiEnergy[icell * numQuadPoints + - iquad]; - } - - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseHamSpinPolarized[icell * numQuadPoints * 2 + - 2 * iquad + spinIndex] = - rhoResponseContributionHam[icell * numQuadPoints + iquad]; - rhoResponseFermiEnergySpinPolarized[icell * numQuadPoints * - 2 + - 2 * iquad + spinIndex] = - rhoResponseContributionFermiEnergy[icell * numQuadPoints + - iquad]; - } - } - } // spin index loop - - // gather density response from all inter communicators - dealii::Utilities::MPI::sum(rhoResponseHam, - interBandGroupComm, - rhoResponseHam); - - dealii::Utilities::MPI::sum(rhoResponseHam, interpoolcomm, rhoResponseHam); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergy, - interBandGroupComm, - rhoResponseFermiEnergy); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergy, - interpoolcomm, - rhoResponseFermiEnergy); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoResponseHamSpinPolarized, - interBandGroupComm, - rhoResponseHamSpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseHamSpinPolarized, - interpoolcomm, - rhoResponseHamSpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergySpinPolarized, - interBandGroupComm, - rhoResponseFermiEnergySpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergySpinPolarized, - interpoolcomm, - rhoResponseFermiEnergySpinPolarized); - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector &temp1Quads = (rhoResponseValuesHam)[cellid]; - - std::vector &temp2Quads = - (rhoResponseValuesFermiEnergy)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp1Quads[q] = rhoResponseHam[iElem * numQuadPoints + q]; - temp2Quads[q] = rhoResponseFermiEnergy[iElem * numQuadPoints + q]; - } - - if (dftParams.spinPolarized == 1) - { - std::vector &temp3Quads = - (rhoResponseValuesHamSpinPolarized)[cellid]; - - std::vector &temp4Quads = - (rhoResponseValuesFermiEnergySpinPolarized)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp3Quads[2 * q + 0] = - rhoResponseHamSpinPolarized[iElem * numQuadPoints * 2 + - 2 * q + 0]; - temp3Quads[2 * q + 1] = - rhoResponseHamSpinPolarized[iElem * numQuadPoints * 2 + - 2 * q + 1]; - temp4Quads[2 * q + 0] = - rhoResponseFermiEnergySpinPolarized[iElem * numQuadPoints * - 2 + - 2 * q + 0]; - temp4Quads[2 * q + 1] = - rhoResponseFermiEnergySpinPolarized[iElem * numQuadPoints * - 2 + - 2 * q + 1]; - } - } - - iElem++; - } - - - MPI_Barrier(mpiCommParent); - cpu_time = MPI_Wtime() - cpu_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rhoprime on CPU: " << cpu_time - << std::endl; - } - - - template - void - computeRhoFirstOrderResponseCPUMixedPrec( - const T * X, - const T * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - MPI_Barrier(mpiCommParent); - double cpu_time = MPI_Wtime(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - std::vector wfcQuads(numQuadPoints * BVec, TLowPrec(0.0)); - std::vector wfcPrimeQuads(numQuadPoints * BVec, TLowPrec(0.0)); - - std::vector shapeFunctionValues(numQuadPoints * - numNodesPerElement, - TLowPrec(0.0)); - const unsigned int numQuadPointsTimes3 = numQuadPoints * 3; - - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = TLowPrec( - operatorMatrix.getShapeFunctionValuesDensityGaussLobattoQuad() - [iquad * numNodesPerElement + iNode]); - - - dftfe::distributedCPUVec flattenedArrayBlock1, flattenedArrayBlock2; - - std::vector cellWaveFunctionMatrix(numNodesPerElement * BVec, - TLowPrec(0.0)); - - std::vector cellWaveFunctionPrimeMatrix(numNodesPerElement * BVec, - TLowPrec(0.0)); - - // set density to zero - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::fill((rhoResponseValuesHam)[cellid].begin(), - (rhoResponseValuesHam)[cellid].end(), - 0.0); - std::fill((rhoResponseValuesFermiEnergy)[cellid].begin(), - (rhoResponseValuesFermiEnergy)[cellid].end(), - 0.0); - - if (dftParams.spinPolarized == 1) - { - std::fill((rhoResponseValuesHamSpinPolarized)[cellid].begin(), - (rhoResponseValuesHamSpinPolarized)[cellid].end(), - 0.0); - std::fill( - (rhoResponseValuesFermiEnergySpinPolarized)[cellid].begin(), - (rhoResponseValuesFermiEnergySpinPolarized)[cellid].end(), - 0.0); - } - } - - - std::vector rhoResponseHam(totalLocallyOwnedCells * numQuadPoints, - 0.0); - std::vector rhoResponseFermiEnergy(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - - std::vector rhoResponseHamSpinPolarized(totalLocallyOwnedCells * - numQuadPoints * 2, - 0.0); - std::vector rhoResponseFermiEnergySpinPolarized( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - - const std::vector &indexMap = - operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap(); - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - std::vector rhoResponseContributionHam(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector rhoResponseContributionFermiEnergy( - totalLocallyOwnedCells * numQuadPoints, 0.0); - - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - const T *XCurrentKPoint = - X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * totalNumWaveFunctions; - - const T *XPrimeCurrentKPoint = - XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * totalNumWaveFunctions; - - const std::vector &densityMatDerFermiEnergyVec = - densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + - spinIndex]; - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, totalNumWaveFunctions - jvec); - - if (currentBlockSize != BVec || jvec == 0) - { - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock1, - true); - flattenedArrayBlock2.reinit(flattenedArrayBlock1); - } - - if ((jvec + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock1.local_element( - iNode * currentBlockSize + iWave) = - XCurrentKPoint[iNode * totalNumWaveFunctions + jvec + - iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock1, currentBlockSize); - - - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock2.local_element( - iNode * currentBlockSize + iWave) = - XPrimeCurrentKPoint[iNode * totalNumWaveFunctions + - jvec + iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock2, currentBlockSize); - - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - { - for (unsigned int iNode = 0; iNode < numNodesPerElement; - ++iNode) - { - const unsigned startIndex = - indexMap[icell * numNodesPerElement + iNode]; - for (unsigned int iwave = 0; - iwave < currentBlockSize; - iwave++) - { - cellWaveFunctionMatrix[currentBlockSize * - iNode + - iwave] = - *(flattenedArrayBlock1.begin() + startIndex + - iwave); - } - - for (unsigned int iwave = 0; - iwave < currentBlockSize; - iwave++) - { - cellWaveFunctionPrimeMatrix[currentBlockSize * - iNode + - iwave] = - *(flattenedArrayBlock2.begin() + startIndex + - iwave); - } - } - - - const TLowPrec scalarCoeffAlpha = TLowPrec(1.0), - scalarCoeffBeta = TLowPrec(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionPrimeMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcPrimeQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - { - rhoResponseContributionHam[icell * numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - wfcQuads[iquad * currentBlockSize + iWave] * - dftfe::utils::complexConj( - wfcPrimeQuads[iquad * currentBlockSize + - iWave])); - - rhoResponseContributionFermiEnergy - [icell * numQuadPoints + iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - densityMatDerFermiEnergyVec[jvec + iWave] * - dftfe::utils::realPart( - wfcQuads[iquad * currentBlockSize + iWave] * - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave])); - } - - } // cells loop - } // band parallelizatoin check - } // wave function block loop - } // kpoint loop - - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseHam[icell * numQuadPoints + iquad] += - rhoResponseContributionHam[icell * numQuadPoints + iquad]; - rhoResponseFermiEnergy[icell * numQuadPoints + iquad] += - rhoResponseContributionFermiEnergy[icell * numQuadPoints + - iquad]; - } - - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseHamSpinPolarized[icell * numQuadPoints * 2 + - 2 * iquad + spinIndex] = - rhoResponseContributionHam[icell * numQuadPoints + iquad]; - rhoResponseFermiEnergySpinPolarized[icell * numQuadPoints * - 2 + - 2 * iquad + spinIndex] = - rhoResponseContributionFermiEnergy[icell * numQuadPoints + - iquad]; - } - } - } // spin index loop - - // gather density response from all inter communicators - dealii::Utilities::MPI::sum(rhoResponseHam, - interBandGroupComm, - rhoResponseHam); - - dealii::Utilities::MPI::sum(rhoResponseHam, interpoolcomm, rhoResponseHam); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergy, - interBandGroupComm, - rhoResponseFermiEnergy); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergy, - interpoolcomm, - rhoResponseFermiEnergy); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoResponseHamSpinPolarized, - interBandGroupComm, - rhoResponseHamSpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseHamSpinPolarized, - interpoolcomm, - rhoResponseHamSpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergySpinPolarized, - interBandGroupComm, - rhoResponseFermiEnergySpinPolarized); - - dealii::Utilities::MPI::sum(rhoResponseFermiEnergySpinPolarized, - interpoolcomm, - rhoResponseFermiEnergySpinPolarized); - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector &temp1Quads = (rhoResponseValuesHam)[cellid]; - - std::vector &temp2Quads = - (rhoResponseValuesFermiEnergy)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp1Quads[q] = rhoResponseHam[iElem * numQuadPoints + q]; - temp2Quads[q] = rhoResponseFermiEnergy[iElem * numQuadPoints + q]; - } - - if (dftParams.spinPolarized == 1) - { - std::vector &temp3Quads = - (rhoResponseValuesHamSpinPolarized)[cellid]; - - std::vector &temp4Quads = - (rhoResponseValuesFermiEnergySpinPolarized)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp3Quads[2 * q + 0] = - rhoResponseHamSpinPolarized[iElem * numQuadPoints * 2 + - 2 * q + 0]; - temp3Quads[2 * q + 1] = - rhoResponseHamSpinPolarized[iElem * numQuadPoints * 2 + - 2 * q + 1]; - temp4Quads[2 * q + 0] = - rhoResponseFermiEnergySpinPolarized[iElem * numQuadPoints * - 2 + - 2 * q + 0]; - temp4Quads[2 * q + 1] = - rhoResponseFermiEnergySpinPolarized[iElem * numQuadPoints * - 2 + - 2 * q + 1]; - } - } - - iElem++; - } - - - MPI_Barrier(mpiCommParent); - cpu_time = MPI_Wtime() - cpu_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rhoprime on CPU: " << cpu_time - << std::endl; - } - - - template void - computeRhoFirstOrderResponseCPU( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); - - template void - computeRhoFirstOrderResponseCPUMixedPrec( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); - -} // namespace dftfe diff --git a/src/dft/densityFirstOrderResponseCalculatorDevice.cc b/src/dft/densityFirstOrderResponseCalculatorDevice.cc deleted file mode 100644 index 08945271a..000000000 --- a/src/dft/densityFirstOrderResponseCalculatorDevice.cc +++ /dev/null @@ -1,676 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2018 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - namespace - { - __global__ void - computeRhoResponseFromInterpolatedValues(const unsigned int numberEntries, - double * XQuads, - double * XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const double psi = XQuads[index]; - const double psiPrime = XPrimeQuads[index]; - XPrimeQuads[index] = psi * psiPrime; - XQuads[index] = psi * psi; - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceDoubleComplex *XQuads, - dftfe::utils::deviceDoubleComplex *XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceDoubleComplex psi = XQuads[index]; - const dftfe::utils::deviceDoubleComplex psiPrime = XPrimeQuads[index]; - dftfe::utils::copyValue(XPrimeQuads + index, - psi.x * psiPrime.x + psi.y * psiPrime.y); - dftfe::utils::copyValue(XQuads + index, - psi.x * psi.x + psi.y * psi.y); - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues(const unsigned int numberEntries, - float * XQuads, - float * XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const float psi = XQuads[index]; - const float psiPrime = XPrimeQuads[index]; - XPrimeQuads[index] = psi * psiPrime; - XQuads[index] = psi * psi; - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceFloatComplex *XQuads, - dftfe::utils::deviceFloatComplex *XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceFloatComplex psi = XQuads[index]; - const dftfe::utils::deviceFloatComplex psiPrime = XPrimeQuads[index]; - dftfe::utils::copyValue(XPrimeQuads + index, - psi.x * psiPrime.x + psi.y * psiPrime.y); - dftfe::utils::copyValue(XQuads + index, - psi.x * psi.x + psi.y * psi.y); - } - } - } // namespace - - template - void - computeRhoFirstOrderResponseDevice( - const NumberType * X, - const NumberType * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - double device_time = MPI_Wtime(); - const unsigned int numKPoints = kPointWeights.size(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - const NumberTypeLowPrec zero = 0; - const NumberTypeLowPrec one = 1.0; - const NumberTypeLowPrec scalarCoeffAlphaRho = 1.0; - const NumberTypeLowPrec scalarCoeffBetaRho = 1.0; - - const unsigned int cellsBlockSize = 50; - const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; - const unsigned int remCellBlockSize = - totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; - - dftfe::utils::MemoryStorage - rhoResponseContributionHamDevice(totalLocallyOwnedCells * numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionFermiEnergyDevice(totalLocallyOwnedCells * - numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionHamHost(totalLocallyOwnedCells * numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionFermiEnergyHost(totalLocallyOwnedCells * - numQuadPoints, - zero); - - std::vector rhoResponseValuesHamFlattenedHost( - totalLocallyOwnedCells * numQuadPoints, 0.0); - std::vector rhoResponseValuesFermiEnergyFlattenedHost( - totalLocallyOwnedCells * numQuadPoints, 0.0); - - std::vector rhoResponseValuesSpinPolarizedHamFlattenedHost( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - std::vector rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - - dftfe::utils::MemoryStorage - XQuadsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - - dftfe::utils::MemoryStorage - XPrimeQuadsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - dftfe::utils::MemoryStorage - onesVecDevice(BVec, one); - - dftfe::utils::MemoryStorage - densityMatDerFermiEnergyVec(BVec, zero); - dftfe::utils::MemoryStorage - densityMatDerFermiEnergyVecDevice(BVec, zero); - - distributedDeviceVec &deviceFlattenedArrayXBlock = - operatorMatrix.getParallelChebyBlockVectorDevice(); - - distributedDeviceVec &deviceFlattenedArrayXPrimeBlock = - operatorMatrix.getParallelChebyBlockVector2Device(); - - - dftfe::utils::MemoryStorage - cellWaveFunctionMatrix(cellsBlockSize * numNodesPerElement * BVec, zero); - - dftfe::utils::MemoryStorage - shapeFunctionValuesTransposedDevice(numNodesPerElement * numQuadPoints, - zero); - - shapeFunctionValuesTransposedDevice.setValue(zero); - - - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - numNodesPerElement * numQuadPoints, - (operatorMatrix.getShapeFunctionValuesTransposed(true)).begin(), - shapeFunctionValuesTransposedDevice.begin()); - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - rhoResponseContributionHamDevice.setValue(zero); - rhoResponseContributionFermiEnergyDevice.setValue(zero); - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - if ((jvec + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(densityMatDerFermiEnergyVec.begin() + iEigenVec) = - densityMatDerFermiEnergy - [(dftParams.spinPolarized + 1) * kPoint + spinIndex] - [jvec + iEigenVec]; - } - - densityMatDerFermiEnergyVec - .template copyTo( - densityMatDerFermiEnergyVecDevice); - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - X + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayXBlock.begin()); - - deviceFlattenedArrayXBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayXBlock, BVec); - - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - XPrime + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayXPrimeBlock.begin()); - - deviceFlattenedArrayXPrimeBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayXPrimeBlock, BVec); - - - for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - NumberTypeLowPrec scalarCoeffAlpha = 1.0; - NumberTypeLowPrec scalarCoeffBeta = 0.0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin(), - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - XQuadsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXPrimeBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin(), - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - XPrimeQuadsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoResponseFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - BVec * numQuadPoints * currentCellsBlockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - XQuadsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - XPrimeQuadsDevice.begin())); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoResponseFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - BVec * numQuadPoints * currentCellsBlockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - XQuadsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - XPrimeQuadsDevice.begin())); -#endif - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - onesVecDevice.begin(), - 1, - XPrimeQuadsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoResponseContributionHamDevice.begin() + - startingCellId * numQuadPoints, - 1); - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - densityMatDerFermiEnergyVecDevice.begin(), - 1, - XQuadsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoResponseContributionFermiEnergyDevice.begin() + - startingCellId * numQuadPoints, - 1); - - } // non-trivial cell block check - } // cells block loop - } // band parallelizatoin check - } // wave function block loop - - - // do memcopy to host - rhoResponseContributionHamDevice - .template copyTo( - rhoResponseContributionHamHost.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - rhoResponseContributionFermiEnergyDevice - .template copyTo( - rhoResponseContributionFermiEnergyHost.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseValuesHamFlattenedHost[icell * numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - *(rhoResponseContributionHamHost.begin() + - icell * numQuadPoints + iquad)); - - rhoResponseValuesFermiEnergyFlattenedHost[icell * - numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - *(rhoResponseContributionFermiEnergyHost.begin() + - icell * numQuadPoints + iquad)); - } - - - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseValuesSpinPolarizedHamFlattenedHost - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - kPointWeights[kPoint] * - dftfe::utils::realPart( - *(rhoResponseContributionHamHost.begin() + - icell * numQuadPoints + iquad)); - - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - kPointWeights[kPoint] * - dftfe::utils::realPart( - *(rhoResponseContributionFermiEnergyHost.begin() + - icell * numQuadPoints + iquad)); - } - } - - - } // kpoint loop - } // spin index loop - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoResponseValuesHamFlattenedHost, - interpoolcomm, - rhoResponseValuesHamFlattenedHost); - - dealii::Utilities::MPI::sum(rhoResponseValuesFermiEnergyFlattenedHost, - interpoolcomm, - rhoResponseValuesFermiEnergyFlattenedHost); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedHamFlattenedHost, - interpoolcomm, - rhoResponseValuesSpinPolarizedHamFlattenedHost); - - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost, - interpoolcomm, - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoResponseValuesHamFlattenedHost, - interBandGroupComm, - rhoResponseValuesHamFlattenedHost); - - dealii::Utilities::MPI::sum(rhoResponseValuesFermiEnergyFlattenedHost, - interBandGroupComm, - rhoResponseValuesFermiEnergyFlattenedHost); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedHamFlattenedHost, - interBandGroupComm, - rhoResponseValuesSpinPolarizedHamFlattenedHost); - - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost, - interBandGroupComm, - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost); - } - } - - unsigned int iElem = 0; - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector &temp1Quads = (rhoResponseValuesHam)[cellid]; - std::vector &temp2Quads = - (rhoResponseValuesFermiEnergy)[cellid]; - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp1Quads[q] = - rhoResponseValuesHamFlattenedHost[iElem * numQuadPoints + q]; - temp2Quads[q] = - rhoResponseValuesFermiEnergyFlattenedHost[iElem * - numQuadPoints + - q]; - } - - if (dftParams.spinPolarized == 1) - { - std::vector &temp3Quads = - (rhoResponseValuesHamSpinPolarized)[cellid]; - - std::vector &temp4Quads = - (rhoResponseValuesFermiEnergySpinPolarized)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp3Quads[2 * q + 0] = - rhoResponseValuesSpinPolarizedHamFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 0]; - temp3Quads[2 * q + 1] = - rhoResponseValuesSpinPolarizedHamFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 1]; - temp4Quads[2 * q + 0] = - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 0]; - temp4Quads[2 * q + 1] = - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 1]; - } - } - - iElem++; - } - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rhoprime on Device: " << device_time - << std::endl; - } - - template void - computeRhoFirstOrderResponseDevice( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); - - template void - computeRhoFirstOrderResponseDevice( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams); -} // namespace dftfe diff --git a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc new file mode 100644 index 000000000..13f7f0fe9 --- /dev/null +++ b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc @@ -0,0 +1,185 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + namespace + { + __global__ void + computeRhoResponseFromInterpolatedValues( + const unsigned int numVectors, + const unsigned int numCells, + const unsigned int nQuadsPerCell, + double * wfcContributions, + double * wfcPrimeContributions, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const double psi = wfcContributions[index]; + const double psiPrime = wfcContributions[index]; + rhoResponseFermiEnergyCellsWfcContributions[index] = psi * psi; + rhoResponseHamCellsWfcContributions[index] = psi * psiPrime; + + } + } + + __global__ void + computeRhoResponseFromInterpolatedValues( + const unsigned int numVectors, + const unsigned int numCells, + const unsigned int nQuadsPerCell, + dftfe::utils::deviceDoubleComplex *wfcContributions, + dftfe::utils::deviceDoubleComplex *wfcPrimeContributions, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index]; + const dftfe::utils::deviceDoubleComplex psiPrime = wfcPrimeContributions[index]; + rhoResponseFermiEnergyCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y; + rhoResponseHamCellsWfcContributions[index] = psi.x * psiPrime.x + psi.y * psiPrime.y; + + } + } + } // namespace + template + void + computeRhoResponseFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + const std::pair cellRange, + const std::pair vecRange, + double * onesVec, + double * partialOccupPrimeVec, + NumberType * wfcQuadPointData, + NumberType * wfcPrimeQuadPointData, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions, + double * rhoResponseHam, + double * rhoResponseFermiEnergy) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); + const double scalarCoeffAlphaRho = 1.0; + const double scalarCoeffBetaRho = 1.0; +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + computeRhoResponseFromInterpolatedValues<<< + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + vectorsBlockSize, + cellsBlockSize, + nQuadsPerCell, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(wfcPrimeQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseHamCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseFermiEnergyCellsWfcContributions)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + computeRhoResponseFromInterpolatedValues, + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + vectorsBlockSize, + cellsBlockSize, + nQuadsPerCell, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(wfcPrimeQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseHamCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseFermiEnergyCellsWfcContributions)); +#endif + BLASWrapperPtr->xgemv('T', + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell, + &scalarCoeffAlphaRho, + rhoResponseHamCellsWfcContributions, + vectorsBlockSize, + onesVec, + 1, + &scalarCoeffBetaRho, + rhoResponseHam + cellRange.first * nQuadsPerCell, + 1); + + BLASWrapperPtr->xgemv('T', + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell, + &scalarCoeffAlphaRho, + rhoResponseFermiEnergyCellsWfcContributions, + vectorsBlockSize, + partialOccupPrimeVec, + 1, + &scalarCoeffBetaRho, + rhoResponseFermiEnergy + cellRange.first * nQuadsPerCell, + 1); + + } + template void + computeRhoResponseFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + &basisOperationsPtr, + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> + & BLASWrapperPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + dataTypes::number * wfcQuadPointData, + dataTypes::number * wfcPrimeQuadPointData, + double * rhoResponseHamCellsWfcContributions, + double * rhoResponseFermiEnergyCellsWfcContributions, + double * rhoResponseHam, + double * rhoResponseFermiEnergy); + +} // namespace dftfe diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index a19ea8393..f13327224 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -306,14 +306,14 @@ namespace dftfe bandGroupLowHighPlusOneIndices[1]); d_basisOperationsPtrHost->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)); + BVec, (d_dftParamsPtr->spinPolarized + 1)*2); if (d_numEigenValues % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( - d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1)); + d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1)*2); if (d_numEigenValues != d_numEigenValuesRR && d_numEigenValuesRR % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( - d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1)); + d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1)*2); } #if defined(DFTFE_WITH_DEVICE) if (d_dftParamsPtr->useDevice && recomputeBasisData) @@ -325,11 +325,8 @@ namespace dftfe const unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); - if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") - d_basisOperationsPtrDevice->createScratchMultiVectors(BVec, 2); - else - d_basisOperationsPtrDevice->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)); + d_basisOperationsPtrDevice->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)*2); d_basisOperationsPtrDevice->computeCellStiffnessMatrix( d_feOrderPlusOneQuadratureId, 50, true, false); } @@ -371,11 +368,8 @@ namespace dftfe const unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); - if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") - d_basisOperationsPtrDevice->createScratchMultiVectors(BVec, 2); - else - d_basisOperationsPtrDevice->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)); + d_basisOperationsPtrDevice->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)*2); } #endif From b09194f12a49a15861db649753b6233271c0fa29 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Sat, 10 Feb 2024 17:50:42 -0500 Subject: [PATCH 07/24] delete older file --- ...nsityFirstOrderResponseCalculatorDevice.cc | 679 ------------------ 1 file changed, 679 deletions(-) delete mode 100644 src/dft/densityFirstOrderResponseCalculatorDevice.cc diff --git a/src/dft/densityFirstOrderResponseCalculatorDevice.cc b/src/dft/densityFirstOrderResponseCalculatorDevice.cc deleted file mode 100644 index 6d742111b..000000000 --- a/src/dft/densityFirstOrderResponseCalculatorDevice.cc +++ /dev/null @@ -1,679 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2018 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - namespace - { - __global__ void - computeRhoResponseFromInterpolatedValues(const unsigned int numberEntries, - double * XQuads, - double * XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const double psi = XQuads[index]; - const double psiPrime = XPrimeQuads[index]; - XPrimeQuads[index] = psi * psiPrime; - XQuads[index] = psi * psi; - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceDoubleComplex *XQuads, - dftfe::utils::deviceDoubleComplex *XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceDoubleComplex psi = XQuads[index]; - const dftfe::utils::deviceDoubleComplex psiPrime = XPrimeQuads[index]; - dftfe::utils::copyValue(XPrimeQuads + index, - psi.x * psiPrime.x + psi.y * psiPrime.y); - dftfe::utils::copyValue(XQuads + index, - psi.x * psi.x + psi.y * psi.y); - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues(const unsigned int numberEntries, - float * XQuads, - float * XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const float psi = XQuads[index]; - const float psiPrime = XPrimeQuads[index]; - XPrimeQuads[index] = psi * psiPrime; - XQuads[index] = psi * psi; - } - } - - __global__ void - computeRhoResponseFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceFloatComplex *XQuads, - dftfe::utils::deviceFloatComplex *XPrimeQuads) - { - const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceFloatComplex psi = XQuads[index]; - const dftfe::utils::deviceFloatComplex psiPrime = XPrimeQuads[index]; - dftfe::utils::copyValue(XPrimeQuads + index, - psi.x * psiPrime.x + psi.y * psiPrime.y); - dftfe::utils::copyValue(XQuads + index, - psi.x * psi.x + psi.y * psi.y); - } - } - } // namespace - - template - void - computeRhoFirstOrderResponseDevice( - const NumberType * X, - const NumberType * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams, - const std::shared_ptr< - dftfe::linearAlgebra::BLASWrapper> - BLASWrapperPtr) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - double device_time = MPI_Wtime(); - const unsigned int numKPoints = kPointWeights.size(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - const NumberTypeLowPrec zero = 0; - const NumberTypeLowPrec one = 1.0; - const NumberTypeLowPrec scalarCoeffAlphaRho = 1.0; - const NumberTypeLowPrec scalarCoeffBetaRho = 1.0; - - const unsigned int cellsBlockSize = 50; - const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; - const unsigned int remCellBlockSize = - totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; - - dftfe::utils::MemoryStorage - rhoResponseContributionHamDevice(totalLocallyOwnedCells * numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionFermiEnergyDevice(totalLocallyOwnedCells * - numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionHamHost(totalLocallyOwnedCells * numQuadPoints, - zero); - - dftfe::utils::MemoryStorage - rhoResponseContributionFermiEnergyHost(totalLocallyOwnedCells * - numQuadPoints, - zero); - - std::vector rhoResponseValuesHamFlattenedHost( - totalLocallyOwnedCells * numQuadPoints, 0.0); - std::vector rhoResponseValuesFermiEnergyFlattenedHost( - totalLocallyOwnedCells * numQuadPoints, 0.0); - - std::vector rhoResponseValuesSpinPolarizedHamFlattenedHost( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - std::vector rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - - dftfe::utils::MemoryStorage - XQuadsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - - dftfe::utils::MemoryStorage - XPrimeQuadsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - dftfe::utils::MemoryStorage - onesVecDevice(BVec, one); - - dftfe::utils::MemoryStorage - densityMatDerFermiEnergyVec(BVec, zero); - dftfe::utils::MemoryStorage - densityMatDerFermiEnergyVecDevice(BVec, zero); - - distributedDeviceVec &deviceFlattenedArrayXBlock = - operatorMatrix.getParallelChebyBlockVectorDevice(); - - distributedDeviceVec &deviceFlattenedArrayXPrimeBlock = - operatorMatrix.getParallelChebyBlockVector2Device(); - - - dftfe::utils::MemoryStorage - cellWaveFunctionMatrix(cellsBlockSize * numNodesPerElement * BVec, zero); - - dftfe::utils::MemoryStorage - shapeFunctionValuesTransposedDevice(numNodesPerElement * numQuadPoints, - zero); - - shapeFunctionValuesTransposedDevice.setValue(zero); - - - BLASWrapperPtr->copyValueType1ArrToValueType2Arr( - numNodesPerElement * numQuadPoints, - (operatorMatrix.getShapeFunctionValuesTransposed(true)).begin(), - shapeFunctionValuesTransposedDevice.begin()); - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - rhoResponseContributionHamDevice.setValue(zero); - rhoResponseContributionFermiEnergyDevice.setValue(zero); - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - if ((jvec + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(densityMatDerFermiEnergyVec.begin() + iEigenVec) = - densityMatDerFermiEnergy - [(dftParams.spinPolarized + 1) * kPoint + spinIndex] - [jvec + iEigenVec]; - } - - densityMatDerFermiEnergyVec - .template copyTo( - densityMatDerFermiEnergyVecDevice); - - BLASWrapperPtr->stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - X + - numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + spinIndex), - deviceFlattenedArrayXBlock.begin()); - - deviceFlattenedArrayXBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayXBlock, BVec); - - - BLASWrapperPtr->stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - XPrime + - numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + spinIndex), - deviceFlattenedArrayXPrimeBlock.begin()); - - deviceFlattenedArrayXPrimeBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayXPrimeBlock, BVec); - - - for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - - - BLASWrapperPtr->stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - NumberTypeLowPrec scalarCoeffAlpha = 1.0; - NumberTypeLowPrec scalarCoeffBeta = 0.0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - - - BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin(), - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - XQuadsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - BLASWrapperPtr->stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayXPrimeBlock.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - BLASWrapperPtr->xgemmStridedBatched( - 'N', - 'N', - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin(), - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - XPrimeQuadsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoResponseFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - BVec * numQuadPoints * currentCellsBlockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - XQuadsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - XPrimeQuadsDevice.begin())); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoResponseFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - BVec * numQuadPoints * currentCellsBlockSize, - dftfe::utils::makeDataTypeDeviceCompatible( - XQuadsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - XPrimeQuadsDevice.begin())); -#endif - - BLASWrapperPtr->xgemm( - 'N', - 'N', - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - onesVecDevice.begin(), - 1, - XPrimeQuadsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoResponseContributionHamDevice.begin() + - startingCellId * numQuadPoints, - 1); - - BLASWrapperPtr->xgemm( - 'N', - 'N', - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - densityMatDerFermiEnergyVecDevice.begin(), - 1, - XQuadsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoResponseContributionFermiEnergyDevice.begin() + - startingCellId * numQuadPoints, - 1); - - } // non-trivial cell block check - } // cells block loop - } // band parallelizatoin check - } // wave function block loop - - - // do memcopy to host - rhoResponseContributionHamDevice - .template copyTo( - rhoResponseContributionHamHost.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - rhoResponseContributionFermiEnergyDevice - .template copyTo( - rhoResponseContributionFermiEnergyHost.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseValuesHamFlattenedHost[icell * numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - *(rhoResponseContributionHamHost.begin() + - icell * numQuadPoints + iquad)); - - rhoResponseValuesFermiEnergyFlattenedHost[icell * - numQuadPoints + - iquad] += - kPointWeights[kPoint] * spinPolarizedFactor * - dftfe::utils::realPart( - *(rhoResponseContributionFermiEnergyHost.begin() + - icell * numQuadPoints + iquad)); - } - - - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoResponseValuesSpinPolarizedHamFlattenedHost - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - kPointWeights[kPoint] * - dftfe::utils::realPart( - *(rhoResponseContributionHamHost.begin() + - icell * numQuadPoints + iquad)); - - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - kPointWeights[kPoint] * - dftfe::utils::realPart( - *(rhoResponseContributionFermiEnergyHost.begin() + - icell * numQuadPoints + iquad)); - } - } - - - } // kpoint loop - } // spin index loop - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoResponseValuesHamFlattenedHost, - interpoolcomm, - rhoResponseValuesHamFlattenedHost); - - dealii::Utilities::MPI::sum(rhoResponseValuesFermiEnergyFlattenedHost, - interpoolcomm, - rhoResponseValuesFermiEnergyFlattenedHost); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedHamFlattenedHost, - interpoolcomm, - rhoResponseValuesSpinPolarizedHamFlattenedHost); - - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost, - interpoolcomm, - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoResponseValuesHamFlattenedHost, - interBandGroupComm, - rhoResponseValuesHamFlattenedHost); - - dealii::Utilities::MPI::sum(rhoResponseValuesFermiEnergyFlattenedHost, - interBandGroupComm, - rhoResponseValuesFermiEnergyFlattenedHost); - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedHamFlattenedHost, - interBandGroupComm, - rhoResponseValuesSpinPolarizedHamFlattenedHost); - - dealii::Utilities::MPI::sum( - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost, - interBandGroupComm, - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost); - } - } - - unsigned int iElem = 0; - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector &temp1Quads = (rhoResponseValuesHam)[cellid]; - std::vector &temp2Quads = - (rhoResponseValuesFermiEnergy)[cellid]; - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp1Quads[q] = - rhoResponseValuesHamFlattenedHost[iElem * numQuadPoints + q]; - temp2Quads[q] = - rhoResponseValuesFermiEnergyFlattenedHost[iElem * - numQuadPoints + - q]; - } - - if (dftParams.spinPolarized == 1) - { - std::vector &temp3Quads = - (rhoResponseValuesHamSpinPolarized)[cellid]; - - std::vector &temp4Quads = - (rhoResponseValuesFermiEnergySpinPolarized)[cellid]; - - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - temp3Quads[2 * q + 0] = - rhoResponseValuesSpinPolarizedHamFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 0]; - temp3Quads[2 * q + 1] = - rhoResponseValuesSpinPolarizedHamFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 1]; - temp4Quads[2 * q + 0] = - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 0]; - temp4Quads[2 * q + 1] = - rhoResponseValuesSpinPolarizedFermiEnergyFlattenedHost - [iElem * numQuadPoints * 2 + 2 * q + 1]; - } - } - - iElem++; - } - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rhoprime on Device: " << device_time - << std::endl; - } - - template void - computeRhoFirstOrderResponseDevice( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams, - const std::shared_ptr< - dftfe::linearAlgebra::BLASWrapper> - BLASWrapperPtr); - - template void - computeRhoFirstOrderResponseDevice( - const dataTypes::number * X, - const dataTypes::number * XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> &rhoResponseValuesHam, - std::map> &rhoResponseValuesFermiEnergy, - std::map> - &rhoResponseValuesHamSpinPolarized, - std::map> - & rhoResponseValuesFermiEnergySpinPolarized, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters &dftParams, - const std::shared_ptr< - dftfe::linearAlgebra::BLASWrapper> - BLASWrapperPtr); -} // namespace dftfe From 3acdc486f8538836311351f68d0f368f7f11a8b8 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Sat, 10 Feb 2024 18:05:38 -0500 Subject: [PATCH 08/24] wip, compilation issues --- include/densityFirstOrderResponseCalculator.h | 36 +-- ...mputeOutputDensityDirectionalDerivative.cc | 139 ++++++---- src/dft/density.cc | 3 +- .../densityFirstOrderResponseCalculator.cc | 247 ++++++++++-------- ...rstOrderResponseCalculatorDeviceKernels.cc | 55 ++-- src/dft/initBoundaryConditions.cc | 10 +- 6 files changed, 282 insertions(+), 208 deletions(-) diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index ab829be33..931690634 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -27,13 +27,13 @@ namespace dftfe { - template + template void computeRhoFirstOrderResponse( - const dftfe::utils::MemoryStorage & X, - const dftfe::utils::MemoryStorage & XPrime, - const unsigned int totalNumWaveFunctions, - const std::vector> & densityMatDerFermiEnergy, + const dftfe::utils::MemoryStorage &X, + const dftfe::utils::MemoryStorage &XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> &densityMatDerFermiEnergy, std::shared_ptr< dftfe::basis::FEBasisOperations> &basisOperationsPtr, @@ -41,9 +41,13 @@ namespace dftfe & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, - const std::vector & kPointWeights, - std::vector> &rhoResponseValuesHam, - std::vector> &rhoResponseValuesFermiEnergy, + const std::vector &kPointWeights, + std::vector< + dftfe::utils::MemoryStorage> + &rhoResponseValuesHam, + std::vector< + dftfe::utils::MemoryStorage> + & rhoResponseValuesFermiEnergy, const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, @@ -66,10 +70,10 @@ namespace dftfe double * partialOccupVecPrime, NumberType * wfcQuadPointData, NumberType * wfcPrimeQuadPointData, - double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions, - double * rhoResponseHam, - double * rhoResponseFermiEnergy); + double *rhoResponseHamCellsWfcContributions, + double *rhoResponseFermiEnergyCellsWfcContributions, + double *rhoResponseHam, + double *rhoResponseFermiEnergy); #if defined(DFTFE_WITH_DEVICE) template @@ -89,10 +93,10 @@ namespace dftfe double * partialOccupVecPrime, NumberType * wfcQuadPointData, NumberType * wfcPrimeQuadPointData, - double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions, - double * rhoResponseHam, - double * rhoResponseFermiEnergy); + double *rhoResponseHamCellsWfcContributions, + double *rhoResponseFermiEnergyCellsWfcContributions, + double *rhoResponseHam, + double *rhoResponseFermiEnergy); #endif } // namespace dftfe diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 46d295452..9c34272c3 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -430,9 +430,11 @@ namespace dftfe fvFermiEnergySpin1 = 0; } - std::vector> + std::vector< + dftfe::utils::MemoryStorage> rhoResponseHamPRefinedNodalData; - std::vector> + std::vector< + dftfe::utils::MemoryStorage> rhoResponseFermiEnergyPRefinedNodalData; @@ -451,7 +453,8 @@ namespace dftfe matrix_free_data.get_quadrature(d_gllQuadratureId); const unsigned int numQuadPoints = quadrature_formula.size(); - // get access to quadrature point coordinates and density DoFHandler nodal points + // get access to quadrature point coordinates and density DoFHandler nodal + // points const std::vector> &quadraturePointCoor = quadrature_formula.get_points(); const std::vector> &supportPointNaturalCoor = @@ -477,17 +480,20 @@ namespace dftfe // allocate the storage to compute 2p nodal values from wavefunctions - rhoResponseHamPRefinedNodalData.resize(d_dftParamsPtr->spinPolarized == 1 ? 2 : - 1); - rhoResponseFermiEnergyPRefinedNodalData.resize(d_dftParamsPtr->spinPolarized == 1 ? 2 : - 1); + rhoResponseHamPRefinedNodalData.resize( + d_dftParamsPtr->spinPolarized == 1 ? 2 : 1); + rhoResponseFermiEnergyPRefinedNodalData.resize( + d_dftParamsPtr->spinPolarized == 1 ? 2 : 1); for (unsigned int iComp = 0; iComp < rhoResponseHamPRefinedNodalData.size(); - ++iComp) - { - rhoResponseHamPRefinedNodalData[iComp].resize(numLocallyOwnedCells*numQuadPoints,0); - rhoResponseFermiEnergyPRefinedNodalData[iComp].resize(numLocallyOwnedCells*numQuadPoints,0); - } + ++iComp) + { + rhoResponseHamPRefinedNodalData[iComp].resize(numLocallyOwnedCells * + numQuadPoints, + 0); + rhoResponseFermiEnergyPRefinedNodalData[iComp].resize( + numLocallyOwnedCells * numQuadPoints, 0); + } // compute first order density response at nodal locations of 2p @@ -495,40 +501,40 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) { - computeRhoFirstOrderResponse( - d_eigenVectorsFlattenedDevice, - d_eigenVectorsDensityMatrixPrimeDevice, - d_numEigenValues, - d_densityMatDerFermiEnergy, - d_BLASWrapperPtr, - d_densityDofHandlerIndex, - d_gllQuadratureId, - d_kPointWeights, - rhoResponseHamPRefinedNodalData, - rhoResponseFermiEnergyPRefinedNodalData, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr); + computeRhoFirstOrderResponse(d_eigenVectorsFlattenedDevice, + d_eigenVectorsDensityMatrixPrimeDevice, + d_numEigenValues, + d_densityMatDerFermiEnergy, + d_basisOperationsPtrDevice, + d_BLASWrapperPtr, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + rhoResponseHamPRefinedNodalData, + rhoResponseFermiEnergyPRefinedNodalData, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr); } #endif if (!d_dftParamsPtr->useDevice) { - computeRhoFirstOrderResponse( - d_eigenVectorsFlattenedHost, - d_eigenVectorsDensityMatrixPrimeHost, - d_numEigenValues, - d_densityMatDerFermiEnergy, - d_BLASWrapperPtr, - d_densityDofHandlerIndex, - d_gllQuadratureId, - d_kPointWeights, - rhoResponseHamPRefinedNodalData, - rhoResponseFermiEnergyPRefinedNodalData, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr); + computeRhoFirstOrderResponse(d_eigenVectorsFlattenedHost, + d_eigenVectorsDensityMatrixPrimeHost, + d_numEigenValues, + d_densityMatDerFermiEnergy, + d_basisOperationsPtrHost, + d_BLASWrapperPtr, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + rhoResponseHamPRefinedNodalData, + rhoResponseFermiEnergyPRefinedNodalData, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr); } // copy Lobatto quadrature data to fill in 2p DoFHandler nodal data @@ -544,12 +550,13 @@ namespace dftfe std::vector cell_dof_indices( dofs_per_cell); cellP->get_dof_indices(cell_dof_indices); - const double * nodalValuesResponseHam = - rhoResponseHamPRefinedNodalData[0].data() + iCell * dofs_per_cell; + const double *nodalValuesResponseHam = + rhoResponseHamPRefinedNodalData[0].data() + iCell * dofs_per_cell; - const double * nodalValuesResponseFermiEnergy = - rhoResponseFermiEnergyPRefinedNodalData[0].data() + iCell * dofs_per_cell; + const double *nodalValuesResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[0].data() + + iCell * dofs_per_cell; for (unsigned int iNode = 0; iNode < dofs_per_cell; ++iNode) { @@ -591,17 +598,21 @@ namespace dftfe std::vector cell_dof_indices( dofs_per_cell); cellP->get_dof_indices(cell_dof_indices); - const double * nodalValuesRhoTotResponseHam = - rhoResponseHamPRefinedNodalData[0].data() + iCell * dofs_per_cell; + const double *nodalValuesRhoTotResponseHam = + rhoResponseHamPRefinedNodalData[0].data() + + iCell * dofs_per_cell; - const double * nodalValuesRhoTotResponseFermiEnergy = - rhoResponseFermiEnergyPRefinedNodalData[0].data() + iCell * dofs_per_cell; + const double *nodalValuesRhoTotResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[0].data() + + iCell * dofs_per_cell; - const double * nodalValuesRhoMagResponseHam = - rhoResponseHamPRefinedNodalData[1].data() + iCell * dofs_per_cell; + const double *nodalValuesRhoMagResponseHam = + rhoResponseHamPRefinedNodalData[1].data() + + iCell * dofs_per_cell; - const double * nodalValuesRhoMagResponseFermiEnergy = - rhoResponseFermiEnergyPRefinedNodalData[1].data() + iCell * dofs_per_cell; + const double *nodalValuesRhoMagResponseFermiEnergy = + rhoResponseFermiEnergyPRefinedNodalData[1].data() + + iCell * dofs_per_cell; for (unsigned int iNode = 0; iNode < dofs_per_cell; ++iNode) @@ -613,13 +624,25 @@ namespace dftfe if (locallyOwnedDofs.is_element(nodeID)) { fvHamSpin0(nodeID) = - 0.5*(nodalValuesRhoTotResponseHam[renumberingMap[iNode]]+nodalValuesRhoMagResponseHam[renumberingMap[iNode]]); + 0.5 * (nodalValuesRhoTotResponseHam + [renumberingMap[iNode]] + + nodalValuesRhoMagResponseHam + [renumberingMap[iNode]]); fvHamSpin1(nodeID) = - 0.5*(nodalValuesRhoTotResponseHam[renumberingMap[iNode]]-nodalValuesRhoMagResponseHam[renumberingMap[iNode]]); + 0.5 * (nodalValuesRhoTotResponseHam + [renumberingMap[iNode]] - + nodalValuesRhoMagResponseHam + [renumberingMap[iNode]]); fvFermiEnergySpin0(nodeID) = - 0.5*(nodalValuesRhoTotResponseFermiEnergy[renumberingMap[iNode]]+nodalValuesRhoMagResponseFermiEnergy[renumberingMap[iNode]]); + 0.5 * (nodalValuesRhoTotResponseFermiEnergy + [renumberingMap[iNode]] + + nodalValuesRhoMagResponseFermiEnergy + [renumberingMap[iNode]]); fvFermiEnergySpin1(nodeID) = - 0.5*(nodalValuesRhoTotResponseFermiEnergy[renumberingMap[iNode]]-nodalValuesRhoMagResponseFermiEnergy[renumberingMap[iNode]]); + 0.5 * (nodalValuesRhoTotResponseFermiEnergy + [renumberingMap[iNode]] - + nodalValuesRhoMagResponseFermiEnergy + [renumberingMap[iNode]]); } } } diff --git a/src/dft/density.cc b/src/dft/density.cc index 662d69504..f27b0a3cb 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -321,7 +321,8 @@ namespace dftfe matrix_free_data.get_quadrature(d_gllQuadratureId); const unsigned int numQuadPoints = quadrature_formula.size(); - // get access to quadrature point coordinates and density DoFHandler nodal points + // get access to quadrature point coordinates and density DoFHandler nodal + // points const std::vector> &quadraturePointCoor = quadrature_formula.get_points(); const std::vector> &supportPointNaturalCoor = diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index 635cc8de2..2c9452669 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -37,10 +37,10 @@ namespace dftfe template void computeRhoFirstOrderResponse( - const dftfe::utils::MemoryStorage & X, - const dftfe::utils::MemoryStorage & XPrime, - const unsigned int totalNumWaveFunctions, - const std::vector> & densityMatDerFermiEnergy, + const dftfe::utils::MemoryStorage &X, + const dftfe::utils::MemoryStorage &XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> &densityMatDerFermiEnergy, std::shared_ptr< dftfe::basis::FEBasisOperations> &basisOperationsPtr, @@ -48,9 +48,13 @@ namespace dftfe & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, - const std::vector & kPointWeights, - std::vector> &rhoResponseValuesHam, - std::vector> &rhoResponseValuesFermiEnergy, + const std::vector &kPointWeights, + std::vector< + dftfe::utils::MemoryStorage> + &rhoResponseValuesHam, + std::vector< + dftfe::utils::MemoryStorage> + & rhoResponseValuesFermiEnergy, const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, @@ -86,9 +90,9 @@ namespace dftfe const unsigned int numSpinComponents = (dftParams.spinPolarized == 1) ? 2 : 1; - const NumberType zero = 0; - const NumberType scalarCoeffAlphaRho = 1.0; - const NumberType scalarCoeffBetaRho = 1.0; + const NumberType zero = 0; + const NumberType scalarCoeffAlphaRho = 1.0; + const NumberType scalarCoeffBetaRho = 1.0; const unsigned int cellsBlockSize = memorySpace == dftfe::utils::MemorySpace::DEVICE ? 50 : 1; @@ -115,12 +119,16 @@ namespace dftfe dftfe::utils::MemoryStorage rhoResponseHam; dftfe::utils::MemoryStorage rhoResponseFermiEnergy; #else - auto &rhoResponseHam = rhoResponseHamHost; - auto &rhoResponseFermiEnergy = rhoResponseFermiEnergyHost; + auto &rhoResponseHam = rhoResponseHamHost; + auto &rhoResponseFermiEnergy = rhoResponseFermiEnergyHost; #endif - rhoResponseHam.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); - rhoResponseFermiEnergy.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); + rhoResponseHam.resize(totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + 0.0); + rhoResponseFermiEnergy.resize(totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + 0.0); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) { wfcQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * @@ -128,23 +136,21 @@ namespace dftfe zero); wfcPrimeQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * - BVec, - zero); + BVec, + zero); if (memorySpace == dftfe::utils::MemorySpace::DEVICE) - { - rhoResponseHamWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * - BVec, - 0.0); + { + rhoResponseHamWfcContributions[spinIndex].resize( + cellsBlockSize * numQuadPoints * BVec, 0.0); - rhoResponseFermiEnergyWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * - BVec, - 0.0); - } + rhoResponseFermiEnergyWfcContributions[spinIndex].resize( + cellsBlockSize * numQuadPoints * BVec, 0.0); + } } - dftfe::utils::MemoryStorage onesVec(BVec,1.0); + dftfe::utils::MemoryStorage onesVec(BVec, 1.0); std::vector< dftfe::utils::MemoryStorage> @@ -156,13 +162,14 @@ namespace dftfe std::vector> partialOccupPrimeVec(numSpinComponents); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - partialOccupPrimeVec[spinIndex].resize(partialOccupPrimeVecHost[spinIndex].size()); + partialOccupPrimeVec[spinIndex].resize( + partialOccupPrimeVecHost[spinIndex].size()); #else - auto &partialOccupPrimeVec = partialOccupPrimeVecHost; + auto &partialOccupPrimeVec = partialOccupPrimeVecHost; #endif std::vector *> - flattenedArrayBlock(numSpinComponents*2); + flattenedArrayBlock(numSpinComponents * 2); for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { @@ -181,8 +188,7 @@ namespace dftfe for (unsigned int icomp = 0; icomp < flattenedArrayBlock.size(); ++icomp) flattenedArrayBlock[icomp] = - &(basisOperationsPtr->getMultiVector(currentBlockSize, - icomp)); + &(basisOperationsPtr->getMultiVector(currentBlockSize, icomp)); if ((jvec + currentBlockSize) <= bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && @@ -191,16 +197,16 @@ namespace dftfe { for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - *(partialOccupPrimeVecHost[spinIndex].begin() + - iEigenVec) = - densityMatDerFermiEnergy[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec]*kPointWeights[kPoint] * spinPolarizedFactor; - } + for (unsigned int iEigenVec = 0; iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupPrimeVecHost[spinIndex].begin() + + iEigenVec) = + densityMatDerFermiEnergy[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec] * + kPointWeights[kPoint] * spinPolarizedFactor; + } #if defined(DFTFE_WITH_DEVICE) for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) @@ -227,7 +233,7 @@ namespace dftfe numLocalDofs, jvec, X.data() + numLocalDofs * totalNumWaveFunctions * - (numSpinComponents * kPoint + spinIndex), + (numSpinComponents * kPoint + spinIndex), flattenedArrayBlock[spinIndex]->data()); #endif @@ -236,13 +242,15 @@ namespace dftfe ++spinIndex) if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - std::memcpy(flattenedArrayBlock[numSpinComponents+spinIndex]->data() + - iNode * currentBlockSize, - XPrime.data() + - numLocalDofs * totalNumWaveFunctions * - (numSpinComponents * kPoint + spinIndex) + - iNode * totalNumWaveFunctions + jvec, - currentBlockSize * sizeof(NumberType)); + std::memcpy( + flattenedArrayBlock[numSpinComponents + spinIndex] + ->data() + + iNode * currentBlockSize, + XPrime.data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex) + + iNode * totalNumWaveFunctions + jvec, + currentBlockSize * sizeof(NumberType)); #if defined(DFTFE_WITH_DEVICE) else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) dftfe::utils::deviceKernelsGeneric:: @@ -251,9 +259,11 @@ namespace dftfe totalNumWaveFunctions, numLocalDofs, jvec, - XPrime.data() + numLocalDofs * totalNumWaveFunctions * - (numSpinComponents * kPoint + spinIndex), - flattenedArrayBlock[numSpinComponents+spinIndex]->data()); + XPrime.data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[numSpinComponents + spinIndex] + ->data()); #endif basisOperationsPtr->reinit(currentBlockSize, @@ -269,7 +279,8 @@ namespace dftfe basisOperationsPtr->distribute( *(flattenedArrayBlock[spinIndex])); - flattenedArrayBlock[numSpinComponents+spinIndex]->updateGhostValues(); + flattenedArrayBlock[numSpinComponents + spinIndex] + ->updateGhostValues(); basisOperationsPtr->distribute( *(flattenedArrayBlock[spinIndex])); } @@ -290,7 +301,7 @@ namespace dftfe basisOperationsPtr->interpolateKernel( *(flattenedArrayBlock[spinIndex]), wfcQuadPointData[spinIndex].data(), - NULL, + NULL, std::pair( startingCellId, startingCellId + currentCellsBlockSize)); @@ -299,9 +310,10 @@ namespace dftfe spinIndex < numSpinComponents; ++spinIndex) basisOperationsPtr->interpolateKernel( - *(flattenedArrayBlock[numSpinComponents+spinIndex]), + *(flattenedArrayBlock[numSpinComponents + + spinIndex]), wfcPrimeQuadPointData[spinIndex].data(), - NULL, + NULL, std::pair( startingCellId, startingCellId + currentCellsBlockSize)); @@ -323,12 +335,14 @@ namespace dftfe wfcQuadPointData[spinIndex].data(), wfcPrimeQuadPointData[spinIndex].data(), rhoResponseHamWfcContributions[spinIndex].data(), - rhoResponseFermiEnergyWfcContributions[spinIndex].data(), - rhoResponseHam.data() + spinIndex * totalLocallyOwnedCells * - numQuadPoints, - rhoResponseFermiEnergy.data() + spinIndex * - totalLocallyOwnedCells * - numQuadPoints); + rhoResponseFermiEnergyWfcContributions[spinIndex] + .data(), + rhoResponseHam.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints, + rhoResponseFermiEnergy.data() + + spinIndex * totalLocallyOwnedCells * + numQuadPoints); } // non-trivial cell block check } // cells block loop } @@ -341,7 +355,7 @@ namespace dftfe rhoResponseFermiEnergyHost.resize(rhoResponseFermiEnergy.size()); - rhoResponseFermiEnergyHost.copyFrom(rhoResponseFermiEnergy); + rhoResponseFermiEnergyHost.copyFrom(rhoResponseFermiEnergy); #endif int size; @@ -363,7 +377,6 @@ namespace dftfe dataTypes::mpi_type_id(rhoResponseFermiEnergyHost.data()), MPI_SUM, interpoolcomm); - } MPI_Comm_size(interBandGroupComm, &size); if (size > 1) @@ -390,34 +403,43 @@ namespace dftfe rhoResponseValuesHam[0].resize(totalLocallyOwnedCells * numQuadPoints); rhoResponseValuesHam[1].resize(totalLocallyOwnedCells * numQuadPoints); std::transform(rhoResponseHamHost.begin(), - rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, - rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + + totalLocallyOwnedCells * numQuadPoints, rhoResponseValuesHam[0].begin(), std::plus<>{}); std::transform(rhoResponseHamHost.begin(), - rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, - rhoResponseHamHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + + totalLocallyOwnedCells * numQuadPoints, + rhoResponseHamHost.begin() + + totalLocallyOwnedCells * numQuadPoints, rhoResponseValuesHam[1].begin(), std::minus<>{}); - rhoResponseValuesFermiEnergy[0].resize(totalLocallyOwnedCells * numQuadPoints); - rhoResponseValuesFermiEnergy[1].resize(totalLocallyOwnedCells * numQuadPoints); + rhoResponseValuesFermiEnergy[0].resize(totalLocallyOwnedCells * + numQuadPoints); + rhoResponseValuesFermiEnergy[1].resize(totalLocallyOwnedCells * + numQuadPoints); std::transform(rhoResponseFermiEnergyHost.begin(), - rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, - rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + + totalLocallyOwnedCells * numQuadPoints, rhoResponseValuesFermiEnergy[0].begin(), std::plus<>{}); std::transform(rhoResponseFermiEnergyHost.begin(), - rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, - rhoResponseFermiEnergyHost.begin() + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + + totalLocallyOwnedCells * numQuadPoints, + rhoResponseFermiEnergyHost.begin() + + totalLocallyOwnedCells * numQuadPoints, rhoResponseValuesFermiEnergy[1].begin(), std::minus<>{}); - } else { - rhoResponseValuesHam[0] = rhoResponseHamHost; - rhoResponseValuesFermiEnergy[0] = rhoResponseFermiEnergyHost; + rhoResponseValuesHam[0] = rhoResponseHamHost; + rhoResponseValuesFermiEnergy[0] = rhoResponseFermiEnergyHost; } #if defined(DFTFE_WITH_DEVICE) if (memorySpace == dftfe::utils::MemorySpace::DEVICE) @@ -450,10 +472,10 @@ namespace dftfe double * partialOccupVecPrime, NumberType * wfcQuadPointData, NumberType * wfcPrimeQuadPointData, - double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions, - double * rhoResponseHam, - double * rhoResponseFermiEnergy) + double *rhoResponseHamCellsWfcContributions, + double *rhoResponseFermiEnergyCellsWfcContributions, + double *rhoResponseHam, + double *rhoResponseFermiEnergy) { const unsigned int cellsBlockSize = cellRange.second - cellRange.first; const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; @@ -471,33 +493,43 @@ namespace dftfe iQuad * vectorsBlockSize + iWave]; const NumberType psiPrime = wfcPrimeQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * - vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]; + vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]; rhoResponseHam[iCell * nQuadsPerCell + iQuad] += - dftfe::utils::realPart(psi * dftfe::utils::complexConj(psiPrime)); + dftfe::utils::realPart(psi * dftfe::utils::complexConj(psiPrime)); rhoResponseFermiEnergy[iCell * nQuadsPerCell + iQuad] += - partialOccupVecPrime[iWave] * dftfe::utils::realPart(psi * dftfe::utils::complexConj(psi)); - + partialOccupVecPrime[iWave] * + dftfe::utils::realPart(psi * dftfe::utils::complexConj(psi)); } } #if defined(DFTFE_WITH_DEVICE) template void computeRhoFirstOrderResponse( - const dftfe::utils::MemoryStorage & X, - const dftfe::utils::MemoryStorage & XPrime, - const unsigned int totalNumWaveFunctions, - const std::vector> & densityMatDerFermiEnergy, + const dftfe::utils::MemoryStorage &X, + const dftfe::utils::MemoryStorage + & XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> &densityMatDerFermiEnergy, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> &basisOperationsPtr, - std::shared_ptr> + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, - const std::vector & kPointWeights, - std::vector> &rhoResponseValuesHam, - std::vector> &rhoResponseValuesFermiEnergy, + const std::vector &kPointWeights, + std::vector< + dftfe::utils::MemoryStorage> + &rhoResponseValuesHam, + std::vector< + dftfe::utils::MemoryStorage> + & rhoResponseValuesFermiEnergy, const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, @@ -506,20 +538,29 @@ namespace dftfe template void computeRhoFirstOrderResponse( - const dftfe::utils::MemoryStorage & X, - const dftfe::utils::MemoryStorage & XPrime, - const unsigned int totalNumWaveFunctions, - const std::vector> & densityMatDerFermiEnergy, + const dftfe::utils::MemoryStorage &X, + const dftfe::utils::MemoryStorage &XPrime, + const unsigned int totalNumWaveFunctions, + const std::vector> &densityMatDerFermiEnergy, std::shared_ptr< - dftfe::basis::FEBasisOperations> + dftfe::basis::FEBasisOperations> &basisOperationsPtr, - std::shared_ptr> + std::shared_ptr< + dftfe::linearAlgebra::BLASWrapper> & BLASWrapperPtr, const unsigned int matrixFreeDofhandlerIndex, const unsigned int quadratureIndex, - const std::vector & kPointWeights, - std::vector> &rhoResponseValuesHam, - std::vector> &rhoResponseValuesFermiEnergy, + const std::vector &kPointWeights, + std::vector< + dftfe::utils::MemoryStorage> + &rhoResponseValuesHam, + std::vector< + dftfe::utils::MemoryStorage> + & rhoResponseValuesFermiEnergy, const MPI_Comm & mpiCommParent, const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, diff --git a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc index 13f7f0fe9..82043d587 100644 --- a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc +++ b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc @@ -48,11 +48,10 @@ namespace dftfe for (unsigned int index = globalThreadId; index < numberEntries; index += blockDim.x * gridDim.x) { - const double psi = wfcContributions[index]; - const double psiPrime = wfcContributions[index]; + const double psi = wfcContributions[index]; + const double psiPrime = wfcContributions[index]; rhoResponseFermiEnergyCellsWfcContributions[index] = psi * psi; - rhoResponseHamCellsWfcContributions[index] = psi * psiPrime; - + rhoResponseHamCellsWfcContributions[index] = psi * psiPrime; } } @@ -64,7 +63,7 @@ namespace dftfe dftfe::utils::deviceDoubleComplex *wfcContributions, dftfe::utils::deviceDoubleComplex *wfcPrimeContributions, double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions) + double *rhoResponseFermiEnergyCellsWfcContributions) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; @@ -74,10 +73,12 @@ namespace dftfe index += blockDim.x * gridDim.x) { const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index]; - const dftfe::utils::deviceDoubleComplex psiPrime = wfcPrimeContributions[index]; - rhoResponseFermiEnergyCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y; - rhoResponseHamCellsWfcContributions[index] = psi.x * psiPrime.x + psi.y * psiPrime.y; - + const dftfe::utils::deviceDoubleComplex psiPrime = + wfcPrimeContributions[index]; + rhoResponseFermiEnergyCellsWfcContributions[index] = + psi.x * psi.x + psi.y * psi.y; + rhoResponseHamCellsWfcContributions[index] = + psi.x * psiPrime.x + psi.y * psiPrime.y; } } } // namespace @@ -98,17 +99,17 @@ namespace dftfe double * partialOccupPrimeVec, NumberType * wfcQuadPointData, NumberType * wfcPrimeQuadPointData, - double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions, - double * rhoResponseHam, - double * rhoResponseFermiEnergy) + double *rhoResponseHamCellsWfcContributions, + double *rhoResponseFermiEnergyCellsWfcContributions, + double *rhoResponseHam, + double *rhoResponseFermiEnergy) { const unsigned int cellsBlockSize = cellRange.second - cellRange.first; const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); const unsigned int nCells = basisOperationsPtr->nCells(); - const double scalarCoeffAlphaRho = 1.0; - const double scalarCoeffBetaRho = 1.0; + const double scalarCoeffAlphaRho = 1.0; + const double scalarCoeffBetaRho = 1.0; #ifdef DFTFE_WITH_DEVICE_LANG_CUDA computeRhoResponseFromInterpolatedValues<<< (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / @@ -119,8 +120,10 @@ namespace dftfe nQuadsPerCell, dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(wfcPrimeQuadPointData), - dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseHamCellsWfcContributions), - dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseFermiEnergyCellsWfcContributions)); + dftfe::utils::makeDataTypeDeviceCompatible( + rhoResponseHamCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible( + rhoResponseFermiEnergyCellsWfcContributions)); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( computeRhoResponseFromInterpolatedValues, @@ -134,8 +137,10 @@ namespace dftfe nQuadsPerCell, dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(wfcPrimeQuadPointData), - dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseHamCellsWfcContributions), - dftfe::utils::makeDataTypeDeviceCompatible(rhoResponseFermiEnergyCellsWfcContributions)); + dftfe::utils::makeDataTypeDeviceCompatible( + rhoResponseHamCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible( + rhoResponseFermiEnergyCellsWfcContributions)); #endif BLASWrapperPtr->xgemv('T', vectorsBlockSize, @@ -158,9 +163,9 @@ namespace dftfe partialOccupPrimeVec, 1, &scalarCoeffBetaRho, - rhoResponseFermiEnergy + cellRange.first * nQuadsPerCell, + rhoResponseFermiEnergy + + cellRange.first * nQuadsPerCell, 1); - } template void computeRhoResponseFromInterpolatedValues( @@ -177,9 +182,9 @@ namespace dftfe double * partialOccupVec, dataTypes::number * wfcQuadPointData, dataTypes::number * wfcPrimeQuadPointData, - double * rhoResponseHamCellsWfcContributions, - double * rhoResponseFermiEnergyCellsWfcContributions, - double * rhoResponseHam, - double * rhoResponseFermiEnergy); + double *rhoResponseHamCellsWfcContributions, + double *rhoResponseFermiEnergyCellsWfcContributions, + double *rhoResponseHam, + double *rhoResponseFermiEnergy); } // namespace dftfe diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index f13327224..f7595175a 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -306,14 +306,14 @@ namespace dftfe bandGroupLowHighPlusOneIndices[1]); d_basisOperationsPtrHost->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)*2); + BVec, (d_dftParamsPtr->spinPolarized + 1) * 2); if (d_numEigenValues % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( - d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1)*2); + d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1) * 2); if (d_numEigenValues != d_numEigenValuesRR && d_numEigenValuesRR % BVec != 0) d_basisOperationsPtrHost->createScratchMultiVectors( - d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1)*2); + d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1) * 2); } #if defined(DFTFE_WITH_DEVICE) if (d_dftParamsPtr->useDevice && recomputeBasisData) @@ -326,7 +326,7 @@ namespace dftfe std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); d_basisOperationsPtrDevice->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)*2); + BVec, (d_dftParamsPtr->spinPolarized + 1) * 2); d_basisOperationsPtrDevice->computeCellStiffnessMatrix( d_feOrderPlusOneQuadratureId, 50, true, false); } @@ -369,7 +369,7 @@ namespace dftfe std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); d_basisOperationsPtrDevice->createScratchMultiVectors( - BVec, (d_dftParamsPtr->spinPolarized + 1)*2); + BVec, (d_dftParamsPtr->spinPolarized + 1) * 2); } #endif From bf3e4c7eebfa476bef1e1fc323a9b93385a00bf5 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Sat, 10 Feb 2024 21:00:32 -0500 Subject: [PATCH 09/24] compile fix --- CMakeLists.txt | 2 ++ include/FEBasisOperationsKernelsInternal.h | 7 ++++--- include/densityFirstOrderResponseCalculator.h | 2 +- setupDevelopPetsc.sh | 2 +- src/dft/computeOutputDensityDirectionalDerivative.cc | 4 ++-- utils/FEBasisOperationsKernelsInternalHost.cc | 6 +++++- 6 files changed, 15 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 444fb9d24..076572c96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,8 @@ # CMakeList file for DFT-FE ## CMAKE_MINIMUM_REQUIRED(VERSION 3.17) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) # Set the name of the project and target: SET(TARGET "dftfe_exe") SET(TARGETLIB "dftfe") diff --git a/include/FEBasisOperationsKernelsInternal.h b/include/FEBasisOperationsKernelsInternal.h index 9aff3c66d..604739241 100644 --- a/include/FEBasisOperationsKernelsInternal.h +++ b/include/FEBasisOperationsKernelsInternal.h @@ -17,13 +17,15 @@ #ifndef dftfeFEBasisOperationsKernelsInternal_h #define dftfeFEBasisOperationsKernelsInternal_h +#include +#include #ifdef DFTFE_WITH_DEVICE -# include # include # include # include # include -# include +#endif // DFTFE_WITH_DEVICE + namespace dftfe { namespace basis @@ -84,5 +86,4 @@ namespace dftfe } // namespace basis } // namespace dftfe -#endif // DFTFE_WITH_DEVICE #endif // dftfeFEBasisOperationsKernelsInternal_h diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index 931690634..8b36b12fe 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -67,7 +67,7 @@ namespace dftfe const std::pair cellRange, const std::pair vecRange, double * onesVec, - double * partialOccupVecPrime, + double * partialOccupPrimeVec, NumberType * wfcQuadPointData, NumberType * wfcPrimeQuadPointData, double *rhoResponseHamCellsWfcContributions, diff --git a/setupDevelopPetsc.sh b/setupDevelopPetsc.sh index f550abd6f..15102e820 100755 --- a/setupDevelopPetsc.sh +++ b/setupDevelopPetsc.sh @@ -48,7 +48,7 @@ withTorch=OFF withCustomizedDealii=ON #Compiler options and flags -cxx_compiler=/sw/pkgs/arc/stacks/gcc/10.3.0/openmpi/4.1.6rc2/bin/mpicxx #sets DCMAKE_CXX_COMPILER +cxx_compiler=/sw/pkgs/arc/stacks/gcc/10.3.0/openmpi/4.1.6/bin/mpicxx #sets DCMAKE_CXX_COMPILER cxx_flags="-std=c++17 -march=native -fopenmp -fPIC" #sets DCMAKE_CXX_FLAGS cxx_flagsRelease="-O2" #sets DCMAKE_CXX_FLAGS_RELEASE device_flags="-arch=sm_70" # set DCMAKE_CXX_CUDA_FLAGS diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 9c34272c3..737f3b6f2 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -502,7 +502,7 @@ namespace dftfe if (d_dftParamsPtr->useDevice) { computeRhoFirstOrderResponse(d_eigenVectorsFlattenedDevice, - d_eigenVectorsDensityMatrixPrimeDevice, + d_eigenVectorsDensityMatrixPrimeFlattenedDevice, d_numEigenValues, d_densityMatDerFermiEnergy, d_basisOperationsPtrDevice, @@ -525,7 +525,7 @@ namespace dftfe d_numEigenValues, d_densityMatDerFermiEnergy, d_basisOperationsPtrHost, - d_BLASWrapperPtr, + d_BLASWrapperPtrHost, d_densityDofHandlerIndex, d_gllQuadratureId, d_kPointWeights, diff --git a/utils/FEBasisOperationsKernelsInternalHost.cc b/utils/FEBasisOperationsKernelsInternalHost.cc index d75ffdf79..19c3730fe 100644 --- a/utils/FEBasisOperationsKernelsInternalHost.cc +++ b/utils/FEBasisOperationsKernelsInternalHost.cc @@ -16,7 +16,11 @@ // #include - +#include +#include +#include +#include +#include namespace dftfe { From a90384b1af04a48c694d1901e11cf545f524141e Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 09:01:07 -0500 Subject: [PATCH 10/24] gpu compile fix --- src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc index 82043d587..c5d3e9ced 100644 --- a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc +++ b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc @@ -179,6 +179,7 @@ namespace dftfe & BLASWrapperPtr, const std::pair cellRange, const std::pair vecRange, + double * onesVec, double * partialOccupVec, dataTypes::number * wfcQuadPointData, dataTypes::number * wfcPrimeQuadPointData, From b3cb93b84185a2e2cd600e3bc7eb0967182704f1 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 11:06:00 -0500 Subject: [PATCH 11/24] bug fix, ctests still not passing --- src/dft/densityFirstOrderResponseCalculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index 2c9452669..5166cc45d 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -282,7 +282,7 @@ namespace dftfe flattenedArrayBlock[numSpinComponents + spinIndex] ->updateGhostValues(); basisOperationsPtr->distribute( - *(flattenedArrayBlock[spinIndex])); + *(flattenedArrayBlock[numSpinComponents + spinIndex])); } for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) From db1c275f7591462ee99102bddef1000462292d82 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 18:59:21 -0500 Subject: [PATCH 12/24] spin LRDM refactoring bug fix --- src/dft/densityFirstOrderResponseCalculator.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index 5166cc45d..d246a42f8 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -202,9 +202,8 @@ namespace dftfe { *(partialOccupPrimeVecHost[spinIndex].begin() + iEigenVec) = - densityMatDerFermiEnergy[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec] * + densityMatDerFermiEnergy[numSpinComponents* kPoint + + spinIndex][jvec + iEigenVec] * kPointWeights[kPoint] * spinPolarizedFactor; } #if defined(DFTFE_WITH_DEVICE) From f05a07476bb6e1d2aa674cd0f0d4d0be51ca7bf2 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 19:20:55 -0500 Subject: [PATCH 13/24] hip compilation fixes --- utils/BLASWrapperDevice.hip.cc | 137 +++++++++++++++++++++------------ 1 file changed, 87 insertions(+), 50 deletions(-) diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index d5a0fde54..8ffb81bce 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -28,10 +28,46 @@ #include "BLASWrapperDeviceKernels.cc" namespace dftfe { - namespace linearAlgebra - { - namespace + + namespace utils { + inline double + makeDataTypeHipBlasCompatible(double a) + { + return a; + } + + inline float + makeDataTypeHipBlasCompatible(float a) + { + return a; + } + + inline float * + makeDataTypeHipBlasCompatible(float *a) + { + return reinterpret_cast(a); + } + + inline const float * + makeDataTypeHipBlasCompatible(const float *a) + { + return reinterpret_cast(a); + } + + inline double * + makeDataTypeHipBlasCompatible(double *a) + { + return reinterpret_cast(a); + } + + inline const double * + makeDataTypeHipBlasCompatible(const double *a) + { + return reinterpret_cast(a); + } + + inline hipblasDoubleComplex makeDataTypeHipBlasCompatible(std::complex a) { @@ -67,9 +103,10 @@ namespace dftfe { return reinterpret_cast(a); } + } - } // namespace - + namespace linearAlgebra + { #ifdef DFTFE_WITH_DEVICE_AMD void BLASWrapper::initialize() @@ -190,13 +227,13 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), - makeDataTypeHipBlasCompatible(A), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), int(lda), - makeDataTypeHipBlasCompatible(B), + dftfe::utils::makeDataTypeHipBlasCompatible(B), int(ldb), - makeDataTypeHipBlasCompatible(beta), - makeDataTypeHipBlasCompatible(C), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), int(ldc)); DEVICEBLAS_API_CHECK(status); } @@ -299,13 +336,13 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), - makeDataTypeHipBlasCompatible(A), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), int(lda), - makeDataTypeHipBlasCompatible(B), + dftfe::utils::makeDataTypeHipBlasCompatible(B), int(ldb), - makeDataTypeHipBlasCompatible(beta), - makeDataTypeHipBlasCompatible(C), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), int(ldc)); DEVICEBLAS_API_CHECK(status); } @@ -333,7 +370,7 @@ namespace dftfe { // Assert Statement } - deviceBlasStatus_t status = hipblasDgemv(d_deviceBlasHandle, + dftfe::utils::deviceBlasStatus_t status = hipblasDgemv(d_deviceBlasHandle, transa, int(m), int(n), @@ -373,7 +410,7 @@ namespace dftfe // Assert Statement } - deviceBlasStatus_t status = hipblasSgemv(d_deviceBlasHandle, + dftfe::utils::deviceBlasStatus_t status = hipblasSgemv(d_deviceBlasHandle, transa, int(m), int(n), @@ -414,7 +451,7 @@ namespace dftfe // Assert Statement } - deviceBlasStatus_t status = + dftfe::utils::deviceBlasStatus_t status = hipblasZgemv(d_deviceBlasHandle, transa, int(m), @@ -456,7 +493,7 @@ namespace dftfe // Assert Statement } - deviceBlasStatus_t status = + dftfe::utils::deviceBlasStatus_t status = hipblasCgemv(d_deviceBlasHandle, transa, int(m), @@ -530,10 +567,10 @@ namespace dftfe dftfe::utils::deviceBlasStatus_t status = hipblasZaxpy(d_deviceBlasHandle, int(n), - makeDataTypeHipBlasCompatible(alpha), - makeDataTypeHipBlasCompatible(x), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(x), int(incx), - makeDataTypeHipBlasCompatible(y), + dftfe::utils::makeDataTypeHipBlasCompatible(y), int(incy)); DEVICEBLAS_API_CHECK(status); } @@ -607,11 +644,11 @@ namespace dftfe dftfe::utils::deviceBlasStatus_t status = hipblasZdotc(d_deviceBlasHandle, int(N), - makeDataTypeHipBlasCompatible(X), + dftfe::utils::makeDataTypeHipBlasCompatible(X), int(INCX), - makeDataTypeHipBlasCompatible(Y), + dftfe::utils::makeDataTypeHipBlasCompatible(Y), int(INCY), - makeDataTypeHipBlasCompatible(result)); + dftfe::utils::makeDataTypeHipBlasCompatible(result)); DEVICEBLAS_API_CHECK(status); } @@ -727,15 +764,15 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), - makeDataTypeHipBlasCompatible(A), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), int(lda), strideA, - makeDataTypeHipBlasCompatible(B), + dftfe::utils::makeDataTypeHipBlasCompatible(B), int(ldb), strideB, - makeDataTypeHipBlasCompatible(beta), - makeDataTypeHipBlasCompatible(C), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), int(ldc), strideC, int(batchCount)); @@ -855,15 +892,15 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), - makeDataTypeHipBlasCompatible(A), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), int(lda), strideA, - makeDataTypeHipBlasCompatible(B), + dftfe::utils::makeDataTypeHipBlasCompatible(B), int(ldb), strideB, - makeDataTypeHipBlasCompatible(beta), - makeDataTypeHipBlasCompatible(C), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), int(ldc), strideC, int(batchCount)); @@ -973,12 +1010,12 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), (const hipblasDoubleComplex **)A, int(lda), (const hipblasDoubleComplex **)B, int(ldb), - makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), (hipblasDoubleComplex **)C, int(ldc), int(batchCount)); @@ -1090,12 +1127,12 @@ namespace dftfe int(m), int(n), int(k), - makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), (const hipblasComplex **)A, int(lda), (const hipblasComplex **)B, int(ldb), - makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(beta), (hipblasComplex **)C, int(ldc), int(batchCount)); @@ -1115,7 +1152,7 @@ namespace dftfe dftfe::utils::deviceBlasStatus_t status = hipblasDznrm2(d_deviceBlasHandle, int(n), - makeDataTypeHipBlasCompatible(x), + dftfe::utils::makeDataTypeHipBlasCompatible(x), int(incx), &localresult); localresult *= localresult; @@ -1210,8 +1247,8 @@ namespace dftfe dftfe::utils::DEVICE_BLOCK_SIZE, 0, d_streamId>>>(size, - makeDataTypeHipBlasCompatible(valueType1Arr), - makeDataTypeHipBlasCompatible(valueType2Arr)); + dftfe::utils::makeDataTypeDeviceCompatible(valueType1Arr), + dftfe::utils::makeDataTypeDeviceCompatible(valueType2Arr)); } template @@ -1232,8 +1269,8 @@ namespace dftfe 0, contiguousBlockSize, numContiguousBlocks, - makeDataTypeHipBlasCompatible(copyFromVec), - makeDataTypeHipBlasCompatible(copyToVecBlock), + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), copyFromVecStartingContiguousBlockIds); } @@ -1266,8 +1303,8 @@ namespace dftfe 0, contiguousBlockSize, numContiguousBlocks, - makeDataTypeHipBlasCompatible(copyFromVecBlock), - makeDataTypeHipBlasCompatible(copyToVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVecBlock), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec), copyFromVecStartingContiguousBlockIds); } @@ -1293,8 +1330,8 @@ namespace dftfe blockSizeFrom, numBlocks, startingId, - makeDataTypeHipBlasCompatible(copyFromVec), - makeDataTypeHipBlasCompatible(copyToVec)); + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); } template @@ -1322,8 +1359,8 @@ namespace dftfe numBlocks, startingToId, startingFromId, - makeDataTypeHipBlasCompatible(copyFromVec), - makeDataTypeHipBlasCompatible(copyToVec)); + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); } From 892910c84b8cbea95c19aefa21a67f15fcc0c021 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 20:39:18 -0500 Subject: [PATCH 14/24] hip compilation succesful --- utils/BLASWrapperDevice.hip.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index 8ffb81bce..c64713f56 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -1606,5 +1606,21 @@ namespace dftfe std::complex * copyToVec); + // axpyStridedBlockAtomicAdd + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const double * addFromVec, + double * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; + + template void + BLASWrapper::axpyStridedBlockAtomicAdd( + const dftfe::size_type contiguousBlockSize, + const dftfe::size_type numContiguousBlocks, + const std::complex * addFromVec, + std::complex * addToVec, + const dftfe::global_size_type *addToVecStartingContiguousBlockIds) const; } // End of namespace linearAlgebra } // End of namespace dftfe From c6440bc3800748b10a59b702cb54b088a42b954f Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Mon, 12 Feb 2024 20:57:07 -0500 Subject: [PATCH 15/24] cleanups and apply indentation standard --- doc/manual/installation.tex | 14 +- setupUser.sh | 6 +- setupUserPetsc.sh | 6 +- ...mputeOutputDensityDirectionalDerivative.cc | 31 +- .../densityFirstOrderResponseCalculator.cc | 4 +- utils/BLASWrapperDevice.hip.cc | 350 +++++++++--------- 6 files changed, 207 insertions(+), 204 deletions(-) diff --git a/doc/manual/installation.tex b/doc/manual/installation.tex index 060c67cc3..b84fc42fa 100644 --- a/doc/manual/installation.tex +++ b/doc/manual/installation.tex @@ -56,7 +56,7 @@ \subsubsection{Instructions for dependencies: ALGLIB, Libxc, spglib, Libxml2, Sc \end{verbatim} There might be errors complaining that it can not create regular file libxml2.py in /usr/lib/python2.7/site-packages, but that should not matter. -\item {\bf ScaLAPACK}: ScaLAPACK library is used by DFT-FE via ELPA for its parallel linear algebra routines involving dense matrices, as well being a dependency for ELPA. \textcolor{red}{\bf If Intel MKL math library is available, please skip this step, as the ScaLAPACK libraries therein can be used directly.} If Intel MKL math library is not available, Netlib ScaLAPACK \url{http://www.netlib.org/scalapack/} needs to be installed using the instructions below. Download the current release version (2.2.0) from \url{http://www.netlib.org/scalapack/#\_software}, and build a shared library (use \verb|BUILD_SHARED_LIBS=ON|, \verb|BUILD_STATIC_LIBS=OFF| and \verb|BUILD_TESTING=OFF|) installation of ScaLAPACK using cmake. We recommend using the ccmake gui interface for the installation. Further, use the appropriate compilers for \verb|CMAKE_C_COMPILER| and \verb|CMAKE_FORTRAN_COMPILER|, and also use \verb|-fPIC| flag for \verb|CMAKE_C_FLAGS| and \verb|CMAKE_Fortran_FLAGS|. For best performance, ScaLAPACK must be linked to optimized BLAS-LAPACK libraries by using\\ \verb|USE_OPTIMIZED_LAPACK_BLAS=ON|, and providing external paths to BLAS-LAPACK libraries (MKL, OpenBlas, ESSL etc.) during the cmake configuration. +\item {\bf ScaLAPACK}: ScaLAPACK library is used by DFT-FE via ELPA for its parallel linear algebra routines involving dense matrices, as well being a dependency for ELPA. \textcolor{red}{\bf If Intel MKL math library is available, please skip this step, as the ScaLAPACK libraries therein can be used directly.} If Intel MKL math library is not available, Netlib ScaLAPACK \url{http://www.netlib.org/scalapack/} needs to be installed using the instructions below. Download the current release version (2.2.0) from \url{http://www.netlib.org/scalapack/#\_software}, and build a shared library (use \verb|BUILD_SHARED_LIBS=ON|, \verb|BUILD_STATIC_LIBS=OFF| and \verb|BUILD_TESTING=OFF|) installation of ScaLAPACK using cmake. We recommend using the ccmake gui interface for the installation. Further, use the appropriate compilers for \verb|CMAKE_C_COMPILER| and \verb|CMAKE_FORTRAN_COMPILER|, and also use \verb|-fPIC| flag for \verb|CMAKE_C_FLAGS| and \verb|-fPIC -fallow-argument-mismatch| for \verb|CMAKE_Fortran_FLAGS|. For best performance, ScaLAPACK must be linked to optimized BLAS-LAPACK libraries by using\\ \verb|USE_OPTIMIZED_LAPACK_BLAS=ON|, and providing external paths to BLAS-LAPACK libraries (MKL, OpenBlas, ESSL etc.) during the cmake configuration. %Alternatively one can also use the python based installer~\url{http://www.netlib.org/scalapack/scalapack_installer.tgz} for Linux. \item {\bf ELPA}: ELPA library is used by DFT-FE for its parallel linear algebra routines involving dense matrices. ELPA requires the ScaLAPACK library (see above) as a dependency. Download the latest version elpa-2022.11.001 from \url{https://elpa.mpcdf.mpg.de/software/} and follow the installation instructions in there. Example of ELPA installation on UMICH Greatlakes supercomputer with GNU compiler, Open MPI library, and Intel MKL math library: @@ -171,7 +171,7 @@ \subsubsection{Instructions for deal.II} \end{verbatim} {\bf ``otherCmakeOptions'' include} the following options for CPU installation: \begin{verbatim} --DCMAKE_CXX_STANDARD=14 +-DCMAKE_CXX_STANDARD=17 -DCMAKE_C_COMPILER=c_compiler -DCMAKE_CXX_COMPILER=cxx_compiler -DCMAKE_Fortran_COMPILER=fortran_compiler @@ -189,6 +189,7 @@ \subsubsection{Instructions for deal.II} -DDEAL_II_WITH_TBB=OFF -DDEAL_II_WITH_TASKFLOW=OFF -DDEAL_II_COMPONENT_EXAMPLES=OFF +-DDEAL_II_FORCE_BUNDLED_BOOST=ON \end{verbatim} @@ -199,21 +200,22 @@ \subsubsection{Instructions for deal.II} $ mkdir build $ cd build $ cmake --DCMAKE_CXX_STANDARD=14 +-DCMAKE_CXX_STANDARD=17 -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran -DMPI_C_COMPILER=mpicc -DMPI_CXX_COMPILER=mpicxx -DMPI_Fortran_COMPILER=mpif90 --DCMAKE_CXX_FLAGS="-march=native -std=c++14" --DCMAKE_C_FLAGS="-march=native -std=c++14" +-DCMAKE_CXX_FLAGS="-march=native -std=c++17" +-DCMAKE_C_FLAGS="-march=native -std=c++17" -DDEAL_II_CXX_FLAGS_RELEASE="-O2" -DDEAL_II_COMPONENT_EXAMPLES=OFF -DDEAL_II_WITH_MPI=ON -DDEAL_II_WITH_64BIT_INDICES=ON -DDEAL_II_WITH_TBB=OFF -DDEAL_II_WITH_TASKFLOW=OFF +-DDEAL_II_FORCE_BUNDLED_BOOST=ON -DDEAL_II_WITH_P4EST=ON -DP4EST_DIR=p4est_install_path -DDEAL_II_WITH_LAPACK=ON -DLAPACK_DIR="${MKLROOT}/lib/intel64" @@ -252,7 +254,7 @@ \subsubsection{Instructions for simple-dftd3 and dftd4} $ cmake --build _build $ cmake --install _build \end{verbatim} - \item {\bf dftd4}: Used by \dftfe{} to provide dftd4 corrections to energy, force and stress. Download the current release (3.460 at the time of writing) of dftd4 from \url{https://github.com/dftd4/dftd4/releases}. After downloading and unpacking, use cmake to build the library. For example, using GNU compiler and Intel MKL for BLAS and LAPACK, do + \item {\bf dftd4}: Used by \dftfe{} to provide dftd4 corrections to energy, force and stress. Download the current release (3.6.0 at the time of writing) of dftd4 from \url{https://github.com/dftd4/dftd4/releases}. After downloading and unpacking, use cmake to build the library. For example, using GNU compiler and Intel MKL for BLAS and LAPACK, do \begin{verbatim} $ cd dftd4-3.6.0 $ mkdir build diff --git a/setupUser.sh b/setupUser.sh index 23a3d3339..c4c7edf79 100755 --- a/setupUser.sh +++ b/setupUser.sh @@ -80,7 +80,7 @@ out=`echo "$build_type" | tr '[:upper:]' '[:lower:]'` function cmake_configure() { if [ "$gpuLang" = "cuda" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -94,7 +94,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 elif [ "$gpuLang" = "hip" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -108,7 +108,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 else - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ diff --git a/setupUserPetsc.sh b/setupUserPetsc.sh index da7048c66..9dbc9166d 100755 --- a/setupUserPetsc.sh +++ b/setupUserPetsc.sh @@ -82,7 +82,7 @@ out=`echo "$build_type" | tr '[:upper:]' '[:lower:]'` function cmake_configure() { mkdir -p complex && cd complex if [ "$gpuLang" = "cuda" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -96,7 +96,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 elif [ "$gpuLang" = "hip" ]; then - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ @@ -110,7 +110,7 @@ function cmake_configure() { -DWITH_TESTING=$testing -DMINIMAL_COMPILE=$minimal_compile\ -DHIGHERQUAD_PSP=$withHigherQuadPSP $1 else - cmake -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_COMPILER=$cxx_compiler\ + cmake -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_COMPILER=$cxx_compiler\ -DCMAKE_CXX_FLAGS="$cxx_flags"\ -DCMAKE_CXX_FLAGS_RELEASE="$cxx_flagsRelease" \ -DCMAKE_BUILD_TYPE=$build_type -DDEAL_II_DIR=$dealiiDir \ diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 737f3b6f2..6062a5a81 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -501,21 +501,22 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) { - computeRhoFirstOrderResponse(d_eigenVectorsFlattenedDevice, - d_eigenVectorsDensityMatrixPrimeFlattenedDevice, - d_numEigenValues, - d_densityMatDerFermiEnergy, - d_basisOperationsPtrDevice, - d_BLASWrapperPtr, - d_densityDofHandlerIndex, - d_gllQuadratureId, - d_kPointWeights, - rhoResponseHamPRefinedNodalData, - rhoResponseFermiEnergyPRefinedNodalData, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr); + computeRhoFirstOrderResponse( + d_eigenVectorsFlattenedDevice, + d_eigenVectorsDensityMatrixPrimeFlattenedDevice, + d_numEigenValues, + d_densityMatDerFermiEnergy, + d_basisOperationsPtrDevice, + d_BLASWrapperPtr, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + rhoResponseHamPRefinedNodalData, + rhoResponseFermiEnergyPRefinedNodalData, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr); } #endif if (!d_dftParamsPtr->useDevice) diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index d246a42f8..a57fe3795 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -202,8 +202,8 @@ namespace dftfe { *(partialOccupPrimeVecHost[spinIndex].begin() + iEigenVec) = - densityMatDerFermiEnergy[numSpinComponents* kPoint + - spinIndex][jvec + iEigenVec] * + densityMatDerFermiEnergy[numSpinComponents * kPoint + + spinIndex][jvec + iEigenVec] * kPointWeights[kPoint] * spinPolarizedFactor; } #if defined(DFTFE_WITH_DEVICE) diff --git a/utils/BLASWrapperDevice.hip.cc b/utils/BLASWrapperDevice.hip.cc index c64713f56..873d28386 100644 --- a/utils/BLASWrapperDevice.hip.cc +++ b/utils/BLASWrapperDevice.hip.cc @@ -28,82 +28,81 @@ #include "BLASWrapperDeviceKernels.cc" namespace dftfe { + namespace utils + { + inline double + makeDataTypeHipBlasCompatible(double a) + { + return a; + } + + inline float + makeDataTypeHipBlasCompatible(float a) + { + return a; + } + + inline float * + makeDataTypeHipBlasCompatible(float *a) + { + return reinterpret_cast(a); + } + + inline const float * + makeDataTypeHipBlasCompatible(const float *a) + { + return reinterpret_cast(a); + } + + inline double * + makeDataTypeHipBlasCompatible(double *a) + { + return reinterpret_cast(a); + } + + inline const double * + makeDataTypeHipBlasCompatible(const double *a) + { + return reinterpret_cast(a); + } + + + inline hipblasDoubleComplex + makeDataTypeHipBlasCompatible(std::complex a) + { + return hipblasDoubleComplex(a.real(), a.imag()); + } + + inline hipblasComplex + makeDataTypeHipBlasCompatible(std::complex a) + { + return hipblasComplex(a.real(), a.imag()); + } + + inline hipblasComplex * + makeDataTypeHipBlasCompatible(std::complex *a) + { + return reinterpret_cast(a); + } + + inline const hipblasComplex * + makeDataTypeHipBlasCompatible(const std::complex *a) + { + return reinterpret_cast(a); + } + + inline hipblasDoubleComplex * + makeDataTypeHipBlasCompatible(std::complex *a) + { + return reinterpret_cast(a); + } - namespace utils + inline const hipblasDoubleComplex * + makeDataTypeHipBlasCompatible(const std::complex *a) { - inline double - makeDataTypeHipBlasCompatible(double a) - { - return a; - } - - inline float - makeDataTypeHipBlasCompatible(float a) - { - return a; - } - - inline float * - makeDataTypeHipBlasCompatible(float *a) - { - return reinterpret_cast(a); - } - - inline const float * - makeDataTypeHipBlasCompatible(const float *a) - { - return reinterpret_cast(a); - } - - inline double * - makeDataTypeHipBlasCompatible(double *a) - { - return reinterpret_cast(a); - } - - inline const double * - makeDataTypeHipBlasCompatible(const double *a) - { - return reinterpret_cast(a); - } - - - inline hipblasDoubleComplex - makeDataTypeHipBlasCompatible(std::complex a) - { - return hipblasDoubleComplex(a.real(), a.imag()); - } - - inline hipblasComplex - makeDataTypeHipBlasCompatible(std::complex a) - { - return hipblasComplex(a.real(), a.imag()); - } - - inline hipblasComplex * - makeDataTypeHipBlasCompatible(std::complex *a) - { - return reinterpret_cast(a); - } - - inline const hipblasComplex * - makeDataTypeHipBlasCompatible(const std::complex *a) - { - return reinterpret_cast(a); - } - - inline hipblasDoubleComplex * - makeDataTypeHipBlasCompatible(std::complex *a) - { - return reinterpret_cast(a); - } - - inline const hipblasDoubleComplex * - makeDataTypeHipBlasCompatible(const std::complex *a) - { - return reinterpret_cast(a); - } + return reinterpret_cast(a); } + } // namespace utils namespace linearAlgebra { @@ -371,17 +370,17 @@ namespace dftfe // Assert Statement } dftfe::utils::deviceBlasStatus_t status = hipblasDgemv(d_deviceBlasHandle, - transa, - int(m), - int(n), - alpha, - A, - int(lda), - x, - int(incx), - beta, - y, - int(incy)); + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); DEVICEBLAS_API_CHECK(status); } @@ -411,17 +410,17 @@ namespace dftfe } dftfe::utils::deviceBlasStatus_t status = hipblasSgemv(d_deviceBlasHandle, - transa, - int(m), - int(n), - alpha, - A, - int(lda), - x, - int(incx), - beta, - y, - int(incy)); + transa, + int(m), + int(n), + alpha, + A, + int(lda), + x, + int(incx), + beta, + y, + int(incy)); DEVICEBLAS_API_CHECK(status); } @@ -757,25 +756,25 @@ namespace dftfe // Assert Statement } - dftfe::utils::deviceBlasStatus_t status = - hipblasZgemmStridedBatched(d_deviceBlasHandle, - transa, - transb, - int(m), - int(n), - int(k), - dftfe::utils::makeDataTypeHipBlasCompatible(alpha), - dftfe::utils::makeDataTypeHipBlasCompatible(A), - int(lda), - strideA, - dftfe::utils::makeDataTypeHipBlasCompatible(B), - int(ldb), - strideB, - dftfe::utils::makeDataTypeHipBlasCompatible(beta), - dftfe::utils::makeDataTypeHipBlasCompatible(C), - int(ldc), - strideC, - int(batchCount)); + dftfe::utils::deviceBlasStatus_t status = hipblasZgemmStridedBatched( + d_deviceBlasHandle, + transa, + transb, + int(m), + int(n), + int(k), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), + int(lda), + strideA, + dftfe::utils::makeDataTypeHipBlasCompatible(B), + int(ldb), + strideB, + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), + int(ldc), + strideC, + int(batchCount)); DEVICEBLAS_API_CHECK(status); } @@ -885,25 +884,25 @@ namespace dftfe // Assert Statement } - dftfe::utils::deviceBlasStatus_t status = - hipblasCgemmStridedBatched(d_deviceBlasHandle, - transa, - transb, - int(m), - int(n), - int(k), - dftfe::utils::makeDataTypeHipBlasCompatible(alpha), - dftfe::utils::makeDataTypeHipBlasCompatible(A), - int(lda), - strideA, - dftfe::utils::makeDataTypeHipBlasCompatible(B), - int(ldb), - strideB, - dftfe::utils::makeDataTypeHipBlasCompatible(beta), - dftfe::utils::makeDataTypeHipBlasCompatible(C), - int(ldc), - strideC, - int(batchCount)); + dftfe::utils::deviceBlasStatus_t status = hipblasCgemmStridedBatched( + d_deviceBlasHandle, + transa, + transb, + int(m), + int(n), + int(k), + dftfe::utils::makeDataTypeHipBlasCompatible(alpha), + dftfe::utils::makeDataTypeHipBlasCompatible(A), + int(lda), + strideA, + dftfe::utils::makeDataTypeHipBlasCompatible(B), + int(ldb), + strideB, + dftfe::utils::makeDataTypeHipBlasCompatible(beta), + dftfe::utils::makeDataTypeHipBlasCompatible(C), + int(ldc), + strideC, + int(batchCount)); DEVICEBLAS_API_CHECK(status); } void @@ -1248,7 +1247,8 @@ namespace dftfe 0, d_streamId>>>(size, dftfe::utils::makeDataTypeDeviceCompatible(valueType1Arr), - dftfe::utils::makeDataTypeDeviceCompatible(valueType2Arr)); + dftfe::utils::makeDataTypeDeviceCompatible( + valueType2Arr)); } template @@ -1260,18 +1260,19 @@ namespace dftfe ValueType2 * copyToVecBlock, const dftfe::global_size_type *copyFromVecStartingContiguousBlockIds) { - hipLaunchKernelGGL(stridedCopyToBlockDeviceKernel, - (contiguousBlockSize * numContiguousBlocks) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - contiguousBlockSize, - numContiguousBlocks, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), - dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), - copyFromVecStartingContiguousBlockIds); + hipLaunchKernelGGL( + stridedCopyToBlockDeviceKernel, + (contiguousBlockSize * numContiguousBlocks) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + contiguousBlockSize, + numContiguousBlocks, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVecBlock), + copyFromVecStartingContiguousBlockIds); } void @@ -1303,7 +1304,8 @@ namespace dftfe 0, contiguousBlockSize, numContiguousBlocks, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVecBlock), + dftfe::utils::makeDataTypeDeviceCompatible( + copyFromVecBlock), dftfe::utils::makeDataTypeDeviceCompatible(copyToVec), copyFromVecStartingContiguousBlockIds); } @@ -1319,19 +1321,18 @@ namespace dftfe const ValueType1 * copyFromVec, ValueType2 * copyToVec) { - hipLaunchKernelGGL(stridedCopyToBlockConstantStrideDeviceKernel, - (blockSizeTo * numBlocks) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - blockSizeTo, - blockSizeFrom, - numBlocks, - startingId, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), - dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); + hipLaunchKernelGGL( + stridedCopyToBlockConstantStrideDeviceKernel, + (blockSizeTo * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + blockSizeTo, + blockSizeFrom, + numBlocks, + startingId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); } template @@ -1346,21 +1347,20 @@ namespace dftfe const ValueType1 * copyFromVec, ValueType2 * copyToVec) { - hipLaunchKernelGGL(stridedCopyConstantStrideDeviceKernel, - (blockSize * numBlocks) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - blockSize, - strideTo, - strideFrom, - numBlocks, - startingToId, - startingFromId, - dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), - dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); + hipLaunchKernelGGL( + stridedCopyConstantStrideDeviceKernel, + (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + blockSize, + strideTo, + strideFrom, + numBlocks, + startingToId, + startingFromId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); } From 1dd551f686520b3b9456ea5153fd1680553e687a Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 13 Feb 2024 07:29:28 +0530 Subject: [PATCH 16/24] Bugfix --- src/dft/initBoundaryConditions.cc | 1 - utils/FEBasisOperations.cc | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index f7595175a..0d41f32af 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -332,7 +332,6 @@ namespace dftfe } else { - d_basisOperationsPtrDevice->clear(); dftfe::basis::UpdateFlags updateFlagsGradientsAndInvJacobians = dftfe::basis::update_inversejacobians | dftfe::basis::update_jxw | dftfe::basis::update_gradients; diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 441038c6f..7242113d1 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -161,6 +161,7 @@ namespace dftfe initializeMPIPattern(); d_nQuadsPerCell.resize(d_quadratureIDsVector.size()); d_quadPoints = basisOperationsSrc.d_quadPoints; + initializeConstraints(); for (unsigned int iQuadIndex = 0; iQuadIndex < d_quadratureIDsVector.size(); ++iQuadIndex) From f5ae1869e338a0a26492393a133deab28d146e68 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 13 Feb 2024 11:17:56 +0530 Subject: [PATCH 17/24] BugFix --- utils/FEBasisOperations.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 7242113d1..c3e114b7a 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -1196,12 +1196,12 @@ namespace dftfe const auto jacobianFactorPtr = d_jacobianFactorHost.data() + iCell * nQuadsPerCell * 9 + iQuad * 9; - for (unsigned int kDim = 0; kDim < 3; ++kDim) - for (unsigned int jDim = 0; jDim < 3; ++jDim) - for (unsigned int iDim = 0; iDim < 3; ++iDim) + for (unsigned int jDim = 0; jDim < 3; ++jDim) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + for (unsigned int kDim = 0; kDim < 3; ++kDim) jacobianFactorPtr[3 * jDim + iDim] += - inverseJacobianQuad[kDim][iDim] * - inverseJacobianQuad[kDim][jDim] * jxw; + inverseJacobianQuad[iDim][kDim] * + inverseJacobianQuad[jDim][kDim] * jxw; } ++iCell; } From 8c5bc8f96555baa771825fa0adb955841c61ee7d Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 13 Feb 2024 13:13:20 +0530 Subject: [PATCH 18/24] Fix stress compute --- src/dft/initBoundaryConditions.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 0d41f32af..932bb3e82 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -356,7 +356,7 @@ namespace dftfe d_densityDofHandlerIndex, quadratureIndices, updateFlags); - if (FEOrder != FEOrderElectro) + if (FEOrder == FEOrderElectro) d_basisOperationsPtrDevice->computeCellStiffnessMatrix( d_feOrderPlusOneQuadratureId, 50, true, false); } From 685acba358f8d3d4684675b13e61baa821fe6d70 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Tue, 13 Feb 2024 07:30:54 -0500 Subject: [PATCH 19/24] lrdm gpu bug fix --- ...rstOrderResponseCalculatorDeviceKernels.cc | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc index c5d3e9ced..13d008a81 100644 --- a/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc +++ b/src/dft/densityFirstOrderResponseCalculatorDeviceKernels.cc @@ -36,8 +36,8 @@ namespace dftfe const unsigned int numVectors, const unsigned int numCells, const unsigned int nQuadsPerCell, - double * wfcContributions, - double * wfcPrimeContributions, + const double * wfc, + const double * wfcPrime, double * rhoResponseHamCellsWfcContributions, double * rhoResponseFermiEnergyCellsWfcContributions) { @@ -48,8 +48,8 @@ namespace dftfe for (unsigned int index = globalThreadId; index < numberEntries; index += blockDim.x * gridDim.x) { - const double psi = wfcContributions[index]; - const double psiPrime = wfcContributions[index]; + const double psi = wfc[index]; + const double psiPrime = wfcPrime[index]; rhoResponseFermiEnergyCellsWfcContributions[index] = psi * psi; rhoResponseHamCellsWfcContributions[index] = psi * psiPrime; } @@ -57,12 +57,12 @@ namespace dftfe __global__ void computeRhoResponseFromInterpolatedValues( - const unsigned int numVectors, - const unsigned int numCells, - const unsigned int nQuadsPerCell, - dftfe::utils::deviceDoubleComplex *wfcContributions, - dftfe::utils::deviceDoubleComplex *wfcPrimeContributions, - double * rhoResponseHamCellsWfcContributions, + const unsigned int numVectors, + const unsigned int numCells, + const unsigned int nQuadsPerCell, + const dftfe::utils::deviceDoubleComplex *wfc, + const dftfe::utils::deviceDoubleComplex *wfcPrime, + double *rhoResponseHamCellsWfcContributions, double *rhoResponseFermiEnergyCellsWfcContributions) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -72,9 +72,8 @@ namespace dftfe for (unsigned int index = globalThreadId; index < numberEntries; index += blockDim.x * gridDim.x) { - const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index]; - const dftfe::utils::deviceDoubleComplex psiPrime = - wfcPrimeContributions[index]; + const dftfe::utils::deviceDoubleComplex psi = wfc[index]; + const dftfe::utils::deviceDoubleComplex psiPrime = wfcPrime[index]; rhoResponseFermiEnergyCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y; rhoResponseHamCellsWfcContributions[index] = From 72eeed0744eebef53d2c2aea322266a39d2ed546 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Tue, 13 Feb 2024 08:02:10 -0500 Subject: [PATCH 20/24] update frontier job script --- .../frontierJobScript6GCDs6MPITasks.rc | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc index 9c051b5e5..cef08dc05 100644 --- a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -23,17 +23,12 @@ LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH BASE = $WD/src/dftfeDebug/build/release/real n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13 -srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_1.prm > outputN2_1 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_2.prm > outputN2_2 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_3.prm > outputN2_3 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_4.prm > outputN2_4 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe --mpi=pmi2 ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 From 8a7d4b17b6bf4b3127403ca156763aeb7885eaca Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Tue, 13 Feb 2024 08:06:37 -0500 Subject: [PATCH 21/24] update frontier job script --- .../frontierJobScript6GCDs6MPITasks.rc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc index cef08dc05..33d1e53b4 100644 --- a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -24,11 +24,11 @@ BASE = $WD/src/dftfeDebug/build/release/real n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_1.prm > outputN2_1 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_2.prm > outputN2_2 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_3.prm > outputN2_3 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileN2_4.prm > outputN2_4 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_8.prm > outputMg2x_8 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_9.prm > outputMg2x_9 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_10.prm > outputMg2x_10 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe ./dftfe parameterFileMg2x_11.prm > outputMg2x_11 -srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe --mpi=pmi2 ./dftfe parameterFileMg2x_14.prm > outputMg2x_14 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_2.prm > outputN2_2 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_3.prm > outputN2_3 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileN2_4.prm > outputN2_4 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_8.prm > outputMg2x_8 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_9.prm > outputMg2x_9 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_10.prm > outputMg2x_10 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_11.prm > outputMg2x_11 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_14.prm > outputMg2x_14 From 53dae6b3b2897c972eb9449f40b52fc0d83e261b Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Tue, 13 Feb 2024 12:57:39 -0500 Subject: [PATCH 22/24] lrdm bug fix, all gpu ctests passed --- src/dft/densityFirstOrderResponseCalculator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index a57fe3795..726f7126e 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -150,7 +150,7 @@ namespace dftfe } - dftfe::utils::MemoryStorage onesVec(BVec, 1.0); + dftfe::utils::MemoryStorage onesVec(BVec, spinPolarizedFactor); std::vector< dftfe::utils::MemoryStorage> From ec58e9f0edc5961d32eaff2d745cc195faeec8a4 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Tue, 13 Feb 2024 13:37:15 -0500 Subject: [PATCH 23/24] apply indentation standard --- src/dft/densityFirstOrderResponseCalculator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dft/densityFirstOrderResponseCalculator.cc b/src/dft/densityFirstOrderResponseCalculator.cc index 726f7126e..8dad13e0d 100644 --- a/src/dft/densityFirstOrderResponseCalculator.cc +++ b/src/dft/densityFirstOrderResponseCalculator.cc @@ -150,7 +150,8 @@ namespace dftfe } - dftfe::utils::MemoryStorage onesVec(BVec, spinPolarizedFactor); + dftfe::utils::MemoryStorage onesVec( + BVec, spinPolarizedFactor); std::vector< dftfe::utils::MemoryStorage> From 46be03ac1878077809aae75d101d97310c0aa579 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Wed, 14 Feb 2024 00:44:23 +0530 Subject: [PATCH 24/24] Update sparsity pattern --- src/dft/dft.cc | 3 +-- .../real/nitrogenMolecule_02.mpirun=10.output | 22 +++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 7b1c7dff4..ca2d24a03 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -1442,8 +1442,7 @@ namespace dftfe MPI_Barrier(d_mpiCommParent); init_pseudo = MPI_Wtime(); - initPseudoPotentialAll(d_dftParamsPtr->floatingNuclearCharges ? true : - false); + initPseudoPotentialAll(); MPI_Barrier(d_mpiCommParent); init_pseudo = MPI_Wtime() - init_pseudo; diff --git a/tests/dft/pseudopotential/real/nitrogenMolecule_02.mpirun=10.output b/tests/dft/pseudopotential/real/nitrogenMolecule_02.mpirun=10.output index 601398edb..c95041f96 100644 --- a/tests/dft/pseudopotential/real/nitrogenMolecule_02.mpirun=10.output +++ b/tests/dft/pseudopotential/real/nitrogenMolecule_02.mpirun=10.output @@ -63,12 +63,12 @@ SCF iterations converged to the specified tolerance after: 9 iterations. Energy computations (Hartree) ------------------- - Total energy: -19.90165650 + Total energy: -19.90165611 Absolute values of ion forces (Hartree/Bohr) -------------------------------------------------------------------------------------------- -AtomId 0: 0.043017,0.000000,0.000000 -AtomId 1: 0.043019,0.000000,0.000000 +AtomId 0: 0.043016,0.000000,0.000000 +AtomId 1: 0.043018,0.000000,0.000000 -------------------------------------------------------------------------------------------- Finite element mesh information @@ -89,12 +89,12 @@ SCF iterations converged to the specified tolerance after: 8 iterations. Energy computations (Hartree) ------------------- - Total energy: -19.90269941 + Total energy: -19.90269981 Absolute values of ion forces (Hartree/Bohr) -------------------------------------------------------------------------------------------- -AtomId 0: 0.080319,0.000000,0.000000 -AtomId 1: 0.080319,0.000000,0.000000 +AtomId 0: 0.080316,0.000000,0.000000 +AtomId 1: 0.080317,0.000000,0.000000 -------------------------------------------------------------------------------------------- Finite element mesh information @@ -115,12 +115,12 @@ SCF iterations converged to the specified tolerance after: 9 iterations. Energy computations (Hartree) ------------------- - Total energy: -19.89838187 + Total energy: -19.89838181 Absolute values of ion forces (Hartree/Bohr) -------------------------------------------------------------------------------------------- -AtomId 0: 0.088664,0.000000,0.000000 -AtomId 1: 0.088664,0.000000,0.000000 +AtomId 0: 0.088662,0.000000,0.000000 +AtomId 1: 0.088663,0.000000,0.000000 -------------------------------------------------------------------------------------------- Finite element mesh information @@ -141,10 +141,10 @@ SCF iterations converged to the specified tolerance after: 8 iterations. Energy computations (Hartree) ------------------- - Total energy: -19.90290962 + Total energy: -19.90290922 Absolute values of ion forces (Hartree/Bohr) -------------------------------------------------------------------------------------------- -AtomId 0: 0.009384,0.000000,0.000000 +AtomId 0: 0.009383,0.000000,0.000000 AtomId 1: 0.009383,0.000000,0.000000 --------------------------------------------------------------------------------------------