diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e26cd7a8..93dd0db76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,7 @@ SET(TARGET_SRC ./src/dft/dft.cc ./src/dft/vselfBinsManager.cc ./src/dft/energyCalculator.cc - ./src/dft/densityCalculatorCPU.cc + ./src/dft/densityCalculator.cc ./src/dft/densityFirstOrderResponseCalculatorCPU.cc ./src/excManager/excDensityBaseClass.cpp ./src/excManager/excDensityLDAClass.cpp @@ -166,7 +166,7 @@ SET(DEVICE_SRC ./utils/MemoryTransferKernelsDevice.cc ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc - ./src/dft/densityCalculatorDevice.cc + ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -185,6 +185,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc + ./utils/FEBasisOperationsKernelsDevice.cc ) ELSEIF ("${GPU_LANG}" STREQUAL "hip") @@ -193,7 +194,7 @@ SET(DEVICE_SRC ./utils/MemoryTransferKernelsDevice.cc ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc - ./src/dft/densityCalculatorDevice.cc + ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -212,6 +213,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc + ./utils/FEBasisOperationsKernelsDevice.cc ) ENDIF() @@ -222,7 +224,11 @@ IF (WITH_GPU) set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE CUDA) ELSEIF ("${GPU_LANG}" STREQUAL "hip") set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE HIP) - ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__) + IF ("${GPU_VENDOR}" STREQUAL "amd") + ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__) + ELSEIF ("${GPU_VENDOR}" STREQUAL "nvidia") + ADD_DEFINITIONS(-D__HIP_PLATFORM_NVIDIA__) + ENDIF() ENDIF() ENDIF() IF (WITH_GPU) diff --git a/include/DeviceBlasWrapper.h b/include/DeviceBlasWrapper.h index 0331c7e45..4925b51f2 100644 --- a/include/DeviceBlasWrapper.h +++ b/include/DeviceBlasWrapper.h @@ -260,6 +260,62 @@ namespace dftfe long long int strideC, int batchCount); + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy); + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy); + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy); + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy); + } // namespace deviceBlasWrapper } // namespace utils diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h new file mode 100644 index 000000000..62021d158 --- /dev/null +++ b/include/FEBasisOperations.h @@ -0,0 +1,1019 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#ifndef dftfeFEBasisOperations_h +#define dftfeFEBasisOperations_h + +#include +#include +#include +#include +#include + +namespace dftfe +{ + namespace basis + { + enum UpdateFlags + { + update_default = 0, + + update_values = 0x0001, + + update_gradients = 0x0002, + + update_transpose = 0x0004 + }; + + inline UpdateFlags + operator|(const UpdateFlags f1, const UpdateFlags f2) + { + return static_cast(static_cast(f1) | + static_cast(f2)); + } + + + + inline UpdateFlags & + operator|=(UpdateFlags &f1, const UpdateFlags f2) + { + f1 = f1 | f2; + return f1; + } + + + inline UpdateFlags operator&(const UpdateFlags f1, const UpdateFlags f2) + { + return static_cast(static_cast(f1) & + static_cast(f2)); + } + + + inline UpdateFlags & + operator&=(UpdateFlags &f1, const UpdateFlags f2) + { + f1 = f1 & f2; + return f1; + } + + + template + class FEBasisOperationsBase + { + protected: + mutable dftfe::utils::MemoryStorage + tempCellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + + public: + /** + * @brief Constructor, fills required data structures using deal.ii's MatrixFree and AffineConstraints objects + * @param[in] matrixFreeData MatrixFree object. + * @param[in] constraintsVector std::vector of AffineConstraints, should + * be the same vector which was passed for the construction of the given + * MatrixFree object. + */ + FEBasisOperationsBase( + dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + &constraintsVector); + + /** + * @brief Default Destructor + */ + ~FEBasisOperationsBase() = default; + + /** + * @brief fills required data structures for the given dofHandlerID + * @param[in] dofHandlerID dofHandler index to be used for getting data + * from the MatrixFree object. + * @param[in] quadratureID std::vector of quadratureIDs to be used, should + * be the same IDs which were used during the construction of the given + * MatrixFree object. + */ + void + init(const unsigned int & dofHandlerID, + const std::vector &quadratureID, + const UpdateFlags updateFlags = update_values); + + /** + * @brief sets internal variables and optionally resizes internal temp storage for interpolation operations + * @param[in] vecBlockSize block size to used for operations on vectors, + * this has to be set to the exact value before any such operations are + * called. + * @param[in] cellBlockSize block size to used for cells, this has to be + * set to a value greater than or equal to the required value before any + * such operations are called + * @param[in] quadratureID Quadrature index to be used. + * @param[in] isResizeTempStorage whether to resize internal tempstorage. + */ + void + reinit(const unsigned int &vecBlockSize, + const unsigned int &cellBlockSize, + const unsigned int &quadratureID, + const bool isResizeTempStorage = true); + + // private: +#if defined(DFTFE_WITH_DEVICE) + using constraintInfoClass = + typename std::conditional::type; +#else + using constraintInfoClass = dftUtils::constraintMatrixInfo; +#endif + + + + /** + * @brief Initializes indexset maps from process level indices to cell level indices for a single vector, also initializes cell index to cellid map. + */ + void + initializeIndexMaps(); + + /** + * @brief Initializes indexset maps from process level indices to cell level indices for multivectors. + */ + void + initializeFlattenedIndexMaps(); + + /** + * @brief Initializes the constraintMatrixInfo object. + */ + void + initializeConstraints(); + + /** + * @brief Constructs the MPIPatternP2P object. + */ + void + initializeMPIPattern(); + + /** + * @brief Fill the shape function data and jacobian data in the ValueTypeBasisCoeff datatype. + */ + void + initializeShapeFunctionAndJacobianData(); + + /** + * @brief Fill the shape function data and jacobian data in the ValueTypeBasisData datatype. + */ + void + initializeShapeFunctionAndJacobianBasisData(); + + /** + * @brief Resizes the internal temp storage to be sufficient for the vector and cell block sizes provided in reinit. + */ + void + resizeTempStorage(); + + /** + * @brief Number of quadrature points per cell for the quadratureID set in reinit. + */ + unsigned int + nQuadsPerCell() const; + + /** + * @brief Number of DoFs per cell for the dofHandlerID set in init. + */ + unsigned int + nDofsPerCell() const; + + /** + * @brief Number of locally owned cells on the current processor. + */ + unsigned int + nCells() const; + + /** + * @brief Number of DoFs on the current processor, locally owned + ghosts. + */ + unsigned int + nRelaventDofs() const; + + /** + * @brief Number of locally owned DoFs on the current processor. + */ + unsigned int + nOwnedDofs() const; + + /** + * @brief Shape function values at quadrature points. + * @param[in] transpose if false the the data is indexed as [iQuad * + * d_nDofsPerCell + iNode] and if true it is indexed as [iNode * + * d_nQuadsPerCell + iQuad]. + */ + const dftfe::utils::MemoryStorage & + shapeFunctionData(bool transpose = false) const; + + /** + * @brief Shape function gradient values at quadrature points. + * @param[in] transpose if false the the data is indexed as [iDim * + * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and + * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell + + * iNode * d_nQuadsPerCell + iQuad]. + */ + const dftfe::utils::MemoryStorage & + shapeFunctionGradientData(bool transpose = false) const; + + /** + * @brief Inverse Jacobian matrices, for cartesian cells returns the + * diagonal elements of the inverse Jacobian matrices for each cell, for + * affine cells returns the 3x3 inverse Jacobians for each cell otherwise + * returns the 3x3 inverse Jacobians at each quad point for each cell. + */ + const dftfe::utils::MemoryStorage & + inverseJacobians() const; + + /** + * @brief determinant of Jacobian times the quadrature weight at each + * quad point for each cell. + */ + const dftfe::utils::MemoryStorage & + JxW() const; + + /** + * @brief Shape function values at quadrature points in ValueTypeBasisData. + * @param[in] transpose if false the the data is indexed as [iQuad * + * d_nDofsPerCell + iNode] and if true it is indexed as [iNode * + * d_nQuadsPerCell + iQuad]. + */ + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionBasisData(bool transpose = false) const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionBasisData(bool transpose = false) const; + + /** + * @brief Shape function gradient values at quadrature points in ValueTypeBasisData. + * @param[in] transpose if false the the data is indexed as [iDim * + * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and + * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell + + * iNode * d_nQuadsPerCell + iQuad]. + */ + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionGradientBasisData(bool transpose = false) const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionGradientBasisData(bool transpose = false) const; + + /** + * @brief Inverse Jacobian matrices in ValueTypeBasisData, for cartesian cells returns the + * diagonal elements of the inverse Jacobian matrices for each cell, for + * affine cells returns the 3x3 inverse Jacobians for each cell otherwise + * returns the 3x3 inverse Jacobians at each quad point for each cell. + */ + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + inverseJacobiansBasisData() const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + inverseJacobiansBasisData() const; + + /** + * @brief determinant of Jacobian times the quadrature weight in ValueTypeBasisData at each + * quad point for each cell. + */ + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + JxWBasisData() const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + JxWBasisData() const; + + /** + * @brief returns 2 if all cells on current processor are Cartesian, + * 1 if all cells on current processor are affine and 0 otherwise. + */ + unsigned int + cellsTypeFlag() const; + + /** + * @brief returns the deal.ii cellID corresponing to given cell Index. + * @param[in] iElem cell Index + */ + dealii::CellId + cellID(const unsigned int iElem) const; + + /** + * @brief Creates a multivector. + * @param[in] blocksize Number of vectors in the multivector. + * @param[out] multiVector the created multivector. + */ + void + createMultiVector( + const unsigned int blocksize, + dftfe::linearAlgebra::MultiVector + &multiVector) const; + + /** + * @brief Creates scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs number of scratch multivectors needed with + * this vecBlockSize. + */ + void + createScratchMultiVectors(const unsigned int vecBlockSize, + const unsigned int numMultiVecs = 1) const; + + /** + * @brief Clears scratch multivectors. + */ + void + clearScratchMultiVectors() const; + + /** + * @brief Gets scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs index of the multivector among those with the + * same vecBlockSize. + */ + dftfe::linearAlgebra::MultiVector & + getMultiVector(const unsigned int vecBlockSize, + const unsigned int index = 0) const; + + /** + * @brief Apply constraints on given multivector. + * @param[inout] multiVector the given multivector. + */ + void + distribute( + dftfe::linearAlgebra::MultiVector + &multiVector) const; + + + + constraintInfoClass d_constraintInfo; + std::vector *> + * d_constraintsVector; + const dealii::MatrixFree<3, ValueTypeBasisData> *d_matrixFreeDataPtr; + dftfe::utils::MemoryStorage + d_cellDofIndexToProcessDofIndexMap; + dftfe::utils::MemoryStorage + d_flattenedCellDofIndexToProcessDofIndexMap; + std::vector d_cellIndexToCellIdMap; + std::vector> + d_inverseJacobianData; + std::vector> + d_JxWData; + std::vector> + d_shapeFunctionData; + std::vector> + d_shapeFunctionGradientDataInternalLayout; + std::vector> + d_shapeFunctionGradientData; + std::vector> + d_shapeFunctionDataTranspose; + std::vector> + d_shapeFunctionGradientDataTranspose; + + std::vector> + d_inverseJacobianBasisData; + std::vector> + d_JxWBasisData; + std::vector> + d_shapeFunctionBasisData; + std::vector> + d_shapeFunctionGradientBasisData; + std::vector> + d_shapeFunctionBasisDataTranspose; + std::vector> + d_shapeFunctionGradientBasisDataTranspose; + + + mutable std::map< + unsigned int, + std::vector< + dftfe::linearAlgebra::MultiVector>> + scratchMultiVectors; + + std::vector d_quadratureIDsVector; + unsigned int d_quadratureID; + std::vector d_nQuadsPerCell; + unsigned int d_dofHandlerID; + unsigned int d_nVectors; + unsigned int d_nCells; + unsigned int d_cellsBlockSize; + unsigned int d_nDofsPerCell; + unsigned int d_localSize; + unsigned int d_locallyOwnedSize; + bool areAllCellsAffine; + bool areAllCellsCartesian; + UpdateFlags d_updateFlags; + + std::shared_ptr> + mpiPatternP2P; + }; + template + class FEBasisOperations : FEBasisOperationsBase + {}; + + template + class FEBasisOperations + : public FEBasisOperationsBase + { + public: + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::FEBasisOperationsBase; + + using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase::d_localSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_locallyOwnedSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempCellNodalData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsDataNonAffine; + using FEBasisOperationsBase::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_quadratureID; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_nQuadsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_nDofsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::areAllCellsAffine; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::areAllCellsCartesian; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_updateFlags; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionDataTranspose; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientDataTranspose; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataInternalLayout; + using FEBasisOperationsBase::d_JxWData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_inverseJacobianData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_cellIndexToCellIdMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_cellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase:: + d_flattenedCellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_constraintsVector; + + + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + */ + void + interpolate( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; + + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + */ + void + integrateWithBasis( + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + */ + void + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const; + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + */ + void + accumulateFromCellNodalData( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ + void + interpolateKernel( + const dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + + /** + * @brief Interpolate cell level nodal data to cell level quadrature data. + * @param[in] nodalData cell level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ + void + interpolateKernel( + const ValueTypeBasisCoeff * nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which integration has to be + * done. + */ + void + integrateWithBasisKernel( + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + + + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ + void + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const; + + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ + void + accumulateFromCellNodalDataKernel( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + }; +#if defined(DFTFE_WITH_DEVICE) + template + class FEBasisOperations + : public FEBasisOperationsBase + { + public: + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::FEBasisOperationsBase; + using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_localSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_locallyOwnedSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::tempCellNodalData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::tempQuadratureGradientsData; + using FEBasisOperationsBase:: + tempQuadratureGradientsDataNonAffine; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellsBlockSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_quadratureID; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nQuadsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nDofsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::areAllCellsAffine; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::areAllCellsCartesian; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_updateFlags; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionDataTranspose; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionGradientData; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataTranspose; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataInternalLayout; + using FEBasisOperationsBase::d_JxWData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_inverseJacobianData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellIndexToCellIdMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase:: + d_flattenedCellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_constraintsVector; + + // FIXME has to be removed in a future PR + /** + * @brief sets device blas handle for internal blas operations. + */ + dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr; + void + setDeviceBLASHandle( + dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr); + + // FIXME has to be removed in a future PR + /** + * @brief gets device blas handle for blas operations. + */ + dftfe::utils::deviceBlasHandle_t & + getDeviceBLASHandle(); + + + + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + */ + void + interpolate( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; + + + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + */ + void + integrateWithBasis( + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + */ + void + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const; + + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + */ + void + accumulateFromCellNodalData( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ + void + interpolateKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + + /** + * @brief Interpolate cell level nodal data to cell level quadrature data. + * @param[in] nodalData cell level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ + void + interpolateKernel( + const ValueTypeBasisCoeff * nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which integration has to be + * done. + */ + void + integrateWithBasisKernel( + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + + + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ + void + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const; + + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ + void + accumulateFromCellNodalDataKernel( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + }; +#endif + } // end of namespace basis +} // end of namespace dftfe +#include "../utils/FEBasisOperations.t.cc" +#include "../utils/FEBasisOperationsHost.t.cc" +#if defined(DFTFE_WITH_DEVICE) +# include "../utils/FEBasisOperationsDevice.t.cc" +#endif + +#endif // dftfeBasisOperations_h diff --git a/include/FEBasisOperationsKernelsDevice.h b/include/FEBasisOperationsKernelsDevice.h new file mode 100644 index 000000000..8a38c53a8 --- /dev/null +++ b/include/FEBasisOperationsKernelsDevice.h @@ -0,0 +1,52 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- + +#ifndef dftfeFEBasisOperationsKernelsDevice_h +#define dftfeFEBasisOperationsKernelsDevice_h + +#ifdef DFTFE_WITH_DEVICE +# include + +namespace dftfe +{ + namespace basis + { + namespace FEBasisOperationsKernelsDevice + { + /** + * @brief rehsape gradient data from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] numVecs number of vectors. + * @param[in] numQuads number of quadrature points per cell. + * @param[in] numCells number of locally owned cells. + * @param[in] copyFromVec source data pointer. + * @param[out] copyToVec destination data pointer. + */ + template + void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec); + + + }; // namespace FEBasisOperationsKernelsDevice + } // namespace basis +} // namespace dftfe + +#endif // DFTFE_WITH_DEVICE +#endif // dftfeFEBasisOperationsKernelsDevice_h diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h index c0dadb055..929ef8534 100644 --- a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h +++ b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h @@ -64,17 +64,18 @@ namespace dftfe * @brief Solve a generalized eigen problem. */ void - solve(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - std::vector &eigenVectorsFlattened, - std::vector &eigenVectorsRotFracDensityFlattened, - const unsigned int totalNumberWaveFunctions, - std::vector & eigenValues, - std::vector & residuals, - const MPI_Comm & interBandGroupComm, - const bool computeResidual, - const bool useMixedPrec = false, - const bool isFirstScf = false); + solve(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, + dataTypes::number * eigenVectorsRotFracDensityFlattened, + const unsigned int totalNumberWaveFunctions, + const unsigned int localVectorSize, + std::vector &eigenValues, + std::vector &residuals, + const MPI_Comm & interBandGroupComm, + const bool computeResidual, + const bool useMixedPrec = false, + const bool isFirstScf = false); /** * @brief Solve a generalized eigen problem. diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h index 7fdb002db..92a71e0f4 100644 --- a/include/constraintMatrixInfo.h +++ b/include/constraintMatrixInfo.h @@ -63,33 +63,6 @@ namespace dftfe & partitioner, const dealii::AffineConstraints &constraintMatrixData); - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps( - const std::shared_ptr - &partitioner1, - const std::shared_ptr - & partitioner2, - const unsigned int blockSize); - - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps(const std::shared_ptr> &partitioner2, - const unsigned int blockSize); - /** * @brief overloaded dealii internal function "distribute" which sets the slave node * field values from master nodes diff --git a/include/constraintMatrixInfoDevice.h b/include/constraintMatrixInfoDevice.h index e682a25b5..dd5338a0e 100644 --- a/include/constraintMatrixInfoDevice.h +++ b/include/constraintMatrixInfoDevice.h @@ -61,27 +61,6 @@ namespace dftfe & partitioner, const dealii::AffineConstraints &constraintMatrixData); - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps(const std::shared_ptr> &partitioner2, - const unsigned int blockSize); - - void - precomputeMaps( - const std::shared_ptr - &partitioner1, - const std::shared_ptr - & partitioner2, - const unsigned int blockSize); - - /** * @brief overloaded dealii internal function distribute for flattened dealii array which sets * the slave node field values from master nodes diff --git a/include/densityCalculator.h b/include/densityCalculator.h new file mode 100644 index 000000000..b6154609e --- /dev/null +++ b/include/densityCalculator.h @@ -0,0 +1,96 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#ifndef densityCalculatorDevice_H_ +#define densityCalculatorDevice_H_ + +#include +#include +#include "dftParameters.h" +#include "FEBasisOperations.h" + +namespace dftfe +{ + template + void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit); + + template + void + computeRhoGradRhoFromInterpolatedValues( + std::shared_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); + +#if defined(DFTFE_WITH_DEVICE) + template + void + computeRhoGradRhoFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); +#endif + +} // namespace dftfe +#endif diff --git a/include/densityCalculatorCPU.h b/include/densityCalculatorCPU.h deleted file mode 100644 index e324cdad6..000000000 --- a/include/densityCalculatorCPU.h +++ /dev/null @@ -1,63 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#ifndef densityCalculatorCPU_H_ -#define densityCalculatorCPU_H_ - -#include "headers.h" -#include "operator.h" -#include "dftParameters.h" - -namespace dftfe -{ - /** - * @brief Density calculator class using gemm recasting - * - * @author Sambit Das - */ - - template - void - computeRhoFromPSICPU( - const std::vector> & X, - const std::vector> & XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad); -} // namespace dftfe -#endif diff --git a/include/densityCalculatorDevice.h b/include/densityCalculatorDevice.h deleted file mode 100644 index 847efc733..000000000 --- a/include/densityCalculatorDevice.h +++ /dev/null @@ -1,63 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#if defined(DFTFE_WITH_DEVICE) -# ifndef densityCalculatorDevice_H_ -# define densityCalculatorDevice_H_ - -# include -# include -# include "dftParameters.h" - -namespace dftfe -{ - namespace Device - { - template - void - computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad = false); - } -} // namespace dftfe -# endif -#endif diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index c674d82d0..4f5bb94f6 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -33,8 +33,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> & X, - const std::vector> & XPrime, + const NumberType * X, + const NumberType * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -59,8 +59,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> & X, - const std::vector> & XPrime, + const NumberType * X, + const NumberType * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, diff --git a/include/deviceKernelsGeneric.h b/include/deviceKernelsGeneric.h index 6e6b631af..3703bdf02 100644 --- a/include/deviceKernelsGeneric.h +++ b/include/deviceKernelsGeneric.h @@ -86,6 +86,17 @@ namespace dftfe ValueType2 * copyToVec); + template + void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec); + template void diff --git a/include/dft.h b/include/dft.h index 1f64da879..de63bbf10 100644 --- a/include/dft.h +++ b/include/dft.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1284,6 +1285,18 @@ namespace dftfe unsigned int d_densityQuadratureId; unsigned int d_densityQuadratureIdElectro; dealii::MatrixFree<3, double> matrix_free_data, d_matrixFreeDataPRefined; + std::shared_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrHost; +#if defined(DFTFE_WITH_DEVICE) + std::shared_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrDevice; +#endif std::map> d_supportPoints, d_supportPointsPRefined, d_supportPointsEigen; std::vector *> d_constraintsVector; @@ -1394,14 +1407,24 @@ namespace dftfe std::vector> d_densityMatDerFermiEnergy; /// Spectrum split higher eigenvalues computed in Rayleigh-Ritz step - std::vector> eigenValuesRRSplit; - std::vector> d_eigenVectorsFlattened; - std::vector> d_eigenVectorsFlattenedSTL; - std::vector> - d_eigenVectorsRotFracDensityFlattenedSTL; - - std::vector> - d_eigenVectorsDensityMatrixPrimeSTL; + std::vector> eigenValuesRRSplit; + + /** + * The indexing of d_eigenVectorsFlattenedHost and + * d_eigenVectorsFlattenedDevice [kPoint * numSpinComponents * + * numLocallyOwnedNodes * numWaveFunctions + iSpin * numLocallyOwnedNodes * + * numWaveFunctions + iNode * numWaveFunctions + iWaveFunction] + */ + dftfe::utils::MemoryStorage + d_eigenVectorsFlattenedHost; + + dftfe::utils::MemoryStorage + d_eigenVectorsRotFracDensityFlattenedHost; + dftfe::utils::MemoryStorage + d_eigenVectorsDensityMatrixPrimeHost; /// device eigenvectors #ifdef DFTFE_WITH_DEVICE diff --git a/include/forceWfcContractions.h b/include/forceWfcContractions.h index af612a197..70511ec7a 100644 --- a/include/forceWfcContractions.h +++ b/include/forceWfcContractions.h @@ -28,14 +28,14 @@ namespace dftfe { void wfcContractionsForceKernelsAllH( - operatorDFTClass & operatorMatrix, - const std::vector> &X, - const unsigned int spinPolarizedFlag, - const unsigned int spinIndex, - const std::vector> & eigenValuesH, - const std::vector> & partialOccupanciesH, - const std::vector & kPointCoordinates, - const unsigned int *nonTrivialIdToElemIdMapH, + operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const unsigned int spinPolarizedFlag, + const unsigned int spinIndex, + const std::vector> &eigenValuesH, + const std::vector> &partialOccupanciesH, + const std::vector & kPointCoordinates, + const unsigned int * nonTrivialIdToElemIdMapH, const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH, const unsigned int MLoc, const unsigned int N, diff --git a/include/forceWfcContractionsDevice.h b/include/forceWfcContractionsDevice.h index e3253a854..4f0771dfe 100644 --- a/include/forceWfcContractionsDevice.h +++ b/include/forceWfcContractionsDevice.h @@ -22,6 +22,7 @@ # include "headers.h" # include "operatorDevice.h" # include "dftParameters.h" +# include "FEBasisOperations.h" namespace dftfe { @@ -29,6 +30,11 @@ namespace dftfe { void wfcContractionsForceKernelsAllH( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, diff --git a/include/kohnShamDFTOperator.h b/include/kohnShamDFTOperator.h index 942255f3a..39754397f 100644 --- a/include/kohnShamDFTOperator.h +++ b/include/kohnShamDFTOperator.h @@ -121,9 +121,10 @@ node is stored * @return ProjMatrix projected small matrix */ void - XtHX(const std::vector &src, - const unsigned int numberComponents, - std::vector & ProjHam); + XtHX(const dataTypes::number * src, + const unsigned int numberComponents, + const unsigned int numberLocalDofs, + std::vector &ProjHam); /** * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HConj=X^{T}*HConj*XConj @@ -139,8 +140,9 @@ node is stored * also avoids creation of another full X memory. */ void - XtHX(const std::vector & X, + XtHX(const dataTypes::number * X, const unsigned int numberComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false); @@ -161,9 +163,10 @@ node is stored */ void XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int N, const unsigned int Ncore, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false); diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index a23c4d7ff..8e4d15b53 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -21,6 +21,7 @@ #include #include #include +#include namespace dftfe { @@ -85,14 +86,14 @@ namespace dftfe dftfe::utils::MemoryStorage & getShapeFunctionValuesNLPTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesXTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesXTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesYTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesYTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesZTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesZTransposed(); dftfe::utils::MemoryStorage & getShapeFunctionGradientValuesNLPTransposed(); @@ -647,6 +648,16 @@ namespace dftfe /// pointer to dft class dftClass *dftPtr; + std::shared_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrDevice; + std::shared_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrHost; /// data structures to store diagonal of inverse square root mass matrix and @@ -686,17 +697,6 @@ namespace dftfe dftfe::utils::MemoryStorage d_shapeFunctionValueTransposedLpspDevice; - /// storage for shapefunction gradients - std::vector d_shapeFunctionGradientValueX; - std::vector d_shapeFunctionGradientValueXTransposed; - - std::vector d_shapeFunctionGradientValueY; - std::vector d_shapeFunctionGradientValueYTransposed; - - std::vector d_shapeFunctionGradientValueZ; - std::vector d_shapeFunctionGradientValueZTransposed; - - std::vector d_cellJxWValues; dftfe::utils::MemoryStorage d_cellJxWValuesDevice; diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index bb361e976..cbb5c9f5a 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -584,8 +584,9 @@ namespace dftfe */ template void - gramSchmidtOrthogonalization(std::vector & X, + gramSchmidtOrthogonalization(T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiComm); @@ -621,8 +622,9 @@ namespace dftfe template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -647,8 +649,9 @@ namespace dftfe void rayleighRitzGEP(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -674,8 +677,9 @@ namespace dftfe void rayleighRitz(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -702,9 +706,10 @@ namespace dftfe void rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, - std::vector & Y, + T * X, + T * Y, const unsigned int numberComponents, + const unsigned int numberDofs, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -731,18 +736,19 @@ namespace dftfe */ template void - rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - const std::vector &X, - std::vector & Y, - const unsigned int numberComponents, - const unsigned int numberCoreStates, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interBandGroupComm, - const MPI_Comm & mpiCommDomain, - const bool useMixedPrec, - std::vector & eigenValues, - const dftParameters & dftParams); + rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + const T * X, + T * Y, + const unsigned int numberComponents, + const unsigned int numberDofs, + const unsigned int numberCoreStates, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interBandGroupComm, + const MPI_Comm & mpiCommDomain, + const bool useMixedPrec, + std::vector &eigenValues, + const dftParameters &dftParams); /** @brief Compute residual norm associated with eigenValue problem of the given operator @@ -757,8 +763,10 @@ namespace dftfe template void computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const std::vector &eigenValues, + const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -772,8 +780,9 @@ namespace dftfe void densityMatrixEigenBasisFirstOrderResponse( operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const unsigned int N, + const unsigned int numberLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, diff --git a/include/operator.h b/include/operator.h index 7395eb718..3c74807d9 100644 --- a/include/operator.h +++ b/include/operator.h @@ -171,9 +171,10 @@ namespace dftfe * @param ProjMatrix projected small matrix */ virtual void - XtHX(const std::vector &X, - const unsigned int numberComponents, - std::vector & ProjHam) = 0; + XtHX(const dataTypes::number * X, + const unsigned int numberComponents, + const unsigned int numberLocalDofs, + std::vector &ProjHam) = 0; /** * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HProjConj=X^{T}*HConj*XConj @@ -185,8 +186,9 @@ namespace dftfe * of the operation into the given subspace */ virtual void - XtHX(const std::vector & X, + XtHX(const dataTypes::number * X, const unsigned int numberComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0; @@ -207,9 +209,10 @@ namespace dftfe */ virtual void XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int totalNumberComponents, const unsigned int singlePrecComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0; diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 5e980b89b..be38b5ff5 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -97,18 +97,6 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> & getShapeFunctionValuesNLPTransposed() = 0; - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesXTransposed() = 0; - - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesYTransposed() = 0; - - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesZTransposed() = 0; - virtual dftfe::utils::MemoryStorage & getShapeFunctionGradientValuesNLPTransposed() = 0; @@ -426,14 +414,14 @@ namespace dftfe dftfe::utils::MemoryStorage d_shapeFunctionValueNLPTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueXTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueXTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueYTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueYTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueZTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueZTransposedDevice; dftfe::utils::MemoryStorage d_shapeFunctionGradientValueNLPTransposedDevice; @@ -455,10 +443,6 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> d_cellWaveFunctionMatrix; - distributedDeviceVec d_parallelChebyBlockVectorDevice; - - distributedDeviceVec d_parallelChebyBlockVector2Device; - distributedDeviceVec d_parallelProjectorKetTimesBlockVectorDevice; diff --git a/include/vectorUtilities.h b/include/vectorUtilities.h index 5fbbcf4f7..890ddb1a1 100644 --- a/include/vectorUtilities.h +++ b/include/vectorUtilities.h @@ -169,8 +169,9 @@ namespace dftfe */ void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, const std::vector &localProcDofIndicesReal, @@ -180,8 +181,9 @@ namespace dftfe void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> &componentVectors); @@ -206,8 +208,9 @@ namespace dftfe */ void copyFlattenedSTLVecToSingleCompVec( - const std::vector & flattenedArray, + const double * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> & componentVectors); diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 599550180..c1deaa4ff 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -53,7 +53,7 @@ namespace dftfe d_eigenVectorsFlattenedDevice.begin()); #endif if (!d_dftParamsPtr->useDevice) - d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL; + d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost; // set up linear solver @@ -465,7 +465,7 @@ namespace dftfe d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorDevice, d_eigenDofHandlerIndex, dofHandler, @@ -488,7 +488,7 @@ namespace dftfe d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorDevice, d_eigenDofHandlerIndex, dofHandler, @@ -511,11 +511,11 @@ namespace dftfe if (d_dftParamsPtr->singlePrecLRD) computeRhoFirstOrderResponseCPUMixedPrec( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsDensityMatrixPrimeSTL, + d_eigenVectorsFlattenedHost.data(), + d_eigenVectorsDensityMatrixPrimeHost.data(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorCPU, d_eigenDofHandlerIndex, dofHandler, @@ -533,11 +533,11 @@ namespace dftfe *d_dftParamsPtr); else computeRhoFirstOrderResponseCPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsDensityMatrixPrimeSTL, + d_eigenVectorsFlattenedHost.data(), + d_eigenVectorsDensityMatrixPrimeHost.data(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorCPU, d_eigenDofHandlerIndex, dofHandler, diff --git a/src/dft/density.cc b/src/dft/density.cc index 3722b3cf9..20596afb8 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -19,10 +19,7 @@ // source file for electron density related computations #include -#include -#ifdef DFTFE_WITH_DEVICE -# include -#endif +#include namespace dftfe { @@ -193,66 +190,56 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - Device::computeRhoFromPSI( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsRotFracFlattenedDevice.begin(), - d_numEigenValues, - d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - kohnShamDFTEigenOperator, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - matrix_free_data.get_quadrature(d_densityQuadratureId).size(), - d_kPointWeights, - rhoOutValues.get(), - gradRhoOutValues.get(), - rhoOutValuesSpinPolarized.get(), - gradRhoOutValuesSpinPolarized.get(), - d_excManagerPtr->getDensityBasedFamilyType() == - densityFamilyType::GGA, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && - d_numEigenValues != d_numEigenValuesRR); + computeRhoFromPSI(&d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrDevice, + d_densityDofHandlerIndex, + d_densityQuadratureId, + d_kPointWeights, + rhoOutValues.get(), + gradRhoOutValues.get(), + rhoOutValuesSpinPolarized.get(), + gradRhoOutValuesSpinPolarized.get(), + d_excManagerPtr->getDensityBasedFamilyType() == + densityFamilyType::GGA, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSICPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsRotFracDensityFlattenedSTL, - d_numEigenValues, - d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - kohnShamDFTEigenOperatorCPU, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - matrix_free_data.get_quadrature(d_densityQuadratureId).size(), - d_kPointWeights, - rhoOutValues.get(), - gradRhoOutValues.get(), - rhoOutValuesSpinPolarized.get(), - gradRhoOutValuesSpinPolarized.get(), - d_excManagerPtr->getDensityBasedFamilyType() == - densityFamilyType::GGA, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && - d_numEigenValues != d_numEigenValuesRR, - false); + computeRhoFromPSI(&d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrHost, + d_densityDofHandlerIndex, + d_densityQuadratureId, + d_kPointWeights, + rhoOutValues.get(), + gradRhoOutValues.get(), + rhoOutValuesSpinPolarized.get(), + gradRhoOutValuesSpinPolarized.get(), + d_excManagerPtr->getDensityBasedFamilyType() == + densityFamilyType::GGA, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); // normalizeRhoOutQuadValues(); if (isGroundState) @@ -611,63 +598,54 @@ namespace dftfe // nodes in each cell #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - Device::computeRhoFromPSI( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsRotFracFlattenedDevice.begin(), - d_numEigenValues, - d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - kohnShamDFTEigenOperator, - d_eigenDofHandlerIndex, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - quadrature_formula.size(), - d_kPointWeights, - &rhoPRefinedNodalData, - &_gradRhoValues, - &rhoPRefinedSpinPolarizedNodalData, - &_gradRhoValuesSpinPolarized, - false, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR, - true); + computeRhoFromPSI(&d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrDevice, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + &rhoPRefinedNodalData, + &_gradRhoValues, + &rhoPRefinedSpinPolarizedNodalData, + &_gradRhoValuesSpinPolarized, + false, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSICPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsRotFracDensityFlattenedSTL, - d_numEigenValues, - d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - kohnShamDFTEigenOperatorCPU, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - quadrature_formula.size(), - d_kPointWeights, - &rhoPRefinedNodalData, - &_gradRhoValues, - &rhoPRefinedSpinPolarizedNodalData, - &_gradRhoValuesSpinPolarized, - false, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR, - true); + computeRhoFromPSI(&d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrHost, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + &rhoPRefinedNodalData, + &_gradRhoValues, + &rhoPRefinedSpinPolarizedNodalData, + &_gradRhoValuesSpinPolarized, + false, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); // copy Lobatto quadrature data to fill in 2p DoFHandler nodal data dealii::DoFHandler<3>::active_cell_iterator cellP = d_dofHandlerRhoNodal diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc new file mode 100644 index 000000000..9e4f1dbeb --- /dev/null +++ b/src/dft/densityCalculator.cc @@ -0,0 +1,772 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + template + void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit) + { + int this_process; + MPI_Comm_rank(mpiCommParent, &this_process); +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::deviceSynchronize(); +#endif + MPI_Barrier(mpiCommParent); + double computeRho_time = MPI_Wtime(); + const unsigned int numKPoints = kPointWeights.size(); + const unsigned int numLocalDofs = basisOperationsPtr->nOwnedDofs(); + const unsigned int totalLocallyOwnedCells = basisOperationsPtr->nCells(); + const unsigned int numNodesPerElement = basisOperationsPtr->nDofsPerCell(); + // band group parallelization data structures + const unsigned int numberBandGroups = + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = + dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + totalNumWaveFunctions, + bandGroupLowHighPlusOneIndices); + + const unsigned int BVec = + std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); + + const double spinPolarizedFactor = + (dftParams.spinPolarized == 1) ? 1.0 : 2.0; + const unsigned int numSpinComponents = + (dftParams.spinPolarized == 1) ? 2 : 1; + + const NumberType zero = 0; + const NumberType scalarCoeffAlphaRho = 1.0; + const NumberType scalarCoeffBetaRho = 1.0; + const NumberType scalarCoeffAlphaGradRho = 1.0; + const NumberType scalarCoeffBetaGradRho = 1.0; + + const unsigned int cellsBlockSize = + memorySpace == dftfe::utils::MemorySpace::DEVICE ? 50 : 1; + const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; + const unsigned int remCellBlockSize = + totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; + basisOperationsPtr->reinit(BVec, cellsBlockSize, quadratureIndex); + const unsigned int numQuadPoints = basisOperationsPtr->nQuadsPerCell(); + + std::vector> + wfcQuadPointData(numSpinComponents); + std::vector> + gradWfcQuadPointData(numSpinComponents); + std::vector> + rhoWfcContributions(numSpinComponents); + std::vector> + gradRhoWfcContributions(numSpinComponents); + dftfe::utils::MemoryStorage + rhoHost; + + dftfe::utils::MemoryStorage + gradRhoHost; +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage rho; + dftfe::utils::MemoryStorage gradRho; +#else + auto &rho = rhoHost; + auto &gradRho = gradRhoHost; +#endif + + rho.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + { + wfcQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + zero); + + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + rhoWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + 0.0); + } + if (isEvaluateGradRho) + { + gradRho.resize(totalLocallyOwnedCells * numQuadPoints * 3 * + numSpinComponents, + 0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + gradWfcQuadPointData[spinIndex].resize(cellsBlockSize * + numQuadPoints * BVec * 3, + zero); + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + gradRhoWfcContributions[spinIndex].resize( + cellsBlockSize * numQuadPoints * BVec * 3, 0.0); + } + } + + + + std::vector< + dftfe::utils::MemoryStorage> + partialOccupVecHost( + numSpinComponents, + dftfe::utils::MemoryStorage( + BVec, 0.0)); +#if defined(DFTFE_WITH_DEVICE) + std::vector> + partialOccupVec(numSpinComponents); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + partialOccupVec[spinIndex].resize(partialOccupVecHost[spinIndex].size()); +#else + auto &partialOccupVec = partialOccupVecHost; +#endif + + std::vector *> + flattenedArrayBlock(numSpinComponents); + + dftfe::utils::MemoryStorage cellWaveFunctionMatrix( + cellsBlockSize * numNodesPerElement * BVec); + + for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) + { + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + wfcQuadPointData[spinIndex].setValue(zero); + gradWfcQuadPointData[spinIndex].setValue(zero); + rhoWfcContributions[spinIndex].setValue(0.0); + gradRhoWfcContributions[spinIndex].setValue(0.0); + } + for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; jvec += BVec) + { + const unsigned int currentBlockSize = + std::min(BVec, totalNumWaveFunctions - jvec); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + flattenedArrayBlock[spinIndex] = + &(basisOperationsPtr->getMultiVector(currentBlockSize, + spinIndex)); + + if ((jvec + currentBlockSize) <= + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && + (jvec + currentBlockSize) > + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) + { + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (spectrumSplit) + { + partialOccupVecHost[spinIndex].setValue( + kPointWeights[kPoint] * spinPolarizedFactor); + } + else + { + if (dftParams.constraintMagnetization) + { + const double fermiEnergyConstraintMag = + spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + if (eigenValues[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec] > + fermiEnergyConstraintMag) + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = 0; + else + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + kPointWeights[kPoint] * spinPolarizedFactor; + } + } + else + { + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + dftUtils::getPartialOccupancy( + eigenValues[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec], + fermiEnergy, + C_kb, + dftParams.TVal) * + kPointWeights[kPoint] * spinPolarizedFactor; + } + } + } +#if defined(DFTFE_WITH_DEVICE) + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); +#endif + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) + std::memcpy(flattenedArrayBlock[spinIndex]->data() + + iNode * currentBlockSize, + X->data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex) + + iNode * totalNumWaveFunctions + jvec, + currentBlockSize * sizeof(NumberType)); +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + totalNumWaveFunctions, + numLocalDofs, + jvec, + X->data() + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[spinIndex]->data()); +#endif + + + basisOperationsPtr->reinit(currentBlockSize, + cellsBlockSize, + quadratureIndex, + false); + + + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + flattenedArrayBlock[spinIndex]->updateGhostValues(); + basisOperationsPtr->distribute( + *(flattenedArrayBlock[spinIndex])); + } + + for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) + { + const unsigned int currentCellsBlockSize = + (iblock == numCellBlocks) ? remCellBlockSize : + cellsBlockSize; + if (currentCellsBlockSize > 0) + { + const unsigned int startingCellId = + iblock * cellsBlockSize; + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + *(flattenedArrayBlock[spinIndex]), + wfcQuadPointData[spinIndex].data(), + isEvaluateGradRho ? + gradWfcQuadPointData[spinIndex].data() : + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + computeRhoGradRhoFromInterpolatedValues( + basisOperationsPtr, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize), + std::pair( + jvec, jvec + currentBlockSize), + partialOccupVec[spinIndex].data(), + wfcQuadPointData[spinIndex].data(), + gradWfcQuadPointData[spinIndex].data(), + rhoWfcContributions[spinIndex].data(), + gradRhoWfcContributions[spinIndex].data(), + rho.data() + spinIndex * totalLocallyOwnedCells * + numQuadPoints, + gradRho.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints * 3, + isEvaluateGradRho); + } // non-trivial cell block check + } // cells block loop + } + } + + if (spectrumSplit) + for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) + { + const unsigned int currentBlockSize = std::min(BVec, Nfr - jvec); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + flattenedArrayBlock[spinIndex] = + &(basisOperationsPtr->getMultiVector(currentBlockSize, + spinIndex)); + if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <= + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && + (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) > + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) + { + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + if (dftParams.constraintMagnetization) + { + const double fermiEnergyConstraintMag = + spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + if (eigenValues[kPoint] + [totalNumWaveFunctions * spinIndex + + (totalNumWaveFunctions - Nfr) + + jvec + iEigenVec] > + fermiEnergyConstraintMag) + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + -kPointWeights[kPoint] * spinPolarizedFactor; + else + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = 0; + } + } + else + { + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + (dftUtils::getPartialOccupancy( + eigenValues[kPoint] + [totalNumWaveFunctions * spinIndex + + (totalNumWaveFunctions - Nfr) + + jvec + iEigenVec], + fermiEnergy, + C_kb, + dftParams.TVal) - + 1.0) * + kPointWeights[kPoint] * spinPolarizedFactor; + } + } + +#if defined(DFTFE_WITH_DEVICE) + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + { + partialOccupVec[spinIndex].resize( + partialOccupVecHost[spinIndex].size()); + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); + } +#endif + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; + ++iNode) + std::memcpy(flattenedArrayBlock[spinIndex]->data() + + iNode * currentBlockSize, + XFrac->data() + + numLocalDofs * Nfr * + (numSpinComponents * kPoint + + spinIndex) + + iNode * Nfr + jvec, + currentBlockSize * sizeof(NumberType)); +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + Nfr, + numLocalDofs, + jvec, + XFrac->data() + + numLocalDofs * Nfr * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[spinIndex]->data()); +#endif + basisOperationsPtr->reinit(currentBlockSize, + cellsBlockSize, + quadratureIndex, + false); + + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + { + flattenedArrayBlock[spinIndex]->updateGhostValues(); + basisOperationsPtr->distribute( + *(flattenedArrayBlock[spinIndex])); + } + + for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) + { + const unsigned int currentCellsBlockSize = + (iblock == numCellBlocks) ? remCellBlockSize : + cellsBlockSize; + if (currentCellsBlockSize > 0) + { + const unsigned int startingCellId = + iblock * cellsBlockSize; + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + *(flattenedArrayBlock[spinIndex]), + wfcQuadPointData[spinIndex].data(), + isEvaluateGradRho ? + gradWfcQuadPointData[spinIndex].data() : + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + computeRhoGradRhoFromInterpolatedValues( + basisOperationsPtr, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize), + std::pair( + jvec, jvec + currentBlockSize), + partialOccupVec[spinIndex].data(), + wfcQuadPointData[spinIndex].data(), + gradWfcQuadPointData[spinIndex].data(), + rhoWfcContributions[spinIndex].data(), + gradRhoWfcContributions[spinIndex].data(), + rho.data() + spinIndex * totalLocallyOwnedCells * + numQuadPoints, + gradRho.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints * 3, + isEvaluateGradRho); + } // non-tivial cells block + } // cells block loop + } + } // spectrum split block + } +#if defined(DFTFE_WITH_DEVICE) + rhoHost.resize(rho.size()); + rhoHost.copyFrom(rho); + if (isEvaluateGradRho) + { + gradRhoHost.resize(gradRho.size()); + gradRhoHost.copyFrom(gradRho); + } +#endif + + int size; + MPI_Comm_size(interpoolcomm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoHost.data()), + MPI_SUM, + interpoolcomm); + if (isEvaluateGradRho) + MPI_Allreduce(MPI_IN_PLACE, + gradRhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents * 3, + dataTypes::mpi_type_id(gradRhoHost.data()), + MPI_SUM, + interpoolcomm); + } + MPI_Comm_size(interBandGroupComm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoHost.data()), + MPI_SUM, + interBandGroupComm); + if (isEvaluateGradRho) + MPI_Allreduce(MPI_IN_PLACE, + gradRhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents * 3, + dataTypes::mpi_type_id(gradRhoHost.data()), + MPI_SUM, + interBandGroupComm); + } + + for (unsigned int iElem = 0; iElem < totalLocallyOwnedCells; ++iElem) + { + const dealii::CellId cellid = basisOperationsPtr->cellID(iElem); + + std::vector dummy(1); + std::vector &tempRhoQuads = (*rhoValues)[cellid]; + std::vector &tempGradRhoQuads = + isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; + + std::vector &tempRhoQuadsSP = + (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : + dummy; + std::vector &tempGradRhoQuadsSP = + ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? + (*gradRhoValuesSpinPolarized)[cellid] : + dummy; + + if (dftParams.spinPolarized == 1) + { + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + const double rho0 = rhoHost[iElem * numQuadPoints + q]; + const double rho1 = + rhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempRhoQuadsSP[2 * q + 0] = rho0; + + tempRhoQuadsSP[2 * q + 1] = rho1; + tempRhoQuads[q] = rho0 + rho1; + } + + if (isEvaluateGradRho) + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + const double gradRho0x = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho0y = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho0z = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 2]; + const double gradRho1x = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho1y = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho1z = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 2]; + tempGradRhoQuadsSP[6 * q + 0] = gradRho0x; + tempGradRhoQuadsSP[6 * q + 1] = gradRho0y; + tempGradRhoQuadsSP[6 * q + 2] = gradRho0z; + tempGradRhoQuadsSP[6 * q + 3] = gradRho1x; + tempGradRhoQuadsSP[6 * q + 4] = gradRho1y; + tempGradRhoQuadsSP[6 * q + 5] = gradRho1z; + tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; + tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; + tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; + } + } + else + { + std::memcpy(tempRhoQuads.data(), + rhoHost.data() + iElem * numQuadPoints, + numQuadPoints * sizeof(double)); + + if (isEvaluateGradRho) + std::memcpy(tempGradRhoQuads.data(), + gradRhoHost.data() + iElem * numQuadPoints * 3, + 3 * numQuadPoints * sizeof(double)); + } + } +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::deviceSynchronize(); +#endif + MPI_Barrier(mpiCommParent); + computeRho_time = MPI_Wtime() - computeRho_time; + + if (this_process == 0 && dftParams.verbosity >= 2) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + std::cout << "Time for compute rho on CPU: " << computeRho_time + << std::endl; + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + std::cout << "Time for compute rho on Device: " << computeRho_time + << std::endl; + } + template + void + computeRhoGradRhoFromInterpolatedValues( + std::shared_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) + for (unsigned int iWave = 0; iWave < vecRange.second - vecRange.first; + ++iWave) + { + const NumberType psi = + wfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * + vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]; + rho[iCell * nQuadsPerCell + iQuad] += + partialOccupVec[iWave] * std::abs(psi) * std::abs(psi); + if (isEvaluateGradRho) + { + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj(psi) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + iQuad * vectorsBlockSize + iWave]); + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 1] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj(psi) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]); + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 2] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj(psi) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + 2 * nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]); + } + } + } +#if defined(DFTFE_WITH_DEVICE) + template void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrDevice, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit); +#endif + + template void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const unsigned int quadratureIndex, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit); +} // namespace dftfe diff --git a/src/dft/densityCalculator.inst.cc b/src/dft/densityCalculator.inst.cc deleted file mode 100644 index 9fbf7a331..000000000 --- a/src/dft/densityCalculator.inst.cc +++ /dev/null @@ -1,44 +0,0 @@ -template class DensityCalculator<1, 1>; -template class DensityCalculator<1, 2>; -template class DensityCalculator<2, 2>; -template class DensityCalculator<2, 3>; -template class DensityCalculator<2, 4>; -template class DensityCalculator<3, 3>; -template class DensityCalculator<3, 4>; -template class DensityCalculator<3, 5>; -template class DensityCalculator<3, 6>; -template class DensityCalculator<4, 4>; -template class DensityCalculator<4, 5>; -template class DensityCalculator<4, 6>; -template class DensityCalculator<4, 7>; -template class DensityCalculator<4, 8>; -template class DensityCalculator<5, 5>; -template class DensityCalculator<5, 6>; -template class DensityCalculator<5, 7>; -template class DensityCalculator<5, 8>; -template class DensityCalculator<5, 9>; -template class DensityCalculator<5, 10>; -template class DensityCalculator<6, 6>; -template class DensityCalculator<6, 7>; -template class DensityCalculator<6, 8>; -template class DensityCalculator<6, 9>; -template class DensityCalculator<6, 10>; -template class DensityCalculator<6, 11>; -template class DensityCalculator<6, 12>; -template class DensityCalculator<7, 7>; -template class DensityCalculator<7, 8>; -template class DensityCalculator<7, 9>; -template class DensityCalculator<7, 10>; -template class DensityCalculator<7, 11>; -template class DensityCalculator<7, 12>; -template class DensityCalculator<7, 13>; -template class DensityCalculator<7, 14>; -template class DensityCalculator<8, 8>; -template class DensityCalculator<8, 9>; -template class DensityCalculator<8, 10>; -template class DensityCalculator<8, 11>; -template class DensityCalculator<8, 12>; -template class DensityCalculator<8, 13>; -template class DensityCalculator<8, 14>; -template class DensityCalculator<8, 15>; -template class DensityCalculator<8, 16>; diff --git a/src/dft/densityCalculatorCPU.cc b/src/dft/densityCalculatorCPU.cc deleted file mode 100644 index c94244e4e..000000000 --- a/src/dft/densityCalculatorCPU.cc +++ /dev/null @@ -1,849 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -// source file for electron density related computations -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - template - void - computeRhoFromPSICPU( - const std::vector> & X, - const std::vector> & XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - MPI_Barrier(mpiCommParent); - double cpu_time = MPI_Wtime(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - - std::vector wfcQuads(numQuadPoints * BVec, T(0.0)); - - std::vector gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); - - std::vector shapeFunctionValues(numQuadPoints * numNodesPerElement, - T(0.0)); - std::vector shapeFunctionGradValues(numQuadPoints * 3 * - numNodesPerElement, - T(0.0)); - const unsigned int numQuadPointsTimes3 = numQuadPoints * 3; - - if (useFEOrderRhoPlusOneGLQuad) - { - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = - T(operatorMatrix.getShapeFunctionValuesDensityGaussLobattoQuad() - [iquad * numNodesPerElement + iNode]); - } - else - { - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = - T(operatorMatrix.getShapeFunctionValuesDensityGaussQuad() - [iquad * numNodesPerElement + iNode]); - } - - std::vector partialOccupVecTimesKptWeight(BVec, 0.0); - - - dftfe::distributedCPUMultiVec flattenedArrayBlock; - - std::vector cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0)); - - - // set density to zero - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - - std::fill((*rhoValues)[cellid].begin(), - (*rhoValues)[cellid].end(), - 0.0); - if (isEvaluateGradRho) - std::fill((*gradRhoValues)[cellid].begin(), - (*gradRhoValues)[cellid].end(), - 0.0); - - if (dftParams.spinPolarized == 1) - { - std::fill((*rhoValuesSpinPolarized)[cellid].begin(), - (*rhoValuesSpinPolarized)[cellid].end(), - 0.0); - if (isEvaluateGradRho) - std::fill((*gradRhoValuesSpinPolarized)[cellid].begin(), - (*gradRhoValuesSpinPolarized)[cellid].end(), - 0.0); - } - } - - std::vector rhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector gradRhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints * 3, - 0.0); - std::vector rhoValuesSpinPolarizedFlattened(totalLocallyOwnedCells * - numQuadPoints * 2, - 0.0); - std::vector gradRhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 6, 0.0); - - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - std::vector rhoContribution(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - - std::vector gradRhoXContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - std::vector gradRhoYContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - std::vector gradRhoZContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; - const std::vector &XFracCurrentKPoint = - XFrac[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, totalNumWaveFunctions - jvec); - - if (currentBlockSize != BVec || jvec == 0) - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock, - true); - - if ((jvec + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (spectrumSplit) - { - std::fill(partialOccupVecTimesKptWeight.begin(), - partialOccupVecTimesKptWeight.end(), - kPointWeights[kPoint] * spinPolarizedFactor); - } - else - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - if (eigenValues[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - partialOccupVecTimesKptWeight[iEigenVec] = - 0.0; - else - partialOccupVecTimesKptWeight[iEigenVec] = - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - else - { - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - partialOccupVecTimesKptWeight[iEigenVec] = - dftUtils::getPartialOccupancy( - eigenValues[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - } - - - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock - .data()[iNode * currentBlockSize + iWave] = - XCurrentKPoint[iNode * totalNumWaveFunctions + jvec + - iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock, currentBlockSize); - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - { - const unsigned int inc = 1; - for (unsigned int iNode = 0; iNode < numNodesPerElement; - ++iNode) - { - xcopy( - ¤tBlockSize, - flattenedArrayBlock.data() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionMatrix[currentBlockSize * iNode], - &inc); - } - - - const T scalarCoeffAlpha = T(1.0), - scalarCoeffBeta = T(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - rhoContribution[icell * numQuadPoints + iquad] += - partialOccupVecTimesKptWeight[iWave] * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]) * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]); - - if (isEvaluateGradRho) - { - for (unsigned int i = 0; - i < numNodesPerElement * 3 * numQuadPoints; - ++i) - { - shapeFunctionGradValues[i] = T( - operatorMatrix - .getShapeFunctionGradValuesDensityGaussQuad() - [icell * numNodesPerElement * 3 * - numQuadPoints + - i]); - } - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPointsTimes3, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionGradValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &gradWfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + - iWave]; - gradRhoXContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + - currentBlockSize + iWave]; - gradRhoYContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + - 2 * currentBlockSize + iWave]; - gradRhoZContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - } - - } // cells loop - } // band parallelizatoin check - } // wave function block loop - - if (spectrumSplit) - for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, Nfr - jvec); - - if (currentBlockSize != BVec || jvec == 0) - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock, - true); - - if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - partialOccupVecTimesKptWeight[iEigenVec] = - -kPointWeights[kPoint] * spinPolarizedFactor; - else - partialOccupVecTimesKptWeight[iEigenVec] = 0.0; - } - } - else - { - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - partialOccupVecTimesKptWeight[iEigenVec] = - (dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) - - 1.0) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - - for (unsigned int iNode = 0; iNode < numLocalDofs; - ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock - .data()[iNode * currentBlockSize + iWave] = - XFracCurrentKPoint[iNode * Nfr + jvec + iWave]; - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock, currentBlockSize); - - for (int icell = 0; icell < totalLocallyOwnedCells; - icell++) - { - const unsigned int inc = 1; - for (unsigned int iNode = 0; - iNode < numNodesPerElement; - ++iNode) - { - xcopy( - ¤tBlockSize, - flattenedArrayBlock.data() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionMatrix[currentBlockSize * - iNode], - &inc); - } - - - const T scalarCoeffAlpha = T(1.0), - scalarCoeffBeta = T(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - rhoContribution[icell * numQuadPoints + iquad] += - partialOccupVecTimesKptWeight[iWave] * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]) * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]); - - if (isEvaluateGradRho) - { - for (unsigned int i = 0; - i < numNodesPerElement * 3 * numQuadPoints; - ++i) - { - shapeFunctionGradValues[i] = T( - operatorMatrix - .getShapeFunctionGradValuesDensityGaussQuad() - [icell * numNodesPerElement * 3 * - numQuadPoints + - i]); - } - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPointsTimes3, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionGradValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &gradWfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - iWave]; - gradRhoXContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - currentBlockSize + iWave]; - gradRhoYContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - 2 * currentBlockSize + - iWave]; - gradRhoZContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - } - - } // cells loop - } - } - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesFlattened[icell * numQuadPoints + iquad] += - rhoContribution[icell * numQuadPoints + iquad]; - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 0] += - gradRhoXContribution[icell * numQuadPoints + iquad]; - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 1] += - gradRhoYContribution[icell * numQuadPoints + iquad]; - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 2] += - gradRhoZContribution[icell * numQuadPoints + iquad]; - } - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesSpinPolarizedFlattened[icell * numQuadPoints * - 2 + - iquad * 2 + spinIndex] += - rhoContribution[icell * numQuadPoints + iquad]; - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3] += - gradRhoXContribution[icell * numQuadPoints + iquad]; - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 1] += - gradRhoYContribution[icell * numQuadPoints + iquad]; - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 2] += - gradRhoZContribution[icell * numQuadPoints + iquad]; - } - } - - } // kpoint loop - } // spin index loop - - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interpoolcomm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interpoolcomm, - gradRhoValuesFlattened); - - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interpoolcomm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened, - interpoolcomm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interBandGroupComm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interBandGroupComm, - gradRhoValuesFlattened); - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interBandGroupComm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened, - interBandGroupComm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector dummy(1); - std::vector &tempRhoQuads = (*rhoValues)[cellid]; - std::vector &tempGradRhoQuads = - isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; - - std::vector &tempRhoQuadsSP = - (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : - dummy; - std::vector &tempGradRhoQuadsSP = - ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? - (*gradRhoValuesSpinPolarized)[cellid] : - dummy; - - if (dftParams.spinPolarized == 1) - { - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempRhoQuadsSP[2 * q + 0] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 + - q * 2 + 0]; - - tempRhoQuadsSP[2 * q + 1] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 + - q * 2 + 1]; - } - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuadsSP[6 * q + 0] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q]; - tempGradRhoQuadsSP[6 * q + 1] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 1]; - tempGradRhoQuadsSP[6 * q + 2] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 2]; - tempGradRhoQuadsSP[6 * q + 3] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 3]; - tempGradRhoQuadsSP[6 * q + 4] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 4]; - tempGradRhoQuadsSP[6 * q + 5] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 5]; - } - } - - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q]; - - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3]; - tempGradRhoQuads[3 * q + 1] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 1]; - tempGradRhoQuads[3 * q + 2] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 2]; - } - iElem++; - } - - MPI_Barrier(mpiCommParent); - cpu_time = MPI_Wtime() - cpu_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rho on CPU: " << cpu_time << std::endl; - } - - template void - computeRhoFromPSICPU( - const std::vector> &X, - const std::vector> &XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> * rhoValues, - std::map> * gradRhoValues, - std::map> * rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad); -} // namespace dftfe diff --git a/src/dft/densityCalculatorDevice.cc b/src/dft/densityCalculatorDevice.cc deleted file mode 100644 index ebc6865b2..000000000 --- a/src/dft/densityCalculatorDevice.cc +++ /dev/null @@ -1,1283 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -// source file for electron density related computations -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - namespace Device - { - namespace - { - __global__ void - computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, - double * rhoCellsWfcContributions, - double * gradRhoCellsWfcContributionsX, - double * gradRhoCellsWfcContributionsY, - double * gradRhoCellsWfcContributionsZ, - const bool isEvaluateGradRho) - { - const unsigned int globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const double psi = rhoCellsWfcContributions[index]; - rhoCellsWfcContributions[index] = psi * psi; - - if (isEvaluateGradRho) - { - const double gradPsiX = gradRhoCellsWfcContributionsX[index]; - gradRhoCellsWfcContributionsX[index] = 2.0 * psi * gradPsiX; - - const double gradPsiY = gradRhoCellsWfcContributionsY[index]; - gradRhoCellsWfcContributionsY[index] = 2.0 * psi * gradPsiY; - - const double gradPsiZ = gradRhoCellsWfcContributionsZ[index]; - gradRhoCellsWfcContributionsZ[index] = 2.0 * psi * gradPsiZ; - } - } - } - - __global__ void - computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceDoubleComplex *rhoCellsWfcContributions, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsX, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsY, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsZ, - const bool isEvaluateGradRho) - { - const unsigned int globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceDoubleComplex psi = - rhoCellsWfcContributions[index]; - rhoCellsWfcContributions[index] = - dftfe::utils::makeComplex(psi.x * psi.x + psi.y * psi.y, 0.0); - - if (isEvaluateGradRho) - { - const dftfe::utils::deviceDoubleComplex gradPsiX = - gradRhoCellsWfcContributionsX[index]; - gradRhoCellsWfcContributionsX[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiX.x + - psi.y * gradPsiX.y), - 0.0); - - const dftfe::utils::deviceDoubleComplex gradPsiY = - gradRhoCellsWfcContributionsY[index]; - gradRhoCellsWfcContributionsY[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiY.x + - psi.y * gradPsiY.y), - 0.0); - - const dftfe::utils::deviceDoubleComplex gradPsiZ = - gradRhoCellsWfcContributionsZ[index]; - gradRhoCellsWfcContributionsZ[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiZ.x + - psi.y * gradPsiZ.y), - 0.0); - } - } - } - } // namespace - - template - void - computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad) - { - if (use2pPlusOneGLQuad) - AssertThrow(!isEvaluateGradRho, dftUtils::ExcNotImplementedYet()); - - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - double device_time = MPI_Wtime(); - const unsigned int numKPoints = kPointWeights.size(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices( - interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - const NumberType zero = 0; - const NumberType scalarCoeffAlphaRho = 1.0; - const NumberType scalarCoeffBetaRho = 1.0; - const NumberType scalarCoeffAlphaGradRho = 1.0; - const NumberType scalarCoeffBetaGradRho = 1.0; - - const unsigned int cellsBlockSize = 50; - const unsigned int numCellBlocks = - totalLocallyOwnedCells / cellsBlockSize; - const unsigned int remCellBlockSize = - totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; - - dftfe::utils::MemoryStorage - rhoDevice(totalLocallyOwnedCells * numQuadPoints, zero); - dftfe::utils::MemoryStorage - rhoWfcContributionsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - - dftfe::utils::MemoryStorage - gradRhoDeviceX(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoDeviceY(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoDeviceZ(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceX( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceY( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceZ( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - - dftfe::utils::MemoryStorage - rhoHost; - dftfe::utils::MemoryStorage - gradRhoHostX; - dftfe::utils::MemoryStorage - gradRhoHostY; - dftfe::utils::MemoryStorage - gradRhoHostZ; - - rhoHost.resize(totalLocallyOwnedCells * numQuadPoints, zero); - - if (isEvaluateGradRho) - { - gradRhoHostX.resize(totalLocallyOwnedCells * numQuadPoints, zero); - - gradRhoHostY.resize(totalLocallyOwnedCells * numQuadPoints, zero); - gradRhoHostZ.resize(totalLocallyOwnedCells * numQuadPoints, zero); - } - - - dftfe::utils::MemoryStorage - shapeFunctionValuesTransposedDevice(numNodesPerElement * numQuadPoints, - zero); - - shapeFunctionValuesTransposedDevice.setValue(zero); - - - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - numNodesPerElement * numQuadPoints, - (operatorMatrix.getShapeFunctionValuesTransposed(use2pPlusOneGLQuad)) - .begin(), - shapeFunctionValuesTransposedDevice.begin()); - - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesXTransposedDevice; - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesYTransposedDevice; - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesZTransposedDevice; - - if (isEvaluateGradRho) - { - shapeFunctionGradientValuesXTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesXTransposedDevice.setValue(0); - - shapeFunctionGradientValuesYTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesYTransposedDevice.setValue(0); - - shapeFunctionGradientValuesZTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesZTransposedDevice.setValue(0); - } - - dftfe::utils::MemoryStorage - partialOccupVec(BVec, zero); - dftfe::utils::MemoryStorage - partialOccupVecDevice(BVec, zero); - - distributedDeviceVec &deviceFlattenedArrayBlock = - operatorMatrix.getParallelChebyBlockVectorDevice(); - - NumberType *cellWaveFunctionMatrix = - (operatorMatrix.getCellWaveFunctionMatrix()).begin(); - - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - - std::vector rhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector gradRhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints * 3, - 0.0); - std::vector rhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - std::vector gradRhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 6, 0.0); - - for (unsigned int spinIndex = 0; - spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - rhoDevice.setValue(zero); - rhoWfcContributionsDevice.setValue(zero); - gradRhoDeviceX.setValue(zero); - gradRhoDeviceY.setValue(zero); - gradRhoDeviceZ.setValue(zero); - gradRhoWfcContributionsDeviceX.setValue(zero); - gradRhoWfcContributionsDeviceY.setValue(zero); - gradRhoWfcContributionsDeviceZ.setValue(zero); - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - if ((jvec + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (spectrumSplit) - { - partialOccupVecDevice.setValue(kPointWeights[kPoint] * - spinPolarizedFactor); - } - else - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : - fermiEnergyDown; - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - *(partialOccupVec.begin() + iEigenVec) = 0; - else - *(partialOccupVec.begin() + iEigenVec) = - kPointWeights[kPoint] * - spinPolarizedFactor; - } - } - else - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(partialOccupVec.begin() + iEigenVec) = - dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - partialOccupVec - .template copyTo( - partialOccupVecDevice); - } - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - X + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayBlock.begin()); - - - deviceFlattenedArrayBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayBlock, BVec); - - for (int iblock = 0; iblock < (numCellBlocks + 1); - iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayBlock.begin(), - cellWaveFunctionMatrix, - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - NumberType scalarCoeffAlpha = 1.0; - NumberType scalarCoeffBeta = 0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - rhoWfcContributionsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - if (isEvaluateGradRho) - { - strideB = numNodesPerElement * numQuadPoints; - - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesXTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesYTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesZTransposedDevice - .begin()); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - strideC, - currentCellsBlockSize); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - strideC, - currentCellsBlockSize); - } - - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoGradRhoFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoGradRhoFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#endif - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - partialOccupVecDevice.begin(), - 1, - rhoWfcContributionsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoDevice.begin() + - startingCellId * numQuadPoints, - 1); - - - if (isEvaluateGradRho) - { - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceX.begin() + - startingCellId * numQuadPoints, - 1); - - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceY.begin() + - startingCellId * numQuadPoints, - 1); - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceZ.begin() + - startingCellId * numQuadPoints, - 1); - } - } // non-trivial cell block check - } // cells block loop - } // band parallelizatoin check - } // wave function block loop - - if (spectrumSplit) - for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) - if ((jvec + totalNumWaveFunctions - Nfr + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + totalNumWaveFunctions - Nfr + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - *(partialOccupVec.begin() + iEigenVec) = - -kPointWeights[kPoint] * spinPolarizedFactor; - else - *(partialOccupVec.begin() + iEigenVec) = 0; - } - } - else - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(partialOccupVec.begin() + iEigenVec) = - (dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) - - 1.0) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - partialOccupVec - .template copyTo( - partialOccupVecDevice); - - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - Nfr, - numLocalDofs, - jvec, - XFrac + numLocalDofs * Nfr * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayBlock.begin()); - - deviceFlattenedArrayBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayBlock, BVec); - - for (int iblock = 0; iblock < (numCellBlocks + 1); - iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayBlock.begin(), - cellWaveFunctionMatrix, - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - NumberType scalarCoeffAlpha = 1.0; - NumberType scalarCoeffBeta = 0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - rhoWfcContributionsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - - if (isEvaluateGradRho) - { - strideB = numNodesPerElement * numQuadPoints; - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesXTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesYTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesZTransposedDevice - .begin()); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - strideC, - currentCellsBlockSize); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - strideC, - currentCellsBlockSize); - } - - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoGradRhoFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoGradRhoFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#endif - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - partialOccupVecDevice.begin(), - 1, - rhoWfcContributionsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoDevice.begin() + - startingCellId * numQuadPoints, - 1); - - - if (isEvaluateGradRho) - { - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceX.begin() + - startingCellId * numQuadPoints, - 1); - - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceY.begin() + - startingCellId * numQuadPoints, - 1); - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceZ.begin() + - startingCellId * numQuadPoints, - 1); - } - } // non-tivial cells block - } // cells block loop - } // spectrum split block - - - // do memcopy to host - rhoDevice.template copyTo( - rhoHost.begin(), totalLocallyOwnedCells * numQuadPoints, 0, 0); - - if (isEvaluateGradRho) - { - gradRhoDeviceX - .template copyTo( - gradRhoHostX.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - gradRhoDeviceY - .template copyTo( - gradRhoHostY.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - gradRhoDeviceZ - .template copyTo( - gradRhoHostZ.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - } - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesFlattened[icell * numQuadPoints + iquad] += - dftfe::utils::realPart( - *(rhoHost.begin() + icell * numQuadPoints + iquad)); - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 0] += - dftfe::utils::realPart(*(gradRhoHostX.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 1] += - dftfe::utils::realPart(*(gradRhoHostY.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 2] += - dftfe::utils::realPart(*(gradRhoHostZ.begin() + - icell * numQuadPoints + - iquad)); - } - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - dftfe::utils::realPart( - *(rhoHost.begin() + icell * numQuadPoints + iquad)); - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - { - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3] += - dftfe::utils::realPart(*(gradRhoHostX.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 1] += - dftfe::utils::realPart(*(gradRhoHostY.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 2] += - dftfe::utils::realPart(*(gradRhoHostZ.begin() + - icell * numQuadPoints + - iquad)); - } - } - } // kpoint loop - } // spin index - - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interpoolcomm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interpoolcomm, - gradRhoValuesFlattened); - - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interpoolcomm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum( - gradRhoValuesSpinPolarizedFlattened, - interpoolcomm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interBandGroupComm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interBandGroupComm, - gradRhoValuesFlattened); - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interBandGroupComm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum( - gradRhoValuesSpinPolarizedFlattened, - interBandGroupComm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector dummy(1); - std::vector &tempRhoQuads = (*rhoValues)[cellid]; - std::vector &tempGradRhoQuads = - isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; - - std::vector &tempRhoQuadsSP = - (dftParams.spinPolarized == 1) ? - (*rhoValuesSpinPolarized)[cellid] : - dummy; - std::vector &tempGradRhoQuadsSP = - ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? - (*gradRhoValuesSpinPolarized)[cellid] : - dummy; - - if (dftParams.spinPolarized == 1) - { - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempRhoQuadsSP[2 * q + 0] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * - 2 + - q * 2 + 0]; - - tempRhoQuadsSP[2 * q + 1] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * - 2 + - q * 2 + 1]; - } - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuadsSP[6 * q + 0] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q]; - tempGradRhoQuadsSP[6 * q + 1] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 1]; - tempGradRhoQuadsSP[6 * q + 2] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 2]; - tempGradRhoQuadsSP[6 * q + 3] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 3]; - tempGradRhoQuadsSP[6 * q + 4] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 4]; - tempGradRhoQuadsSP[6 * q + 5] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 5]; - } - } - - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q]; - - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3]; - tempGradRhoQuads[3 * q + 1] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + - 1]; - tempGradRhoQuads[3 * q + 2] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + - 2]; - } - iElem++; - } - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rho on Device: " << device_time - << std::endl; - } - - template void - computeRhoFromPSI( - const dataTypes::number * X, - const dataTypes::number * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad); - } // namespace Device -} // namespace dftfe diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc new file mode 100644 index 000000000..8b8ac25c1 --- /dev/null +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -0,0 +1,247 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + namespace + { + __global__ void + computeRhoGradRhoFromInterpolatedValues( + const unsigned int numVectors, + const unsigned int numCells, + const unsigned int nQuadsPerCell, + double * wfcContributions, + double * gradwfcContributions, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + const bool isEvaluateGradRho) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const double psi = wfcContributions[index]; + rhoCellsWfcContributions[index] = psi * psi; + + if (isEvaluateGradRho) + { + unsigned int iCell = index / numEntriesPerCell; + unsigned int intraCellIndex = index - iCell * numEntriesPerCell; + unsigned int iQuad = intraCellIndex / numVectors; + unsigned int iVec = intraCellIndex - iQuad * numVectors; + const double gradPsiX = //[iVec * numCells * numVectors + + 0] + gradwfcContributions[intraCellIndex + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * psi * gradPsiX; + + const double gradPsiY = + gradwfcContributions[intraCellIndex + numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * psi * gradPsiY; + + const double gradPsiZ = + gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + 2 * numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * psi * gradPsiZ; + } + } + } + + __global__ void + computeRhoGradRhoFromInterpolatedValues( + const unsigned int numVectors, + const unsigned int numCells, + const unsigned int nQuadsPerCell, + dftfe::utils::deviceDoubleComplex *wfcContributions, + dftfe::utils::deviceDoubleComplex *gradwfcContributions, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + const bool isEvaluateGradRho) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index]; + rhoCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y; + + if (isEvaluateGradRho) + { + unsigned int iCell = index / numEntriesPerCell; + unsigned int intraCellIndex = index - iCell * numEntriesPerCell; + unsigned int iQuad = intraCellIndex / numVectors; + unsigned int iVec = intraCellIndex - iQuad * numVectors; + const dftfe::utils::deviceDoubleComplex gradPsiX = + gradwfcContributions[intraCellIndex + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * (psi.x * gradPsiX.x + psi.y * gradPsiX.y); + + const dftfe::utils::deviceDoubleComplex gradPsiY = + gradwfcContributions[intraCellIndex + numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * (psi.x * gradPsiY.x + psi.y * gradPsiY.y); + + const dftfe::utils::deviceDoubleComplex gradPsiZ = + gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[iVec + 2 * numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * (psi.x * gradPsiZ.x + psi.y * gradPsiZ.y); + } + } + } + } // namespace + template + void + computeRhoGradRhoFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); + const double scalarCoeffAlphaRho = 1.0; + const double scalarCoeffBetaRho = 1.0; + const double scalarCoeffAlphaGradRho = 1.0; + const double scalarCoeffBetaGradRho = 1.0; +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + computeRhoGradRhoFromInterpolatedValues<<< + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + vectorsBlockSize, + cellsBlockSize, + nQuadsPerCell, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), + isEvaluateGradRho); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + computeRhoGradRhoFromInterpolatedValues, + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + vectorsBlockSize, + cellsBlockSize, + nQuadsPerCell, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), + isEvaluateGradRho); +#endif + dftfe::utils::deviceBlasWrapper::gemv( + basisOperationsPtr->getDeviceBLASHandle(), + dftfe::utils::DEVICEBLAS_OP_T, + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell, + &scalarCoeffAlphaRho, + rhoCellsWfcContributions, + vectorsBlockSize, + partialOccupVec, + 1, + &scalarCoeffBetaRho, + rho + cellRange.first * nQuadsPerCell, + 1); + + + if (isEvaluateGradRho) + { + dftfe::utils::deviceBlasWrapper::gemv( + basisOperationsPtr->getDeviceBLASHandle(), + dftfe::utils::DEVICEBLAS_OP_T, + vectorsBlockSize, + cellsBlockSize * nQuadsPerCell * 3, + &scalarCoeffAlphaGradRho, + gradRhoCellsWfcContributions, + vectorsBlockSize, + partialOccupVec, + 1, + &scalarCoeffBetaGradRho, + gradRho + cellRange.first * nQuadsPerCell * 3, + 1); + } + } + template void + computeRhoGradRhoFromInterpolatedValues( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + dataTypes::number * wfcQuadPointData, + dataTypes::number * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); + +} // namespace dftfe diff --git a/src/dft/densityFirstOrderResponseCalculatorCPU.cc b/src/dft/densityFirstOrderResponseCalculatorCPU.cc index 0c1eef5b0..b0efcd893 100644 --- a/src/dft/densityFirstOrderResponseCalculatorCPU.cc +++ b/src/dft/densityFirstOrderResponseCalculatorCPU.cc @@ -31,8 +31,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> & X, - const std::vector> & XPrime, + const T * X, + const T * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -149,11 +149,13 @@ namespace dftfe for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XCurrentKPoint = + X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; - const std::vector &XPrimeCurrentKPoint = - XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XPrimeCurrentKPoint = + XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; const std::vector &densityMatDerFermiEnergyVec = densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + @@ -420,8 +422,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> & X, - const std::vector> & XPrime, + const T * X, + const T * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -543,11 +545,13 @@ namespace dftfe for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XCurrentKPoint = + X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; - const std::vector &XPrimeCurrentKPoint = - XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XPrimeCurrentKPoint = + XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; const std::vector &densityMatDerFermiEnergyVec = densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + @@ -815,12 +819,12 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> &X, - const std::vector> &XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const dataTypes::number * XPrime, + const std::vector> & densityMatDerFermiEnergy, + const unsigned int totalNumWaveFunctions, + const unsigned int numLocalDofs, + operatorDFTClass & operatorMatrix, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, @@ -841,12 +845,12 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> &X, - const std::vector> &XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const dataTypes::number * XPrime, + const std::vector> & densityMatDerFermiEnergy, + const unsigned int totalNumWaveFunctions, + const unsigned int numLocalDofs, + operatorDFTClass & operatorMatrix, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, diff --git a/src/dft/dft.cc b/src/dft/dft.cc index fd371b8b7..decfb89fd 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -20,7 +20,6 @@ // Include header files #include #include -#include #include #include #include @@ -65,7 +64,6 @@ #include #ifdef DFTFE_WITH_DEVICE -# include # include #endif @@ -758,10 +756,6 @@ namespace dftfe d_upperBoundUnwantedSpectrumValues.resize( (d_dftParamsPtr->spinPolarized + 1) * d_kPointWeights.size(), 0.0); - d_eigenVectorsFlattenedSTL.resize((1 + d_dftParamsPtr->spinPolarized) * - d_kPointWeights.size()); - d_eigenVectorsRotFracDensityFlattenedSTL.resize( - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); for (unsigned int kPoint = 0; kPoint < d_kPointWeights.size(); ++kPoint) { @@ -1654,8 +1648,9 @@ namespace dftfe vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[0], + d_eigenVectorsFlattenedHost.data(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(0, numberWaveFunctionsErrorEstimate), eigenVectorsArray); @@ -1842,6 +1837,8 @@ namespace dftfe if (initializeCublas) { kohnShamDFTEigenOperatorDevice.createDeviceBlasHandle(); + basisOperationsPtrDevice->setDeviceBLASHandle( + &(kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle())); } AssertThrow( @@ -1942,6 +1939,9 @@ namespace dftfe d_kohnShamDFTOperatorDevicePtr->reinit( std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues), true); + + basisOperationsPtrDevice->setDeviceBLASHandle( + &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); } #endif } @@ -3552,8 +3552,7 @@ namespace dftfe dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; - + matrix_free_data.get_vector_partitioner()->locally_owned_size(); if (numberBandGroups > 1 && !d_dftParamsPtr->useDevice) { MPI_Barrier(interBandGroupComm); @@ -3569,13 +3568,17 @@ namespace dftfe { const unsigned int currentBlockSize = std::min(blockSize, d_numEigenValues * localVectorSize - i); - MPI_Allreduce(MPI_IN_PLACE, - &d_eigenVectorsFlattenedSTL[kPoint][0] + i, - currentBlockSize, - dataTypes::mpi_type_id( - &d_eigenVectorsFlattenedSTL[kPoint][0]), - MPI_SUM, - interBandGroupComm); + MPI_Allreduce( + MPI_IN_PLACE, + &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize] + + i, + currentBlockSize, + dataTypes::mpi_type_id( + &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize]), + MPI_SUM, + interBandGroupComm); } } @@ -3775,17 +3778,7 @@ namespace dftfe if (d_dftParamsPtr->useDevice && (d_dftParamsPtr->writeWfcSolutionFields || d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile)) - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice.copyTo( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[kPoint].size(), - (kPoint * d_eigenVectorsFlattenedSTL[0].size()), - 0); - } + d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost); #endif @@ -3891,6 +3884,11 @@ namespace dftfe #endif ); } +#ifdef DFTFE_WITH_DEVICE + if (d_dftParamsPtr->useDevice) + basisOperationsPtrDevice->setDeviceBLASHandle( + &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); +#endif forcePtr->computeStress(matrix_free_data, #ifdef DFTFE_WITH_DEVICE @@ -4171,20 +4169,26 @@ namespace dftfe { #ifdef USE_COMPLEX vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[k * - (1 + d_dftParamsPtr->spinPolarized) + - s], + d_eigenVectorsFlattenedHost.data() + + (k * (1 + d_dftParamsPtr->spinPolarized) + s) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(i, i + 1), localProc_dof_indicesReal, localProc_dof_indicesImag, tempVec); #else vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[k * - (1 + d_dftParamsPtr->spinPolarized) + - s], + d_eigenVectorsFlattenedHost.data() + + (k * (1 + d_dftParamsPtr->spinPolarized) + s) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(i, i + 1), tempVec); #endif diff --git a/src/dft/dos.cc b/src/dft/dos.cc index d5abe5689..78ad4b545 100644 --- a/src/dft/dos.cc +++ b/src/dft/dos.cc @@ -445,7 +445,7 @@ namespace dftfe std::vector tempQuadPointValues(n_q_points); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; + matrix_free_data.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); std::vector> @@ -475,11 +475,6 @@ namespace dftfe eigenVectorsFlattenedBlock[kPoint]); eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0); } - - constraintsNoneDataInfo.precomputeMaps( - matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedBlock[0].get_partitioner(), - currentBlockSize); } @@ -507,8 +502,10 @@ namespace dftfe for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave) eigenVectorsFlattenedBlock[kPoint].local_element( iNode * currentBlockSize + iWave) = - d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues + - ivec + iWave]; + d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize + + iNode * d_numEigenValues + ivec + + iWave]; constraintsNoneDataInfo.distribute( eigenVectorsFlattenedBlock[kPoint], currentBlockSize); @@ -987,7 +984,7 @@ namespace dftfe std::vector tempQuadPointValues(n_q_points); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; + matrix_free_data.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); std::vector> @@ -1022,11 +1019,6 @@ namespace dftfe eigenVectorsFlattenedBlock[kPoint]); eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0); } - - constraintsNoneDataInfo.precomputeMaps( - matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedBlock[0].get_partitioner(), - currentBlockSize); } @@ -1056,8 +1048,10 @@ namespace dftfe for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave) eigenVectorsFlattenedBlock[kPoint].local_element( iNode * currentBlockSize + iWave) = - d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues + - ivec + iWave]; + d_eigenVectorsFlattenedHost[kPoint * localVectorSize * + d_numEigenValues + + iNode * d_numEigenValues + ivec + + iWave]; constraintsNoneDataInfo.distribute( eigenVectorsFlattenedBlock[kPoint], currentBlockSize); diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 5e0814c20..d7e207974 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -262,6 +262,60 @@ namespace dftfe d_constraintsVector, quadratureVector, additional_data); + basisOperationsPtrHost = std::make_shared< + dftfe::basis::FEBasisOperations>( + matrix_free_data, d_constraintsVector); + dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | + dftfe::basis::update_gradients | + dftfe::basis::update_transpose; + std::vector quadratureIndices(4, 0); + for (auto i = 0; i < 4; ++i) + quadratureIndices[i] = i; + basisOperationsPtrHost->init(d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); + if (!d_dftParamsPtr->useDevice) + { + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices( + interBandGroupComm, d_numEigenValues, bandGroupLowHighPlusOneIndices); + + unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize, + bandGroupLowHighPlusOneIndices[1]); + + basisOperationsPtrHost->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)); + if (d_numEigenValues % BVec != 0) + basisOperationsPtrHost->createScratchMultiVectors( + d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1)); + if (d_numEigenValues != d_numEigenValuesRR && + d_numEigenValuesRR % BVec != 0) + basisOperationsPtrHost->createScratchMultiVectors( + d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1)); + } +#if defined(DFTFE_WITH_DEVICE) + if (d_dftParamsPtr->useDevice) + { + basisOperationsPtrDevice = std::make_shared< + dftfe::basis::FEBasisOperations>( + matrix_free_data, d_constraintsVector); + basisOperationsPtrDevice->init(d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); + const unsigned int BVec = + std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); + + if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") + basisOperationsPtrDevice->createScratchMultiVectors(BVec, 2); + else + basisOperationsPtrDevice->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)); + } +#endif MPI_Barrier(d_mpiCommParent); init_mf = MPI_Wtime() - init_mf; diff --git a/src/dft/initElectronicFields.cc b/src/dft/initElectronicFields.cc index 90fb3c3d4..74ce870a7 100644 --- a/src/dft/initElectronicFields.cc +++ b/src/dft/initElectronicFields.cc @@ -104,16 +104,17 @@ namespace dftfe kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); ++kPoint) { - d_eigenVectorsFlattenedSTL[kPoint].resize( - d_numEigenValues * - matrix_free_data.get_vector_partitioner()->local_size(), + d_eigenVectorsFlattenedHost.resize( + (d_numEigenValues * + matrix_free_data.get_vector_partitioner()->local_size()) * + (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(), dataTypes::number(0.0)); - if (d_numEigenValuesRR != d_numEigenValues) { - d_eigenVectorsRotFracDensityFlattenedSTL[kPoint].resize( + d_eigenVectorsRotFracDensityFlattenedHost.resize( d_numEigenValuesRR * - matrix_free_data.get_vector_partitioner()->local_size(), + matrix_free_data.get_vector_partitioner()->local_size() * + (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(), dataTypes::number(0.0)); } } @@ -145,40 +146,26 @@ namespace dftfe if (d_dftParamsPtr->useDevice) { d_eigenVectorsFlattenedDevice.resize( - d_eigenVectorsFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsFlattenedHost.size()); if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") d_eigenVectorsDensityMatrixPrimeFlattenedDevice.resize( - d_eigenVectorsFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsFlattenedHost.size()); if (d_numEigenValuesRR != d_numEigenValues) d_eigenVectorsRotFracFlattenedDevice.resize( - d_eigenVectorsRotFracDensityFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsRotFracDensityFlattenedHost.size()); else d_eigenVectorsRotFracFlattenedDevice.resize(1); - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice - .copyFrom( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[0].size(), - 0, - kPoint * d_eigenVectorsFlattenedSTL[0].size()); - } + d_eigenVectorsFlattenedDevice.copyFrom(d_eigenVectorsFlattenedHost); } #endif if (!d_dftParamsPtr->useDevice && d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") { - d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL; + d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost; } if (d_dftParamsPtr->verbosity >= 2 && d_dftParamsPtr->spinPolarized == 1) diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index 75c473ac6..92b0552c3 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -20,22 +20,17 @@ #include #include #include -#include namespace dftfe { namespace internal { void - pointWiseScaleWithDiagonal( - const distributedCPUVec &diagonal, - const std::shared_ptr - & singleComponentPartitioner, - const unsigned int numberFields, - std::vector &fieldsArrayFlattened) + pointWiseScaleWithDiagonal(const distributedCPUVec &diagonal, + const unsigned int numberFields, + const unsigned int numberDofs, + dataTypes::number *fieldsArrayFlattened) { - const unsigned int numberDofs = - fieldsArrayFlattened.size() / numberFields; const unsigned int inc = 1; for (unsigned int i = 0; i < numberDofs; ++i) @@ -178,9 +173,9 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); // @@ -188,9 +183,11 @@ namespace dftfe // std::vector ProjHam; - kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0], - d_numEigenValues, - ProjHam); + kohnShamDFTEigenOperator.XtHX( + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + ProjHam); // // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in @@ -198,9 +195,9 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); dataTypes::number trXtHX = 0.0; @@ -243,25 +240,30 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); // // orthogonalize the vectors // linearAlgebraOperations::gramSchmidtOrthogonalization( - d_eigenVectorsFlattenedSTL[0], d_numEigenValues, mpi_communicator); + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + mpi_communicator); // // compute projected Hamiltonian // std::vector ProjHam; - kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0], - d_numEigenValues, - ProjHam); + kohnShamDFTEigenOperator.XtHX( + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + ProjHam); // // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in @@ -269,9 +271,9 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); double trXtKX = 0.0; #ifdef USE_COMPLEX @@ -319,11 +321,12 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, @@ -387,12 +390,16 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, elpaScala, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], - d_eigenVectorsRotFracDensityFlattenedSTL - [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType], + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsRotFracDensityFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValuesTemp, residualNormWaveFunctions, interBandGroupComm, @@ -406,20 +413,23 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); if (isSpectrumSplit && d_numEigenValuesRR != d_numEigenValues) { internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValuesRR, - d_eigenVectorsRotFracDensityFlattenedSTL - [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsRotFracDensityFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); } // @@ -537,8 +547,10 @@ namespace dftfe elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesDummy, *d_devicecclMpiCommDomainPtr, @@ -556,11 +568,14 @@ namespace dftfe elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_eigenVectorsRotFracFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsRotFracDensityFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesTemp, residualNormWaveFunctions, @@ -662,11 +677,12 @@ namespace dftfe // multiply by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex), d_numEigenValues, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(d_numEigenValues, 0.0); for (unsigned int i = 0; i < d_numEigenValues; i++) @@ -678,10 +694,12 @@ namespace dftfe linearAlgebraOperations::densityMatrixEigenBasisFirstOrderResponse( kohnShamDFTEigenOperator, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_mpiCommParent, kohnShamDFTEigenOperator.getMPICommunicator(), interBandGroupComm, @@ -700,11 +718,12 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex), d_numEigenValues, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); } #ifdef DFTFE_WITH_DEVICE @@ -739,8 +758,10 @@ namespace dftfe kohnShamDFTEigenOperator, d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesTemp, fermiEnergy, @@ -779,11 +800,12 @@ namespace dftfe if (ipass == 1) internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(d_numEigenValues, 0.0); @@ -834,13 +856,16 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, *d_elpaScala, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValuesTemp, residualNormWaveFunctions, interBandGroupComm, diff --git a/src/dft/localizationLength.cc b/src/dft/localizationLength.cc index 64e06d019..90296b7eb 100644 --- a/src/dft/localizationLength.cc +++ b/src/dft/localizationLength.cc @@ -57,8 +57,9 @@ namespace dftfe for (unsigned int iWave = 0; iWave < d_numEigenValues; ++iWave) { vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[0], + d_eigenVectorsFlattenedHost.data(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(iWave, iWave + 1), tempVec); diff --git a/src/dft/psiInitialGuess.cc b/src/dft/psiInitialGuess.cc index f9a8ae3a9..f065dec2f 100644 --- a/src/dft/psiInitialGuess.cc +++ b/src/dft/psiInitialGuess.cc @@ -344,14 +344,9 @@ namespace dftfe locallyOwnedSet.fill_index_vector(locallyOwnedDOFs); unsigned int numberDofs = locallyOwnedDOFs.size(); - for (unsigned int kPoint = 0; - kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - std::fill(d_eigenVectorsFlattenedSTL[kPoint].begin(), - d_eigenVectorsFlattenedSTL[kPoint].end(), - 0.0); - } + std::fill(d_eigenVectorsFlattenedHost.begin(), + d_eigenVectorsFlattenedHost.end(), + 0.0); const unsigned int numberGlobalAtoms = atomLocations.size(); @@ -498,8 +493,9 @@ namespace dftfe // spherical part if (it->m > 0) { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * std::sqrt(2) * boost::math::spherical_harmonic_r( @@ -507,16 +503,18 @@ namespace dftfe } else if (it->m == 0) { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * boost::math::spherical_harmonic_r( it->l, it->m, theta, phi)); } else { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * std::sqrt(2) * boost::math::spherical_harmonic_i( @@ -539,8 +537,9 @@ namespace dftfe // // boost::math::normal normDist; - std::vector &temp = - d_eigenVectorsFlattenedSTL[kPoint]; + dataTypes::number *temp = + d_eigenVectorsFlattenedHost.data() + + kPoint * d_numEigenValues * numberDofs; for (unsigned int iWave = waveFunctionsVector.size(); iWave < d_numEigenValues; ++iWave) @@ -565,11 +564,10 @@ namespace dftfe (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); ++kPoint) { - std::vector &temp1 = - d_eigenVectorsFlattenedSTL[kPoint]; + dataTypes::number *temp1 = d_eigenVectorsFlattenedHost.data() + + kPoint * d_numEigenValues * numberDofs; - std::vector &temp2 = - d_eigenVectorsFlattenedSTL[0]; + dataTypes::number *temp2 = d_eigenVectorsFlattenedHost.data(); for (unsigned int idof = 0; idof < numberDofs; idof++) for (unsigned int iwave = 0; iwave < d_numEigenValues; iwave++) diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc index bcdb9edf9..b03a0b22d 100644 --- a/src/dft/solveNSCF.cc +++ b/src/dft/solveNSCF.cc @@ -1130,17 +1130,7 @@ namespace dftfe if (d_dftParamsPtr->useDevice && (d_dftParamsPtr->writeWfcSolutionFields || d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile)) - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice.copyTo( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[kPoint].size(), - (kPoint * d_eigenVectorsFlattenedSTL[0].size()), - 0); - } + d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost); #endif //#ifdef USE_COMPLEX diff --git a/src/dft/solveVselfInBinsDevice.cc b/src/dft/solveVselfInBinsDevice.cc index 22bad5577..3fc681a31 100644 --- a/src/dft/solveVselfInBinsDevice.cc +++ b/src/dft/solveVselfInBinsDevice.cc @@ -515,10 +515,6 @@ namespace dftfe matrixFreeData.get_vector_partitioner(mfDofHandlerIndex), hangingPeriodicConstraintMatrix); - - constraintsMatrixDataInfoDevice.precomputeMaps( - flattenedArray.getMPIPatternP2P(), blockSize); - constraintsMatrixDataInfoDevice.set_zero(xD, blockSize); dftfe::utils::deviceSynchronize(); diff --git a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc index 349ff0004..35ec71e9d 100644 --- a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc +++ b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc @@ -253,8 +253,7 @@ kohnShamDFTOperatorDeviceClass:: if (std::is_same>::value) { utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), dst, d_tempRealVec.begin(), d_tempImagVec.begin()); @@ -269,8 +268,7 @@ kohnShamDFTOperatorDeviceClass:: utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), d_tempRealVec.begin(), d_tempImagVec.begin(), dst); diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc index 963d2cafd..661c67f8f 100644 --- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc +++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc @@ -68,9 +68,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -124,9 +124,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -165,21 +165,82 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + } val += vEffJxW[cellIndex * numQuadPoints + q] * shapeI * shapeJ; @@ -227,9 +288,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -264,37 +325,105 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -334,9 +463,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -376,37 +505,105 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -462,12 +659,12 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double *shapeFunctionGradientValuesXTransposed, - const double *shapeFunctionGradientValuesYTransposed, - const double *shapeFunctionGradientValuesZTransposed, - const double *vEffPrimeJxW, - const double *JxW, - double * cellHamiltonianPrimeMatrixFlattened) + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, + const double * vEffPrimeJxW, + const double * JxW, + double *cellHamiltonianPrimeMatrixFlattened) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -506,9 +703,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, dftfe::utils::deviceDoubleComplex *cellHamiltonianPrimeMatrixFlattened) @@ -553,9 +750,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, const double * derExcPrimeWithSigmaTimesGradRhoJxW, @@ -585,37 +782,105 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -647,9 +912,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, const double * derExcPrimeWithSigmaTimesGradRhoJxW, @@ -679,37 +944,105 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -764,17 +1097,18 @@ kohnShamDFTOperatorDeviceClass:: !d_isStiffnessMatrixExternalPotCorrComputed && !onlyHPrimePartForFirstOrderDensityMatResponse) { + basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_lpspQuadratureId); #ifdef DFTFE_WITH_DEVICE_LANG_CUDA hamMatrixExtPotCorr<<<(d_numLocallyOwnedCells * d_numberNodesPerElement * d_numberNodesPerElement + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPointsLpsp, - d_shapeFunctionValueLpspDevice.begin(), - d_shapeFunctionValueTransposedLpspDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP @@ -787,18 +1121,18 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPointsLpsp, - d_shapeFunctionValueLpspDevice.begin(), - d_shapeFunctionValueTransposedLpspDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #endif d_isStiffnessMatrixExternalPotCorrComputed = true; } - + basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_densityQuadratureId); if (onlyHPrimePartForFirstOrderDensityMatResponse) { if (dftPtr->d_excManagerPtr->getDensityBasedFamilyType() == @@ -810,14 +1144,14 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -835,14 +1169,14 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -860,14 +1194,14 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -884,14 +1218,14 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -911,17 +1245,17 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -943,17 +1277,17 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -974,17 +1308,17 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1005,17 +1339,17 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc index 7d3fb765d..fed3b321f 100644 --- a/src/dftOperator/kohnShamDFTOperator.cc +++ b/src/dftOperator/kohnShamDFTOperator.cc @@ -170,11 +170,6 @@ namespace dftfe d_normalCellIdToMacroCellIdMap, d_macroCellIdToNormalCellIdMap, d_FullflattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - flattenedArray.get_partitioner(), - numberWaveFunctions); } template @@ -217,9 +212,6 @@ namespace dftfe d_normalCellIdToMacroCellIdMap, d_macroCellIdToNormalCellIdMap, d_FullflattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - flattenedArray.getMPIPatternP2P(), numberWaveFunctions); } template @@ -1239,14 +1231,14 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHX( - const std::vector &X, - const unsigned int numberWaveFunctions, - std::vector & ProjHam) + const dataTypes::number * X, + const unsigned int numberWaveFunctions, + const unsigned int numberDofs, + std::vector &ProjHam) { // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / numberWaveFunctions; // // Resize ProjHam @@ -1326,8 +1318,9 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHX( - const std::vector & X, + const dataTypes::number * X, const unsigned int numberWaveFunctions, + const unsigned int numberDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse) @@ -1335,7 +1328,6 @@ namespace dftfe // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / numberWaveFunctions; // create temporary arrays XBlock,Hx distributedCPUMultiVec XBlock, HXBlock; @@ -1500,9 +1492,10 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int N, const unsigned int Ncore, + const unsigned int numberDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse) @@ -1510,7 +1503,6 @@ namespace dftfe // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / N; // create temporary arrays XBlock,Hx distributedCPUMultiVec XBlock, HXBlock; @@ -1558,7 +1550,7 @@ namespace dftfe std::vector HXBlockSinglePrec; - std::vector XSinglePrec(&X[0], &X[0] + X.size()); + std::vector XSinglePrec(X, X + numberDofs * N); if (dftPtr->d_dftParamsPtr->verbosity >= 4) dftUtils::printCurrentMemoryUsage( diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 0485516e3..c63a353ae 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -386,29 +386,29 @@ namespace dftfe return d_shapeFunctionValueNLPTransposedDevice; } - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesXTransposed() - { - return d_shapeFunctionGradientValueXTransposedDevice; - } - - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesYTransposed() - { - return d_shapeFunctionGradientValueYTransposedDevice; - } - - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesZTransposed() - { - return d_shapeFunctionGradientValueZTransposedDevice; - } + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesXTransposed() + // { + // return d_shapeFunctionGradientValueXTransposedDevice; + // } + + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesYTransposed() + // { + // return d_shapeFunctionGradientValueYTransposedDevice; + // } + + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesZTransposed() + // { + // return d_shapeFunctionGradientValueZTransposedDevice; + // } template dftfe::utils::MemoryStorage & @@ -458,7 +458,10 @@ namespace dftfe kohnShamDFTOperatorDeviceClass:: getParallelChebyBlockVectorDevice() { - return d_parallelChebyBlockVectorDevice; + const unsigned int BVec = + std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, + dftPtr->d_numEigenValues); + return basisOperationsPtrDevice->getMultiVector(BVec); } template @@ -466,7 +469,10 @@ namespace dftfe kohnShamDFTOperatorDeviceClass:: getParallelChebyBlockVector2Device() { - return d_parallelChebyBlockVector2Device; + const unsigned int BVec = + std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, + dftPtr->d_numEigenValues); + return basisOperationsPtrDevice->getMultiVector(BVec, 1); } template @@ -495,6 +501,8 @@ namespace dftfe { computing_timer.enter_subsection("kohnShamDFTOperatorDeviceClass setup"); + basisOperationsPtrDevice = dftPtr->basisOperationsPtrDevice; + basisOperationsPtrHost = dftPtr->basisOperationsPtrHost; dftPtr->matrix_free_data.initialize_dof_vector( d_invSqrtMassVector, dftPtr->d_densityDofHandlerIndex); @@ -569,25 +577,6 @@ namespace dftfe std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, numberWaveFunctions); - dftfe::linearAlgebra::createMultiVectorFromDealiiPartitioner( - dftPtr->matrix_free_data.get_vector_partitioner( - dftPtr->d_densityDofHandlerIndex), - BVec, - d_parallelChebyBlockVectorDevice); - - if (dftPtr->d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") - d_parallelChebyBlockVector2Device.reinit( - d_parallelChebyBlockVectorDevice); - - if (std::is_same>::value) - { - d_tempRealVec.resize((d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), - 0.0); - d_tempImagVec.resize((d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), - 0.0); - } const unsigned int n_ghosts = dftPtr->matrix_free_data @@ -597,6 +586,11 @@ namespace dftfe dftPtr->matrix_free_data .get_vector_partitioner(dftPtr->d_densityDofHandlerIndex) ->local_size(); + if (std::is_same>::value) + { + d_tempRealVec.resize(((localSize + n_ghosts) * BVec), 0.0); + d_tempImagVec.resize(((localSize + n_ghosts) * BVec), 0.0); + } dftfe::utils::MemoryStorage locallyOwnedProcBoundaryNodesVector(localSize, 0); @@ -642,16 +636,6 @@ namespace dftfe d_flattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - flattenedArray.getMPIPatternP2P(), numberWaveFunctions); - - getOverloadedConstraintMatrixHost()->precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - dftPtr->matrix_free_data.get_vector_partitioner(), - 1); - - const unsigned int totalLocallyOwnedCells = dftPtr->matrix_free_data.n_physical_cells(); diff --git a/src/dftOperator/matrixVectorProductImplementationsDevice.cc b/src/dftOperator/matrixVectorProductImplementationsDevice.cc index 33461fca7..625354d54 100644 --- a/src/dftOperator/matrixVectorProductImplementationsDevice.cc +++ b/src/dftOperator/matrixVectorProductImplementationsDevice.cc @@ -86,8 +86,7 @@ kohnShamDFTOperatorDeviceClass:: if (std::is_same>::value) { utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), dst, d_tempRealVec.begin(), d_tempImagVec.begin()); @@ -102,8 +101,7 @@ kohnShamDFTOperatorDeviceClass:: utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), d_tempRealVec.begin(), d_tempImagVec.begin(), dst); diff --git a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc index 24a144115..c21491078 100644 --- a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc +++ b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc @@ -368,8 +368,6 @@ kohnShamDFTOperatorDeviceClass:: // // resize data members // - // d_cellShapeFunctionGradientIntegralFlattened.clear(); - // d_cellShapeFunctionGradientIntegralFlattened.resize(numberPhysicalCells*numberDofsPerElement*numberDofsPerElement); d_cellJxWValues.clear(); d_cellJxWValues.resize(numberPhysicalCells * numberQuadraturePoints); @@ -380,33 +378,6 @@ kohnShamDFTOperatorDeviceClass:: numberDofsPerElement, 0.0); - d_shapeFunctionGradientValueX.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueXTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - - d_shapeFunctionGradientValueY.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueYTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - - d_shapeFunctionGradientValueZ.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueZTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - std::vector shapeFunctionValueLpsp(numberQuadraturePointsLpsp * numberDofsPerElement, 0.0); @@ -435,38 +406,6 @@ kohnShamDFTOperatorDeviceClass:: d_cellJxWValues[iElem * numberQuadraturePoints + q_point] = fe_values.JxW(q_point); - for (unsigned int iNode = 0; iNode < numberDofsPerElement; ++iNode) - for (unsigned int q_point = 0; q_point < numberQuadraturePoints; - ++q_point) - { - const dealii::Tensor<1, 3, double> &shape_grad = - fe_values.shape_grad(iNode, q_point); - - d_shapeFunctionGradientValueX[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[0]; - d_shapeFunctionGradientValueXTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[0]; - - d_shapeFunctionGradientValueY[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[1]; - d_shapeFunctionGradientValueYTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[1]; - - d_shapeFunctionGradientValueZ[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[2]; - d_shapeFunctionGradientValueZTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[2]; - } - if (iElem == 0) { fe_values_lpsp.reinit(cellPtr); @@ -512,21 +451,6 @@ kohnShamDFTOperatorDeviceClass:: d_shapeFunctionValueTransposedDevice.copyFrom( d_shapeFunctionValueTransposed); - d_shapeFunctionGradientValueXTransposedDevice.resize( - d_shapeFunctionGradientValueXTransposed.size()); - d_shapeFunctionGradientValueXTransposedDevice.copyFrom( - d_shapeFunctionGradientValueXTransposed); - - d_shapeFunctionGradientValueYTransposedDevice.resize( - d_shapeFunctionGradientValueYTransposed.size()); - d_shapeFunctionGradientValueYTransposedDevice.copyFrom( - d_shapeFunctionGradientValueYTransposed); - - d_shapeFunctionGradientValueZTransposedDevice.resize( - d_shapeFunctionGradientValueZTransposed.size()); - d_shapeFunctionGradientValueZTransposedDevice.copyFrom( - d_shapeFunctionGradientValueZTransposed); - d_shapeFunctionValueLpspDevice.resize(shapeFunctionValueLpsp.size()); d_shapeFunctionValueLpspDevice.copyFrom(shapeFunctionValueLpsp); diff --git a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc index 165b5026a..3cf5db1f2 100644 --- a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc +++ b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc @@ -173,7 +173,7 @@ namespace dftfe const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors; + matrixFreeData.get_vector_partitioner()->locally_owned_size(); const unsigned int numMacroCells = matrixFreeData.n_cell_batches(); @@ -250,6 +250,7 @@ namespace dftfe double device_time = MPI_Wtime(); forceDevice::wfcContractionsForceKernelsAllH( + dftPtr->basisOperationsPtrDevice, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, @@ -296,7 +297,7 @@ namespace dftfe force::wfcContractionsForceKernelsAllH( kohnShamDFTEigenOperator, - dftPtr->d_eigenVectorsFlattenedSTL, + dftPtr->d_eigenVectorsFlattenedHost.begin(), d_dftParams.spinPolarized, spinIndex, dftPtr->eigenValues, diff --git a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc index 9c6003ec0..10c40a236 100644 --- a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc +++ b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc @@ -163,7 +163,7 @@ namespace dftfe bandGroupLowHighPlusOneIndices[1]); const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors; + matrixFreeData.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( dftPtr->d_kPointWeights.size()); std::vector> @@ -244,6 +244,7 @@ namespace dftfe double device_time = MPI_Wtime(); forceDevice::wfcContractionsForceKernelsAllH( + dftPtr->basisOperationsPtrDevice, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, @@ -289,7 +290,7 @@ namespace dftfe force::wfcContractionsForceKernelsAllH( kohnShamDFTEigenOperator, - dftPtr->d_eigenVectorsFlattenedSTL, + dftPtr->d_eigenVectorsFlattenedHost.begin(), d_dftParams.spinPolarized, spinIndex, dftPtr->eigenValues, diff --git a/src/force/forceWfcContractions.cc b/src/force/forceWfcContractions.cc index fbc8a8628..e9b2e9771 100644 --- a/src/force/forceWfcContractions.cc +++ b/src/force/forceWfcContractions.cc @@ -607,7 +607,7 @@ namespace dftfe operatorDFTClass & operatorMatrix, distributedCPUMultiVec &flattenedArrayBlock, distributedCPUMultiVec &projectorKetTimesVector, - const std::vector & X, + const dataTypes::number * X, const std::vector & eigenValues, const std::vector & partialOccupancies, const std::vector & kcoord, @@ -725,14 +725,14 @@ namespace dftfe void wfcContractionsForceKernelsAllH( - operatorDFTClass & operatorMatrix, - const std::vector> &X, - const unsigned int spinPolarizedFlag, - const unsigned int spinIndex, - const std::vector> & eigenValuesH, - const std::vector> & partialOccupanciesH, - const std::vector & kPointCoordinates, - const unsigned int *nonTrivialIdToElemIdMapH, + operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const unsigned int spinPolarizedFlag, + const unsigned int spinIndex, + const std::vector> &eigenValuesH, + const std::vector> &partialOccupanciesH, + const std::vector & kPointCoordinates, + const unsigned int * nonTrivialIdToElemIdMapH, const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH, const unsigned int MLoc, const unsigned int N, @@ -894,7 +894,8 @@ namespace dftfe operatorMatrix, flattenedArrayBlock, projectorKetTimesVector, - X[(1 + spinPolarizedFlag) * kPoint + spinIndex], + X + + ((1 + spinPolarizedFlag) * kPoint + spinIndex) * MLoc * N, blockedEigenValues, blockedPartialOccupancies, kcoord, diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index 985eb75c8..12a942a7c 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -41,9 +41,7 @@ namespace dftfe const unsigned int numContiguousBlocks, const unsigned int numQuads, const double * psiQuadValues, - const double * gradPsiQuadValuesX, - const double * gradPsiQuadValuesY, - const double * gradPsiQuadValuesZ, + const double * gradPsiQuadValues, const double * eigenValues, const double * partialOccupancies, double * eshelbyTensor) @@ -66,10 +64,16 @@ namespace dftfe const unsigned int tempIndex = (cellIndex)*numQuads * contiguousBlockSize + quadId * contiguousBlockSize + intraBlockIndex; - const double psi = psiQuadValues[tempIndex]; - const double gradPsiX = gradPsiQuadValuesX[tempIndex]; - const double gradPsiY = gradPsiQuadValuesY[tempIndex]; - const double gradPsiZ = gradPsiQuadValuesZ[tempIndex]; + const unsigned int tempIndex2 = + (cellIndex)*numQuads * contiguousBlockSize * 3 + + quadId * contiguousBlockSize + intraBlockIndex; + const double psi = psiQuadValues[tempIndex]; + const double gradPsiX = gradPsiQuadValues[tempIndex2]; + const double gradPsiY = + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]; + const double gradPsiZ = + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]; const double eigenValue = eigenValues[intraBlockIndex]; const double partOcc = partialOccupancies[intraBlockIndex]; @@ -109,9 +113,7 @@ namespace dftfe const unsigned int numContiguousBlocks, const unsigned int numQuads, const dftfe::utils::deviceDoubleComplex *psiQuadValues, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesX, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesY, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesZ, + const dftfe::utils::deviceDoubleComplex *gradPsiQuadValues, const double * eigenValues, const double * partialOccupancies, const double kcoordx, @@ -138,22 +140,29 @@ namespace dftfe const unsigned int tempIndex = (cellIndex)*numQuads * contiguousBlockSize + quadId * contiguousBlockSize + intraBlockIndex; + const unsigned int tempIndex2 = + (cellIndex)*numQuads * contiguousBlockSize * 3 + + quadId * contiguousBlockSize + intraBlockIndex; const dftfe::utils::deviceDoubleComplex psi = psiQuadValues[tempIndex]; const dftfe::utils::deviceDoubleComplex psiConj = dftfe::utils::conj(psiQuadValues[tempIndex]); const dftfe::utils::deviceDoubleComplex gradPsiX = - gradPsiQuadValuesX[tempIndex]; + gradPsiQuadValues[tempIndex2]; const dftfe::utils::deviceDoubleComplex gradPsiY = - gradPsiQuadValuesY[tempIndex]; + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]; const dftfe::utils::deviceDoubleComplex gradPsiZ = - gradPsiQuadValuesZ[tempIndex]; + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]; const dftfe::utils::deviceDoubleComplex gradPsiXConj = - dftfe::utils::conj(gradPsiQuadValuesX[tempIndex]); + dftfe::utils::conj(gradPsiQuadValues[tempIndex2]); const dftfe::utils::deviceDoubleComplex gradPsiYConj = - dftfe::utils::conj(gradPsiQuadValuesY[tempIndex]); + dftfe::utils::conj( + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]); const dftfe::utils::deviceDoubleComplex gradPsiZConj = - dftfe::utils::conj(gradPsiQuadValuesZ[tempIndex]); + dftfe::utils::conj( + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]); const double eigenValue = eigenValues[intraBlockIndex]; const double partOcc = partialOccupancies[intraBlockIndex]; @@ -410,6 +419,11 @@ namespace dftfe void interpolatePsiComputeELocWfcEshelbyTensorD( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &Xb, const unsigned int BVec, @@ -437,13 +451,7 @@ namespace dftfe &psiQuadsFlatD, dftfe::utils::MemoryStorage - &gradPsiQuadsXFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsYFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsZFlatD, + &gradPsiQuadsFlatD, #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -463,56 +471,24 @@ namespace dftfe dftfe::utils::MemoryStorage &cellWaveFunctionMatrix = operatorMatrix.getCellWaveFunctionMatrix(); - - dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( - BVec, - numCells * numNodesPerElement, - Xb.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap()).begin()); + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | dftfe::basis::update_gradients; + basisOperationsPtr->reinit(BVec, cellsBlockSize, 0); const int blockSize = cellsBlockSize; const int numberBlocks = numCells / blockSize; const int remBlockSize = numCells - numberBlocks * blockSize; - dftfe::utils::MemoryStorage - shapeFunctionValuesReferenceD(numQuads * numNodesPerElement, - dataTypes::number(0.0)); dftfe::utils::MemoryStorage shapeFunctionValuesNLPReferenceD(numQuadsNLP * numNodesPerElement, dataTypes::number(0.0)); - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - numQuads * numNodesPerElement, - (operatorMatrix.getShapeFunctionValuesTransposed()).begin(), - shapeFunctionValuesReferenceD.begin()); - - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( numQuadsNLP * numNodesPerElement, (operatorMatrix.getShapeFunctionValuesNLPTransposed()).begin(), shapeFunctionValuesNLPReferenceD.begin()); - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesXTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); - - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesYTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); - - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesZTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); - dftfe::utils::MemoryStorage shapeFunctionGradientValuesNLPReferenceD(blockSize * numQuadsNLP * 3 * numNodesPerElement, @@ -535,6 +511,10 @@ namespace dftfe 0, i * numQuadsNLP * 3 * numNodesPerElement); + basisOperationsPtr->extractToCellNodalDataKernel( + Xb, + cellWaveFunctionMatrix.data(), + std::pair(0, numCells)); for (int iblock = 0; iblock < (numberBlocks + 1); iblock++) @@ -558,120 +538,13 @@ namespace dftfe if (!isFloatingChargeForces) { - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionValuesReferenceD.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - psiQuadsFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - strideB = numNodesPerElement * numQuads; - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesXTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + + basisOperationsPtr->interpolateKernel( + cellWaveFunctionMatrix.data() + startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsXFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesYTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsYFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesZTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsZFlatD.begin(), - BVec, - strideC, - currentBlockSize); - + psiQuadsFlatD.data(), + gradPsiQuadsFlatD.begin(), + std::pair( + startingId, startingId + currentBlockSize)); #ifdef DFTFE_WITH_DEVICE_LANG_CUDA computeELocWfcEshelbyTensorContributions<<< (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / @@ -684,11 +557,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible( psiQuadsFlatD.begin()), dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsXFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsYFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsZFlatD.begin()), + gradPsiQuadsFlatD.begin()), eigenValuesD.begin(), partialOccupanciesD.begin(), # ifdef USE_COMPLEX @@ -717,11 +586,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible( psiQuadsFlatD.begin()), dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsXFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsYFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsZFlatD.begin()), + gradPsiQuadsFlatD.begin()), eigenValuesD.begin(), partialOccupanciesD.begin(), # ifdef USE_COMPLEX @@ -1069,6 +934,11 @@ namespace dftfe void devicePortedForceKernelsAllD( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &deviceFlattenedArrayBlock, distributedDeviceVec &projectorKetTimesVectorD, @@ -1109,13 +979,7 @@ namespace dftfe &psiQuadsFlatD, dftfe::utils::MemoryStorage - &gradPsiQuadsXFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsYFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsZFlatD, + &gradPsiQuadsFlatD, #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -1154,11 +1018,13 @@ namespace dftfe // int this_process; // MPI_Comm_rank(d_mpiCommParent, &this_process); - const unsigned int M = operatorMatrix.getMatrixFreeData() - ->get_vector_partitioner() - ->local_size(); dftfe::utils::deviceKernelsGeneric::stridedCopyToBlockConstantStride( - numPsi, N, M, startingVecId, X, deviceFlattenedArrayBlock.begin()); + numPsi, + N, + basisOperationsPtr->nOwnedDofs(), + startingVecId, + X, + deviceFlattenedArrayBlock.begin()); deviceFlattenedArrayBlock.updateGhostValues(); (operatorMatrix.getOverloadedConstraintMatrix()) @@ -1169,7 +1035,8 @@ namespace dftfe // MPI_Barrier(d_mpiCommParent); // double kernel1_time = MPI_Wtime(); - interpolatePsiComputeELocWfcEshelbyTensorD(operatorMatrix, + interpolatePsiComputeELocWfcEshelbyTensorD(basisOperationsPtr, + operatorMatrix, deviceFlattenedArrayBlock, numPsi, numCells, @@ -1186,9 +1053,7 @@ namespace dftfe onesVecD, cellsBlockSize, psiQuadsFlatD, - gradPsiQuadsXFlatD, - gradPsiQuadsYFlatD, - gradPsiQuadsZFlatD, + gradPsiQuadsFlatD, #ifdef USE_COMPLEX psiQuadsNLPD, #endif @@ -1273,6 +1138,11 @@ namespace dftfe void wfcContractionsForceKernelsAllH( + std::shared_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, @@ -1361,16 +1231,8 @@ namespace dftfe dataTypes::number(0.0)); dftfe::utils::MemoryStorage - gradPsiQuadsXFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - gradPsiQuadsYFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - gradPsiQuadsZFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); + gradPsiQuadsFlatD(cellsBlockSize * numQuads * blockSize * 3, + dataTypes::number(0.0)); #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -1501,6 +1363,7 @@ namespace dftfe // double kernel_time = MPI_Wtime(); devicePortedForceKernelsAllD( + basisOperationsPtr, operatorMatrix, deviceFlattenedArrayBlock, projectorKetTimesVectorD, @@ -1526,9 +1389,7 @@ namespace dftfe numNodesPerElement, totalNonTrivialPseudoWfcs, psiQuadsFlatD, - gradPsiQuadsXFlatD, - gradPsiQuadsYFlatD, - gradPsiQuadsZFlatD, + gradPsiQuadsFlatD, #ifdef USE_COMPLEX psiQuadsNLPD, #endif diff --git a/src/helmholtz/kerkerSolverProblemDevice.cc b/src/helmholtz/kerkerSolverProblemDevice.cc index 2a2afc5d4..84712f276 100644 --- a/src/helmholtz/kerkerSolverProblemDevice.cc +++ b/src/helmholtz/kerkerSolverProblemDevice.cc @@ -103,11 +103,6 @@ namespace dftfe d_matrixFreeDataPRefinedPtr->get_vector_partitioner( d_matrixFreeVectorComponent), *d_constraintMatrixPRefinedPtr); - d_constraintsTotalPotentialInfo.precomputeMaps( - d_matrixFreeDataPRefinedPtr->get_vector_partitioner( - d_matrixFreeVectorComponent), - d_xPtr->get_partitioner(), - 1); } diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index 8eac264b7..4c5766e18 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -516,12 +516,12 @@ namespace dftfe template void - gramSchmidtOrthogonalization(std::vector & X, + gramSchmidtOrthogonalization(T * X, const unsigned int numberVectors, + const unsigned int localVectorSize, const MPI_Comm & mpiComm) { #ifdef USE_PETSC - const unsigned int localVectorSize = X.size() / numberVectors; // // Create template PETSc vector to create BV object later @@ -614,8 +614,9 @@ namespace dftfe void rayleighRitzGEP(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpi_communicator, @@ -660,8 +661,9 @@ namespace dftfe // SConj=X^{T}*XConj. if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -674,8 +676,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -684,8 +686,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -791,7 +793,8 @@ namespace dftfe T(0.0)); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Compute ProjHam, RR step"); computing_timer.enter_subsection( @@ -918,8 +921,8 @@ namespace dftfe projHamParCopy.mmult(projHamPar, LMatPar); if (!(dftParams.useMixedPrecSubspaceRotRR && useMixedPrec)) - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -933,8 +936,8 @@ namespace dftfe { if (std::is_same>::value) internal::subspaceRotationMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -944,8 +947,9 @@ namespace dftfe false, false); else - internal::subspaceRotationMixedPrec(&X[0], - X.size(), + internal::subspaceRotationMixedPrec(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -968,8 +972,9 @@ namespace dftfe void rayleighRitz(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpi_communicator, @@ -1006,7 +1011,8 @@ namespace dftfe T(0.0)); computing_timer.enter_subsection("Blocked XtHX, RR step"); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Blocked XtHX, RR step"); // @@ -1116,8 +1122,8 @@ namespace dftfe processGrid, rowsBlockSize); projHamParCopy.copy_conjugate_transposed(projHamPar); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1135,9 +1141,10 @@ namespace dftfe void rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, - std::vector & Y, + T * X, + T * Y, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -1183,8 +1190,9 @@ namespace dftfe // SConj=X^{T}*XConj if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1197,8 +1205,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1207,8 +1215,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1319,12 +1327,17 @@ namespace dftfe if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit) { - operatorMatrix.XtHXMixedPrec( - X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar); + operatorMatrix.XtHXMixedPrec(X, + numberWaveFunctions, + numberCoreStates, + localVectorSize, + processGrid, + projHamPar); } else { - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); } @@ -1497,9 +1510,10 @@ namespace dftfe computing_timer.enter_subsection( "Xfr^{T}={QfrConjPrime}^{C}*LConj^{-1}*X^{T}, RR step"); - internal::subspaceRotationSpectrumSplit(&X[0], - &Y[0], - X.size(), + internal::subspaceRotationSpectrumSplit(X, + Y, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, numberWaveFunctions - @@ -1517,8 +1531,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec)) { computing_timer.enter_subsection("X^{T}=Lconj^{-1}*X^{T}, RR step"); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1536,8 +1550,8 @@ namespace dftfe "X^{T}=Lconj^{-1}*X^{T} mixed prec, RR step"); if (std::is_same>::value) internal::subspaceRotationCGSMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1548,8 +1562,8 @@ namespace dftfe false); else internal::subspaceRotationCGSMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1566,18 +1580,19 @@ namespace dftfe template void - rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - const std::vector &X, - std::vector & Y, - const unsigned int numberWaveFunctions, - const unsigned int numberCoreStates, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interBandGroupComm, - const MPI_Comm & mpi_communicator, - const bool useMixedPrec, - std::vector & eigenValues, - const dftParameters & dftParams) + rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + const T * X, + T * Y, + const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, + const unsigned int numberCoreStates, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interBandGroupComm, + const MPI_Comm & mpi_communicator, + const bool useMixedPrec, + std::vector &eigenValues, + const dftParameters &dftParams) { dealii::ConditionalOStream pcout( @@ -1611,15 +1626,20 @@ namespace dftfe if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit) { computing_timer.enter_subsection("Blocked XtHX Mixed Prec, RR step"); - operatorMatrix.XtHXMixedPrec( - X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar); + operatorMatrix.XtHXMixedPrec(X, + numberWaveFunctions, + numberCoreStates, + localVectorSize, + processGrid, + projHamPar); computing_timer.leave_subsection("Blocked XtHX Mixed Prec, RR step"); } else { computing_timer.enter_subsection("Blocked XtHX, RR step"); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Blocked XtHX, RR step"); } @@ -1766,9 +1786,10 @@ namespace dftfe computing_timer.enter_subsection("Blocked subspace rotation, RR step"); - internal::subspaceRotationSpectrumSplit(&X[0], - &Y[0], - X.size(), + internal::subspaceRotationSpectrumSplit(X, + Y, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, numberWaveFunctions - @@ -2373,8 +2394,10 @@ namespace dftfe template void computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const std::vector &eigenValues, + const unsigned int totalNumberVectors, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -2385,8 +2408,6 @@ namespace dftfe // // get the number of eigenVectors // - const unsigned int totalNumberVectors = eigenValues.size(); - const unsigned int localVectorSize = X.size() / totalNumberVectors; std::vector residualNormSquare(totalNumberVectors, 0.0); // band group parallelization data structures @@ -3006,8 +3027,9 @@ namespace dftfe void densityMatrixEigenBasisFirstOrderResponse( operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const unsigned int N, + const unsigned int numberLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -3050,10 +3072,11 @@ namespace dftfe if (dftParams.singlePrecLRD) { operatorMatrix.XtHXMixedPrec( - X, N, N, processGrid, projHamPrimePar, true); + X, N, N, numberLocalDofs, processGrid, projHamPrimePar, true); } else - operatorMatrix.XtHX(X, N, processGrid, projHamPrimePar, true); + operatorMatrix.XtHX( + X, N, numberLocalDofs, processGrid, projHamPrimePar, true); computing_timer.leave_subsection("Compute ProjHamPrime, DMFOR step"); @@ -3165,8 +3188,8 @@ namespace dftfe { if (std::is_same>::value) internal::subspaceRotationMixedPrec>( - &X[0], - X.size(), + X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3177,8 +3200,8 @@ namespace dftfe false); else internal::subspaceRotationMixedPrec( - &X[0], - X.size(), + X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3190,8 +3213,8 @@ namespace dftfe } else { - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3237,14 +3260,16 @@ namespace dftfe template void - gramSchmidtOrthogonalization(std::vector &, + gramSchmidtOrthogonalization(dataTypes::number *, const unsigned int, + const unsigned int localVectorSize, const MPI_Comm &); template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm & mpiComm, @@ -3254,8 +3279,9 @@ namespace dftfe template void rayleighRitz(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm &, @@ -3266,8 +3292,9 @@ namespace dftfe template void rayleighRitzGEP(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm &, @@ -3279,9 +3306,10 @@ namespace dftfe template void rayleighRitzSpectrumSplitDirect(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - const std::vector &, - std::vector &, + const dataTypes::number *, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm &, const MPI_Comm &, @@ -3291,11 +3319,12 @@ namespace dftfe const dftParameters &dftParams); template void - rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &operatorMatrix, - elpaScalaManager &elpaScala, - std::vector &X, - std::vector &Y, + rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * X, + dataTypes::number * Y, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -3305,28 +3334,31 @@ namespace dftfe const dftParameters &dftParams); template void - computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector &X, - const std::vector & eigenValues, - const MPI_Comm & mpiCommParent, - const MPI_Comm & mpiCommDomain, - const MPI_Comm & interBandGroupComm, - std::vector & residualNorm, - const dftParameters & dftParams); + computeEigenResidualNorm(operatorDFTClass & operatorMatrix, + dataTypes::number * X, + const std::vector &eigenValues, + const unsigned int totalNumberVectors, + const unsigned int localVectorSize, + const MPI_Comm & mpiCommParent, + const MPI_Comm & mpiCommDomain, + const MPI_Comm & interBandGroupComm, + std::vector & residualNorm, + const dftParameters & dftParams); template void densityMatrixEigenBasisFirstOrderResponse( - operatorDFTClass & operatorMatrix, - std::vector &X, - const unsigned int N, - const MPI_Comm & mpiCommParent, - const MPI_Comm & mpiCommDomain, - const MPI_Comm & interBandGroupComm, - const std::vector & eigenValues, - const double fermiEnergy, - std::vector & densityMatDerFermiEnergy, - elpaScalaManager & elpaScala, - const dftParameters & dftParams); + operatorDFTClass & operatorMatrix, + dataTypes::number * X, + const unsigned int N, + const unsigned int numberLocalDofs, + const MPI_Comm & mpiCommParent, + const MPI_Comm & mpiCommDomain, + const MPI_Comm & interBandGroupComm, + const std::vector &eigenValues, + const double fermiEnergy, + std::vector & densityMatDerFermiEnergy, + elpaScalaManager & elpaScala, + const dftParameters & dftParams); } // namespace linearAlgebraOperations diff --git a/src/linAlg/pseudoGS.cc b/src/linAlg/pseudoGS.cc index 1703e083c..8eb1898a4 100644 --- a/src/linAlg/pseudoGS.cc +++ b/src/linAlg/pseudoGS.cc @@ -29,8 +29,9 @@ namespace dftfe template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberVectors, + const unsigned int numLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiComm, @@ -38,8 +39,6 @@ namespace dftfe const dftParameters &dftParams) { - const unsigned int numLocalDofs = X.size() / numberVectors; - dealii::ConditionalOStream pcout( std::cout, (dealii::Utilities::MPI::this_mpi_process(mpiCommParent) == 0)); @@ -74,8 +73,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { computing_timer.enter_subsection("Fill overlap matrix CGS"); - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -91,8 +90,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -101,8 +100,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -218,8 +217,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec)) { computing_timer.enter_subsection("Subspace rotation CGS"); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -235,8 +234,8 @@ namespace dftfe computing_timer.enter_subsection("Subspace rotation mixed prec CGS"); if (std::is_same>::value) internal::subspaceRotationCGSMixedPrec>( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -245,8 +244,9 @@ namespace dftfe dftParams, false); else - internal::subspaceRotationCGSMixedPrec(&X[0], - X.size(), + internal::subspaceRotationCGSMixedPrec(X, + numberVectors * + numLocalDofs, numberVectors, processGrid, interBandGroupComm, diff --git a/src/poisson/poissonSolverProblemDevice.cc b/src/poisson/poissonSolverProblemDevice.cc index 4881183dc..27f4ad642 100644 --- a/src/poisson/poissonSolverProblemDevice.cc +++ b/src/poisson/poissonSolverProblemDevice.cc @@ -785,10 +785,6 @@ namespace dftfe d_constraintsTotalPotentialInfo.initialize( d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent), *d_constraintMatrixPtr); - d_constraintsTotalPotentialInfo.precomputeMaps( - d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent), - d_xPtr->get_partitioner(), - 1); } diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc index 11ac14c22..36ce39a4f 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc @@ -114,17 +114,18 @@ namespace dftfe // void chebyshevOrthogonalizedSubspaceIterationSolver::solve( - operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - std::vector &eigenVectorsFlattened, - std::vector &eigenVectorsRotFracDensityFlattened, - const unsigned int totalNumberWaveFunctions, - std::vector & eigenValues, - std::vector & residualNorms, - const MPI_Comm & interBandGroupComm, - const bool computeResidual, - const bool useMixedPrec, - const bool isFirstScf) + operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, + dataTypes::number * eigenVectorsRotFracDensityFlattened, + const unsigned int totalNumberWaveFunctions, + const unsigned int localVectorSize, + std::vector &eigenValues, + std::vector &residualNorms, + const MPI_Comm & interBandGroupComm, + const bool computeResidual, + const bool useMixedPrec, + const bool isFirstScf) { dealii::TimerOutput computingTimerStandard( operatorMatrix.getMPICommunicator(), @@ -185,8 +186,6 @@ namespace dftfe dftUtils::printCurrentMemoryUsage(operatorMatrix.getMPICommunicator(), "Before starting chebyshev filtering"); - const unsigned int localVectorSize = - eigenVectorsFlattened.size() / totalNumberWaveFunctions; // band group parallelization data structures @@ -244,9 +243,9 @@ namespace dftfe computing_timer.enter_subsection( "Copy from full to block flattened array"); for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) - std::copy(eigenVectorsFlattened.data() + + std::copy(eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec, - eigenVectorsFlattened.data() + + eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec + BVec, eigenVectorsFlattenedArrayBlock.data() + iNode * BVec); computing_timer.leave_subsection( @@ -321,7 +320,7 @@ namespace dftfe std::copy(eigenVectorsFlattenedArrayBlock.data() + iNode * BVec, eigenVectorsFlattenedArrayBlock.data() + (iNode + 1) * BVec, - eigenVectorsFlattened.data() + + eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec); computing_timer.leave_subsection( @@ -358,9 +357,9 @@ namespace dftfe std::min(blockSize, totalNumberWaveFunctions * localVectorSize - i); MPI_Allreduce(MPI_IN_PLACE, - &eigenVectorsFlattened[0] + i, + eigenVectorsFlattened + i, currentBlockSize, - dataTypes::mpi_type_id(&eigenVectorsFlattened[0]), + dataTypes::mpi_type_id(eigenVectorsFlattened), MPI_SUM, interBandGroupComm); } @@ -446,6 +445,7 @@ namespace dftfe eigenVectorsFlattened, eigenVectorsRotFracDensityFlattened, totalNumberWaveFunctions, + localVectorSize, totalNumberWaveFunctions - eigenValues.size(), d_mpiCommParent, interBandGroupComm, @@ -461,6 +461,7 @@ namespace dftfe elpaScala, eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, interBandGroupComm, operatorMatrix.getMPICommunicator(), @@ -477,6 +478,8 @@ namespace dftfe operatorMatrix, eigenVectorsRotFracDensityFlattened, eigenValues, + eigenValues.size(), + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -489,6 +492,8 @@ namespace dftfe operatorMatrix, eigenVectorsFlattened, eigenValues, + totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -503,6 +508,7 @@ namespace dftfe linearAlgebraOperations::gramSchmidtOrthogonalization( eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, operatorMatrix.getMPICommunicator()); computing_timer.leave_subsection("Gram-Schmidt Orthogn Opt"); @@ -519,6 +525,7 @@ namespace dftfe eigenVectorsFlattened, eigenVectorsRotFracDensityFlattened, totalNumberWaveFunctions, + localVectorSize, totalNumberWaveFunctions - eigenValues.size(), d_mpiCommParent, interBandGroupComm, @@ -534,6 +541,7 @@ namespace dftfe elpaScala, eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, interBandGroupComm, operatorMatrix.getMPICommunicator(), @@ -561,6 +569,8 @@ namespace dftfe operatorMatrix, eigenVectorsRotFracDensityFlattened, eigenValues, + eigenValues.size(), + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -572,6 +582,8 @@ namespace dftfe operatorMatrix, eigenVectorsFlattened, eigenValues, + totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, diff --git a/src/symmetry/symmetrizeRho.cc b/src/symmetry/symmetrizeRho.cc index ecc579bee..5ee927dee 100644 --- a/src/symmetry/symmetrizeRho.cc +++ b/src/symmetry/symmetrizeRho.cc @@ -223,7 +223,7 @@ namespace dftfe dftPtr->d_kPointWeights.size()); const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / dftPtr->d_numEigenValues; + dftPtr->matrix_free_data.get_vector_partitioner()->locally_owned_size(); distributedCPUVec eigenVectorsFlattenedArrayFullBlock; vectorTools::createDealiiVector( @@ -231,11 +231,6 @@ namespace dftfe dftPtr->d_numEigenValues, eigenVectorsFlattenedArrayFullBlock); - dftPtr->constraintsNoneDataInfo.precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedArrayFullBlock.get_partitioner(), - dftPtr->d_numEigenValues); - for (unsigned int kPoint = 0; kPoint < (1 + dftPtr->getParametersObject().spinPolarized) * dftPtr->d_kPointWeights.size(); @@ -250,8 +245,9 @@ namespace dftfe ++iWave) eigenVectorsFlattenedArrayFullBlock.local_element( iNode * dftPtr->d_numEigenValues + iWave) = - dftPtr->d_eigenVectorsFlattenedSTL - [kPoint][iNode * dftPtr->d_numEigenValues + iWave]; + dftPtr->d_eigenVectorsFlattenedHost + [kPoint * localVectorSize * dftPtr->d_numEigenValues + + iNode * dftPtr->d_numEigenValues + iWave]; dftPtr->constraintsNoneDataInfo.distribute( eigenVectorsFlattenedArrayFullBlock, dftPtr->d_numEigenValues); diff --git a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc new file mode 100644 index 000000000..aa420fa35 --- /dev/null +++ b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -0,0 +1,32 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 1 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/complex +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc new file mode 100644 index 000000000..0659588b2 --- /dev/null +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc @@ -0,0 +1,41 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 3 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/real +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe + + diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc new file mode 100644 index 000000000..9c051b5e5 --- /dev/null +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -0,0 +1,39 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 1 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/real +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe diff --git a/utils/DeviceBlasWrapper.cu.cc b/utils/DeviceBlasWrapper.cu.cc index 6ed34f28a..dede70848 100644 --- a/utils/DeviceBlasWrapper.cu.cc +++ b/utils/DeviceBlasWrapper.cu.cc @@ -474,6 +474,108 @@ namespace dftfe return status; } + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy) + { + deviceBlasStatus_t status = cublasDgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy) + { + deviceBlasStatus_t status = cublasSgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + cublasZgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + cublasCgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + } // namespace deviceBlasWrapper } // namespace utils } // namespace dftfe diff --git a/utils/DeviceBlasWrapper.hip.cc b/utils/DeviceBlasWrapper.hip.cc index ec6a5b316..f24a55ccf 100644 --- a/utils/DeviceBlasWrapper.hip.cc +++ b/utils/DeviceBlasWrapper.hip.cc @@ -517,6 +517,107 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); return status; } + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy) + { + deviceBlasStatus_t status = hipblasDgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy) + { + deviceBlasStatus_t status = hipblasSgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + hipblasZgemv(handle, + trans, + m, + n, + makeDataTypeHipBlasCompatible(alpha), + makeDataTypeHipBlasCompatible(A), + lda, + makeDataTypeHipBlasCompatible(x), + incx, + makeDataTypeHipBlasCompatible(beta), + makeDataTypeHipBlasCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + hipblasCgemv(handle, + trans, + m, + n, + makeDataTypeHipBlasCompatible(alpha), + makeDataTypeHipBlasCompatible(A), + lda, + makeDataTypeHipBlasCompatible(x), + incx, + makeDataTypeHipBlasCompatible(beta), + makeDataTypeHipBlasCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } } // namespace deviceBlasWrapper } // namespace utils diff --git a/utils/DeviceKernelsGeneric.cc b/utils/DeviceKernelsGeneric.cc index 54c517e28..56ecd913d 100644 --- a/utils/DeviceKernelsGeneric.cc +++ b/utils/DeviceKernelsGeneric.cc @@ -208,6 +208,37 @@ namespace dftfe } } + template + __global__ void + stridedCopyConstantStrideDeviceKernel(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numBlocks * blockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / blockSize; + dftfe::size_type intraBlockIndex = index - blockIndex * blockSize; + dftfe::utils::copyValue( + copyToVec + blockIndex * strideTo + startingToId + + intraBlockIndex, + copyFromVec[blockIndex * strideFrom + startingFromId + + intraBlockIndex]); + } + } + } + + // x=a*x, with inc=1 template __global__ void @@ -586,6 +617,47 @@ namespace dftfe #endif } + template + void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + stridedCopyConstantStrideDeviceKernel<<< + (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + blockSize, + strideTo, + strideFrom, + numBlocks, + startingToId, + startingFromId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + stridedCopyConstantStrideDeviceKernel, + (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + blockSize, + strideTo, + strideFrom, + numBlocks, + startingToId, + startingFromId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template void @@ -1312,6 +1384,87 @@ namespace dftfe const dftfe::size_type startingId, const std::complex *copyFromVec, std::complex * copyToVec); + // strided copy constant stride + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const double * copyFromVec, + double * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const float * copyFromVec, + float * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const double * copyFromVec, + float * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const float * copyFromVec, + double * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); // stridedBlockScale template void diff --git a/utils/FEBasisOperations.t.cc b/utils/FEBasisOperations.t.cc new file mode 100644 index 000000000..b26ee0c40 --- /dev/null +++ b/utils/FEBasisOperations.t.cc @@ -0,0 +1,1059 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +namespace dftfe +{ + namespace basis + { + template + FEBasisOperationsBase:: + FEBasisOperationsBase( + dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + &constraintsVector) + { + d_matrixFreeDataPtr = &matrixFreeData; + d_constraintsVector = &constraintsVector; + d_dofHandlerID = 0; + d_nVectors = 0; + d_updateFlags = update_default; + areAllCellsAffine = true; + for (unsigned int iMacroCell = 0; + iMacroCell < d_matrixFreeDataPtr->n_cell_batches(); + ++iMacroCell) + { + areAllCellsAffine = + areAllCellsAffine && + (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( + iMacroCell) <= dealii::internal::MatrixFreeFunctions::affine); + } + areAllCellsCartesian = true; + for (unsigned int iMacroCell = 0; + iMacroCell < d_matrixFreeDataPtr->n_cell_batches(); + ++iMacroCell) + { + areAllCellsCartesian = + areAllCellsCartesian && + (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( + iMacroCell) == dealii::internal::MatrixFreeFunctions::cartesian); + } + } + + template + void + FEBasisOperationsBase::init(const unsigned int &dofHandlerID, + const std::vector + & quadratureID, + const UpdateFlags updateFlags) + { + d_dofHandlerID = dofHandlerID; + d_quadratureIDsVector = quadratureID; + d_updateFlags = updateFlags; + initializeIndexMaps(); + initializeMPIPattern(); + initializeConstraints(); + initializeShapeFunctionAndJacobianData(); + if (!std::is_same::value) + initializeShapeFunctionAndJacobianBasisData(); + } + + template + void + FEBasisOperationsBase::reinit(const unsigned int &vecBlockSize, + const unsigned int + &cellsBlockSize, + const unsigned int &quadratureID, + const bool isResizeTempStorage) + { + d_quadratureID = quadratureID; + d_cellsBlockSize = cellsBlockSize; + if (d_nVectors != vecBlockSize) + { + d_nVectors = vecBlockSize; + initializeFlattenedIndexMaps(); + } + if (isResizeTempStorage) + resizeTempStorage(); + } + + template + unsigned int + FEBasisOperationsBase::nQuadsPerCell() const + { + return d_nQuadsPerCell[d_quadratureID]; + } + + template + unsigned int + FEBasisOperationsBase::nDofsPerCell() const + { + return d_nDofsPerCell; + } + + template + unsigned int + FEBasisOperationsBase::nCells() const + { + return d_nCells; + } + + template + unsigned int + FEBasisOperationsBase::nRelaventDofs() const + { + return d_localSize; + } + + template + unsigned int + FEBasisOperationsBase::nOwnedDofs() const + { + return d_locallyOwnedSize; + } + + template + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::shapeFunctionData(bool transpose) const + { + return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] : + d_shapeFunctionData[d_quadratureID]; + } + + template + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientData(bool transpose) const + { + return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] : + d_shapeFunctionGradientData[d_quadratureID]; + } + + template + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::inverseJacobians() const + { + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::JxW() const + { + return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::JxWBasisData() const + { + return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::JxWBasisData() const + { + return d_JxWBasisData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::inverseJacobiansBasisData() const + { + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::inverseJacobiansBasisData() const + { + return d_inverseJacobianBasisData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) + const + { + return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] : + d_shapeFunctionData[d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) + const + { + return transpose ? d_shapeFunctionBasisDataTranspose[d_quadratureID] : + d_shapeFunctionBasisData[d_quadratureID]; + } + + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientBasisData(bool transpose) const + { + return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] : + d_shapeFunctionGradientData[d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientBasisData(bool transpose) const + { + return transpose ? + d_shapeFunctionGradientBasisDataTranspose[d_quadratureID] : + d_shapeFunctionGradientBasisData[d_quadratureID]; + } + + + template + unsigned int + FEBasisOperationsBase::cellsTypeFlag() const + { + return (unsigned int)areAllCellsAffine + + (unsigned int)areAllCellsCartesian; + } + + template + dealii::CellId + FEBasisOperationsBase::cellID(const unsigned int iElem) const + { + return d_cellIndexToCellIdMap[iElem]; + } + + + + template + void + FEBasisOperationsBase::resizeTempStorage() + { + tempCellNodalData.resize(d_nVectors * d_nDofsPerCell * d_cellsBlockSize); + + if (d_updateFlags & update_gradients) + tempQuadratureGradientsData.resize( + areAllCellsCartesian ? 0 : + (d_nVectors * d_nQuadsPerCell[d_quadratureID] * + 3 * d_cellsBlockSize)); + + if (d_updateFlags & update_gradients) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? 0 : + (d_nVectors * d_nQuadsPerCell[d_quadratureID] * + 3 * d_cellsBlockSize)); + } + + template + void + FEBasisOperationsBase::initializeFlattenedIndexMaps() + { +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_flattenedCellDofIndexToProcessDofIndexMapHost; +#else + auto &d_flattenedCellDofIndexToProcessDofIndexMapHost = + d_flattenedCellDofIndexToProcessDofIndexMap; +#endif + d_flattenedCellDofIndexToProcessDofIndexMapHost.clear(); + d_flattenedCellDofIndexToProcessDofIndexMapHost.resize(d_nCells * + d_nDofsPerCell); + + std::transform(d_cellDofIndexToProcessDofIndexMap.begin(), + d_cellDofIndexToProcessDofIndexMap.end(), + d_flattenedCellDofIndexToProcessDofIndexMapHost.begin(), + [&a = this->d_nVectors](auto &c) { return c * a; }); +#if defined(DFTFE_WITH_DEVICE) + d_flattenedCellDofIndexToProcessDofIndexMap.resize( + d_flattenedCellDofIndexToProcessDofIndexMapHost.size()); + d_flattenedCellDofIndexToProcessDofIndexMap.copyFrom( + d_flattenedCellDofIndexToProcessDofIndexMapHost); +#endif + } + + template + void + FEBasisOperationsBase::initializeMPIPattern() + { + const std::pair &locallyOwnedRange = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->local_range(); + + std::vector ghostIndices; + (d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->ghost_indices()) + .fill_index_vector(ghostIndices); + + mpiPatternP2P = + std::make_shared>( + locallyOwnedRange, + ghostIndices, + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->get_mpi_communicator()); + } + + template + void + FEBasisOperationsBase::initializeIndexMaps() + { + d_nCells = d_matrixFreeDataPtr->n_physical_cells(); + d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) + .get_fe() + .dofs_per_cell; + d_locallyOwnedSize = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->locally_owned_size(); + d_localSize = d_locallyOwnedSize + + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->n_ghost_indices(); + d_cellDofIndexToProcessDofIndexMap.clear(); + d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell); + + d_cellIndexToCellIdMap.clear(); + d_cellIndexToCellIdMap.resize(d_nCells); + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + std::vector cellDofIndicesGlobal(d_nDofsPerCell); + std::map cellIdToCellIndexMap; + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + cellPtr->get_dof_indices(cellDofIndicesGlobal); + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + d_cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + + iDof] = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->global_to_local(cellDofIndicesGlobal[iDof]); + + + d_cellIndexToCellIdMap[iCell] = cellPtr->id(); + + ++iCell; + } + } + + + template + void + FEBasisOperationsBase::initializeConstraints() + { + d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner( + d_dofHandlerID), + *((*d_constraintsVector)[d_dofHandlerID])); + } + + template + void + FEBasisOperationsBase::initializeShapeFunctionAndJacobianData() + { + d_nQuadsPerCell.resize(d_quadratureIDsVector.size()); + d_inverseJacobianData.resize( + areAllCellsAffine ? 1 : d_quadratureIDsVector.size()); + d_JxWData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_values) + { + d_shapeFunctionData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionDataTranspose.resize(d_quadratureIDsVector.size()); + } + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataInternalLayout.resize( + d_quadratureIDsVector.size()); + d_shapeFunctionGradientData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTranspose.resize( + d_quadratureIDsVector.size()); + } + for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size(); + ++iQuadID) + { + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_values | dealii::update_gradients | + dealii::update_jacobians | dealii::update_JxW_values | + dealii::update_inverse_jacobians); + + d_nQuadsPerCell[iQuadID] = quadrature.size(); + +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_inverseJacobianDataHost; + dftfe::utils::MemoryStorage + d_JxWDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataTransposeHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataInternalLayoutHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataTransposeHost; +#else + auto &d_inverseJacobianDataHost = + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID]; + auto &d_JxWDataHost = d_JxWData[iQuadID]; + auto &d_shapeFunctionDataHost = d_shapeFunctionData[iQuadID]; + auto &d_shapeFunctionGradientDataInternalLayoutHost = + d_shapeFunctionGradientDataInternalLayout[iQuadID]; + auto &d_shapeFunctionDataTransposeHost = + d_shapeFunctionDataTranspose[iQuadID]; + auto &d_shapeFunctionGradientDataHost = + d_shapeFunctionGradientData[iQuadID]; + auto &d_shapeFunctionGradientDataTransposeHost = + d_shapeFunctionGradientDataTranspose[iQuadID]; +#endif + + + d_shapeFunctionDataHost.clear(); + if (d_updateFlags & update_values) + d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionDataTransposeHost.clear(); + if ((d_updateFlags & update_values) && + (d_updateFlags & update_transpose)) + d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionGradientDataInternalLayoutHost.clear(); + d_shapeFunctionGradientDataHost.clear(); + d_shapeFunctionGradientDataTransposeHost.clear(); + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataInternalLayoutHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell * 3, + 0.0); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTransposeHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + } + + d_JxWDataHost.clear(); + if ((d_updateFlags & update_values) || + (d_updateFlags & update_gradients)) + d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]); + + d_inverseJacobianDataHost.clear(); + if (d_updateFlags & update_gradients) + d_inverseJacobianDataHost.resize( + areAllCellsCartesian ? + d_nCells * 3 : + (areAllCellsAffine ? d_nCells * 9 : + d_nCells * 9 * d_nQuadsPerCell[iQuadID])); + const unsigned int nJacobiansPerCell = + areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID]; + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + fe_values.reinit(cellPtr); + auto &jacobians = fe_values.get_jacobians(); + auto &inverseJacobians = fe_values.get_inverse_jacobians(); + if (iCell == 0) + { + if (d_updateFlags & update_values) + { + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + + iNode] = + fe_values.shape_value(iNode, iQuad); + if (d_updateFlags & update_transpose) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataTransposeHost + [iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.shape_value(iNode, iQuad); + } + + + if (d_updateFlags & update_gradients) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + { + const auto &shape_grad_real = + fe_values.shape_grad(iNode, iQuad); + const auto &shape_grad_reference = + apply_transformation(jacobians[iQuad].transpose(), + shape_grad_real); + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsAffine) + d_shapeFunctionGradientDataInternalLayoutHost + [d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * + iDim + + d_nDofsPerCell * iQuad + iNode] = + shape_grad_reference[iDim]; + else + d_shapeFunctionGradientDataInternalLayoutHost + [iQuad * d_nDofsPerCell * 3 + + d_nDofsPerCell * iDim + iNode] = + shape_grad_reference[iDim]; + + + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iQuad * d_nDofsPerCell + iNode] = + shape_grad_reference[iDim]; + if (d_updateFlags & update_transpose) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataTransposeHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + shape_grad_reference[iDim]; + } + } + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.JxW(iQuad); + for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsCartesian) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + + iDim * nJacobiansPerCell + + iQuad] = + inverseJacobians[iQuad][iDim][iDim]; + else + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * + 9 + + 9 * iQuad + jDim * 3 + iDim] = + inverseJacobians[iQuad][iDim][jDim]; + ++iCell; + } + +#if defined(DFTFE_WITH_DEVICE) + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].resize( + d_inverseJacobianDataHost.size()); + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].copyFrom( + d_inverseJacobianDataHost); + d_JxWData[iQuadID].resize(d_JxWDataHost.size()); + d_JxWData[iQuadID].copyFrom(d_JxWDataHost); + d_shapeFunctionData[iQuadID].resize(d_shapeFunctionDataHost.size()); + d_shapeFunctionData[iQuadID].copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionGradientDataInternalLayout[iQuadID].resize( + d_shapeFunctionGradientDataInternalLayoutHost.size()); + d_shapeFunctionGradientDataInternalLayout[iQuadID].copyFrom( + d_shapeFunctionGradientDataInternalLayoutHost); + d_shapeFunctionDataTranspose[iQuadID].resize( + d_shapeFunctionDataTransposeHost.size()); + d_shapeFunctionDataTranspose[iQuadID].copyFrom( + d_shapeFunctionDataTransposeHost); + d_shapeFunctionGradientData[iQuadID].resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientData[iQuadID].copyFrom( + d_shapeFunctionGradientDataHost); + d_shapeFunctionGradientDataTranspose[iQuadID].resize( + d_shapeFunctionGradientDataTransposeHost.size()); + d_shapeFunctionGradientDataTranspose[iQuadID].copyFrom( + d_shapeFunctionGradientDataTransposeHost); +#endif + } + } + + + template + void + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::initializeShapeFunctionAndJacobianBasisData() + { + d_inverseJacobianBasisData.resize( + areAllCellsAffine ? 1 : d_quadratureIDsVector.size()); + d_JxWBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_values) + { + d_shapeFunctionBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionBasisDataTranspose.resize( + d_quadratureIDsVector.size()); + } + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientBasisDataTranspose.resize( + d_quadratureIDsVector.size()); + } + for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size(); + ++iQuadID) + { + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_values | dealii::update_gradients | + dealii::update_jacobians | dealii::update_JxW_values | + dealii::update_inverse_jacobians); + +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_inverseJacobianDataHost; + dftfe::utils::MemoryStorage + d_JxWDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataTransposeHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataTransposeHost; +#else + auto &d_inverseJacobianDataHost = + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID]; + auto &d_JxWDataHost = d_JxWBasisData[iQuadID]; + auto &d_shapeFunctionDataHost = d_shapeFunctionBasisData[iQuadID]; + auto &d_shapeFunctionDataTransposeHost = + d_shapeFunctionBasisDataTranspose[iQuadID]; + auto &d_shapeFunctionGradientDataHost = + d_shapeFunctionGradientBasisData[iQuadID]; + auto &d_shapeFunctionGradientDataTransposeHost = + d_shapeFunctionGradientBasisDataTranspose[iQuadID]; +#endif + + + d_shapeFunctionDataHost.clear(); + if (d_updateFlags & update_values) + d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionDataTransposeHost.clear(); + if ((d_updateFlags & update_values) && + (d_updateFlags & update_transpose)) + d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionGradientDataHost.clear(); + d_shapeFunctionGradientDataTransposeHost.clear(); + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell * 3, + 0.0); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTransposeHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + } + + d_JxWDataHost.clear(); + if ((d_updateFlags & update_values) || + (d_updateFlags & update_gradients)) + d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]); + + d_inverseJacobianDataHost.clear(); + if (d_updateFlags & update_gradients) + d_inverseJacobianDataHost.resize( + areAllCellsCartesian ? + d_nCells * 3 : + (areAllCellsAffine ? d_nCells * 9 : + d_nCells * 9 * d_nQuadsPerCell[iQuadID])); + const unsigned int nJacobiansPerCell = + areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID]; + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + fe_values.reinit(cellPtr); + auto &jacobians = fe_values.get_jacobians(); + auto &inverseJacobians = fe_values.get_inverse_jacobians(); + if (iCell == 0) + { + if (d_updateFlags & update_values) + { + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + + iNode] = + fe_values.shape_value(iNode, iQuad); + if (d_updateFlags & update_transpose) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataTransposeHost + [iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.shape_value(iNode, iQuad); + } + + + if (d_updateFlags & update_gradients) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + { + const auto &shape_grad_real = + fe_values.shape_grad(iNode, iQuad); + const auto &shape_grad_reference = + apply_transformation(jacobians[iQuad].transpose(), + shape_grad_real); + + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iQuad * d_nDofsPerCell + iNode] = + shape_grad_reference[iDim]; + if (d_updateFlags & update_transpose) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataTransposeHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + shape_grad_reference[iDim]; + } + } + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.JxW(iQuad); + for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsCartesian) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + + iDim * nJacobiansPerCell + + iQuad] = + inverseJacobians[iQuad][iDim][iDim]; + else + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * + 9 + + 9 * iQuad + jDim * 3 + iDim] = + inverseJacobians[iQuad][iDim][jDim]; + ++iCell; + } + +#if defined(DFTFE_WITH_DEVICE) + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].resize( + d_inverseJacobianDataHost.size()); + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].copyFrom( + d_inverseJacobianDataHost); + d_JxWBasisData[iQuadID].resize(d_JxWDataHost.size()); + d_JxWBasisData[iQuadID].copyFrom(d_JxWDataHost); + d_shapeFunctionBasisData[iQuadID].resize( + d_shapeFunctionDataHost.size()); + d_shapeFunctionBasisData[iQuadID].copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionBasisDataTranspose[iQuadID].resize( + d_shapeFunctionDataTransposeHost.size()); + d_shapeFunctionBasisDataTranspose[iQuadID].copyFrom( + d_shapeFunctionDataTransposeHost); + d_shapeFunctionGradientBasisData[iQuadID].resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientBasisData[iQuadID].copyFrom( + d_shapeFunctionGradientDataHost); + d_shapeFunctionGradientBasisDataTranspose[iQuadID].resize( + d_shapeFunctionGradientDataTransposeHost.size()); + d_shapeFunctionGradientBasisDataTranspose[iQuadID].copyFrom( + d_shapeFunctionGradientDataTransposeHost); +#endif + } + } + + + template + void + FEBasisOperationsBase:: + createMultiVector( + const unsigned int blocksize, + dftfe::linearAlgebra::MultiVector + &multiVector) const + { + multiVector.reinit(mpiPatternP2P, blocksize); + } + + template + void + FEBasisOperationsBase:: + createScratchMultiVectors(const unsigned int vecBlockSize, + const unsigned int numMultiVecs) const + { + auto iter = scratchMultiVectors.find(vecBlockSize); + if (iter == scratchMultiVectors.end()) + { + scratchMultiVectors[vecBlockSize] = + std::vector>( + numMultiVecs); + for (unsigned int iVec = 0; iVec < numMultiVecs; ++iVec) + scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P, + vecBlockSize); + } + else + { + scratchMultiVectors[vecBlockSize].resize( + scratchMultiVectors[vecBlockSize].size() + numMultiVecs); + for (unsigned int iVec = 0; + iVec < scratchMultiVectors[vecBlockSize].size(); + ++iVec) + scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P, + vecBlockSize); + } + } + + template + void + FEBasisOperationsBase::clearScratchMultiVectors() const + { + scratchMultiVectors.clear(); + } + + template + dftfe::linearAlgebra::MultiVector & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::getMultiVector(const unsigned int vecBlockSize, + const unsigned int index) const + { + AssertThrow(scratchMultiVectors.find(vecBlockSize) != + scratchMultiVectors.end(), + dealii::ExcMessage( + "DFT-FE Error: MultiVector not found in scratch storage.")); + return scratchMultiVectors[vecBlockSize][index]; + } + + + template + void + FEBasisOperationsBase:: + distribute( + dftfe::linearAlgebra::MultiVector + &multiVector) const + { + d_constraintInfo.distribute(multiVector, d_nVectors); + } + + + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsDevice.t.cc b/utils/FEBasisOperationsDevice.t.cc new file mode 100644 index 000000000..ef79fe1d5 --- /dev/null +++ b/utils/FEBasisOperationsDevice.t.cc @@ -0,0 +1,392 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace dftfe +{ + // namespace + // { + // template + // __global__ void + // reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, + // const dftfe::size_type numQuads, + // const dftfe::size_type numCells, + // const ValueType1 * copyFromVec, + // ValueType2 * copyToVec) + // { + // const dftfe::size_type globalThreadId = + // blockIdx.x * blockDim.x + threadIdx.x; + // const dftfe::size_type numberEntries = numQuads * numCells * numVecs * + // 3; + + // for (dftfe::size_type index = globalThreadId; index < numberEntries; + // index += blockDim.x * gridDim.x) + // { + // dftfe::size_type blockIndex = index / numVecs; + // dftfe::size_type iVec = index - blockIndex * numVecs; + // dftfe::size_type blockIndex2 = blockIndex / numQuads; + // dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + // dftfe::size_type iCell = blockIndex2 / 3; + // dftfe::size_type iDim = blockIndex2 - iCell * 3; + // dftfe::utils::copyValue( + // copyToVec + index, + // copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + // iCell * 3 * numQuads * numVecs]); + // } + // } + // } // namespace + + namespace basis + { + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const + { + interpolateKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + integrateWithBasisKernel(quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, + d_nCells)); + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const + { + extractToCellNodalDataKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + accumulateFromCellNodalDataKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells)); + } + + + + template + void + FEBasisOperations:: + interpolateKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> & nodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + extractToCellNodalDataKernel(nodalValues, + tempCellNodalData.data(), + cellRange); + interpolateKernel(tempCellNodalData.data(), + quadratureValues, + quadratureGradients, + cellRange); + } + + template + void + FEBasisOperations:: + interpolateKernel( + const ValueTypeBasisCoeff * cellNodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + + if (quadratureValues != NULL) + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + d_nQuadsPerCell[d_quadratureID], + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues, + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionData[d_quadratureID].data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + quadratureValues, + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureID], + cellRange.second - cellRange.first); + if (quadratureGradients != NULL) + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + d_nQuadsPerCell[d_quadratureID] * 3, + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues, + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionGradientDataInternalLayout[d_quadratureID].data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + areAllCellsCartesian ? quadratureGradients : + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureID] * 3, + cellRange.second - cellRange.first); + if (areAllCellsCartesian) + { + dftfe::utils::deviceKernelsGeneric::stridedBlockScale( + d_nQuadsPerCell[d_quadratureID] * d_nVectors, + 3 * (cellRange.second - cellRange.first), + ValueTypeBasisCoeff(1.0), + d_inverseJacobianData[0].data() + cellRange.first * 3, + quadratureGradients); + } + else if (areAllCellsAffine) + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nQuadsPerCell[d_quadratureID] * d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nQuadsPerCell[d_quadratureID] * d_nVectors, + d_nQuadsPerCell[d_quadratureID] * d_nVectors * 3, + d_inverseJacobianData[0].data() + 9 * cellRange.first, + 3, + 9, + &scalarCoeffBeta, + quadratureGradients, + d_nQuadsPerCell[d_quadratureID] * d_nVectors, + d_nVectors * d_nQuadsPerCell[d_quadratureID] * 3, + cellRange.second - cellRange.first); + } + else + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * 3, + d_inverseJacobianData[d_quadratureID].data() + + 9 * cellRange.first * d_nQuadsPerCell[d_quadratureID], + 3, + 9, + &scalarCoeffBeta, + tempQuadratureGradientsDataNonAffine.data(), + d_nVectors, + d_nVectors * 3, + (cellRange.second - cellRange.first) * + d_nQuadsPerCell[d_quadratureID]); + dftfe::basis::FEBasisOperationsKernelsDevice:: + reshapeNonAffineCase( + d_nVectors, + d_nQuadsPerCell[d_quadratureID], + (cellRange.second - cellRange.first), + tempQuadratureGradientsDataNonAffine.data(), + quadratureGradients); + // #ifdef DFTFE_WITH_DEVICE_LANG_CUDA + // reshapeNonAffineCaseDeviceKernel<<< + // (d_nVectors * (cellRange.second - + // cellRange.first) * + // d_nQuadsPerCell[d_quadratureID] * 3) / + // dftfe::utils::DEVICE_BLOCK_SIZE + + // 1, + // dftfe::utils::DEVICE_BLOCK_SIZE>>>( + // d_nVectors, + // d_nQuadsPerCell[d_quadratureID], + // (cellRange.second - cellRange.first), + // dftfe::utils::makeDataTypeDeviceCompatible( + // tempQuadratureGradientsDataNonAffine.data()), + // dftfe::utils::makeDataTypeDeviceCompatible( + // quadratureGradients)); + // #elif DFTFE_WITH_DEVICE_LANG_HIP + // hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel, + // (d_nVectors * + // (cellRange.second - + // cellRange.first) * + // d_nQuadsPerCell[d_quadratureID] + // * 3) / + // dftfe::utils::DEVICE_BLOCK_SIZE + // + + // 1, + // dftfe::utils::DEVICE_BLOCK_SIZE, + // 0, + // 0, + // d_nVectors, + // d_nQuadsPerCell[d_quadratureID], + // (cellRange.second - + // cellRange.first), + // dftfe::utils::makeDataTypeDeviceCompatible( + // tempQuadratureGradientsDataNonAffine.data()), + // dftfe::utils::makeDataTypeDeviceCompatible( + // quadratureGradients)); + // #endif + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisKernel( + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + {} + + template + void + FEBasisOperations:: + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const + { + dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + nodalData.data(), + cellNodalDataPtr, + d_flattenedCellDofIndexToProcessDofIndexMap.data() + + cellRange.first * d_nDofsPerCell); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataKernel( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + dftfe::utils::deviceKernelsGeneric::axpyStridedBlockAtomicAdd( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + cellNodalDataPtr, + nodalData.begin(), + d_flattenedCellDofIndexToProcessDofIndexMap.begin() + + cellRange.first * d_nDofsPerCell); + } + + template + void + FEBasisOperations:: + setDeviceBLASHandle(dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr) + { + d_deviceBlasHandlePtr = deviceBlasHandlePtr; + } + + template + dftfe::utils::deviceBlasHandle_t & + FEBasisOperations::getDeviceBLASHandle() + { + return *d_deviceBlasHandlePtr; + } + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsHost.t.cc b/utils/FEBasisOperationsHost.t.cc new file mode 100644 index 000000000..8b17a7265 --- /dev/null +++ b/utils/FEBasisOperationsHost.t.cc @@ -0,0 +1,461 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +namespace dftfe +{ + namespace basis + { + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const + { + interpolateKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + integrateWithBasisKernel(quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, + d_nCells)); + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const + { + extractToCellNodalDataKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + accumulateFromCellNodalDataKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells)); + } + template + void + FEBasisOperations:: + interpolateKernel( + const dftfe::linearAlgebra::MultiVector + & nodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + extractToCellNodalDataKernel( + nodalValues, + tempCellNodalData.data(), + std::pair(iCell, iCell + 1)); + interpolateKernel(tempCellNodalData.data(), + quadratureValues, + quadratureGradients, + std::pair(iCell, + iCell + 1)); + } + } + template + void + FEBasisOperations:: + interpolateKernel( + const ValueTypeBasisCoeff * cellNodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'N'; + + if (quadratureValues != NULL) + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCell[d_quadratureID], + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues + + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, + &d_nVectors, + d_shapeFunctionData[d_quadratureID].data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + quadratureValues + d_nQuadsPerCell[d_quadratureID] * + (iCell - cellRange.first) * d_nVectors, + &d_nVectors); + if (quadratureGradients != NULL) + { + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell[d_quadratureID] * 3; + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCellTimesThree, + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalValues + + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, + &d_nVectors, + d_shapeFunctionGradientDataInternalLayout[d_quadratureID] + .data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + areAllCellsCartesian ? + (quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * + (iCell - cellRange.first)) : + (tempQuadratureGradientsData.data()), + &d_nVectors); + if (areAllCellsCartesian) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell[d_quadratureID] * d_nVectors; + const unsigned int one = 1; + for (unsigned int iDim = 0; iDim < 3; ++iDim) + xscal(&d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData[0].data() + 3 * iCell + iDim, + quadratureGradients + + d_nQuadsPerCell[d_quadratureID] * d_nVectors * 3 * + (iCell - cellRange.first) + + d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim, + &one); + } + else if (areAllCellsAffine) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell[d_quadratureID] * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData[0].data() + 9 * iCell, + &three, + &scalarCoeffBeta, + quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * + (iCell - cellRange.first), + &d_nQuadsPerCellTimesnVectors); + } + else + { + const unsigned int three = 3; + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) + xgemm(&transA, + &transB, + &d_nVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data() + + iQuad * d_nVectors * 3, + &d_nVectors, + d_inverseJacobianData[d_quadratureID].data() + + 9 * d_nQuadsPerCell[d_quadratureID] * iCell + + 9 * iQuad, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsDataNonAffine.data() + + iQuad * d_nVectors * 3, + &d_nVectors); + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(quadratureGradients + + d_nVectors * 3 * + d_nQuadsPerCell[d_quadratureID] * + (iCell - cellRange.first) + + d_nVectors * + d_nQuadsPerCell[d_quadratureID] * iDim + + d_nVectors * iQuad, + tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad + d_nVectors * iDim, + d_nVectors * sizeof(ValueTypeBasisCoeff)); + } + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisKernel( + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize(3 * d_nVectors * + d_nQuadsPerCell[d_quadratureID]); + + if (quadratureGradients != NULL) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? + 0 : + (3 * d_nVectors * d_nQuadsPerCell[d_quadratureID])); + + + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'T'; + + xgemm(&transA, + &transB, + &d_nVectors, + &d_nDofsPerCell, + &d_nQuadsPerCell[d_quadratureID], + &scalarCoeffAlpha, + quadratureValues + d_nQuadsPerCell[d_quadratureID] * iCell, + &d_nVectors, + d_shapeFunctionData[d_quadratureID].data(), + &d_nQuadsPerCell[d_quadratureID], + &scalarCoeffBeta, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + if (quadratureGradients != NULL) + { + if (areAllCellsCartesian) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell[d_quadratureID] * d_nVectors; + const unsigned int one = 1; + std::memcpy(tempQuadratureGradientsData.data(), + quadratureGradients + + d_nQuadsPerCell[d_quadratureID] * d_nVectors * + 3 * iCell, + 3 * d_nQuadsPerCellTimesnVectors * + sizeof(ValueTypeBasisCoeff)); + for (unsigned int iDim = 0; iDim < 3; ++iDim) + xscal(&d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData[0].data() + 3 * iCell + iDim, + tempQuadratureGradientsData.data() + + d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim, + &one); + } + else if (areAllCellsAffine) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell[d_quadratureID] * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * iCell, + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData[0].data() + 9 * iCell, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors); + } + else + { + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad + d_nVectors * iDim, + quadratureGradients + + d_nVectors * 3 * + d_nQuadsPerCell[d_quadratureID] * iCell + + d_nVectors * + d_nQuadsPerCell[d_quadratureID] * iDim + + d_nVectors * iQuad, + d_nVectors * sizeof(ValueTypeBasisCoeff)); + const unsigned int three = 3; + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) + xgemm(&transA, + &transB, + &d_nVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad, + &d_nVectors, + d_inverseJacobianData[d_quadratureID].data() + + 9 * d_nQuadsPerCell[d_quadratureID] * iCell + + 9 * iQuad, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsData.data() + + d_nVectors * 3 * iQuad, + &d_nVectors); + } + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell[d_quadratureID] * 3; + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCellTimesThree, + &d_nDofsPerCell, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nVectors, + d_shapeFunctionGradientDataInternalLayout[d_quadratureID] + .data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + } + accumulateFromCellNodalDataKernel( + cellNodalData.data(), + nodalData, + std::pair(iCell, iCell + 1)); + } + } + + template + void + FEBasisOperations:: + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector + & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + { + std::memcpy(cellNodalDataPtr + + (iCell - cellRange.first) * d_nVectors * + d_nDofsPerCell + + iDof * d_nVectors, + nodalData.data() + + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + d_nVectors * sizeof(ValueTypeBasisCoeff)); + } + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataKernel( + const ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + std::transform( + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors, + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors + d_nVectors, + nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + std::plus()); + } + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsKernelsDevice.cc b/utils/FEBasisOperationsKernelsDevice.cc new file mode 100644 index 000000000..2ad2a4706 --- /dev/null +++ b/utils/FEBasisOperationsKernelsDevice.cc @@ -0,0 +1,110 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +#include +#include +#include + + +namespace dftfe +{ + namespace + { + template + __global__ void + reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / numVecs; + dftfe::size_type iVec = index - blockIndex * numVecs; + dftfe::size_type blockIndex2 = blockIndex / numQuads; + dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + dftfe::size_type iCell = blockIndex2 / 3; + dftfe::size_type iDim = blockIndex2 - iCell * 3; + dftfe::utils::copyValue( + copyToVec + index, + copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + iCell * 3 * numQuads * numVecs]); + } + } + } // namespace + namespace basis + { + namespace FEBasisOperationsKernelsDevice + { + template + void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + reshapeNonAffineCaseDeviceKernel<<<(numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + reshapeNonAffineCaseDeviceKernel, + (numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + } // namespace FEBasisOperationsKernelsDevice + } // namespace basis +} // namespace dftfe diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index 99098a218..7b3eb77a2 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -171,66 +171,6 @@ namespace dftfe } } - - void - constraintMatrixInfo::precomputeMaps( - const std::shared_ptr - &unFlattenedPartitioner, - const std::shared_ptr - & flattenedPartitioner, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int n_ghosts = unFlattenedPartitioner->n_ghost_indices(); - const unsigned int localSize = unFlattenedPartitioner->local_size(); - const unsigned int totalSize = n_ghosts + localSize; - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - const dealii::types::global_dof_index globalIndex = - unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - flattenedPartitioner->global_to_local(globalIndex * blockSize); - } - } - - void - constraintMatrixInfo::precomputeMaps( - const std::shared_ptr< - const utils::mpi::MPIPatternP2P> - & mpiPattern, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int totalSize = - mpiPattern->localOwnedSize() + mpiPattern->localGhostSize(); - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - (dealii::types::global_dof_index)ilocalDof * - (dealii::types::global_dof_index)blockSize; - } - } - - - // // set the constrained degrees of freedom to values so that constraints // are satisfied @@ -273,7 +213,7 @@ namespace dftfe d_inhomogenities[i]); const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { @@ -284,8 +224,7 @@ namespace dftfe const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; @@ -323,7 +262,7 @@ namespace dftfe d_inhomogenities[i]); const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { @@ -334,8 +273,7 @@ namespace dftfe const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; @@ -371,13 +309,12 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; callaxpy(&blockSize, @@ -411,13 +348,12 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; callaxpy(&blockSize, @@ -448,7 +384,7 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; // set constrained nodes to zero std::fill(fieldVector.begin() + startingLocalDofIndexRow, @@ -465,7 +401,7 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; // set constrained nodes to zero std::fill(fieldVector.data() + startingLocalDofIndexRow, @@ -488,7 +424,6 @@ namespace dftfe d_columnValues.clear(); d_inhomogenities.clear(); d_rowSizes.clear(); - d_localIndexMapUnflattenedToFlattened.clear(); } @@ -509,8 +444,13 @@ namespace dftfe template void constraintMatrixInfo::distribute( - distributedCPUMultiVec &fieldVector, - const unsigned int blockSize) const; + distributedCPUMultiVec &fieldVector, + const unsigned int blockSize) const; + + template void + constraintMatrixInfo::distribute( + distributedCPUMultiVec> &fieldVector, + const unsigned int blockSize) const; template void constraintMatrixInfo::distribute_slave_to_master( diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index e3434678c..119ef6fef 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -39,9 +39,7 @@ namespace dftfe const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -60,7 +58,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] = inhomogenities[blockIndex]; for (unsigned int i = 0; i < numberColumns; ++i) @@ -69,7 +67,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] += constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -89,9 +87,7 @@ namespace dftfe const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -110,7 +106,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] = inhomogenities[blockIndex]; for (unsigned int i = 0; i < numberColumns; ++i) @@ -119,7 +115,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] += constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -139,9 +135,7 @@ namespace dftfe const unsigned int * constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -160,7 +154,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex, inhomogenities[blockIndex]); for (unsigned int i = 0; i < numberColumns; ++i) @@ -169,7 +163,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; dftfe::utils::copyValue( xVec + xVecStartingIdRow + intraBlockIndex, dftfe::utils::add( @@ -196,9 +190,7 @@ namespace dftfe const unsigned int * constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -217,7 +209,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex, inhomogenities[blockIndex]); for (unsigned int i = 0; i < numberColumns; ++i) @@ -226,7 +218,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; dftfe::utils::copyValue( xVec + xVecStartingIdRow + intraBlockIndex, dftfe::utils::add( @@ -251,9 +243,7 @@ namespace dftfe const unsigned int *constraintRowSizes, const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, - const double * constraintColumnValuesAllRowsUnflattened, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * constraintColumnValuesAllRowsUnflattened) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -272,14 +262,14 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; for (unsigned int i = 0; i < numberColumns; ++i) { const unsigned int constrainedColumnId = constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex]), constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -299,9 +289,7 @@ namespace dftfe const unsigned int *constraintRowSizes, const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, - const double * constraintColumnValuesAllRowsUnflattened, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * constraintColumnValuesAllRowsUnflattened) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -320,14 +308,14 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; for (unsigned int i = 0; i < numberColumns; ++i) { const unsigned int constrainedColumnId = constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; const float tempfloatval = constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -344,9 +332,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, double * xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -359,8 +345,8 @@ namespace dftfe { const unsigned int blockIndex = index / contiguousBlockSize; const unsigned int intraBlockIndex = index % contiguousBlockSize; - xVec[localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + xVec[constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex] = 0; } } @@ -369,9 +355,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, float * xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -384,8 +368,8 @@ namespace dftfe { const unsigned int blockIndex = index / contiguousBlockSize; const unsigned int intraBlockIndex = index % contiguousBlockSize; - xVec[localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + xVec[constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex] = 0; } } @@ -394,9 +378,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, dftfe::utils::deviceDoubleComplex *xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -411,8 +393,8 @@ namespace dftfe const unsigned int intraBlockIndex = index % contiguousBlockSize; dftfe::utils::copyValue( xVec + - localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex, 0.0); } @@ -423,9 +405,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, dftfe::utils::deviceFloatComplex *xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -440,8 +420,8 @@ namespace dftfe const unsigned int intraBlockIndex = index % contiguousBlockSize; dftfe::utils::copyValue( xVec + - localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex, 0.0); } @@ -561,77 +541,6 @@ namespace dftfe d_numConstrainedDofs = d_rowIdsLocal.size(); } - - void - constraintMatrixInfoDevice::precomputeMaps( - const std::shared_ptr - &unFlattenedPartitioner, - const std::shared_ptr - & flattenedPartitioner, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int n_ghosts = unFlattenedPartitioner->n_ghost_indices(); - const unsigned int localSize = unFlattenedPartitioner->local_size(); - const unsigned int totalSize = n_ghosts + localSize; - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - const dealii::types::global_dof_index globalIndex = - unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - flattenedPartitioner->global_to_local(globalIndex * blockSize); - } - - d_localIndexMapUnflattenedToFlattenedDevice.resize( - d_localIndexMapUnflattenedToFlattened.size()); - d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( - d_localIndexMapUnflattenedToFlattened); - } - - void - constraintMatrixInfoDevice::precomputeMaps( - const std::shared_ptr< - const utils::mpi::MPIPatternP2P> - & mpiPattern, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int totalSize = - mpiPattern->localOwnedSize() + mpiPattern->localGhostSize(); - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - // const dealii::types::global_dof_index globalIndex = - // unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - ilocalDof * blockSize; - // flattenedPartitioner->globalToLocal(globalIndex * blockSize); - } - - d_localIndexMapUnflattenedToFlattenedDevice.resize( - d_localIndexMapUnflattenedToFlattened.size()); - d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( - d_localIndexMapUnflattenedToFlattened); - } - - template void constraintMatrixInfoDevice::distribute( @@ -656,8 +565,7 @@ namespace dftfe d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), d_columnValuesDevice.begin(), - d_inhomogenitiesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_inhomogenitiesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeKernel, @@ -675,8 +583,7 @@ namespace dftfe d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), d_columnValuesDevice.begin(), - d_inhomogenitiesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_inhomogenitiesDevice.begin()); #endif } @@ -707,8 +614,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -725,8 +631,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif } @@ -755,31 +660,27 @@ namespace dftfe min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempReal, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempReal, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); distributeSlaveToMasterKernelAtomicAdd<<< min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempImag, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempImag, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -796,8 +697,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -814,8 +714,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( @@ -850,31 +749,27 @@ namespace dftfe min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempReal, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempReal, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); distributeSlaveToMasterKernelAtomicAdd<<< min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempImag, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempImag, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -891,8 +786,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -909,8 +803,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( @@ -940,8 +833,7 @@ namespace dftfe blockSize, dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), d_rowIdsLocalDevice.begin(), - numConstrainedDofs, - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + numConstrainedDofs); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( setzeroKernel, @@ -954,8 +846,7 @@ namespace dftfe blockSize, dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), d_rowIdsLocalDevice.begin(), - numConstrainedDofs, - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + numConstrainedDofs); #endif } diff --git a/utils/vectorTools/vectorUtilities.cc b/utils/vectorTools/vectorUtilities.cc index 508d87ee8..68a4a448d 100644 --- a/utils/vectorTools/vectorUtilities.cc +++ b/utils/vectorTools/vectorUtilities.cc @@ -682,8 +682,9 @@ namespace dftfe #ifdef USE_COMPLEX void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, const std::vector &localProcDofIndicesReal, @@ -699,8 +700,6 @@ namespace dftfe dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second; @@ -720,8 +719,9 @@ namespace dftfe void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> & componentVectors) { @@ -733,8 +733,6 @@ namespace dftfe dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second; @@ -750,8 +748,9 @@ namespace dftfe #else void copyFlattenedSTLVecToSingleCompVec( - const std::vector &flattenedArray, + const double *flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> &componentVectors) { @@ -762,8 +761,6 @@ namespace dftfe componentIndexRange.second <= totalNumberComponents, dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second;