From d9abf762b4469ef9ce3003f02a42481b061b07d5 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Fri, 9 Jun 2023 20:17:16 +0530 Subject: [PATCH 01/25] Initial framework and host kernels --- CMakeLists.txt | 2 + include/FEBasisOperations.h | 230 ++++++++++++ include/constraintMatrixInfo.h | 4 + include/constraintMatrixInfoDevice.h | 4 + src/dftOperator/kohnShamDFTOperator.cc | 12 + utils/FEBasisOperations.cc | 496 +++++++++++++++++++++++++ utils/FEBasisOperationsHostKernels.cc | 270 ++++++++++++++ utils/constraintMatrixInfo.cc | 18 + utils/constraintMatrixInfoDevice.cc | 25 ++ 9 files changed, 1061 insertions(+) create mode 100644 include/FEBasisOperations.h create mode 100644 utils/FEBasisOperations.cc create mode 100644 utils/FEBasisOperationsHostKernels.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 4573cc1d6..7c07b9b1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,8 @@ SET(TARGET_SRC ./pseudoConverters/upfToxml.cc ./utils/PeriodicTable.cc ./utils/xmlTodftfeParser.cc + ./utils/FEBasisOperations.cc + ./utils/FEBasisOperationsHostKernels.cc ./src/dft/dftd.cc ./src/mdi/MDIEngine.cpp ./src/mdi/libraryMDI.cpp diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h new file mode 100644 index 000000000..d045898c1 --- /dev/null +++ b/include/FEBasisOperations.h @@ -0,0 +1,230 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#ifndef dftfeFEBasisOperations_h +#define dftfeFEBasisOperations_h + +#include +#include +#include +#include +namespace dftfe +{ + namespace basis + { + enum UpdateFlags + { + update_default = 0, + + update_values = 0x0001, + + update_gradients = 0x0002, + + update_macrocell_map = 0x0004 + }; + template + class FEBasisOperations + { + public: + FEBasisOperations( + dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + &constraintsVector); + + ~FEBasisOperations() = default; + + void + reinit(const unsigned int &blockSize, + const unsigned int &dofHandlerID, + const unsigned int &quadratureID, + const UpdateFlags updateFlags = update_values); + + void + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + std::map> + *quadratureValues, + std::map> + * quadratureGradients = NULL, + bool useMacroCellSubCellOrdering = false) const; + + + void + interpolate(dftfe::linearAlgebra::MultiVector &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients = NULL, + bool useMacroCellSubCellOrdering = false) const; + + void + integrateWithBasis( + const std::map< + dealii::CellId, + dftfe::utils::MemoryStorage> + &quadratureValues, + std::map> + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering = false) const; + + + void + integrateWithBasis( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering = false) const; + + void + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + bool useMacroCellSubCellOrdering = false) const; + + void + accumulateFromCellNodalData( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering = false) const; + + private: +#if defined(DFTFE_WITH_DEVICE) + using constraintInfoClass = + typename std::conditional::type; +#else + using constraintInfoClass = dftUtils::constraintMatrixInfo; +#endif + + + + void + initializeIndexMaps(); + + void + initializeConstraints(); + + void + initializeShapeFunctionAndJacobianData(); + + void + interpolateHostKernel( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + std::pair cellRange, + bool useMacroCellSubCellOrdering = false) const; + + void + integrateWithBasisHostKernel( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + std::pair cellRange, + bool useMacroCellSubCellOrdering = false) const; + + + void + extractToCellNodalDataHostKernel( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + std::pair cellRange, + bool useMacroCellSubCellOrdering = false) const; + + void + accumulateFromCellNodalDataHostKernel( + dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + std::pair cellRange, + bool useMacroCellSubCellOrdering = false) const; + + + + constraintInfoClass d_constraintInfo; + std::vector *> + * d_constraintsVector; + const dealii::MatrixFree<3, ValueTypeBasisData> *d_matrixFreeDataPtr; + std::vector d_cellDofIndexToProcessDofIndexMap; + std::vector d_macroCellSubCellDofIndexToProcessDofIndexMap; + std::vector d_cellIndexToMacroCellSubCellIndexMap; + std::vector d_cellIndexToCellIdMap; + std::map> + d_inverseJacobianData; + std::map> + d_JxWData; + dftfe::utils::MemoryStorage + d_shapeFunctionData; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientData; + unsigned int d_quadratureID; + unsigned int d_dofHandlerID; + unsigned int d_nVectors; + unsigned int d_nCells; + unsigned int d_nMacroCells; + unsigned int d_nDofsPerCell; + unsigned int d_nQuadsPerCell; + bool areAllCellsAffine; + UpdateFlags d_updateFlags; + + }; // end of FEBasisOperations + } // end of namespace basis +} // end of namespace dftfe +#endif // dftfeBasisOperations_h diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h index 7fdb002db..5c7083b97 100644 --- a/include/constraintMatrixInfo.h +++ b/include/constraintMatrixInfo.h @@ -90,6 +90,10 @@ namespace dftfe dftfe::utils::MemorySpace::HOST>> &partitioner2, const unsigned int blockSize); + void + precomputeMaps(const unsigned int totalSize, + const unsigned int blockSize); + /** * @brief overloaded dealii internal function "distribute" which sets the slave node * field values from master nodes diff --git a/include/constraintMatrixInfoDevice.h b/include/constraintMatrixInfoDevice.h index e682a25b5..4782c0541 100644 --- a/include/constraintMatrixInfoDevice.h +++ b/include/constraintMatrixInfoDevice.h @@ -73,6 +73,10 @@ namespace dftfe dftfe::utils::MemorySpace::HOST>> &partitioner2, const unsigned int blockSize); + void + precomputeMaps(const unsigned int totalSize, + const unsigned int blockSize); + void precomputeMaps( const std::shared_ptr diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc index 7d3fb765d..80d4c15ee 100644 --- a/src/dftOperator/kohnShamDFTOperator.cc +++ b/src/dftOperator/kohnShamDFTOperator.cc @@ -405,6 +405,11 @@ namespace dftfe kohnShamDFTOperatorClass:: getShapeFunctionValuesDensityGaussQuad() const { + static bool once = [&]() { + std::cout << "DEBUG vals " << d_densityGaussQuadShapeFunctionValues.size() + << std::endl; + return true; + }(); return d_densityGaussQuadShapeFunctionValues; } @@ -414,6 +419,13 @@ namespace dftfe kohnShamDFTOperatorClass:: getShapeFunctionGradValuesDensityGaussQuad() const { + static bool once2 = [&]() { + std::cout << "DEBUG vals " + << d_densityGaussQuadShapeFunctionGradientValues.size() + << std::endl; + return true; + }(); + return d_densityGaussQuadShapeFunctionGradientValues; } diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc new file mode 100644 index 000000000..121cf80ed --- /dev/null +++ b/utils/FEBasisOperations.cc @@ -0,0 +1,496 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +namespace dftfe +{ + namespace basis + { + template + FEBasisOperations:: + FEBasisOperations( + dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, + std::vector *> + &constraintsVector) + { + d_matrixFreeDataPtr = &matrixFreeData; + d_constraintsVector = &constraintsVector; + d_dofHandlerID = 0; + d_quadratureID = 0; + d_nVectors = 0; + d_updateFlags = update_default; + areAllCellsAffine = true; + for (unsigned int iMacroCell = 0; + iMacroCell < d_matrixFreeDataPtr->n_cell_batches(); + ++iMacroCell) + { + areAllCellsAffine = + areAllCellsAffine && + (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( + iMacroCell) <= dealii::internal::MatrixFreeFunctions::affine); + } + } + + template + void + FEBasisOperations:: + reinit(const unsigned int &blockSize, + const unsigned int &dofHandlerID, + const unsigned int &quadratureID, + const UpdateFlags updateFlags) + { + if ((d_dofHandlerID != dofHandlerID) || (d_updateFlags != updateFlags)) + { + d_dofHandlerID = dofHandlerID; + d_quadratureID = quadratureID; + d_nVectors = blockSize; + d_updateFlags = updateFlags; + initializeIndexMaps(); + initializeConstraints(); + initializeShapeFunctionAndJacobianData(); + } + else if ((d_quadratureID != quadratureID) && (d_nVectors != blockSize)) + { + d_quadratureID = quadratureID; + d_nVectors = blockSize; + initializeConstraints(); + initializeShapeFunctionAndJacobianData(); + } + else if (d_quadratureID != quadratureID) + { + d_quadratureID = quadratureID; + initializeShapeFunctionAndJacobianData(); + } + else if (d_nVectors != blockSize) + { + d_nVectors = blockSize; + initializeConstraints(); + } + } + + + template + void + FEBasisOperations:: + initializeIndexMaps() + { + d_nMacroCells = d_matrixFreeDataPtr->n_cell_batches(); + d_nCells = d_matrixFreeDataPtr->n_physical_cells(); + d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) + .get_fe() + .dofs_per_cell; + d_cellDofIndexToProcessDofIndexMap.clear(); + d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell); + + d_cellIndexToCellIdMap.clear(); + d_cellIndexToCellIdMap.resize(d_nCells); + + if (d_updateFlags & update_macrocell_map) + { + d_cellIndexToMacroCellSubCellIndexMap.clear(); + d_cellIndexToMacroCellSubCellIndexMap.resize(d_nCells); + + d_macroCellSubCellDofIndexToProcessDofIndexMap.clear(); + d_macroCellSubCellDofIndexToProcessDofIndexMap.resize(d_nCells * + d_nDofsPerCell); + } + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + std::vector cellDofIndicesGlobal(d_nDofsPerCell); + std::map cellIdToCellIndexMap; + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + cellPtr->get_dof_indices(cellDofIndicesGlobal); + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + d_cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + + iDof] = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->global_to_local(cellDofIndicesGlobal[iDof]); + + if (d_updateFlags & update_macrocell_map) + cellIdToCellIndexMap[cellPtr->id()] = iCell; + + d_cellIndexToCellIdMap[iCell] = cellPtr->id(); + + ++iCell; + } + + iCell = 0; + for (unsigned int iMacroCell = 0; iMacroCell < d_nMacroCells; + ++iMacroCell) + { + const unsigned int numberSubCells = + d_matrixFreeDataPtr->n_components_filled(iMacroCell); + for (unsigned int iSubCell = 0; iSubCell < numberSubCells; ++iSubCell) + { + cellPtr = d_matrixFreeDataPtr->get_cell_iterator(iMacroCell, + iSubCell, + d_dofHandlerID); + size_type cellIndex = cellIdToCellIndexMap[cellPtr->id()]; + d_cellIndexToMacroCellSubCellIndexMap[cellIndex] = iCell; + std::copy(d_cellDofIndexToProcessDofIndexMap.begin() + + cellIndex * d_nDofsPerCell, + d_cellDofIndexToProcessDofIndexMap.begin() + + (cellIndex + 1) * d_nDofsPerCell, + d_macroCellSubCellDofIndexToProcessDofIndexMap.begin() + + iCell * d_nDofsPerCell); + ++iCell; + } + } + } + + + + template + void + FEBasisOperations:: + initializeConstraints() + { + d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner( + d_dofHandlerID), + *((*d_constraintsVector)[d_dofHandlerID])); + d_constraintInfo.precomputeMaps( + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->locally_owned_size() + + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->n_ghost_indices(), + d_nVectors); + } + + template + void + FEBasisOperations:: + initializeShapeFunctionAndJacobianData() + { + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(d_quadratureID); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_values | dealii::update_gradients | + dealii::update_jacobians | dealii::update_JxW_values | + dealii::update_inverse_jacobians); + + d_nQuadsPerCell = quadrature.size(); + +#if defined(DFTFE_WITH_DEVICE) + std::map> + d_inverseJacobianDataHost; + std::map> + d_JxWDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataHost; + if (memorySpace == dftfe::utils::MemorySpace::HOST) + { + &d_inverseJacobianDataHost = d_inverseJacobianData; + &d_JxWDataHost = d_JxWData; + &d_shapeFunctionDataHost = d_shapeFunctionData; + &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; + } +#else + auto &d_inverseJacobianDataHost = d_inverseJacobianData; + auto &d_JxWDataHost = d_JxWData; + auto &d_shapeFunctionDataHost = d_shapeFunctionData; + auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; +#endif + + + d_shapeFunctionDataHost.clear(); + if (d_updateFlags & update_values) + d_shapeFunctionDataHost.resize(d_nQuadsPerCell * d_nDofsPerCell, 0.0); + d_shapeFunctionGradientDataHost.clear(); + if (d_updateFlags & update_gradients) + d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell * + d_nDofsPerCell * 3, + 0.0); + + d_JxWDataHost.clear(); + if ((d_updateFlags & update_values) || (d_updateFlags & update_gradients)) + d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell); + + d_inverseJacobianDataHost.clear(); + if (d_updateFlags & update_gradients) + d_inverseJacobianDataHost.resize( + areAllCellsAffine ? d_nCells * 9 : d_nCells * 9 * d_nQuadsPerCell); + const unsigned int nJacobiansPerCell = + areAllCellsAffine ? 1 : d_nQuadsPerCell; + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + fe_values.reinit(cellPtr); + auto &jacobians = fe_values.get_jacobians(); + auto &inverseJacobians = fe_values.get_inverse_jacobians(); + if (iCell == 0) + { + if (d_updateFlags & update_values) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; ++iNode) + for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; + ++q_point) + d_shapeFunctionDataHost[q_point * d_nDofsPerCell + + iNode] = + fe_values.shape_value(iNode, q_point); + + + if (d_updateFlags & update_gradients) + for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; + ++q_point) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + { + const auto &shape_grad_real = + fe_values.shape_grad(iNode, q_point); + const auto &shape_grad_reference = + apply_transformation(jacobians[q_point].transpose(), + shape_grad_real); + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataHost + [d_nQuadsPerCell * d_nDofsPerCell * iDim + + d_nDofsPerCell * q_point + iNode] = + shape_grad_reference[iDim]; + } + } + for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; ++q_point) + d_JxWDataHost[iCell * d_nQuadsPerCell + q_point] = + fe_values.JxW(q_point); + for (unsigned int q_point = 0; q_point < nJacobiansPerCell; + ++q_point) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 9 + + q_point * 9 + jDim * 3 + iDim] = + inverseJacobians[q_point][jDim][iDim]; + } + +#if defined(DFTFE_WITH_DEVICE) + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + { + d_inverseJacobianData.resize(d_inverseJacobianDataHost.size()); + d_inverseJacobianData.copyFrom(d_inverseJacobianDataHost); + d_JxWData.resize(d_JxWDataHost.size()); + d_JxWData.copyFrom(d_JxWDataHost); + d_shapeFunctionData.resize(d_shapeFunctionDataHost.size()); + d_shapeFunctionData.copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionGradientData.resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientData.copyFrom(d_shapeFunctionGradientDataHost); + } +#endif + } + + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + std::map> + *quadratureValues, + std::map> + * quadratureGradients, + bool useMacroCellSubCellOrdering) const + { + if (memorySpace == dftfe::utils::MemorySpace::HOST) + { + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + auto &cellQuadratureData = (*quadratureValues)[currentCellId]; + cellQuadratureData.resize(d_nQuadsPerCell * d_nVectors); + + auto &cellQuadratureGradientData = + (quadratureGradients != NULL) ? + (*quadratureGradients)[currentCellId] : + NULL; + if (quadratureGradients != NULL) + cellQuadratureGradientData.resize(d_nQuadsPerCell * d_nVectors * + 3); + interpolateHostKernel( + nodalData, + &cellQuadratureData, + &cellQuadratureGradientData, + std::pair(iCell, iCell + 1), + useMacroCellSubCellOrdering); + } + } + } + + template + void + FEBasisOperations:: + interpolate(dftfe::linearAlgebra::MultiVector &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + bool useMacroCellSubCellOrdering) const + { + if (memorySpace == dftfe::utils::MemorySpace::HOST) + { + interpolateHostKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, + d_nCells), + useMacroCellSubCellOrdering); + } + } + + template + void + FEBasisOperations:: + integrateWithBasis( + const std::map< + dealii::CellId, + dftfe::utils::MemoryStorage> + &quadratureValues, + std::map> + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering) const + { + if (memorySpace == dftfe::utils::MemorySpace::HOST) + { + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + auto &cellQuadratureData = (*quadratureValues)[currentCellId]; + + auto &cellQuadratureGradientData = + (quadratureGradients != NULL) ? + (*quadratureGradients)[currentCellId] : + NULL; + integrateWithBasisHostKernel( + &cellQuadratureData, + &cellQuadratureGradientData, + nodalData, + std::pair(iCell, iCell + 1), + useMacroCellSubCellOrdering); + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasis( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering) const + { + if (memorySpace == dftfe::utils::MemorySpace::HOST) + { + integrateWithBasisHostKernel( + quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, d_nCells), + useMacroCellSubCellOrdering); + } + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + bool useMacroCellSubCellOrdering) const + { + extractToCellNodalDataHostKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells), + useMacroCellSubCellOrdering); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + bool useMacroCellSubCellOrdering) const + { + accumulateFromCellNodalDataHostKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells), + useMacroCellSubCellOrdering); + } + + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsHostKernels.cc b/utils/FEBasisOperationsHostKernels.cc new file mode 100644 index 000000000..ec90708db --- /dev/null +++ b/utils/FEBasisOperationsHostKernels.cc @@ -0,0 +1,270 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +namespace dftfe +{ + namespace basis + { + template + void + FEBasisOperations:: + interpolateHostKernel( + dftfe::linearAlgebra::MultiVector + &nodalValues, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + std::pair cellRange, + bool useMacroCellSubCellOrdering) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData; + cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); + + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize(d_nVectors * d_nQuadsPerCell * 3); + + + extractToCellNodalDataHostKernel(nodalValues, + &cellNodalData, + cellRange, + useMacroCellSubCellOrdering); + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'N'; + + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCell, + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors, + d_shapeFunctionData.data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + quadratureValues->data() + d_nQuadsPerCell * iCell, + &d_nVectors); + if (quadratureGradients != NULL) + { + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell * 3; + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCellTimesThree, + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors, + d_shapeFunctionGradientData.data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + &d_nVectors); + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 9 * iCell, + &three, + &scalarCoeffBeta, + quadratureGradients->data() + d_nQuadsPerCell * 3 * iCell, + &d_nQuadsPerCellTimesnVectors); + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisHostKernel( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + std::pair cellRange, + bool useMacroCellSubCellOrdering) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData; + cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize(3 * d_nVectors * d_nQuadsPerCell); + + + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'T'; + + xgemm(&transA, + &transB, + &d_nVectors, + &d_nDofsPerCell, + &d_nQuadsPerCell, + &scalarCoeffAlpha, + quadratureValues->data() + d_nQuadsPerCell * iCell, + &d_nVectors, + d_shapeFunctionData.data(), + &d_nQuadsPerCell, + &scalarCoeffBeta, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + if (quadratureGradients != NULL) + { + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell * 3; + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + quadratureGradients->data() + d_nQuadsPerCell * 3 * iCell, + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 9 * iCell, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors); + xgemm(&transA, + &transB, + &d_nVectors, + &d_nDofsPerCell, + &d_nQuadsPerCellTimesThree, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nVectors, + d_shapeFunctionGradientData.data(), + &d_nQuadsPerCellTimesThree, + &scalarCoeffAlpha, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + } + } + accumulateFromCellNodalDataHostKernel(&cellNodalData, + nodalData, + cellRange, + useMacroCellSubCellOrdering); + } + + template + void + FEBasisOperations:: + extractToCellNodalDataHostKernel( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + std::pair cellRange, + bool useMacroCellSubCellOrdering) const + { + auto &cellDofIndexToProcessDofIndexMap = + useMacroCellSubCellOrdering ? + d_macroCellSubCellDofIndexToProcessDofIndexMap : + d_cellDofIndexToProcessDofIndexMap; + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + std::memcpy( + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors, + nodalData.data() + + d_nVectors * + cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], + d_nVectors * sizeof(ValueTypeBasisCoeff)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataHostKernel( + dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + std::pair cellRange, + bool useMacroCellSubCellOrdering) const + { + auto &cellDofIndexToProcessDofIndexMap = + useMacroCellSubCellOrdering ? + d_macroCellSubCellDofIndexToProcessDofIndexMap : + d_cellDofIndexToProcessDofIndexMap; + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + std::transform( + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors, + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors + d_nVectors, + nodalData.data() + + d_nVectors * + cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], + nodalData.data() + + d_nVectors * + cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], + std::plus()); + } + + } // namespace basis +} // namespace dftfe diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index 99098a218..e0026e7e6 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -229,6 +229,24 @@ namespace dftfe } } + void + constraintMatrixInfo::precomputeMaps(const unsigned int totalSize, + const unsigned int blockSize) + { + d_localIndexMapUnflattenedToFlattened.clear(); + d_localIndexMapUnflattenedToFlattened.resize(totalSize); + + // + // fill the data array + // + for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) + { + d_localIndexMapUnflattenedToFlattened[ilocalDof] = + (dealii::types::global_dof_index)ilocalDof * + (dealii::types::global_dof_index)blockSize; + } + } + // diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index e3434678c..7d992b1ad 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -631,6 +631,31 @@ namespace dftfe d_localIndexMapUnflattenedToFlattened); } + void + constraintMatrixInfoDevice::precomputeMaps(const unsigned int totalSize, + const unsigned int blockSize) + { + d_localIndexMapUnflattenedToFlattened.clear(); + d_localIndexMapUnflattenedToFlattened.resize(totalSize); + + // + // fill the data array + // + for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) + { + // const dealii::types::global_dof_index globalIndex = + // unFlattenedPartitioner->local_to_global(ilocalDof); + d_localIndexMapUnflattenedToFlattened[ilocalDof] = + ilocalDof * blockSize; + // flattenedPartitioner->globalToLocal(globalIndex * blockSize); + } + + d_localIndexMapUnflattenedToFlattenedDevice.resize( + d_localIndexMapUnflattenedToFlattened.size()); + d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( + d_localIndexMapUnflattenedToFlattened); + } + template void From 77f7148be50ddea167612639c886f84a6d7c3bc0 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Thu, 21 Sep 2023 21:08:29 +0530 Subject: [PATCH 02/25] density calculator CPU/GPU merged cleanups pending --- CMakeLists.txt | 7 +- include/FEBasisOperations.h | 569 +++++++++--- ...hevOrthogonalizedSubspaceIterationSolver.h | 23 +- include/densityCalculator.h | 101 +++ include/densityCalculatorCPU.h | 25 +- include/densityCalculatorDevice.h | 26 +- include/densityFirstOrderResponseCalculator.h | 8 +- include/deviceKernelsGeneric.h | 11 + include/dft.h | 39 +- include/forceWfcContractions.h | 16 +- include/kohnShamDFTOperator.h | 13 +- include/linearAlgebraOperations.h | 49 +- include/operator.h | 13 +- include/vectorUtilities.h | 9 +- ...mputeOutputDensityDirectionalDerivative.cc | 18 +- src/dft/density.cc | 54 +- src/dft/densityCalculator.cc | 817 ++++++++++++++++++ src/dft/densityCalculatorCPU.cc | 172 ++-- src/dft/densityCalculatorDevice.cc | 121 ++- src/dft/densityCalculatorDeviceKernels.cc | 230 +++++ .../densityFirstOrderResponseCalculatorCPU.cc | 52 +- src/dft/dft.cc | 60 +- src/dft/dos.cc | 16 +- src/dft/initBoundaryConditions.cc | 13 + src/dft/initElectronicFields.cc | 37 +- src/dft/kohnShamEigenSolve.cc | 197 +++-- src/dft/localizationLength.cc | 3 +- src/dft/nscf.cc | 12 +- src/dft/psiInitialGuess.cc | 38 +- src/dftOperator/kohnShamDFTOperator.cc | 30 +- ...onfigurationalForceEEshelbyFPSPFnlLinFE.cc | 4 +- .../computeStressEEshelbyEPSPEnlEk.cc | 4 +- src/force/forceWfcContractions.cc | 21 +- src/linAlg/linearAlgebraOperationsOpt.cc | 230 ++--- src/linAlg/pseudoGS.cc | 30 +- ...evOrthogonalizedSubspaceIterationSolver.cc | 48 +- src/symmetry/symmetrizeRho.cc | 7 +- utils/DeviceKernelsGeneric.cc | 153 ++++ utils/FEBasisOperations.cc | 454 ++++------ utils/FEBasisOperationsDevice.cc | 445 ++++++++++ utils/FEBasisOperationsHost.cc | 535 ++++++++++++ utils/FEBasisOperationsHostKernels.cc | 270 ------ utils/vectorTools/vectorUtilities.cc | 15 +- 43 files changed, 3733 insertions(+), 1262 deletions(-) create mode 100644 include/densityCalculator.h create mode 100644 src/dft/densityCalculator.cc create mode 100644 src/dft/densityCalculatorDeviceKernels.cc create mode 100644 utils/FEBasisOperationsDevice.cc create mode 100644 utils/FEBasisOperationsHost.cc delete mode 100644 utils/FEBasisOperationsHostKernels.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c07b9b1a..3a7b947da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ SET(TARGET_SRC ./src/dft/vselfBinsManager.cc ./src/dft/energyCalculator.cc ./src/dft/densityCalculatorCPU.cc + ./src/dft/densityCalculator.cc ./src/dft/densityFirstOrderResponseCalculatorCPU.cc ./src/excManager/excDensityBaseClass.cpp ./src/excManager/excDensityLDAClass.cpp @@ -105,7 +106,7 @@ SET(TARGET_SRC ./utils/PeriodicTable.cc ./utils/xmlTodftfeParser.cc ./utils/FEBasisOperations.cc - ./utils/FEBasisOperationsHostKernels.cc + ./utils/FEBasisOperationsHost.cc ./src/dft/dftd.cc ./src/mdi/MDIEngine.cpp ./src/mdi/libraryMDI.cpp @@ -166,6 +167,7 @@ SET(DEVICE_SRC ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc ./src/dft/densityCalculatorDevice.cc + ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -184,6 +186,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc + ./utils/FEBasisOperationsDevice.cc ) ELSEIF ("${GPU_LANG}" STREQUAL "hip") @@ -193,6 +196,7 @@ SET(DEVICE_SRC ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc ./src/dft/densityCalculatorDevice.cc + ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc ./src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -211,6 +215,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc + ./utils/FEBasisOperationsDevice.cc ) ENDIF() diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index d045898c1..f06eb0e84 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -22,6 +22,8 @@ #include #include #include +#include + namespace dftfe { namespace basis @@ -32,22 +34,53 @@ namespace dftfe update_values = 0x0001, - update_gradients = 0x0002, - - update_macrocell_map = 0x0004 + update_gradients = 0x0002 }; + + inline UpdateFlags + operator|(const UpdateFlags f1, const UpdateFlags f2) + { + return static_cast(static_cast(f1) | + static_cast(f2)); + } + + + + inline UpdateFlags & + operator|=(UpdateFlags &f1, const UpdateFlags f2) + { + f1 = f1 | f2; + return f1; + } + + + inline UpdateFlags operator&(const UpdateFlags f1, const UpdateFlags f2) + { + return static_cast(static_cast(f1) & + static_cast(f2)); + } + + + inline UpdateFlags & + operator&=(UpdateFlags &f1, const UpdateFlags f2) + { + f1 = f1 & f2; + return f1; + } + + template - class FEBasisOperations + class FEBasisOperationsBase { public: - FEBasisOperations( + FEBasisOperationsBase( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, - std::vector *> + std::vector *> &constraintsVector); - ~FEBasisOperations() = default; + ~FEBasisOperationsBase() = default; void reinit(const unsigned int &blockSize, @@ -55,9 +88,157 @@ namespace dftfe const unsigned int &quadratureID, const UpdateFlags updateFlags = update_values); + // private: +#if defined(DFTFE_WITH_DEVICE) + using constraintInfoClass = + typename std::conditional::type; +#else + using constraintInfoClass = dftUtils::constraintMatrixInfo; +#endif + + + void - interpolate( + initializeIndexMaps(); + void + initializeFlattenedIndexMaps(); + + void + initializeConstraints(); + + void + initializeShapeFunctionAndJacobianData(); + + void + createMultiVector( + const unsigned int dofHandlerIndex, + const unsigned int blocksize, + dftfe::linearAlgebra::MultiVector + &multiVector) const; + + void + distribute( dftfe::linearAlgebra::MultiVector + &multiVector) const; + + + + constraintInfoClass d_constraintInfo; + std::vector *> + * d_constraintsVector; + const dealii::MatrixFree<3, ValueTypeBasisData> *d_matrixFreeDataPtr; + dftfe::utils::MemoryStorage + d_cellDofIndexToProcessDofIndexMap; + dftfe::utils::MemoryStorage + d_flattenedCellDofIndexToProcessDofIndexMap; + std::vector d_cellIndexToCellIdMap; + dftfe::utils::MemoryStorage + d_inverseJacobianData; + dftfe::utils::MemoryStorage d_JxWData; + dftfe::utils::MemoryStorage + d_shapeFunctionData; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientData; + dftfe::utils::MemoryStorage + d_nonAffineReshapeIDs; + + unsigned int d_quadratureID; + unsigned int d_dofHandlerID; + unsigned int d_nVectors; + unsigned int d_nCells; + unsigned int d_nDofsPerCell; + unsigned int d_nQuadsPerCell; + bool areAllCellsAffine; + bool areAllCellsCartesian; + UpdateFlags d_updateFlags; + }; + template + class FEBasisOperations : FEBasisOperationsBase + {}; + + template + class FEBasisOperations + : public FEBasisOperationsBase + { + public: + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::FEBasisOperationsBase; + + using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_nQuadsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_nDofsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::areAllCellsAffine; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::areAllCellsCartesian; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_updateFlags; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientData; + using FEBasisOperationsBase::d_JxWData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_inverseJacobianData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_cellIndexToCellIdMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_cellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase:: + d_flattenedCellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_constraintsVector; + void + interpolate( + dftfe::linearAlgebra::MultiVector &nodalData, std::map> - * quadratureGradients = NULL, - bool useMacroCellSubCellOrdering = false) const; + *quadratureGradients = NULL) const; void - interpolate(dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients = NULL, - bool useMacroCellSubCellOrdering = false) const; + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients = NULL) const; void integrateWithBasis( - const std::map< - dealii::CellId, - dftfe::utils::MemoryStorage> - &quadratureValues, + std::map> + *quadratureValues, std::map> *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering = false) const; + dftfe::linearAlgebra::MultiVector + &nodalData) const; void integrateWithBasis( - dftfe::utils::MemoryStorage + dftfe::utils::MemoryStorage *quadratureValues, - dftfe::utils::MemoryStorage + dftfe::utils::MemoryStorage *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering = false) const; + dftfe::linearAlgebra::MultiVector + &nodalData) const; void extractToCellNodalData( - dftfe::linearAlgebra::MultiVector + dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, - bool useMacroCellSubCellOrdering = false) const; + dftfe::utils::MemoryStorage + *cellNodalDataPtr) const; void accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage + const dftfe::utils::MemoryStorage *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering = false) const; + dftfe::linearAlgebra::MultiVector + &nodalData) const; - private: -#if defined(DFTFE_WITH_DEVICE) - using constraintInfoClass = - typename std::conditional::type; -#else - using constraintInfoClass = dftUtils::constraintMatrixInfo; -#endif + void + interpolateKernel( + const dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + const std::pair cellRange) const; + void + integrateWithBasisKernel( + const dftfe::utils::MemoryStorage + *quadratureValues, + const dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; void - initializeIndexMaps(); + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + const std::pair cellRange) const; void - initializeConstraints(); + accumulateFromCellNodalDataKernel( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + }; +#if defined(DFTFE_WITH_DEVICE) + template + class FEBasisOperations + : public FEBasisOperationsBase + { + public: + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::FEBasisOperationsBase; + using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nQuadsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nDofsPerCell; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::areAllCellsAffine; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::areAllCellsCartesian; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_updateFlags; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionGradientData; + using FEBasisOperationsBase::d_JxWData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_inverseJacobianData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellIndexToCellIdMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_nonAffineReshapeIDs; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase:: + d_flattenedCellDofIndexToProcessDofIndexMap; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_constraintsVector; + dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr; void - initializeShapeFunctionAndJacobianData(); + setDeviceBLASHandle( + dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr); + + dftfe::utils::deviceBlasHandle_t & + getDeviceBLASHandle(); + void - interpolateHostKernel( + interpolate( dftfe::linearAlgebra::MultiVector + dftfe::utils::MemorySpace::DEVICE> + &nodalData, + std::map> + *quadratureValues, + std::map> + *quadratureGradients = NULL) const; + + + void + interpolate( + dftfe::linearAlgebra::MultiVector &nodalData, dftfe::utils::MemoryStorage + dftfe::utils::MemorySpace::DEVICE> *quadratureValues, dftfe::utils::MemoryStorage - * quadratureGradients, - std::pair cellRange, - bool useMacroCellSubCellOrdering = false) const; + dftfe::utils::MemorySpace::DEVICE> + *quadratureGradients = NULL) const; + + void + integrateWithBasis( + std::map> + *quadratureValues, + std::map> + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + void - integrateWithBasisHostKernel( + integrateWithBasis( dftfe::utils::MemoryStorage + dftfe::utils::MemorySpace::DEVICE> *quadratureValues, dftfe::utils::MemoryStorage + dftfe::utils::MemorySpace::DEVICE> *quadratureGradients, dftfe::linearAlgebra::MultiVector - & nodalData, - std::pair cellRange, - bool useMacroCellSubCellOrdering = false) const; - + dftfe::utils::MemorySpace::DEVICE> + &nodalData) const; void - extractToCellNodalDataHostKernel( + extractToCellNodalData( dftfe::linearAlgebra::MultiVector + dftfe::utils::MemorySpace::DEVICE> &nodalData, dftfe::utils::MemoryStorage - * cellNodalDataPtr, - std::pair cellRange, - bool useMacroCellSubCellOrdering = false) const; + dftfe::utils::MemorySpace::DEVICE> + *cellNodalDataPtr) const; void - accumulateFromCellNodalDataHostKernel( + accumulateFromCellNodalData( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const; + + void + interpolateKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> &nodalData, dftfe::utils::MemoryStorage + dftfe::utils::MemorySpace::DEVICE> + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + const std::pair cellRange) const; + + void + integrateWithBasisKernel( + const dftfe::utils::MemoryStorage + *quadratureValues, + const dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const; + + + void + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + const std::pair cellRange) const; + + void + accumulateFromCellNodalDataKernel( + const dftfe::utils::MemoryStorage *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector - & nodalData, - std::pair cellRange, - bool useMacroCellSubCellOrdering = false) const; + dftfe::utils::MemorySpace::DEVICE> + & nodalData, + const std::pair cellRange) const; + }; +#endif + template class FEBasisOperationsBase; +#if defined(DFTFE_WITH_DEVICE) + template class FEBasisOperationsBase; +#endif + template class FEBasisOperations; +#if defined(DFTFE_WITH_DEVICE) + template class FEBasisOperations; +#endif - constraintInfoClass d_constraintInfo; - std::vector *> - * d_constraintsVector; - const dealii::MatrixFree<3, ValueTypeBasisData> *d_matrixFreeDataPtr; - std::vector d_cellDofIndexToProcessDofIndexMap; - std::vector d_macroCellSubCellDofIndexToProcessDofIndexMap; - std::vector d_cellIndexToMacroCellSubCellIndexMap; - std::vector d_cellIndexToCellIdMap; - std::map> - d_inverseJacobianData; - std::map> - d_JxWData; - dftfe::utils::MemoryStorage - d_shapeFunctionData; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientData; - unsigned int d_quadratureID; - unsigned int d_dofHandlerID; - unsigned int d_nVectors; - unsigned int d_nCells; - unsigned int d_nMacroCells; - unsigned int d_nDofsPerCell; - unsigned int d_nQuadsPerCell; - bool areAllCellsAffine; - UpdateFlags d_updateFlags; + } // end of namespace basis - }; // end of FEBasisOperations - } // end of namespace basis } // end of namespace dftfe #endif // dftfeBasisOperations_h diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h index c0dadb055..929ef8534 100644 --- a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h +++ b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h @@ -64,17 +64,18 @@ namespace dftfe * @brief Solve a generalized eigen problem. */ void - solve(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - std::vector &eigenVectorsFlattened, - std::vector &eigenVectorsRotFracDensityFlattened, - const unsigned int totalNumberWaveFunctions, - std::vector & eigenValues, - std::vector & residuals, - const MPI_Comm & interBandGroupComm, - const bool computeResidual, - const bool useMixedPrec = false, - const bool isFirstScf = false); + solve(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, + dataTypes::number * eigenVectorsRotFracDensityFlattened, + const unsigned int totalNumberWaveFunctions, + const unsigned int localVectorSize, + std::vector &eigenValues, + std::vector &residuals, + const MPI_Comm & interBandGroupComm, + const bool computeResidual, + const bool useMixedPrec = false, + const bool isFirstScf = false); /** * @brief Solve a generalized eigen problem. diff --git a/include/densityCalculator.h b/include/densityCalculator.h new file mode 100644 index 000000000..8871665f1 --- /dev/null +++ b/include/densityCalculator.h @@ -0,0 +1,101 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#ifndef densityCalculatorDevice_H_ +#define densityCalculatorDevice_H_ + +#include +#include +#include "dftParameters.h" +#include "FEBasisOperations.h" + +namespace dftfe +{ + template + void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const dealii::DoFHandler<3> & dofHandler, + const unsigned int totalLocallyOwnedCells, + const unsigned int numberNodesPerElement, + const unsigned int numQuadPoints, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit, + const bool use2pPlusOneGLQuad = false); + + template + void + computeRhoGradRhoFromInterpolatedValues( + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); + +#if defined(DFTFE_WITH_DEVICE) + template + void + computeRhoGradRhoFromInterpolatedValues( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); +#endif + +} // namespace dftfe +#endif diff --git a/include/densityCalculatorCPU.h b/include/densityCalculatorCPU.h index e324cdad6..05300ae50 100644 --- a/include/densityCalculatorCPU.h +++ b/include/densityCalculatorCPU.h @@ -20,6 +20,7 @@ #include "headers.h" #include "operator.h" +#include "FEBasisOperations.h" #include "dftParameters.h" namespace dftfe @@ -33,16 +34,20 @@ namespace dftfe template void computeRhoFromPSICPU( - const std::vector> & X, - const std::vector> & XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, + const T * X, + const T * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTClass & operatorMatrix, + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtrHost, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, const unsigned int numberNodesPerElement, diff --git a/include/densityCalculatorDevice.h b/include/densityCalculatorDevice.h index 847efc733..774b67adf 100644 --- a/include/densityCalculatorDevice.h +++ b/include/densityCalculatorDevice.h @@ -22,6 +22,7 @@ # include # include # include "dftParameters.h" +# include "FEBasisOperations.h" namespace dftfe { @@ -30,16 +31,21 @@ namespace dftfe template void computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, + const NumberType * X, + const NumberType * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTDeviceClass & operatorMatrix, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrDevice, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h index c674d82d0..4f5bb94f6 100644 --- a/include/densityFirstOrderResponseCalculator.h +++ b/include/densityFirstOrderResponseCalculator.h @@ -33,8 +33,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> & X, - const std::vector> & XPrime, + const NumberType * X, + const NumberType * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -59,8 +59,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> & X, - const std::vector> & XPrime, + const NumberType * X, + const NumberType * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, diff --git a/include/deviceKernelsGeneric.h b/include/deviceKernelsGeneric.h index 6e6b631af..3703bdf02 100644 --- a/include/deviceKernelsGeneric.h +++ b/include/deviceKernelsGeneric.h @@ -86,6 +86,17 @@ namespace dftfe ValueType2 * copyToVec); + template + void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec); + template void diff --git a/include/dft.h b/include/dft.h index b73b72aad..98009f53b 100644 --- a/include/dft.h +++ b/include/dft.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -1230,6 +1231,18 @@ namespace dftfe unsigned int d_densityQuadratureId; unsigned int d_densityQuadratureIdElectro; dealii::MatrixFree<3, double> matrix_free_data, d_matrixFreeDataPRefined; + std::unique_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrHost; +#if defined(DFTFE_WITH_DEVICE) + std::unique_ptr< + dftfe::basis::FEBasisOperations> + basisOperationsPtrDevice; +#endif std::map> d_supportPoints, d_supportPointsPRefined, d_supportPointsEigen; std::vector *> d_constraintsVector; @@ -1340,14 +1353,24 @@ namespace dftfe std::vector> d_densityMatDerFermiEnergy; /// Spectrum split higher eigenvalues computed in Rayleigh-Ritz step - std::vector> eigenValuesRRSplit; - std::vector> d_eigenVectorsFlattened; - std::vector> d_eigenVectorsFlattenedSTL; - std::vector> - d_eigenVectorsRotFracDensityFlattenedSTL; - - std::vector> - d_eigenVectorsDensityMatrixPrimeSTL; + std::vector> eigenValuesRRSplit; + // std::vector> + // d_eigenVectorsFlattened; std::vector> + // d_eigenVectorsFlattenedSTL; + dftfe::utils::MemoryStorage + d_eigenVectorsFlattenedHost; + // std::vector> + // d_eigenVectorsRotFracDensityFlattenedSTL; + + // std::vector> + // d_eigenVectorsDensityMatrixPrimeSTL; + dftfe::utils::MemoryStorage + d_eigenVectorsRotFracDensityFlattenedHost; + dftfe::utils::MemoryStorage + d_eigenVectorsDensityMatrixPrimeHost; /// device eigenvectors #ifdef DFTFE_WITH_DEVICE diff --git a/include/forceWfcContractions.h b/include/forceWfcContractions.h index af612a197..70511ec7a 100644 --- a/include/forceWfcContractions.h +++ b/include/forceWfcContractions.h @@ -28,14 +28,14 @@ namespace dftfe { void wfcContractionsForceKernelsAllH( - operatorDFTClass & operatorMatrix, - const std::vector> &X, - const unsigned int spinPolarizedFlag, - const unsigned int spinIndex, - const std::vector> & eigenValuesH, - const std::vector> & partialOccupanciesH, - const std::vector & kPointCoordinates, - const unsigned int *nonTrivialIdToElemIdMapH, + operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const unsigned int spinPolarizedFlag, + const unsigned int spinIndex, + const std::vector> &eigenValuesH, + const std::vector> &partialOccupanciesH, + const std::vector & kPointCoordinates, + const unsigned int * nonTrivialIdToElemIdMapH, const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH, const unsigned int MLoc, const unsigned int N, diff --git a/include/kohnShamDFTOperator.h b/include/kohnShamDFTOperator.h index 942255f3a..39754397f 100644 --- a/include/kohnShamDFTOperator.h +++ b/include/kohnShamDFTOperator.h @@ -121,9 +121,10 @@ node is stored * @return ProjMatrix projected small matrix */ void - XtHX(const std::vector &src, - const unsigned int numberComponents, - std::vector & ProjHam); + XtHX(const dataTypes::number * src, + const unsigned int numberComponents, + const unsigned int numberLocalDofs, + std::vector &ProjHam); /** * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HConj=X^{T}*HConj*XConj @@ -139,8 +140,9 @@ node is stored * also avoids creation of another full X memory. */ void - XtHX(const std::vector & X, + XtHX(const dataTypes::number * X, const unsigned int numberComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false); @@ -161,9 +163,10 @@ node is stored */ void XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int N, const unsigned int Ncore, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false); diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h index bb361e976..cbb5c9f5a 100644 --- a/include/linearAlgebraOperations.h +++ b/include/linearAlgebraOperations.h @@ -584,8 +584,9 @@ namespace dftfe */ template void - gramSchmidtOrthogonalization(std::vector & X, + gramSchmidtOrthogonalization(T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiComm); @@ -621,8 +622,9 @@ namespace dftfe template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -647,8 +649,9 @@ namespace dftfe void rayleighRitzGEP(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -674,8 +677,9 @@ namespace dftfe void rayleighRitz(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiCommDomain, @@ -702,9 +706,10 @@ namespace dftfe void rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, - std::vector & Y, + T * X, + T * Y, const unsigned int numberComponents, + const unsigned int numberDofs, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -731,18 +736,19 @@ namespace dftfe */ template void - rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - const std::vector &X, - std::vector & Y, - const unsigned int numberComponents, - const unsigned int numberCoreStates, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interBandGroupComm, - const MPI_Comm & mpiCommDomain, - const bool useMixedPrec, - std::vector & eigenValues, - const dftParameters & dftParams); + rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + const T * X, + T * Y, + const unsigned int numberComponents, + const unsigned int numberDofs, + const unsigned int numberCoreStates, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interBandGroupComm, + const MPI_Comm & mpiCommDomain, + const bool useMixedPrec, + std::vector &eigenValues, + const dftParameters &dftParams); /** @brief Compute residual norm associated with eigenValue problem of the given operator @@ -757,8 +763,10 @@ namespace dftfe template void computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const std::vector &eigenValues, + const unsigned int numberComponents, + const unsigned int numberDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -772,8 +780,9 @@ namespace dftfe void densityMatrixEigenBasisFirstOrderResponse( operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const unsigned int N, + const unsigned int numberLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, diff --git a/include/operator.h b/include/operator.h index 7395eb718..3c74807d9 100644 --- a/include/operator.h +++ b/include/operator.h @@ -171,9 +171,10 @@ namespace dftfe * @param ProjMatrix projected small matrix */ virtual void - XtHX(const std::vector &X, - const unsigned int numberComponents, - std::vector & ProjHam) = 0; + XtHX(const dataTypes::number * X, + const unsigned int numberComponents, + const unsigned int numberLocalDofs, + std::vector &ProjHam) = 0; /** * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HProjConj=X^{T}*HConj*XConj @@ -185,8 +186,9 @@ namespace dftfe * of the operation into the given subspace */ virtual void - XtHX(const std::vector & X, + XtHX(const dataTypes::number * X, const unsigned int numberComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0; @@ -207,9 +209,10 @@ namespace dftfe */ virtual void XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int totalNumberComponents, const unsigned int singlePrecComponents, + const unsigned int numberLocalDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0; diff --git a/include/vectorUtilities.h b/include/vectorUtilities.h index 5fbbcf4f7..890ddb1a1 100644 --- a/include/vectorUtilities.h +++ b/include/vectorUtilities.h @@ -169,8 +169,9 @@ namespace dftfe */ void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, const std::vector &localProcDofIndicesReal, @@ -180,8 +181,9 @@ namespace dftfe void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> &componentVectors); @@ -206,8 +208,9 @@ namespace dftfe */ void copyFlattenedSTLVecToSingleCompVec( - const std::vector & flattenedArray, + const double * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> & componentVectors); diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc index 599550180..c1deaa4ff 100644 --- a/src/dft/computeOutputDensityDirectionalDerivative.cc +++ b/src/dft/computeOutputDensityDirectionalDerivative.cc @@ -53,7 +53,7 @@ namespace dftfe d_eigenVectorsFlattenedDevice.begin()); #endif if (!d_dftParamsPtr->useDevice) - d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL; + d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost; // set up linear solver @@ -465,7 +465,7 @@ namespace dftfe d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorDevice, d_eigenDofHandlerIndex, dofHandler, @@ -488,7 +488,7 @@ namespace dftfe d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorDevice, d_eigenDofHandlerIndex, dofHandler, @@ -511,11 +511,11 @@ namespace dftfe if (d_dftParamsPtr->singlePrecLRD) computeRhoFirstOrderResponseCPUMixedPrec( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsDensityMatrixPrimeSTL, + d_eigenVectorsFlattenedHost.data(), + d_eigenVectorsDensityMatrixPrimeHost.data(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorCPU, d_eigenDofHandlerIndex, dofHandler, @@ -533,11 +533,11 @@ namespace dftfe *d_dftParamsPtr); else computeRhoFirstOrderResponseCPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsDensityMatrixPrimeSTL, + d_eigenVectorsFlattenedHost.data(), + d_eigenVectorsDensityMatrixPrimeHost.data(), d_densityMatDerFermiEnergy, d_numEigenValues, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), kohnShamDFTEigenOperatorCPU, d_eigenDofHandlerIndex, dofHandler, diff --git a/src/dft/density.cc b/src/dft/density.cc index d170c28c8..7041a23e7 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -19,10 +19,7 @@ // source file for electron density related computations #include -#include -#ifdef DFTFE_WITH_DEVICE -# include -#endif +#include namespace dftfe { @@ -202,18 +199,18 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - Device::computeRhoFromPSI( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsRotFracFlattenedDevice.begin(), + computeRhoFromPSI( + &d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, d_numEigenValues, d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValues, fermiEnergy, fermiEnergyUp, fermiEnergyDown, - kohnShamDFTEigenOperator, - d_eigenDofHandlerIndex, + basisOperationsPtrDevice, + 0, dofHandler, matrix_free_data.n_physical_cells(), matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), @@ -230,20 +227,22 @@ namespace dftfe interBandGroupComm, *d_dftParamsPtr, isConsiderSpectrumSplitting && - d_numEigenValues != d_numEigenValuesRR); + d_numEigenValues != d_numEigenValuesRR, + false); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSICPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsRotFracDensityFlattenedSTL, + computeRhoFromPSI( + &d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, d_numEigenValues, d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValues, fermiEnergy, fermiEnergyUp, fermiEnergyDown, - kohnShamDFTEigenOperatorCPU, + basisOperationsPtrHost, + 0, dofHandler, matrix_free_data.n_physical_cells(), matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), @@ -685,18 +684,18 @@ namespace dftfe // nodes in each cell #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - Device::computeRhoFromPSI( - d_eigenVectorsFlattenedDevice.begin(), - d_eigenVectorsRotFracFlattenedDevice.begin(), + computeRhoFromPSI( + &d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, d_numEigenValues, d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValues, fermiEnergy, fermiEnergyUp, fermiEnergyDown, - kohnShamDFTEigenOperator, - d_eigenDofHandlerIndex, + basisOperationsPtrDevice, + 0, dofHandler, matrix_free_data.n_physical_cells(), matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), @@ -715,17 +714,18 @@ namespace dftfe true); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSICPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsRotFracDensityFlattenedSTL, + computeRhoFromPSI( + &d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, d_numEigenValues, d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValues, fermiEnergy, fermiEnergyUp, fermiEnergyDown, - kohnShamDFTEigenOperatorCPU, + basisOperationsPtrHost, + 0, dofHandler, matrix_free_data.n_physical_cells(), matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc new file mode 100644 index 000000000..6f3e408cf --- /dev/null +++ b/src/dft/densityCalculator.cc @@ -0,0 +1,817 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + template + void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const dealii::DoFHandler<3> & dofHandler, + const unsigned int totalLocallyOwnedCells, + const unsigned int numNodesPerElement, + const unsigned int numQuadPoints, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit, + const bool use2pPlusOneGLQuad) + { + int this_process; + MPI_Comm_rank(mpiCommParent, &this_process); + // dftfe::utils::deviceSynchronize(); + // MPI_Barrier(mpiCommParent); + // double device_time = MPI_Wtime(); + const unsigned int numKPoints = kPointWeights.size(); + + // band group parallelization data structures + const unsigned int numberBandGroups = + dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); + const unsigned int bandGroupTaskId = + dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices(interBandGroupComm, + totalNumWaveFunctions, + bandGroupLowHighPlusOneIndices); + + const unsigned int BVec = + std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); + + const double spinPolarizedFactor = + (dftParams.spinPolarized == 1) ? 1.0 : 2.0; + const unsigned int numSpinComponents = + (dftParams.spinPolarized == 1) ? 2 : 1; + + const NumberType zero = 0; + const NumberType scalarCoeffAlphaRho = 1.0; + const NumberType scalarCoeffBetaRho = 1.0; + const NumberType scalarCoeffAlphaGradRho = 1.0; + const NumberType scalarCoeffBetaGradRho = 1.0; + + const unsigned int cellsBlockSize = + memorySpace == dftfe::utils::MemorySpace::DEVICE ? 50 : 1; + const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; + const unsigned int remCellBlockSize = + totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; + + std::vector> + wfcQuadPointData(numSpinComponents); + std::vector> + gradWfcQuadPointData(numSpinComponents); + std::vector> + rhoWfcContributions(numSpinComponents); + std::vector> + gradRhoWfcContributions(numSpinComponents); + dftfe::utils::MemoryStorage + rhoHost; + + dftfe::utils::MemoryStorage + gradRhoHost; +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage rho; + dftfe::utils::MemoryStorage gradRho; +#else + auto &rho = rhoHost; + auto &gradRho = gradRhoHost; +#endif + + rho.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + { + wfcQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + zero); + + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + rhoWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints * + BVec, + 0.0); + } + if (isEvaluateGradRho) + { + gradRho.resize(totalLocallyOwnedCells * numQuadPoints * 3 * + numSpinComponents, + 0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + gradWfcQuadPointData[spinIndex].resize(cellsBlockSize * + numQuadPoints * BVec * 3, + zero); + if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + gradRhoWfcContributions[spinIndex].resize( + cellsBlockSize * numQuadPoints * BVec * 3, 0.0); + } + } + + + + std::vector< + dftfe::utils::MemoryStorage> + partialOccupVecHost( + numSpinComponents, + dftfe::utils::MemoryStorage( + BVec, 0.0)); +#if defined(DFTFE_WITH_DEVICE) + std::vector> + partialOccupVec(numSpinComponents); +#else + auto &partialOccupVec = partialOccupVecHost; +#endif + + std::vector> + flattenedArrayBlock(numSpinComponents); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + basisOperationsPtr->createMultiVector(matrixFreeDofhandlerIndex, + BVec, + flattenedArrayBlock[spinIndex]); + + dftfe::utils::MemoryStorage cellWaveFunctionMatrix( + cellsBlockSize * numNodesPerElement * BVec); + + for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) + { + rho.setValue(0.0); + gradRho.setValue(0.0); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + wfcQuadPointData[spinIndex].setValue(zero); + gradWfcQuadPointData[spinIndex].setValue(zero); + rhoWfcContributions[spinIndex].setValue(0.0); + gradRhoWfcContributions[spinIndex].setValue(0.0); + } + for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; jvec += BVec) + { + const unsigned int currentBlockSize = + std::min(BVec, totalNumWaveFunctions - jvec); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (currentBlockSize != + flattenedArrayBlock[spinIndex].numVectors()) + basisOperationsPtr->createMultiVector( + matrixFreeDofhandlerIndex, + currentBlockSize, + flattenedArrayBlock[spinIndex]); + if ((jvec + currentBlockSize) <= + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && + (jvec + currentBlockSize) > + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) + { + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (spectrumSplit) + { + partialOccupVecHost[spinIndex].setValue( + kPointWeights[kPoint] * spinPolarizedFactor); + } + else + { + if (dftParams.constraintMagnetization) + { + const double fermiEnergyConstraintMag = + spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + if (eigenValues[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec] > + fermiEnergyConstraintMag) + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = 0; + else + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + kPointWeights[kPoint] * spinPolarizedFactor; + } + } + else + { + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + dftUtils::getPartialOccupancy( + eigenValues[kPoint][totalNumWaveFunctions * + spinIndex + + jvec + iEigenVec], + fermiEnergy, + C_kb, + dftParams.TVal) * + kPointWeights[kPoint] * spinPolarizedFactor; + } + } +#if defined(DFTFE_WITH_DEVICE) + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + { + partialOccupVec[spinIndex].resize( + partialOccupVecHost[spinIndex].size()); + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); + } +#endif + } + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) + for (unsigned int iWave = 0; iWave < currentBlockSize; + ++iWave) + flattenedArrayBlock[spinIndex] + .data()[iNode * currentBlockSize + iWave] = + (X->data())[numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + + spinIndex) + + iNode * totalNumWaveFunctions + jvec + + iWave]; +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + totalNumWaveFunctions, + numLocalDofs, + jvec, + X->data() + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[spinIndex].begin()); +#endif + + + const unsigned int d_quadratureIndex = + use2pPlusOneGLQuad ? 2 : 0; + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | dftfe::basis::update_gradients; + basisOperationsPtr->reinit(currentBlockSize * numSpinComponents, + 0, + d_quadratureIndex, + updateFlags); + + + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + flattenedArrayBlock[spinIndex].updateGhostValues(); + basisOperationsPtr->distribute( + flattenedArrayBlock[spinIndex]); + } + + for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) + { + const unsigned int currentCellsBlockSize = + (iblock == numCellBlocks) ? remCellBlockSize : + cellsBlockSize; + if (currentCellsBlockSize > 0) + { + const unsigned int startingCellId = + iblock * cellsBlockSize; + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + flattenedArrayBlock[spinIndex], + &wfcQuadPointData[spinIndex], + isEvaluateGradRho ? + &gradWfcQuadPointData[spinIndex] : + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + computeRhoGradRhoFromInterpolatedValues( + basisOperationsPtr, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize), + std::pair( + jvec, jvec + currentBlockSize), + partialOccupVec[spinIndex].data(), + wfcQuadPointData[spinIndex].data(), + gradWfcQuadPointData[spinIndex].data(), + rhoWfcContributions[spinIndex].data(), + gradRhoWfcContributions[spinIndex].data(), + rho.data() + spinIndex * totalLocallyOwnedCells * + numQuadPoints, + gradRho.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints * 3, + isEvaluateGradRho); + } // non-trivial cell block check + } // cells block loop + } + } + + if (spectrumSplit) + for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) + { + const unsigned int currentBlockSize = std::min(BVec, Nfr - jvec); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + if (currentBlockSize != + flattenedArrayBlock[spinIndex].numVectors()) + basisOperationsPtr->createMultiVector( + matrixFreeDofhandlerIndex, + currentBlockSize, + flattenedArrayBlock[spinIndex]); + if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <= + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && + (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) > + bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) + { + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + if (dftParams.constraintMagnetization) + { + const double fermiEnergyConstraintMag = + spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + if (eigenValues[kPoint] + [totalNumWaveFunctions * spinIndex + + (totalNumWaveFunctions - Nfr) + + jvec + iEigenVec] > + fermiEnergyConstraintMag) + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + -kPointWeights[kPoint] * spinPolarizedFactor; + else + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = 0; + } + } + else + { + for (unsigned int iEigenVec = 0; + iEigenVec < currentBlockSize; + ++iEigenVec) + { + *(partialOccupVecHost[spinIndex].begin() + + iEigenVec) = + (dftUtils::getPartialOccupancy( + eigenValues[kPoint] + [totalNumWaveFunctions * spinIndex + + (totalNumWaveFunctions - Nfr) + + jvec + iEigenVec], + fermiEnergy, + C_kb, + dftParams.TVal) - + 1.0) * + kPointWeights[kPoint] * spinPolarizedFactor; + } + } + +#if defined(DFTFE_WITH_DEVICE) + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + { + partialOccupVec[spinIndex].resize( + partialOccupVecHost[spinIndex].size()); + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); + } +#endif + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + for (unsigned int iNode = 0; iNode < numLocalDofs; + ++iNode) + for (unsigned int iWave = 0; iWave < Nfr; ++iWave) + flattenedArrayBlock[spinIndex] + .data()[iNode * currentBlockSize + iWave] = + (XFrac->data())[numLocalDofs * Nfr * + (numSpinComponents * kPoint + + spinIndex) + + iNode * Nfr + jvec + iWave]; +#if defined(DFTFE_WITH_DEVICE) + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + dftfe::utils::deviceKernelsGeneric:: + stridedCopyToBlockConstantStride( + currentBlockSize, + Nfr, + numLocalDofs, + jvec, + XFrac->data() + + numLocalDofs * Nfr * + (numSpinComponents * kPoint + spinIndex), + flattenedArrayBlock[spinIndex].begin()); +#endif + const unsigned int d_quadratureIndex = + use2pPlusOneGLQuad ? 2 : 0; + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | + dftfe::basis::update_gradients; + basisOperationsPtr->reinit(currentBlockSize, + 0, + d_quadratureIndex, + updateFlags); + + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + { + flattenedArrayBlock[spinIndex].updateGhostValues(); + basisOperationsPtr->distribute( + flattenedArrayBlock[spinIndex]); + } + + for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) + { + const unsigned int currentCellsBlockSize = + (iblock == numCellBlocks) ? remCellBlockSize : + cellsBlockSize; + if (currentCellsBlockSize > 0) + { + const unsigned int startingCellId = + iblock * cellsBlockSize; + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + basisOperationsPtr->interpolateKernel( + flattenedArrayBlock[spinIndex], + &wfcQuadPointData[spinIndex], + isEvaluateGradRho ? + &gradWfcQuadPointData[spinIndex] : + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); + + for (unsigned int spinIndex = 0; + spinIndex < numSpinComponents; + ++spinIndex) + computeRhoGradRhoFromInterpolatedValues( + basisOperationsPtr, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize), + std::pair( + jvec, jvec + currentBlockSize), + partialOccupVec[spinIndex].data(), + wfcQuadPointData[spinIndex].data(), + gradWfcQuadPointData[spinIndex].data(), + rhoWfcContributions[spinIndex].data(), + gradRhoWfcContributions[spinIndex].data(), + rho.data() + spinIndex * totalLocallyOwnedCells * + numQuadPoints, + gradRho.data() + spinIndex * + totalLocallyOwnedCells * + numQuadPoints * 3, + isEvaluateGradRho); + } // non-tivial cells block + } // cells block loop + } + } // spectrum split block + } +#if defined(DFTFE_WITH_DEVICE) + rhoHost.resize(rho.size()); + rhoHost.copyFrom(rho); + if (isEvaluateGradRho) + { + gradRhoHost.resize(gradRho.size()); + gradRhoHost.copyFrom(gradRho); + } +#endif + + int size; + MPI_Comm_size(interpoolcomm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoHost.data()), + MPI_SUM, + interpoolcomm); + if (isEvaluateGradRho) + MPI_Allreduce(MPI_IN_PLACE, + gradRhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents * 3, + dataTypes::mpi_type_id(gradRhoHost.data()), + MPI_SUM, + interpoolcomm); + } + MPI_Comm_size(interBandGroupComm, &size); + if (size > 1) + { + MPI_Allreduce(MPI_IN_PLACE, + rhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents, + dataTypes::mpi_type_id(rhoHost.data()), + MPI_SUM, + interBandGroupComm); + if (isEvaluateGradRho) + MPI_Allreduce(MPI_IN_PLACE, + gradRhoHost.data(), + totalLocallyOwnedCells * numQuadPoints * + numSpinComponents * 3, + dataTypes::mpi_type_id(gradRhoHost.data()), + MPI_SUM, + interBandGroupComm); + } + + unsigned int iElem = 0; + auto cell = dofHandler.begin_active(); + auto endc = dofHandler.end(); + for (; cell != endc; ++cell) + if (cell->is_locally_owned()) + { + const dealii::CellId cellid = cell->id(); + + std::vector dummy(1); + std::vector &tempRhoQuads = (*rhoValues)[cellid]; + std::vector &tempGradRhoQuads = + isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; + + std::vector &tempRhoQuadsSP = + (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : + dummy; + std::vector &tempGradRhoQuadsSP = + ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? + (*gradRhoValuesSpinPolarized)[cellid] : + dummy; + + if (dftParams.spinPolarized == 1) + { + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + tempRhoQuadsSP[2 * q + 0] = + rhoHost[iElem * numQuadPoints + q]; + + tempRhoQuadsSP[2 * q + 1] = + rhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + } + + if (isEvaluateGradRho) + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + tempGradRhoQuadsSP[6 * q + 0] = + gradRhoHost[iElem * numQuadPoints + q]; + tempGradRhoQuadsSP[6 * q + 1] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuadsSP[6 * q + 2] = + gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuadsSP[6 * q + 3] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints + q]; + tempGradRhoQuadsSP[6 * q + 4] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuadsSP[6 * q + 5] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + 2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + } + } + + for (unsigned int q = 0; q < numQuadPoints; ++q) + tempRhoQuads[q] = rhoHost[iElem * numQuadPoints + q]; + + + if (isEvaluateGradRho) + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + tempGradRhoQuads[3 * q] = + gradRhoHost[iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 1] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 2] = + gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + } + iElem++; + } + } + template + void + computeRhoGradRhoFromInterpolatedValues( + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->d_nQuadsPerCell; + const unsigned int nCells = basisOperationsPtr->d_nCells; + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) + for (unsigned int iWave = 0; iWave < vecRange.second - vecRange.first; + ++iWave) + { + rho[iCell * nQuadsPerCell + iQuad] += + partialOccupVec[iWave] * + std::abs(wfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]) * + std::abs(wfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]); + } + if (isEvaluateGradRho) + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) + for (unsigned int iWave = 0; + iWave < vecRange.second - vecRange.first; + ++iWave) + { + gradRho[iCell * nQuadsPerCell + iQuad] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj( + wfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + iQuad * vectorsBlockSize + iWave]); + gradRho[nCells * nQuadsPerCell + iCell * nQuadsPerCell + + iQuad] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj( + wfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]); + gradRho[2 * nCells * nQuadsPerCell + iCell * nQuadsPerCell + + iQuad] += + 2 * partialOccupVec[iWave] * + dftfe::utils::realPart( + dftfe::utils::complexConj( + wfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]) * + gradWfcQuadPointData[(iCell - cellRange.first) * + nQuadsPerCell * vectorsBlockSize * + 3 + + 2 * nQuadsPerCell * vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]); + } + } + } +#if defined(DFTFE_WITH_DEVICE) + template void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrDevice, + const unsigned int matrixFreeDofhandlerIndex, + const dealii::DoFHandler<3> & dofHandler, + const unsigned int totalLocallyOwnedCells, + const unsigned int numNodesPerElement, + const unsigned int numQuadPoints, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit, + const bool use2pPlusOneGLQuad); +#endif + + template void + computeRhoFromPSI( + const dftfe::utils::MemoryStorage *X, + const dftfe::utils::MemoryStorage *XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const unsigned int matrixFreeDofhandlerIndex, + const dealii::DoFHandler<3> & dofHandler, + const unsigned int totalLocallyOwnedCells, + const unsigned int numNodesPerElement, + const unsigned int numQuadPoints, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, + std::map> *gradRhoValuesSpinPolarized, + const bool isEvaluateGradRho, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interpoolcomm, + const MPI_Comm & interBandGroupComm, + const dftParameters & dftParams, + const bool spectrumSplit, + const bool use2pPlusOneGLQuad); +} // namespace dftfe diff --git a/src/dft/densityCalculatorCPU.cc b/src/dft/densityCalculatorCPU.cc index db8580394..c53272c85 100644 --- a/src/dft/densityCalculatorCPU.cc +++ b/src/dft/densityCalculatorCPU.cc @@ -24,22 +24,26 @@ #include #include #include - +#include namespace dftfe { template void computeRhoFromPSICPU( - const std::vector> & X, - const std::vector> & XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, + const T * X, + const T * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTClass & operatorMatrix, + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + & basisOperationsPtrHost, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, const unsigned int numNodesPerElement, @@ -79,9 +83,12 @@ namespace dftfe (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - std::vector wfcQuads(numQuadPoints * BVec, T(0.0)); + // std::vector wfcQuads(numQuadPoints * BVec, T(0.0)); - std::vector gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); + // std::vector gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); + dftfe::utils::MemoryStorage wfcQuads( + numQuadPoints * BVec, T(0.0)), + gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); std::vector shapeFunctionValues(numQuadPoints * numNodesPerElement, T(0.0)); @@ -112,7 +119,8 @@ namespace dftfe dftfe::distributedCPUMultiVec flattenedArrayBlock; - std::vector cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0)); + dftfe::utils::MemoryStorage + cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0)); // set density to zero typename dealii::DoFHandler<3>::active_cell_iterator cell = @@ -176,16 +184,27 @@ namespace dftfe isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, 0.0); - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; - const std::vector &XFracCurrentKPoint = - XFrac[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XCurrentKPoint = + X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; + const T *XFracCurrentKPoint = + XFrac + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * Nfr; for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; jvec += BVec) { const unsigned int currentBlockSize = std::min(BVec, totalNumWaveFunctions - jvec); + const unsigned int d_eigenDofHandlerIndex = 1; + const unsigned int d_quadratureIndex = + useFEOrderRhoPlusOneGLQuad ? 2 : 0; + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | dftfe::basis::update_gradients; + basisOperationsPtrHost->reinit(currentBlockSize, + d_eigenDofHandlerIndex, + d_quadratureIndex, + updateFlags); if (currentBlockSize != BVec || jvec == 0) operatorMatrix.reinit(currentBlockSize, @@ -277,21 +296,26 @@ namespace dftfe const T scalarCoeffAlpha = T(1.0), scalarCoeffBeta = T(0.0); const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - + basisOperationsPtrHost->interpolateKernel( + flattenedArrayBlock, + &wfcQuads, + &gradWfcQuads, + std::pair(icell, + icell + 1)); + + // xgemm(&transA, + // &transB, + // ¤tBlockSize, + // &numQuadPoints, + // &numNodesPerElement, + // &scalarCoeffAlpha, + // &cellWaveFunctionMatrix[0], + // ¤tBlockSize, + // &shapeFunctionValues[0], + // &numNodesPerElement, + // &scalarCoeffBeta, + // &wfcQuads[0], + // ¤tBlockSize); for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) for (unsigned int iWave = 0; iWave < currentBlockSize; @@ -317,19 +341,19 @@ namespace dftfe i]); } - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPointsTimes3, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionGradValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &gradWfcQuads[0], - ¤tBlockSize); + // xgemm(&transA, + // &transB, + // ¤tBlockSize, + // &numQuadPointsTimes3, + // &numNodesPerElement, + // &scalarCoeffAlpha, + // &cellWaveFunctionMatrix[0], + // ¤tBlockSize, + // &shapeFunctionGradValues[0], + // &numNodesPerElement, + // &scalarCoeffBeta, + // &gradWfcQuads[0], + // ¤tBlockSize); for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) @@ -343,7 +367,7 @@ namespace dftfe iWave]); const T temp1 = wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + + gradWfcQuads[iquad * currentBlockSize + iWave]; gradRhoXContribution[icell * numQuadPoints + iquad] += @@ -363,8 +387,10 @@ namespace dftfe iWave]); const T temp1 = wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + - currentBlockSize + iWave]; + gradWfcQuads[currentBlockSize * + numQuadPoints + + iquad * currentBlockSize + + iWave]; gradRhoYContribution[icell * numQuadPoints + iquad] += 2.0 * partialOccupVecTimesKptWeight[iWave] * @@ -383,8 +409,10 @@ namespace dftfe iWave]); const T temp1 = wfcQuadVal * - gradWfcQuads[iquad * 3 * currentBlockSize + - 2 * currentBlockSize + iWave]; + gradWfcQuads[currentBlockSize * + numQuadPoints * 2 + + iquad * currentBlockSize + + iWave]; gradRhoZContribution[icell * numQuadPoints + iquad] += 2.0 * partialOccupVecTimesKptWeight[iWave] * @@ -454,7 +482,6 @@ namespace dftfe } } - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) for (unsigned int iWave = 0; iWave < currentBlockSize; @@ -818,24 +845,29 @@ namespace dftfe template void computeRhoFromPSICPU( - const std::vector> &X, - const std::vector> &XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> * rhoValues, - std::map> * gradRhoValues, - std::map> * rhoValuesSpinPolarized, + const dataTypes::number * X, + const dataTypes::number * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTClass & operatorMatrix, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrHost, + const dealii::DoFHandler<3> & dofHandler, + const unsigned int totalLocallyOwnedCells, + const unsigned int numNodesPerElement, + const unsigned int numQuadPoints, + const std::vector & kPointWeights, + std::map> *rhoValues, + std::map> *gradRhoValues, + std::map> *rhoValuesSpinPolarized, std::map> *gradRhoValuesSpinPolarized, const bool isEvaluateGradRho, const MPI_Comm & mpiCommParent, diff --git a/src/dft/densityCalculatorDevice.cc b/src/dft/densityCalculatorDevice.cc index ebc6865b2..3559e9d1e 100644 --- a/src/dft/densityCalculatorDevice.cc +++ b/src/dft/densityCalculatorDevice.cc @@ -120,16 +120,21 @@ namespace dftfe template void computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, + const NumberType * X, + const NumberType * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTDeviceClass & operatorMatrix, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrDevice, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, @@ -282,9 +287,9 @@ namespace dftfe distributedDeviceVec &deviceFlattenedArrayBlock = operatorMatrix.getParallelChebyBlockVectorDevice(); - - NumberType *cellWaveFunctionMatrix = - (operatorMatrix.getCellWaveFunctionMatrix()).begin(); + dftfe::utils::MemoryStorage + &cellWaveFunctionMatrixMV = operatorMatrix.getCellWaveFunctionMatrix(); + NumberType *cellWaveFunctionMatrix = (cellWaveFunctionMatrixMV).begin(); typename dealii::DoFHandler<3>::active_cell_iterator cell = dofHandler.begin_active(); @@ -387,6 +392,17 @@ namespace dftfe spinIndex), deviceFlattenedArrayBlock.begin()); + const unsigned int d_eigenDofHandlerIndex = 1; + const unsigned int d_quadratureIndex = + use2pPlusOneGLQuad ? 2 : 0; + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | + dftfe::basis::update_gradients; + basisOperationsPtrDevice->reinit(BVec, + 0, + d_quadratureIndex, + updateFlags); + deviceFlattenedArrayBlock.updateGhostValues(); @@ -414,6 +430,13 @@ namespace dftfe .getFlattenedArrayCellLocalProcIndexIdMap()) .begin() + startingCellId * numNodesPerElement); + // basisOperationsPtrDevice + // ->extractToCellNodalDataKernel( + // deviceFlattenedArrayBlock, + // &cellWaveFunctionMatrixMV, + // std::pair( + // startingCellId, + // startingCellId + currentCellsBlockSize)); NumberType scalarCoeffAlpha = 1.0; NumberType scalarCoeffBeta = 0; @@ -421,26 +444,33 @@ namespace dftfe int strideB = 0; int strideC = BVec * numQuadPoints; - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - rhoWfcContributionsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); + // dftfe::utils::deviceBlasWrapper:: + // gemmStridedBatched( + // operatorMatrix.getDeviceBlasHandle(), + // dftfe::utils::DEVICEBLAS_OP_N, + // dftfe::utils::DEVICEBLAS_OP_N, + // BVec, + // numQuadPoints, + // numNodesPerElement, + // &scalarCoeffAlpha, + // cellWaveFunctionMatrixMV.data(), + // BVec, + // strideA, + // shapeFunctionValuesTransposedDevice.begin(), + // numNodesPerElement, + // strideB, + // &scalarCoeffBeta, + // rhoWfcContributionsDevice.begin(), + // BVec, + // strideC, + // currentCellsBlockSize); + basisOperationsPtrDevice->interpolateKernel( + deviceFlattenedArrayBlock, + &rhoWfcContributionsDevice, + NULL, + std::pair( + startingCellId, + startingCellId + currentCellsBlockSize)); if (isEvaluateGradRho) @@ -1252,16 +1282,21 @@ namespace dftfe template void computeRhoFromPSI( - const dataTypes::number * X, - const dataTypes::number * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> & eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, + const dataTypes::number * X, + const dataTypes::number * XFrac, + const unsigned int totalNumWaveFunctions, + const unsigned int Nfr, + const unsigned int numLocalDofs, + const std::vector> &eigenValues, + const double fermiEnergy, + const double fermiEnergyUp, + const double fermiEnergyDown, + operatorDFTDeviceClass & operatorMatrix, + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtrDevice, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc new file mode 100644 index 000000000..87a9783c3 --- /dev/null +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -0,0 +1,230 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// +// @author Sambit Das +// + +// source file for electron density related computations +#include +#include +#include +#include +#include +#include +#include +#include + +namespace dftfe +{ + namespace + { + __global__ void + computeRhoGradRhoFromInterpolatedValues( + const unsigned int numberEntries, + const unsigned int numCells, + double * wfcContributions, + double * gradwfcContributions, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + const bool isEvaluateGradRho) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numberEntries / numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const double psi = wfcContributions[index]; + rhoCellsWfcContributions[index] = psi * psi; + + if (isEvaluateGradRho) + { + unsigned int iCell = index / numCells; + unsigned int intraCellIndex = index - iCell * numEntriesPerCell; + const double gradPsiX = + gradwfcContributions[intraCellIndex + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index] = 2.0 * psi * gradPsiX; + + const double gradPsiY = + gradwfcContributions[intraCellIndex + numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index + numberEntries] = + 2.0 * psi * gradPsiY; + + const double gradPsiZ = + gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index + 2 * numberEntries] = + 2.0 * psi * gradPsiZ; + } + } + } + + __global__ void + computeRhoGradRhoFromInterpolatedValues( + const unsigned int numberEntries, + const unsigned int numCells, + dftfe::utils::deviceDoubleComplex *wfcContributions, + dftfe::utils::deviceDoubleComplex *gradwfcContributions, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + const bool isEvaluateGradRho) + { + const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; + const unsigned int numEntriesPerCell = numberEntries / numCells; + + for (unsigned int index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index]; + rhoCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y; + + if (isEvaluateGradRho) + { + unsigned int iCell = index / numCells; + unsigned int intraCellIndex = index - iCell * numEntriesPerCell; + const dftfe::utils::deviceDoubleComplex gradPsiX = + gradwfcContributions[intraCellIndex + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index] = + 2.0 * (psi.x * gradPsiX.x + psi.y * gradPsiX.y); + + const dftfe::utils::deviceDoubleComplex gradPsiY = + gradwfcContributions[intraCellIndex + numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index + numberEntries] = + 2.0 * (psi.x * gradPsiY.x + psi.y * gradPsiY.y); + + const dftfe::utils::deviceDoubleComplex gradPsiZ = + gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + + numEntriesPerCell * 3 * iCell]; + gradRhoCellsWfcContributions[index + 2 * numberEntries] = + 2.0 * (psi.x * gradPsiZ.x + psi.y * gradPsiZ.y); + } + } + } + } // namespace + template + void + computeRhoGradRhoFromInterpolatedValues( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + NumberType * wfcQuadPointData, + NumberType * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho) + { + const unsigned int cellsBlockSize = cellRange.second - cellRange.first; + const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; + const unsigned int nQuadsPerCell = basisOperationsPtr->d_nQuadsPerCell; + const unsigned int nCells = basisOperationsPtr->d_nCells; + const double scalarCoeffAlphaRho = 1.0; + const double scalarCoeffBetaRho = 1.0; + const double scalarCoeffAlphaGradRho = 1.0; + const double scalarCoeffBetaGradRho = 1.0; +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + computeRhoGradRhoFromInterpolatedValues<<< + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + cellsBlockSize * nQuadsPerCell * vectorsBlockSize, + cellsBlockSize, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), + isEvaluateGradRho); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + computeRhoGradRhoFromInterpolatedValues, + (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / + dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + cellsBlockSize * nQuadsPerCell * vectorsBlockSize, + cellsBlockSize, + dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), + dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), + dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), + isEvaluateGradRho); +#endif + dftfe::utils::deviceBlasWrapper::gemm( + basisOperationsPtr->getDeviceBLASHandle(), + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + 1, + cellsBlockSize * nQuadsPerCell, + vectorsBlockSize, + &scalarCoeffAlphaRho, + partialOccupVec, + 1, + rhoCellsWfcContributions, + vectorsBlockSize, + &scalarCoeffBetaRho, + rho + cellRange.first * nQuadsPerCell, + 1); + + + if (isEvaluateGradRho) + { + dftfe::utils::deviceBlasWrapper::gemm( + basisOperationsPtr->getDeviceBLASHandle(), + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + 1, + cellsBlockSize * nQuadsPerCell * 3, + vectorsBlockSize, + &scalarCoeffAlphaGradRho, + partialOccupVec, + 1, + gradRhoCellsWfcContributions, + vectorsBlockSize, + &scalarCoeffBetaGradRho, + gradRho + cellRange.first * nQuadsPerCell, + 1); + } + } + template void + computeRhoGradRhoFromInterpolatedValues( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, + const std::pair cellRange, + const std::pair vecRange, + double * partialOccupVec, + dataTypes::number * wfcQuadPointData, + dataTypes::number * gradWfcQuadPointData, + double * rhoCellsWfcContributions, + double * gradRhoCellsWfcContributions, + double * rho, + double * gradRho, + const bool isEvaluateGradRho); + +} // namespace dftfe diff --git a/src/dft/densityFirstOrderResponseCalculatorCPU.cc b/src/dft/densityFirstOrderResponseCalculatorCPU.cc index 0c1eef5b0..b0efcd893 100644 --- a/src/dft/densityFirstOrderResponseCalculatorCPU.cc +++ b/src/dft/densityFirstOrderResponseCalculatorCPU.cc @@ -31,8 +31,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> & X, - const std::vector> & XPrime, + const T * X, + const T * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -149,11 +149,13 @@ namespace dftfe for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XCurrentKPoint = + X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; - const std::vector &XPrimeCurrentKPoint = - XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XPrimeCurrentKPoint = + XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; const std::vector &densityMatDerFermiEnergyVec = densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + @@ -420,8 +422,8 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> & X, - const std::vector> & XPrime, + const T * X, + const T * XPrime, const std::vector> & densityMatDerFermiEnergy, const unsigned int totalNumWaveFunctions, const unsigned int numLocalDofs, @@ -543,11 +545,13 @@ namespace dftfe for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { - const std::vector &XCurrentKPoint = - X[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XCurrentKPoint = + X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; - const std::vector &XPrimeCurrentKPoint = - XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex]; + const T *XPrimeCurrentKPoint = + XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * + numLocalDofs * totalNumWaveFunctions; const std::vector &densityMatDerFermiEnergyVec = densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint + @@ -815,12 +819,12 @@ namespace dftfe template void computeRhoFirstOrderResponseCPU( - const std::vector> &X, - const std::vector> &XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const dataTypes::number * XPrime, + const std::vector> & densityMatDerFermiEnergy, + const unsigned int totalNumWaveFunctions, + const unsigned int numLocalDofs, + operatorDFTClass & operatorMatrix, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, @@ -841,12 +845,12 @@ namespace dftfe template void computeRhoFirstOrderResponseCPUMixedPrec( - const std::vector> &X, - const std::vector> &XPrime, - const std::vector> & densityMatDerFermiEnergy, - const unsigned int totalNumWaveFunctions, - const unsigned int numLocalDofs, - operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const dataTypes::number * XPrime, + const std::vector> & densityMatDerFermiEnergy, + const unsigned int totalNumWaveFunctions, + const unsigned int numLocalDofs, + operatorDFTClass & operatorMatrix, const unsigned int matrixFreeDofhandlerIndex, const dealii::DoFHandler<3> & dofHandler, const unsigned int totalLocallyOwnedCells, diff --git a/src/dft/dft.cc b/src/dft/dft.cc index b8d18e469..d2e0aabf0 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -728,10 +728,6 @@ namespace dftfe d_upperBoundUnwantedSpectrumValues.resize( (d_dftParamsPtr->spinPolarized + 1) * d_kPointWeights.size(), 0.0); - d_eigenVectorsFlattenedSTL.resize((1 + d_dftParamsPtr->spinPolarized) * - d_kPointWeights.size()); - d_eigenVectorsRotFracDensityFlattenedSTL.resize( - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); for (unsigned int kPoint = 0; kPoint < d_kPointWeights.size(); ++kPoint) { @@ -1628,8 +1624,9 @@ namespace dftfe vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[0], + d_eigenVectorsFlattenedHost.data(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(0, numberWaveFunctionsErrorEstimate), eigenVectorsArray); @@ -1803,6 +1800,8 @@ namespace dftfe if (initializeCublas) { kohnShamDFTEigenOperatorDevice.createDeviceBlasHandle(); + basisOperationsPtrDevice->setDeviceBLASHandle( + &(kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle())); } AssertThrow( @@ -3330,8 +3329,7 @@ namespace dftfe dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; - + matrix_free_data.get_vector_partitioner()->locally_owned_size(); if (numberBandGroups > 1 && !d_dftParamsPtr->useDevice) { MPI_Barrier(interBandGroupComm); @@ -3347,13 +3345,17 @@ namespace dftfe { const unsigned int currentBlockSize = std::min(blockSize, d_numEigenValues * localVectorSize - i); - MPI_Allreduce(MPI_IN_PLACE, - &d_eigenVectorsFlattenedSTL[kPoint][0] + i, - currentBlockSize, - dataTypes::mpi_type_id( - &d_eigenVectorsFlattenedSTL[kPoint][0]), - MPI_SUM, - interBandGroupComm); + MPI_Allreduce( + MPI_IN_PLACE, + &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize] + + i, + currentBlockSize, + dataTypes::mpi_type_id( + &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize]), + MPI_SUM, + interBandGroupComm); } } @@ -3553,17 +3555,7 @@ namespace dftfe if (d_dftParamsPtr->useDevice && (d_dftParamsPtr->writeWfcSolutionFields || d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile)) - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice.copyTo( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[kPoint].size(), - (kPoint * d_eigenVectorsFlattenedSTL[0].size()), - 0); - } + d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost); #endif @@ -3957,20 +3949,26 @@ namespace dftfe { #ifdef USE_COMPLEX vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[k * - (1 + d_dftParamsPtr->spinPolarized) + - s], + d_eigenVectorsFlattenedHost.data() + + (k * (1 + d_dftParamsPtr->spinPolarized) + s) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(i, i + 1), localProc_dof_indicesReal, localProc_dof_indicesImag, tempVec); #else vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[k * - (1 + d_dftParamsPtr->spinPolarized) + - s], + d_eigenVectorsFlattenedHost.data() + + (k * (1 + d_dftParamsPtr->spinPolarized) + s) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(i, i + 1), tempVec); #endif diff --git a/src/dft/dos.cc b/src/dft/dos.cc index aabf4474d..e993137b8 100644 --- a/src/dft/dos.cc +++ b/src/dft/dos.cc @@ -357,7 +357,7 @@ namespace dftfe std::vector tempQuadPointValues(n_q_points); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; + matrix_free_data.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); std::vector> @@ -419,8 +419,10 @@ namespace dftfe for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave) eigenVectorsFlattenedBlock[kPoint].local_element( iNode * currentBlockSize + iWave) = - d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues + - ivec + iWave]; + d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues * + localVectorSize + + iNode * d_numEigenValues + ivec + + iWave]; constraintsNoneDataInfo.distribute( eigenVectorsFlattenedBlock[kPoint], currentBlockSize); @@ -899,7 +901,7 @@ namespace dftfe std::vector tempQuadPointValues(n_q_points); const unsigned int localVectorSize = - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues; + matrix_free_data.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); std::vector> @@ -968,8 +970,10 @@ namespace dftfe for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave) eigenVectorsFlattenedBlock[kPoint].local_element( iNode * currentBlockSize + iWave) = - d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues + - ivec + iWave]; + d_eigenVectorsFlattenedHost[kPoint * localVectorSize * + d_numEigenValues + + iNode * d_numEigenValues + ivec + + iWave]; constraintsNoneDataInfo.distribute( eigenVectorsFlattenedBlock[kPoint], currentBlockSize); diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 5e0814c20..7f3b612f2 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -262,6 +262,19 @@ namespace dftfe d_constraintsVector, quadratureVector, additional_data); + basisOperationsPtrHost = std::make_unique< + dftfe::basis::FEBasisOperations>( + matrix_free_data, d_constraintsVector); +#if defined(DFTFE_WITH_DEVICE) + if (d_dftParamsPtr->useDevice) + basisOperationsPtrDevice = std::make_unique< + dftfe::basis::FEBasisOperations>( + matrix_free_data, d_constraintsVector); +#endif MPI_Barrier(d_mpiCommParent); init_mf = MPI_Wtime() - init_mf; diff --git a/src/dft/initElectronicFields.cc b/src/dft/initElectronicFields.cc index 14b560063..054d5610f 100644 --- a/src/dft/initElectronicFields.cc +++ b/src/dft/initElectronicFields.cc @@ -104,16 +104,17 @@ namespace dftfe kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); ++kPoint) { - d_eigenVectorsFlattenedSTL[kPoint].resize( - d_numEigenValues * - matrix_free_data.get_vector_partitioner()->local_size(), + d_eigenVectorsFlattenedHost.resize( + (d_numEigenValues * + matrix_free_data.get_vector_partitioner()->local_size()) * + (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(), dataTypes::number(0.0)); - if (d_numEigenValuesRR != d_numEigenValues) { - d_eigenVectorsRotFracDensityFlattenedSTL[kPoint].resize( + d_eigenVectorsRotFracDensityFlattenedHost.resize( d_numEigenValuesRR * - matrix_free_data.get_vector_partitioner()->local_size(), + matrix_free_data.get_vector_partitioner()->local_size() * + (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(), dataTypes::number(0.0)); } } @@ -145,40 +146,26 @@ namespace dftfe if (d_dftParamsPtr->useDevice) { d_eigenVectorsFlattenedDevice.resize( - d_eigenVectorsFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsFlattenedHost.size()); if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") d_eigenVectorsDensityMatrixPrimeFlattenedDevice.resize( - d_eigenVectorsFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsFlattenedHost.size()); if (d_numEigenValuesRR != d_numEigenValues) d_eigenVectorsRotFracFlattenedDevice.resize( - d_eigenVectorsRotFracDensityFlattenedSTL[0].size() * - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size()); + d_eigenVectorsRotFracDensityFlattenedHost.size()); else d_eigenVectorsRotFracFlattenedDevice.resize(1); - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice - .copyFrom( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[0].size(), - 0, - kPoint * d_eigenVectorsFlattenedSTL[0].size()); - } + d_eigenVectorsFlattenedDevice.copyFrom(d_eigenVectorsFlattenedHost); } #endif if (!d_dftParamsPtr->useDevice && d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") { - d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL; + d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost; } if (d_dftParamsPtr->verbosity >= 2 && d_dftParamsPtr->spinPolarized == 1) diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index c1fbc84c4..e49a52e4c 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -27,15 +27,11 @@ namespace dftfe namespace internal { void - pointWiseScaleWithDiagonal( - const distributedCPUVec &diagonal, - const std::shared_ptr - & singleComponentPartitioner, - const unsigned int numberFields, - std::vector &fieldsArrayFlattened) + pointWiseScaleWithDiagonal(const distributedCPUVec &diagonal, + const unsigned int numberFields, + const unsigned int numberDofs, + dataTypes::number *fieldsArrayFlattened) { - const unsigned int numberDofs = - fieldsArrayFlattened.size() / numberFields; const unsigned int inc = 1; for (unsigned int i = 0; i < numberDofs; ++i) @@ -178,9 +174,9 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); // @@ -188,9 +184,11 @@ namespace dftfe // std::vector ProjHam; - kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0], - d_numEigenValues, - ProjHam); + kohnShamDFTEigenOperator.XtHX( + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + ProjHam); // // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in @@ -198,9 +196,9 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); dataTypes::number trXtHX = 0.0; @@ -243,25 +241,30 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); // // orthogonalize the vectors // linearAlgebraOperations::gramSchmidtOrthogonalization( - d_eigenVectorsFlattenedSTL[0], d_numEigenValues, mpi_communicator); + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + mpi_communicator); // // compute projected Hamiltonian // std::vector ProjHam; - kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0], - d_numEigenValues, - ProjHam); + kohnShamDFTEigenOperator.XtHX( + d_eigenVectorsFlattenedHost.data(), + d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + ProjHam); // // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in @@ -269,9 +272,9 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[0]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data()); double trXtKX = 0.0; #ifdef USE_COMPLEX @@ -311,11 +314,13 @@ namespace dftfe { internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size()); } @@ -332,10 +337,14 @@ namespace dftfe const unsigned int flag = linearAlgebraOperations::pseudoGramSchmidtOrthogonalization( *d_elpaScala, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_mpiCommParent, interBandGroupComm, mpi_communicator, @@ -353,25 +362,28 @@ namespace dftfe { internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner() + ->locally_owned_size()); } } computeRhoFromPSICPU( - d_eigenVectorsFlattenedSTL, - d_eigenVectorsRotFracDensityFlattenedSTL, + d_eigenVectorsFlattenedHost.data(), + d_eigenVectorsRotFracDensityFlattenedHost.data(), d_numEigenValues, d_numEigenValuesRR, - d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValues, fermiEnergy, fermiEnergyUp, fermiEnergyDown, kohnShamDFTEigenOperator, + basisOperationsPtrHost, dofHandler, matrix_free_data.n_physical_cells(), matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), @@ -420,11 +432,12 @@ namespace dftfe // by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR : d_numEigenValues, @@ -488,12 +501,16 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, elpaScala, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], - d_eigenVectorsRotFracDensityFlattenedSTL - [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType], + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsRotFracDensityFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValuesTemp, residualNormWaveFunctions, interBandGroupComm, @@ -507,20 +524,23 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); if (isSpectrumSplit && d_numEigenValuesRR != d_numEigenValues) { internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValuesRR, - d_eigenVectorsRotFracDensityFlattenedSTL - [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsRotFracDensityFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); } // @@ -638,8 +658,10 @@ namespace dftfe elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesDummy, *d_devicecclMpiCommDomainPtr, @@ -657,11 +679,14 @@ namespace dftfe elpaScala, d_eigenVectorsFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_eigenVectorsRotFracFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsRotFracDensityFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValuesRR * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesTemp, residualNormWaveFunctions, @@ -763,11 +788,12 @@ namespace dftfe // multiply by M^{1/2} internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_sqrtMassVector, - matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex), d_numEigenValues, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(d_numEigenValues, 0.0); for (unsigned int i = 0; i < d_numEigenValues; i++) @@ -779,10 +805,12 @@ namespace dftfe linearAlgebraOperations::densityMatrixEigenBasisFirstOrderResponse( kohnShamDFTEigenOperator, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_mpiCommParent, kohnShamDFTEigenOperator.getMPICommunicator(), interBandGroupComm, @@ -801,11 +829,12 @@ namespace dftfe // internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex), d_numEigenValues, - d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsDensityMatrixPrimeHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); } #ifdef DFTFE_WITH_DEVICE @@ -840,8 +869,10 @@ namespace dftfe kohnShamDFTEigenOperator, d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin() + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * - d_eigenVectorsFlattenedSTL[0].size(), - d_eigenVectorsFlattenedSTL[0].size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, eigenValuesTemp, fermiEnergy, @@ -880,11 +911,12 @@ namespace dftfe if (ipass == 1) internal::pointWiseScaleWithDiagonal( kohnShamDFTEigenOperator.d_invSqrtMassVector, - matrix_free_data.get_vector_partitioner(), d_numEigenValues, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType]); + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); std::vector eigenValuesTemp(d_numEigenValues, 0.0); @@ -935,13 +967,16 @@ namespace dftfe subspaceIterationSolver.solve( kohnShamDFTEigenOperator, *d_elpaScala, - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], - d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) * - kPointIndex + - spinType], + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), + d_eigenVectorsFlattenedHost.data() + + ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) * + d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), eigenValuesTemp, residualNormWaveFunctions, interBandGroupComm, diff --git a/src/dft/localizationLength.cc b/src/dft/localizationLength.cc index 64e06d019..90296b7eb 100644 --- a/src/dft/localizationLength.cc +++ b/src/dft/localizationLength.cc @@ -57,8 +57,9 @@ namespace dftfe for (unsigned int iWave = 0; iWave < d_numEigenValues; ++iWave) { vectorTools::copyFlattenedSTLVecToSingleCompVec( - d_eigenVectorsFlattenedSTL[0], + d_eigenVectorsFlattenedHost.data(), d_numEigenValues, + matrix_free_data.get_vector_partitioner()->locally_owned_size(), std::make_pair(iWave, iWave + 1), tempVec); diff --git a/src/dft/nscf.cc b/src/dft/nscf.cc index d7fcbd145..31df85d95 100644 --- a/src/dft/nscf.cc +++ b/src/dft/nscf.cc @@ -46,7 +46,7 @@ namespace dftfe eigenValues.clear(); a0.clear(); bLow.clear(); - d_eigenVectorsFlattenedSTL.clear(); + d_eigenVectorsFlattenedHost.clear(); waveFunctionsVector.clear(); numElectrons = 0; d_numEigenValues = @@ -56,13 +56,9 @@ namespace dftfe eigenValues.resize(d_maxkPoints); a0.resize(d_maxkPoints, 0.0); bLow.resize(d_maxkPoints, 0.0); - d_eigenVectorsFlattenedSTL.resize(d_maxkPoints); - // - for (unsigned int kPoint = 0; kPoint < d_maxkPoints; ++kPoint) - d_eigenVectorsFlattenedSTL[kPoint].resize( - d_numEigenValues * - matrix_free_data.get_vector_partitioner()->local_size(), - dataTypes::number(0.0)); + d_eigenVectorsFlattenedHost.resize( + d_maxkPoints * d_numEigenValues * + matrix_free_data.get_vector_partitioner()->locally_owned_size()); // pcout << " check 0.1 " << std::endl; // diff --git a/src/dft/psiInitialGuess.cc b/src/dft/psiInitialGuess.cc index f9a8ae3a9..f065dec2f 100644 --- a/src/dft/psiInitialGuess.cc +++ b/src/dft/psiInitialGuess.cc @@ -344,14 +344,9 @@ namespace dftfe locallyOwnedSet.fill_index_vector(locallyOwnedDOFs); unsigned int numberDofs = locallyOwnedDOFs.size(); - for (unsigned int kPoint = 0; - kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - std::fill(d_eigenVectorsFlattenedSTL[kPoint].begin(), - d_eigenVectorsFlattenedSTL[kPoint].end(), - 0.0); - } + std::fill(d_eigenVectorsFlattenedHost.begin(), + d_eigenVectorsFlattenedHost.end(), + 0.0); const unsigned int numberGlobalAtoms = atomLocations.size(); @@ -498,8 +493,9 @@ namespace dftfe // spherical part if (it->m > 0) { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * std::sqrt(2) * boost::math::spherical_harmonic_r( @@ -507,16 +503,18 @@ namespace dftfe } else if (it->m == 0) { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * boost::math::spherical_harmonic_r( it->l, it->m, theta, phi)); } else { - d_eigenVectorsFlattenedSTL - [kPoint][dof * d_numEigenValues + waveId] += + d_eigenVectorsFlattenedHost + [kPoint * d_numEigenValues * numberDofs + + dof * d_numEigenValues + waveId] += dataTypes::number( R * std::sqrt(2) * boost::math::spherical_harmonic_i( @@ -539,8 +537,9 @@ namespace dftfe // // boost::math::normal normDist; - std::vector &temp = - d_eigenVectorsFlattenedSTL[kPoint]; + dataTypes::number *temp = + d_eigenVectorsFlattenedHost.data() + + kPoint * d_numEigenValues * numberDofs; for (unsigned int iWave = waveFunctionsVector.size(); iWave < d_numEigenValues; ++iWave) @@ -565,11 +564,10 @@ namespace dftfe (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); ++kPoint) { - std::vector &temp1 = - d_eigenVectorsFlattenedSTL[kPoint]; + dataTypes::number *temp1 = d_eigenVectorsFlattenedHost.data() + + kPoint * d_numEigenValues * numberDofs; - std::vector &temp2 = - d_eigenVectorsFlattenedSTL[0]; + dataTypes::number *temp2 = d_eigenVectorsFlattenedHost.data(); for (unsigned int idof = 0; idof < numberDofs; idof++) for (unsigned int iwave = 0; iwave < d_numEigenValues; iwave++) diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc index 80d4c15ee..7cfc3d102 100644 --- a/src/dftOperator/kohnShamDFTOperator.cc +++ b/src/dftOperator/kohnShamDFTOperator.cc @@ -405,11 +405,6 @@ namespace dftfe kohnShamDFTOperatorClass:: getShapeFunctionValuesDensityGaussQuad() const { - static bool once = [&]() { - std::cout << "DEBUG vals " << d_densityGaussQuadShapeFunctionValues.size() - << std::endl; - return true; - }(); return d_densityGaussQuadShapeFunctionValues; } @@ -419,13 +414,6 @@ namespace dftfe kohnShamDFTOperatorClass:: getShapeFunctionGradValuesDensityGaussQuad() const { - static bool once2 = [&]() { - std::cout << "DEBUG vals " - << d_densityGaussQuadShapeFunctionGradientValues.size() - << std::endl; - return true; - }(); - return d_densityGaussQuadShapeFunctionGradientValues; } @@ -1251,14 +1239,14 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHX( - const std::vector &X, - const unsigned int numberWaveFunctions, - std::vector & ProjHam) + const dataTypes::number * X, + const unsigned int numberWaveFunctions, + const unsigned int numberDofs, + std::vector &ProjHam) { // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / numberWaveFunctions; // // Resize ProjHam @@ -1338,8 +1326,9 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHX( - const std::vector & X, + const dataTypes::number * X, const unsigned int numberWaveFunctions, + const unsigned int numberDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse) @@ -1347,7 +1336,6 @@ namespace dftfe // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / numberWaveFunctions; // create temporary arrays XBlock,Hx distributedCPUMultiVec XBlock, HXBlock; @@ -1512,9 +1500,10 @@ namespace dftfe template void kohnShamDFTOperatorClass::XtHXMixedPrec( - const std::vector & X, + const dataTypes::number * X, const unsigned int N, const unsigned int Ncore, + const unsigned int numberDofs, const std::shared_ptr &processGrid, dftfe::ScaLAPACKMatrix & projHamPar, const bool onlyHPrimePartForFirstOrderDensityMatResponse) @@ -1522,7 +1511,6 @@ namespace dftfe // // Get access to number of locally owned nodes on the current processor // - const unsigned int numberDofs = X.size() / N; // create temporary arrays XBlock,Hx distributedCPUMultiVec XBlock, HXBlock; @@ -1570,7 +1558,7 @@ namespace dftfe std::vector HXBlockSinglePrec; - std::vector XSinglePrec(&X[0], &X[0] + X.size()); + std::vector XSinglePrec(X, X + numberDofs * N); if (dftPtr->d_dftParamsPtr->verbosity >= 4) dftUtils::printCurrentMemoryUsage( diff --git a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc index 165b5026a..bc07edf27 100644 --- a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc +++ b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc @@ -173,7 +173,7 @@ namespace dftfe const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors; + matrixFreeData.get_vector_partitioner()->locally_owned_size(); const unsigned int numMacroCells = matrixFreeData.n_cell_batches(); @@ -296,7 +296,7 @@ namespace dftfe force::wfcContractionsForceKernelsAllH( kohnShamDFTEigenOperator, - dftPtr->d_eigenVectorsFlattenedSTL, + dftPtr->d_eigenVectorsFlattenedHost.begin(), d_dftParams.spinPolarized, spinIndex, dftPtr->eigenValues, diff --git a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc index 9c6003ec0..08c534633 100644 --- a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc +++ b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc @@ -163,7 +163,7 @@ namespace dftfe bandGroupLowHighPlusOneIndices[1]); const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors; + matrixFreeData.get_vector_partitioner()->locally_owned_size(); std::vector>> eigenVectors( dftPtr->d_kPointWeights.size()); std::vector> @@ -289,7 +289,7 @@ namespace dftfe force::wfcContractionsForceKernelsAllH( kohnShamDFTEigenOperator, - dftPtr->d_eigenVectorsFlattenedSTL, + dftPtr->d_eigenVectorsFlattenedHost.begin(), d_dftParams.spinPolarized, spinIndex, dftPtr->eigenValues, diff --git a/src/force/forceWfcContractions.cc b/src/force/forceWfcContractions.cc index fbc8a8628..e9b2e9771 100644 --- a/src/force/forceWfcContractions.cc +++ b/src/force/forceWfcContractions.cc @@ -607,7 +607,7 @@ namespace dftfe operatorDFTClass & operatorMatrix, distributedCPUMultiVec &flattenedArrayBlock, distributedCPUMultiVec &projectorKetTimesVector, - const std::vector & X, + const dataTypes::number * X, const std::vector & eigenValues, const std::vector & partialOccupancies, const std::vector & kcoord, @@ -725,14 +725,14 @@ namespace dftfe void wfcContractionsForceKernelsAllH( - operatorDFTClass & operatorMatrix, - const std::vector> &X, - const unsigned int spinPolarizedFlag, - const unsigned int spinIndex, - const std::vector> & eigenValuesH, - const std::vector> & partialOccupanciesH, - const std::vector & kPointCoordinates, - const unsigned int *nonTrivialIdToElemIdMapH, + operatorDFTClass & operatorMatrix, + const dataTypes::number * X, + const unsigned int spinPolarizedFlag, + const unsigned int spinIndex, + const std::vector> &eigenValuesH, + const std::vector> &partialOccupanciesH, + const std::vector & kPointCoordinates, + const unsigned int * nonTrivialIdToElemIdMapH, const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH, const unsigned int MLoc, const unsigned int N, @@ -894,7 +894,8 @@ namespace dftfe operatorMatrix, flattenedArrayBlock, projectorKetTimesVector, - X[(1 + spinPolarizedFlag) * kPoint + spinIndex], + X + + ((1 + spinPolarizedFlag) * kPoint + spinIndex) * MLoc * N, blockedEigenValues, blockedPartialOccupancies, kcoord, diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc index 8eac264b7..4c5766e18 100644 --- a/src/linAlg/linearAlgebraOperationsOpt.cc +++ b/src/linAlg/linearAlgebraOperationsOpt.cc @@ -516,12 +516,12 @@ namespace dftfe template void - gramSchmidtOrthogonalization(std::vector & X, + gramSchmidtOrthogonalization(T * X, const unsigned int numberVectors, + const unsigned int localVectorSize, const MPI_Comm & mpiComm) { #ifdef USE_PETSC - const unsigned int localVectorSize = X.size() / numberVectors; // // Create template PETSc vector to create BV object later @@ -614,8 +614,9 @@ namespace dftfe void rayleighRitzGEP(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpi_communicator, @@ -660,8 +661,9 @@ namespace dftfe // SConj=X^{T}*XConj. if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -674,8 +676,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -684,8 +686,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -791,7 +793,8 @@ namespace dftfe T(0.0)); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Compute ProjHam, RR step"); computing_timer.enter_subsection( @@ -918,8 +921,8 @@ namespace dftfe projHamParCopy.mmult(projHamPar, LMatPar); if (!(dftParams.useMixedPrecSubspaceRotRR && useMixedPrec)) - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -933,8 +936,8 @@ namespace dftfe { if (std::is_same>::value) internal::subspaceRotationMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -944,8 +947,9 @@ namespace dftfe false, false); else - internal::subspaceRotationMixedPrec(&X[0], - X.size(), + internal::subspaceRotationMixedPrec(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -968,8 +972,9 @@ namespace dftfe void rayleighRitz(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpi_communicator, @@ -1006,7 +1011,8 @@ namespace dftfe T(0.0)); computing_timer.enter_subsection("Blocked XtHX, RR step"); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Blocked XtHX, RR step"); // @@ -1116,8 +1122,8 @@ namespace dftfe processGrid, rowsBlockSize); projHamParCopy.copy_conjugate_transposed(projHamPar); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1135,9 +1141,10 @@ namespace dftfe void rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, elpaScalaManager & elpaScala, - std::vector & X, - std::vector & Y, + T * X, + T * Y, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -1183,8 +1190,9 @@ namespace dftfe // SConj=X^{T}*XConj if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1197,8 +1205,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1207,8 +1215,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1319,12 +1327,17 @@ namespace dftfe if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit) { - operatorMatrix.XtHXMixedPrec( - X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar); + operatorMatrix.XtHXMixedPrec(X, + numberWaveFunctions, + numberCoreStates, + localVectorSize, + processGrid, + projHamPar); } else { - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); } @@ -1497,9 +1510,10 @@ namespace dftfe computing_timer.enter_subsection( "Xfr^{T}={QfrConjPrime}^{C}*LConj^{-1}*X^{T}, RR step"); - internal::subspaceRotationSpectrumSplit(&X[0], - &Y[0], - X.size(), + internal::subspaceRotationSpectrumSplit(X, + Y, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, numberWaveFunctions - @@ -1517,8 +1531,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec)) { computing_timer.enter_subsection("X^{T}=Lconj^{-1}*X^{T}, RR step"); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1536,8 +1550,8 @@ namespace dftfe "X^{T}=Lconj^{-1}*X^{T} mixed prec, RR step"); if (std::is_same>::value) internal::subspaceRotationCGSMixedPrec>( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1548,8 +1562,8 @@ namespace dftfe false); else internal::subspaceRotationCGSMixedPrec( - &X[0], - X.size(), + X, + numberWaveFunctions * localVectorSize, numberWaveFunctions, processGrid, interBandGroupComm, @@ -1566,18 +1580,19 @@ namespace dftfe template void - rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - const std::vector &X, - std::vector & Y, - const unsigned int numberWaveFunctions, - const unsigned int numberCoreStates, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interBandGroupComm, - const MPI_Comm & mpi_communicator, - const bool useMixedPrec, - std::vector & eigenValues, - const dftParameters & dftParams) + rayleighRitzSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + const T * X, + T * Y, + const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, + const unsigned int numberCoreStates, + const MPI_Comm & mpiCommParent, + const MPI_Comm & interBandGroupComm, + const MPI_Comm & mpi_communicator, + const bool useMixedPrec, + std::vector &eigenValues, + const dftParameters &dftParams) { dealii::ConditionalOStream pcout( @@ -1611,15 +1626,20 @@ namespace dftfe if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit) { computing_timer.enter_subsection("Blocked XtHX Mixed Prec, RR step"); - operatorMatrix.XtHXMixedPrec( - X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar); + operatorMatrix.XtHXMixedPrec(X, + numberWaveFunctions, + numberCoreStates, + localVectorSize, + processGrid, + projHamPar); computing_timer.leave_subsection("Blocked XtHX Mixed Prec, RR step"); } else { computing_timer.enter_subsection("Blocked XtHX, RR step"); - operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar); + operatorMatrix.XtHX( + X, numberWaveFunctions, localVectorSize, processGrid, projHamPar); computing_timer.leave_subsection("Blocked XtHX, RR step"); } @@ -1766,9 +1786,10 @@ namespace dftfe computing_timer.enter_subsection("Blocked subspace rotation, RR step"); - internal::subspaceRotationSpectrumSplit(&X[0], - &Y[0], - X.size(), + internal::subspaceRotationSpectrumSplit(X, + Y, + numberWaveFunctions * + localVectorSize, numberWaveFunctions, processGrid, numberWaveFunctions - @@ -2373,8 +2394,10 @@ namespace dftfe template void computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const std::vector &eigenValues, + const unsigned int totalNumberVectors, + const unsigned int localVectorSize, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -2385,8 +2408,6 @@ namespace dftfe // // get the number of eigenVectors // - const unsigned int totalNumberVectors = eigenValues.size(); - const unsigned int localVectorSize = X.size() / totalNumberVectors; std::vector residualNormSquare(totalNumberVectors, 0.0); // band group parallelization data structures @@ -3006,8 +3027,9 @@ namespace dftfe void densityMatrixEigenBasisFirstOrderResponse( operatorDFTClass & operatorMatrix, - std::vector & X, + T * X, const unsigned int N, + const unsigned int numberLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & mpiCommDomain, const MPI_Comm & interBandGroupComm, @@ -3050,10 +3072,11 @@ namespace dftfe if (dftParams.singlePrecLRD) { operatorMatrix.XtHXMixedPrec( - X, N, N, processGrid, projHamPrimePar, true); + X, N, N, numberLocalDofs, processGrid, projHamPrimePar, true); } else - operatorMatrix.XtHX(X, N, processGrid, projHamPrimePar, true); + operatorMatrix.XtHX( + X, N, numberLocalDofs, processGrid, projHamPrimePar, true); computing_timer.leave_subsection("Compute ProjHamPrime, DMFOR step"); @@ -3165,8 +3188,8 @@ namespace dftfe { if (std::is_same>::value) internal::subspaceRotationMixedPrec>( - &X[0], - X.size(), + X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3177,8 +3200,8 @@ namespace dftfe false); else internal::subspaceRotationMixedPrec( - &X[0], - X.size(), + X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3190,8 +3213,8 @@ namespace dftfe } else { - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberLocalDofs * N, N, processGrid, interBandGroupComm, @@ -3237,14 +3260,16 @@ namespace dftfe template void - gramSchmidtOrthogonalization(std::vector &, + gramSchmidtOrthogonalization(dataTypes::number *, const unsigned int, + const unsigned int localVectorSize, const MPI_Comm &); template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm & mpiComm, @@ -3254,8 +3279,9 @@ namespace dftfe template void rayleighRitz(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm &, @@ -3266,8 +3292,9 @@ namespace dftfe template void rayleighRitzGEP(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - std::vector &, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const MPI_Comm &, const MPI_Comm &, const MPI_Comm &, @@ -3279,9 +3306,10 @@ namespace dftfe template void rayleighRitzSpectrumSplitDirect(operatorDFTClass &operatorMatrix, elpaScalaManager &elpaScala, - const std::vector &, - std::vector &, + const dataTypes::number *, + dataTypes::number *, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm &, const MPI_Comm &, @@ -3291,11 +3319,12 @@ namespace dftfe const dftParameters &dftParams); template void - rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &operatorMatrix, - elpaScalaManager &elpaScala, - std::vector &X, - std::vector &Y, + rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * X, + dataTypes::number * Y, const unsigned int numberWaveFunctions, + const unsigned int localVectorSize, const unsigned int numberCoreStates, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, @@ -3305,28 +3334,31 @@ namespace dftfe const dftParameters &dftParams); template void - computeEigenResidualNorm(operatorDFTClass & operatorMatrix, - std::vector &X, - const std::vector & eigenValues, - const MPI_Comm & mpiCommParent, - const MPI_Comm & mpiCommDomain, - const MPI_Comm & interBandGroupComm, - std::vector & residualNorm, - const dftParameters & dftParams); + computeEigenResidualNorm(operatorDFTClass & operatorMatrix, + dataTypes::number * X, + const std::vector &eigenValues, + const unsigned int totalNumberVectors, + const unsigned int localVectorSize, + const MPI_Comm & mpiCommParent, + const MPI_Comm & mpiCommDomain, + const MPI_Comm & interBandGroupComm, + std::vector & residualNorm, + const dftParameters & dftParams); template void densityMatrixEigenBasisFirstOrderResponse( - operatorDFTClass & operatorMatrix, - std::vector &X, - const unsigned int N, - const MPI_Comm & mpiCommParent, - const MPI_Comm & mpiCommDomain, - const MPI_Comm & interBandGroupComm, - const std::vector & eigenValues, - const double fermiEnergy, - std::vector & densityMatDerFermiEnergy, - elpaScalaManager & elpaScala, - const dftParameters & dftParams); + operatorDFTClass & operatorMatrix, + dataTypes::number * X, + const unsigned int N, + const unsigned int numberLocalDofs, + const MPI_Comm & mpiCommParent, + const MPI_Comm & mpiCommDomain, + const MPI_Comm & interBandGroupComm, + const std::vector &eigenValues, + const double fermiEnergy, + std::vector & densityMatDerFermiEnergy, + elpaScalaManager & elpaScala, + const dftParameters & dftParams); } // namespace linearAlgebraOperations diff --git a/src/linAlg/pseudoGS.cc b/src/linAlg/pseudoGS.cc index 1703e083c..8eb1898a4 100644 --- a/src/linAlg/pseudoGS.cc +++ b/src/linAlg/pseudoGS.cc @@ -29,8 +29,9 @@ namespace dftfe template unsigned int pseudoGramSchmidtOrthogonalization(elpaScalaManager & elpaScala, - std::vector & X, + T * X, const unsigned int numberVectors, + const unsigned int numLocalDofs, const MPI_Comm & mpiCommParent, const MPI_Comm & interBandGroupComm, const MPI_Comm & mpiComm, @@ -38,8 +39,6 @@ namespace dftfe const dftParameters &dftParams) { - const unsigned int numLocalDofs = X.size() / numberVectors; - dealii::ConditionalOStream pcout( std::cout, (dealii::Utilities::MPI::this_mpi_process(mpiCommParent) == 0)); @@ -74,8 +73,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_O && useMixedPrec)) { computing_timer.enter_subsection("Fill overlap matrix CGS"); - internal::fillParallelOverlapMatrix(&X[0], - X.size(), + internal::fillParallelOverlapMatrix(X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -91,8 +90,8 @@ namespace dftfe if (std::is_same>::value) internal::fillParallelOverlapMatrixMixedPrec>( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -101,8 +100,8 @@ namespace dftfe dftParams); else internal::fillParallelOverlapMatrixMixedPrec( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -218,8 +217,8 @@ namespace dftfe if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec)) { computing_timer.enter_subsection("Subspace rotation CGS"); - internal::subspaceRotation(&X[0], - X.size(), + internal::subspaceRotation(X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -235,8 +234,8 @@ namespace dftfe computing_timer.enter_subsection("Subspace rotation mixed prec CGS"); if (std::is_same>::value) internal::subspaceRotationCGSMixedPrec>( - &X[0], - X.size(), + X, + numberVectors * numLocalDofs, numberVectors, processGrid, interBandGroupComm, @@ -245,8 +244,9 @@ namespace dftfe dftParams, false); else - internal::subspaceRotationCGSMixedPrec(&X[0], - X.size(), + internal::subspaceRotationCGSMixedPrec(X, + numberVectors * + numLocalDofs, numberVectors, processGrid, interBandGroupComm, diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc index 11ac14c22..36ce39a4f 100644 --- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc +++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc @@ -114,17 +114,18 @@ namespace dftfe // void chebyshevOrthogonalizedSubspaceIterationSolver::solve( - operatorDFTClass & operatorMatrix, - elpaScalaManager & elpaScala, - std::vector &eigenVectorsFlattened, - std::vector &eigenVectorsRotFracDensityFlattened, - const unsigned int totalNumberWaveFunctions, - std::vector & eigenValues, - std::vector & residualNorms, - const MPI_Comm & interBandGroupComm, - const bool computeResidual, - const bool useMixedPrec, - const bool isFirstScf) + operatorDFTClass & operatorMatrix, + elpaScalaManager & elpaScala, + dataTypes::number * eigenVectorsFlattened, + dataTypes::number * eigenVectorsRotFracDensityFlattened, + const unsigned int totalNumberWaveFunctions, + const unsigned int localVectorSize, + std::vector &eigenValues, + std::vector &residualNorms, + const MPI_Comm & interBandGroupComm, + const bool computeResidual, + const bool useMixedPrec, + const bool isFirstScf) { dealii::TimerOutput computingTimerStandard( operatorMatrix.getMPICommunicator(), @@ -185,8 +186,6 @@ namespace dftfe dftUtils::printCurrentMemoryUsage(operatorMatrix.getMPICommunicator(), "Before starting chebyshev filtering"); - const unsigned int localVectorSize = - eigenVectorsFlattened.size() / totalNumberWaveFunctions; // band group parallelization data structures @@ -244,9 +243,9 @@ namespace dftfe computing_timer.enter_subsection( "Copy from full to block flattened array"); for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) - std::copy(eigenVectorsFlattened.data() + + std::copy(eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec, - eigenVectorsFlattened.data() + + eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec + BVec, eigenVectorsFlattenedArrayBlock.data() + iNode * BVec); computing_timer.leave_subsection( @@ -321,7 +320,7 @@ namespace dftfe std::copy(eigenVectorsFlattenedArrayBlock.data() + iNode * BVec, eigenVectorsFlattenedArrayBlock.data() + (iNode + 1) * BVec, - eigenVectorsFlattened.data() + + eigenVectorsFlattened + iNode * totalNumberWaveFunctions + jvec); computing_timer.leave_subsection( @@ -358,9 +357,9 @@ namespace dftfe std::min(blockSize, totalNumberWaveFunctions * localVectorSize - i); MPI_Allreduce(MPI_IN_PLACE, - &eigenVectorsFlattened[0] + i, + eigenVectorsFlattened + i, currentBlockSize, - dataTypes::mpi_type_id(&eigenVectorsFlattened[0]), + dataTypes::mpi_type_id(eigenVectorsFlattened), MPI_SUM, interBandGroupComm); } @@ -446,6 +445,7 @@ namespace dftfe eigenVectorsFlattened, eigenVectorsRotFracDensityFlattened, totalNumberWaveFunctions, + localVectorSize, totalNumberWaveFunctions - eigenValues.size(), d_mpiCommParent, interBandGroupComm, @@ -461,6 +461,7 @@ namespace dftfe elpaScala, eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, interBandGroupComm, operatorMatrix.getMPICommunicator(), @@ -477,6 +478,8 @@ namespace dftfe operatorMatrix, eigenVectorsRotFracDensityFlattened, eigenValues, + eigenValues.size(), + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -489,6 +492,8 @@ namespace dftfe operatorMatrix, eigenVectorsFlattened, eigenValues, + totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -503,6 +508,7 @@ namespace dftfe linearAlgebraOperations::gramSchmidtOrthogonalization( eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, operatorMatrix.getMPICommunicator()); computing_timer.leave_subsection("Gram-Schmidt Orthogn Opt"); @@ -519,6 +525,7 @@ namespace dftfe eigenVectorsFlattened, eigenVectorsRotFracDensityFlattened, totalNumberWaveFunctions, + localVectorSize, totalNumberWaveFunctions - eigenValues.size(), d_mpiCommParent, interBandGroupComm, @@ -534,6 +541,7 @@ namespace dftfe elpaScala, eigenVectorsFlattened, totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, interBandGroupComm, operatorMatrix.getMPICommunicator(), @@ -561,6 +569,8 @@ namespace dftfe operatorMatrix, eigenVectorsRotFracDensityFlattened, eigenValues, + eigenValues.size(), + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, @@ -572,6 +582,8 @@ namespace dftfe operatorMatrix, eigenVectorsFlattened, eigenValues, + totalNumberWaveFunctions, + localVectorSize, d_mpiCommParent, operatorMatrix.getMPICommunicator(), interBandGroupComm, diff --git a/src/symmetry/symmetrizeRho.cc b/src/symmetry/symmetrizeRho.cc index 38115156a..401c55eb7 100644 --- a/src/symmetry/symmetrizeRho.cc +++ b/src/symmetry/symmetrizeRho.cc @@ -265,7 +265,7 @@ namespace dftfe dftPtr->d_kPointWeights.size()); const unsigned int localVectorSize = - dftPtr->d_eigenVectorsFlattenedSTL[0].size() / dftPtr->d_numEigenValues; + dftPtr->matrix_free_data.get_vector_partitioner()->locally_owned_size(); distributedCPUVec eigenVectorsFlattenedArrayFullBlock; vectorTools::createDealiiVector( @@ -292,8 +292,9 @@ namespace dftfe ++iWave) eigenVectorsFlattenedArrayFullBlock.local_element( iNode * dftPtr->d_numEigenValues + iWave) = - dftPtr->d_eigenVectorsFlattenedSTL - [kPoint][iNode * dftPtr->d_numEigenValues + iWave]; + dftPtr->d_eigenVectorsFlattenedHost + [kPoint * localVectorSize * dftPtr->d_numEigenValues + + iNode * dftPtr->d_numEigenValues + iWave]; dftPtr->constraintsNoneDataInfo.distribute( eigenVectorsFlattenedArrayFullBlock, dftPtr->d_numEigenValues); diff --git a/utils/DeviceKernelsGeneric.cc b/utils/DeviceKernelsGeneric.cc index 54c517e28..56ecd913d 100644 --- a/utils/DeviceKernelsGeneric.cc +++ b/utils/DeviceKernelsGeneric.cc @@ -208,6 +208,37 @@ namespace dftfe } } + template + __global__ void + stridedCopyConstantStrideDeviceKernel(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numBlocks * blockSize; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / blockSize; + dftfe::size_type intraBlockIndex = index - blockIndex * blockSize; + dftfe::utils::copyValue( + copyToVec + blockIndex * strideTo + startingToId + + intraBlockIndex, + copyFromVec[blockIndex * strideFrom + startingFromId + + intraBlockIndex]); + } + } + } + + // x=a*x, with inc=1 template __global__ void @@ -586,6 +617,47 @@ namespace dftfe #endif } + template + void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + stridedCopyConstantStrideDeviceKernel<<< + (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + blockSize, + strideTo, + strideFrom, + numBlocks, + startingToId, + startingFromId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + stridedCopyConstantStrideDeviceKernel, + (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + blockSize, + strideTo, + strideFrom, + numBlocks, + startingToId, + startingFromId, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template void @@ -1312,6 +1384,87 @@ namespace dftfe const dftfe::size_type startingId, const std::complex *copyFromVec, std::complex * copyToVec); + // strided copy constant stride + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const double * copyFromVec, + double * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const float * copyFromVec, + float * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const double * copyFromVec, + float * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const float * copyFromVec, + double * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); + + template void + stridedCopyConstantStride(const dftfe::size_type blockSize, + const dftfe::size_type strideTo, + const dftfe::size_type strideFrom, + const dftfe::size_type numBlocks, + const dftfe::size_type startingToId, + const dftfe::size_type startingFromId, + const std::complex *copyFromVec, + std::complex * copyToVec); // stridedBlockScale template void diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 121cf80ed..5ff644f48 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -23,10 +23,12 @@ namespace dftfe template - FEBasisOperations:: - FEBasisOperations( + FEBasisOperationsBase:: + FEBasisOperationsBase( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, - std::vector *> + std::vector *> &constraintsVector) { d_matrixFreeDataPtr = &matrixFreeData; @@ -45,17 +47,30 @@ namespace dftfe (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( iMacroCell) <= dealii::internal::MatrixFreeFunctions::affine); } + areAllCellsCartesian = true; + for (unsigned int iMacroCell = 0; + iMacroCell < d_matrixFreeDataPtr->n_cell_batches(); + ++iMacroCell) + { + areAllCellsCartesian = + areAllCellsCartesian && + (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( + iMacroCell) == dealii::internal::MatrixFreeFunctions::cartesian); + } + // std::cout << "DEBUG cart " << areAllCellsCartesian << " " + // << areAllCellsAffine << std::endl; } template void - FEBasisOperations:: - reinit(const unsigned int &blockSize, - const unsigned int &dofHandlerID, - const unsigned int &quadratureID, - const UpdateFlags updateFlags) + FEBasisOperationsBase::reinit(const unsigned int &blockSize, + const unsigned int &dofHandlerID, + const unsigned int &quadratureID, + const UpdateFlags updateFlags) { if ((d_dofHandlerID != dofHandlerID) || (d_updateFlags != updateFlags)) { @@ -66,6 +81,7 @@ namespace dftfe initializeIndexMaps(); initializeConstraints(); initializeShapeFunctionAndJacobianData(); + initializeFlattenedIndexMaps(); } else if ((d_quadratureID != quadratureID) && (d_nVectors != blockSize)) { @@ -73,6 +89,7 @@ namespace dftfe d_nVectors = blockSize; initializeConstraints(); initializeShapeFunctionAndJacobianData(); + initializeFlattenedIndexMaps(); } else if (d_quadratureID != quadratureID) { @@ -83,41 +100,85 @@ namespace dftfe { d_nVectors = blockSize; initializeConstraints(); + initializeFlattenedIndexMaps(); } } + template + void + FEBasisOperationsBase::initializeFlattenedIndexMaps() + { +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_flattenedCellDofIndexToProcessDofIndexMapHost; + dftfe::utils::MemoryStorage + d_nonAffineReshapeIDsHost; + if ((memorySpace == dftfe::utils::MemorySpace::DEVICE) && + (!areAllCellsAffine)) + { + d_nonAffineReshapeIDsHost.resize(d_nCells * d_nQuadsPerCell * 3); + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + { + for (unsigned int iDim = 0; iDim < 3; ++iDim) + { + d_nonAffineReshapeIDsHost[iQuad + d_nQuadsPerCell * iDim + + d_nQuadsPerCell * 3 * iCell] = + (iDim + 3 * iQuad + d_nQuadsPerCell * 3 * iCell) * + d_nVectors; + } + } + } + } + d_nonAffineReshapeIDs.resize(d_nonAffineReshapeIDsHost.size()); + d_nonAffineReshapeIDs.copyFrom(d_nonAffineReshapeIDsHost); +#else + auto &d_flattenedCellDofIndexToProcessDofIndexMapHost = + d_flattenedCellDofIndexToProcessDofIndexMap; +#endif + d_flattenedCellDofIndexToProcessDofIndexMapHost.clear(); + d_flattenedCellDofIndexToProcessDofIndexMapHost.resize(d_nCells * + d_nDofsPerCell); + + std::transform(d_cellDofIndexToProcessDofIndexMap.begin(), + d_cellDofIndexToProcessDofIndexMap.end(), + d_flattenedCellDofIndexToProcessDofIndexMapHost.begin(), + [&a = this->d_nVectors](auto &c) { return c * a; }); +#if defined(DFTFE_WITH_DEVICE) + d_flattenedCellDofIndexToProcessDofIndexMap.resize( + d_flattenedCellDofIndexToProcessDofIndexMapHost.size()); + d_flattenedCellDofIndexToProcessDofIndexMap.copyFrom( + d_flattenedCellDofIndexToProcessDofIndexMapHost); +#endif + } + template void - FEBasisOperations:: - initializeIndexMaps() + FEBasisOperationsBase::initializeIndexMaps() { - d_nMacroCells = d_matrixFreeDataPtr->n_cell_batches(); - d_nCells = d_matrixFreeDataPtr->n_physical_cells(); - d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) - .get_fe() - .dofs_per_cell; + d_nCells = d_matrixFreeDataPtr->n_physical_cells(); + d_nDofsPerCell = + d_matrixFreeDataPtr->get_dof_handler(0).get_fe().dofs_per_cell; d_cellDofIndexToProcessDofIndexMap.clear(); d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell); d_cellIndexToCellIdMap.clear(); d_cellIndexToCellIdMap.resize(d_nCells); - if (d_updateFlags & update_macrocell_map) - { - d_cellIndexToMacroCellSubCellIndexMap.clear(); - d_cellIndexToMacroCellSubCellIndexMap.resize(d_nCells); - - d_macroCellSubCellDofIndexToProcessDofIndexMap.clear(); - d_macroCellSubCellDofIndexToProcessDofIndexMap.resize(d_nCells * - d_nDofsPerCell); - } - - auto cellPtr = - d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); - auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + auto cellPtr = d_matrixFreeDataPtr->get_dof_handler(0).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(0).end(); std::vector cellDofIndicesGlobal(d_nDofsPerCell); std::map cellIdToCellIndexMap; @@ -130,58 +191,31 @@ namespace dftfe for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) d_cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof] = - d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) - ->global_to_local(cellDofIndicesGlobal[iDof]); + d_matrixFreeDataPtr->get_vector_partitioner(0)->global_to_local( + cellDofIndicesGlobal[iDof]); - if (d_updateFlags & update_macrocell_map) - cellIdToCellIndexMap[cellPtr->id()] = iCell; d_cellIndexToCellIdMap[iCell] = cellPtr->id(); ++iCell; } - - iCell = 0; - for (unsigned int iMacroCell = 0; iMacroCell < d_nMacroCells; - ++iMacroCell) - { - const unsigned int numberSubCells = - d_matrixFreeDataPtr->n_components_filled(iMacroCell); - for (unsigned int iSubCell = 0; iSubCell < numberSubCells; ++iSubCell) - { - cellPtr = d_matrixFreeDataPtr->get_cell_iterator(iMacroCell, - iSubCell, - d_dofHandlerID); - size_type cellIndex = cellIdToCellIndexMap[cellPtr->id()]; - d_cellIndexToMacroCellSubCellIndexMap[cellIndex] = iCell; - std::copy(d_cellDofIndexToProcessDofIndexMap.begin() + - cellIndex * d_nDofsPerCell, - d_cellDofIndexToProcessDofIndexMap.begin() + - (cellIndex + 1) * d_nDofsPerCell, - d_macroCellSubCellDofIndexToProcessDofIndexMap.begin() + - iCell * d_nDofsPerCell); - ++iCell; - } - } } - template void - FEBasisOperations:: - initializeConstraints() + FEBasisOperationsBase::initializeConstraints() { d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner( - d_dofHandlerID), - *((*d_constraintsVector)[d_dofHandlerID])); + 0), + *((*d_constraintsVector)[0])); d_constraintInfo.precomputeMaps( - d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) - ->locally_owned_size() + - d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) - ->n_ghost_indices(), + d_matrixFreeDataPtr->get_vector_partitioner(0)->locally_owned_size() + + d_matrixFreeDataPtr->get_vector_partitioner(0)->n_ghost_indices(), d_nVectors); } @@ -189,13 +223,14 @@ namespace dftfe typename ValueTypeBasisData, dftfe::utils::MemorySpace memorySpace> void - FEBasisOperations:: - initializeShapeFunctionAndJacobianData() + FEBasisOperationsBase::initializeShapeFunctionAndJacobianData() { const dealii::Quadrature<3> &quadrature = d_matrixFreeDataPtr->get_quadrature(d_quadratureID); dealii::FEValues<3> fe_values( - d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + d_matrixFreeDataPtr->get_dof_handler(0).get_fe(), quadrature, dealii::update_values | dealii::update_gradients | dealii::update_jacobians | dealii::update_JxW_values | @@ -204,27 +239,18 @@ namespace dftfe d_nQuadsPerCell = quadrature.size(); #if defined(DFTFE_WITH_DEVICE) - std::map> + dftfe::utils::MemoryStorage d_inverseJacobianDataHost; - std::map> + dftfe::utils::MemoryStorage d_JxWDataHost; - dftfe::utils::MemoryStorage d_shapeFunctionDataHost; - dftfe::utils::MemoryStorage d_shapeFunctionGradientDataHost; - if (memorySpace == dftfe::utils::MemorySpace::HOST) - { - &d_inverseJacobianDataHost = d_inverseJacobianData; - &d_JxWDataHost = d_JxWData; - &d_shapeFunctionDataHost = d_shapeFunctionData; - &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; - } #else auto &d_inverseJacobianDataHost = d_inverseJacobianData; auto &d_JxWDataHost = d_JxWData; @@ -248,14 +274,16 @@ namespace dftfe d_inverseJacobianDataHost.clear(); if (d_updateFlags & update_gradients) - d_inverseJacobianDataHost.resize( - areAllCellsAffine ? d_nCells * 9 : d_nCells * 9 * d_nQuadsPerCell); + d_inverseJacobianDataHost.resize(areAllCellsCartesian ? + d_nCells * 3 : + (areAllCellsAffine ? + d_nCells * 9 : + d_nCells * 9 * d_nQuadsPerCell)); const unsigned int nJacobiansPerCell = areAllCellsAffine ? 1 : d_nQuadsPerCell; - auto cellPtr = - d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); - auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + auto cellPtr = d_matrixFreeDataPtr->get_dof_handler(0).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(0).end(); unsigned int iCell = 0; for (; cellPtr != endcPtr; ++cellPtr) @@ -268,229 +296,103 @@ namespace dftfe { if (d_updateFlags & update_values) for (unsigned int iNode = 0; iNode < d_nDofsPerCell; ++iNode) - for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; - ++q_point) - d_shapeFunctionDataHost[q_point * d_nDofsPerCell + - iNode] = - fe_values.shape_value(iNode, q_point); + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; + ++iQuad) + d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + iNode] = + fe_values.shape_value(iNode, iQuad); if (d_updateFlags & update_gradients) - for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; - ++q_point) + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) for (unsigned int iNode = 0; iNode < d_nDofsPerCell; ++iNode) { const auto &shape_grad_real = - fe_values.shape_grad(iNode, q_point); + fe_values.shape_grad(iNode, iQuad); const auto &shape_grad_reference = - apply_transformation(jacobians[q_point].transpose(), + apply_transformation(jacobians[iQuad].transpose(), shape_grad_real); for (unsigned int iDim = 0; iDim < 3; ++iDim) - d_shapeFunctionGradientDataHost - [d_nQuadsPerCell * d_nDofsPerCell * iDim + - d_nDofsPerCell * q_point + iNode] = - shape_grad_reference[iDim]; + if (areAllCellsAffine) + d_shapeFunctionGradientDataHost + [d_nQuadsPerCell * d_nDofsPerCell * iDim + + d_nDofsPerCell * iQuad + iNode] = + shape_grad_reference[iDim]; + else + d_shapeFunctionGradientDataHost + [iQuad * d_nDofsPerCell * 3 + + d_nDofsPerCell * iDim + iNode] = + shape_grad_reference[iDim]; } } - for (unsigned int q_point = 0; q_point < d_nQuadsPerCell; ++q_point) - d_JxWDataHost[iCell * d_nQuadsPerCell + q_point] = - fe_values.JxW(q_point); - for (unsigned int q_point = 0; q_point < nJacobiansPerCell; - ++q_point) + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + d_JxWDataHost[iCell * d_nQuadsPerCell + iQuad] = + fe_values.JxW(iQuad); + for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) for (unsigned int iDim = 0; iDim < 3; ++iDim) - for (unsigned int jDim = 0; jDim < 3; ++jDim) - d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 9 + - q_point * 9 + jDim * 3 + iDim] = - inverseJacobians[q_point][jDim][iDim]; + if (areAllCellsCartesian) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + + iDim * nJacobiansPerCell + iQuad] = + inverseJacobians[iQuad][iDim][iDim]; + else + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 9 + + 9 * iQuad + jDim * 3 + iDim] = + inverseJacobians[iQuad][iDim][jDim]; + ++iCell; } #if defined(DFTFE_WITH_DEVICE) - if (memorySpace == dftfe::utils::MemorySpace::DEVICE) - { - d_inverseJacobianData.resize(d_inverseJacobianDataHost.size()); - d_inverseJacobianData.copyFrom(d_inverseJacobianDataHost); - d_JxWData.resize(d_JxWDataHost.size()); - d_JxWData.copyFrom(d_JxWDataHost); - d_shapeFunctionData.resize(d_shapeFunctionDataHost.size()); - d_shapeFunctionData.copyFrom(d_shapeFunctionDataHost); - d_shapeFunctionGradientData.resize( - d_shapeFunctionGradientDataHost.size()); - d_shapeFunctionGradientData.copyFrom(d_shapeFunctionGradientDataHost); - } + d_inverseJacobianData.resize(d_inverseJacobianDataHost.size()); + d_inverseJacobianData.copyFrom(d_inverseJacobianDataHost); + d_JxWData.resize(d_JxWDataHost.size()); + d_JxWData.copyFrom(d_JxWDataHost); + d_shapeFunctionData.resize(d_shapeFunctionDataHost.size()); + d_shapeFunctionData.copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionGradientData.resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientData.copyFrom(d_shapeFunctionGradientDataHost); #endif } - template void - FEBasisOperations:: - interpolate( + FEBasisOperationsBase:: + createMultiVector( + const unsigned int dofHandlerIndex, + const unsigned int blocksize, dftfe::linearAlgebra::MultiVector - &nodalData, - std::map> - *quadratureValues, - std::map> - * quadratureGradients, - bool useMacroCellSubCellOrdering) const + &multiVector) const { - if (memorySpace == dftfe::utils::MemorySpace::HOST) - { - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - auto &cellQuadratureData = (*quadratureValues)[currentCellId]; - cellQuadratureData.resize(d_nQuadsPerCell * d_nVectors); - - auto &cellQuadratureGradientData = - (quadratureGradients != NULL) ? - (*quadratureGradients)[currentCellId] : - NULL; - if (quadratureGradients != NULL) - cellQuadratureGradientData.resize(d_nQuadsPerCell * d_nVectors * - 3); - interpolateHostKernel( - nodalData, - &cellQuadratureData, - &cellQuadratureGradientData, - std::pair(iCell, iCell + 1), - useMacroCellSubCellOrdering); - } - } + dftfe::linearAlgebra::createMultiVectorFromDealiiPartitioner( + d_matrixFreeDataPtr->get_vector_partitioner(dofHandlerIndex), + blocksize, + multiVector); } template void - FEBasisOperations:: - interpolate(dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, - bool useMacroCellSubCellOrdering) const - { - if (memorySpace == dftfe::utils::MemorySpace::HOST) - { - interpolateHostKernel(nodalData, - quadratureValues, - quadratureGradients, - std::pair(0, - d_nCells), - useMacroCellSubCellOrdering); - } - } - - template - void - FEBasisOperations:: - integrateWithBasis( - const std::map< - dealii::CellId, - dftfe::utils::MemoryStorage> - &quadratureValues, - std::map> - *quadratureGradients, + FEBasisOperationsBase:: + distribute( dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering) const + &multiVector) const { - if (memorySpace == dftfe::utils::MemorySpace::HOST) - { - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - auto &cellQuadratureData = (*quadratureValues)[currentCellId]; - - auto &cellQuadratureGradientData = - (quadratureGradients != NULL) ? - (*quadratureGradients)[currentCellId] : - NULL; - integrateWithBasisHostKernel( - &cellQuadratureData, - &cellQuadratureGradientData, - nodalData, - std::pair(iCell, iCell + 1), - useMacroCellSubCellOrdering); - } - } + d_constraintInfo.distribute(multiVector, d_nVectors); } - template - void - FEBasisOperations:: - integrateWithBasis( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering) const - { - if (memorySpace == dftfe::utils::MemorySpace::HOST) - { - integrateWithBasisHostKernel( - quadratureValues, - quadratureGradients, - nodalData, - std::pair(0, d_nCells), - useMacroCellSubCellOrdering); - } - } - - - template - void - FEBasisOperations:: - extractToCellNodalData( - dftfe::linearAlgebra::MultiVector - &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, - bool useMacroCellSubCellOrdering) const - { - extractToCellNodalDataHostKernel( - nodalData, - cellNodalDataPtr, - std::pair(0, d_nCells), - useMacroCellSubCellOrdering); - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - bool useMacroCellSubCellOrdering) const - { - accumulateFromCellNodalDataHostKernel( - cellNodalDataPtr, - nodalData, - std::pair(0, d_nCells), - useMacroCellSubCellOrdering); - } + // template class FEBasisOperations; + // template class FEBasisOperations; } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.cc new file mode 100644 index 000000000..e6ec336d3 --- /dev/null +++ b/utils/FEBasisOperationsDevice.cc @@ -0,0 +1,445 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +#include +#include +#include + +namespace dftfe +{ + namespace basis + { + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + std::map> + *quadratureValues, + std::map> + *quadratureGradients) const + { + dftfe::utils::MemoryStorage + quadratureValuesAllCells; + dftfe::utils::MemoryStorage + quadratureGradientsAllCells; + quadratureValuesAllCells.resize(d_nCells * d_nQuadsPerCell * d_nVectors); + quadratureGradientsAllCells.resize(d_nCells * 3 * d_nQuadsPerCell * + d_nVectors); + + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + quadratureValuesAllCells.copyFrom(quadratureValues->at(currentCellId), + d_nQuadsPerCell * d_nVectors, + 0, + d_nVectors * d_nQuadsPerCell * + iCell); + if (quadratureGradients != NULL) + quadratureGradientsAllCells.copyFrom( + quadratureGradients->at(currentCellId), + d_nQuadsPerCell * d_nVectors * 3, + 0, + d_nVectors * d_nQuadsPerCell * 3 * iCell); + } + interpolateKernel(nodalData, + &quadratureValuesAllCells, + quadratureGradients == NULL ? + NULL : + &quadratureGradientsAllCells, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients) const + { + interpolateKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + std::map> + *quadratureValues, + std::map> + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + dftfe::utils::MemoryStorage + quadratureValuesAllCells; + dftfe::utils::MemoryStorage + quadratureGradientsAllCells; + quadratureValuesAllCells.resize(d_nCells * d_nQuadsPerCell * d_nVectors); + quadratureGradientsAllCells.resize(d_nCells * 3 * d_nQuadsPerCell * + d_nVectors); + + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + quadratureValuesAllCells.copyFrom(quadratureValues->at(currentCellId), + d_nQuadsPerCell * d_nVectors, + 0, + d_nVectors * d_nQuadsPerCell * + iCell); + if (quadratureGradients != NULL) + quadratureGradientsAllCells.copyFrom( + quadratureGradients->at(currentCellId), + d_nQuadsPerCell * d_nVectors * 3, + 0, + d_nVectors * d_nQuadsPerCell * 3 * iCell); + } + integrateWithBasisKernel( + &quadratureValuesAllCells, + quadratureGradients == NULL ? NULL : &quadratureGradientsAllCells, + nodalData, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + integrateWithBasisKernel(quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, + d_nCells)); + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *cellNodalDataPtr) const + { + extractToCellNodalDataKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + accumulateFromCellNodalDataKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells)); + } + + + + template + void + FEBasisOperations:: + interpolateKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> &nodalValues, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + const std::pair cellRange) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + cellNodalData.resize(d_nVectors * d_nDofsPerCell * + (cellRange.second - cellRange.first)); + + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize( + areAllCellsCartesian ? 0 : + (d_nVectors * d_nQuadsPerCell * 3 * + (cellRange.second - cellRange.first))); + + if (quadratureGradients != NULL) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? 0 : + (d_nVectors * d_nQuadsPerCell * 3 * + (cellRange.second - cellRange.first))); + + extractToCellNodalDataKernel(nodalValues, &cellNodalData, cellRange); + + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + d_nQuadsPerCell, + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data(), + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionData.data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + quadratureValues->data(), + d_nVectors, + d_nVectors * d_nQuadsPerCell, + cellRange.second - cellRange.first); + if (quadratureGradients != NULL) + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + d_nQuadsPerCell * 3, + d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data(), + d_nVectors, + d_nVectors * d_nDofsPerCell, + d_shapeFunctionGradientData.data(), + d_nDofsPerCell, + 0, + &scalarCoeffBeta, + areAllCellsCartesian ? quadratureGradients->data() : + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * d_nQuadsPerCell * 3, + cellRange.second - cellRange.first); + if (areAllCellsCartesian) + { + dftfe::utils::deviceKernelsGeneric::stridedBlockScale( + d_nQuadsPerCell * d_nVectors, + 3 * (cellRange.second - cellRange.first), + ValueTypeBasisCoeff(1.0), + d_inverseJacobianData.data() + cellRange.first * 3, + quadratureGradients->data()); + } + else if (areAllCellsAffine) + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nQuadsPerCell * d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nQuadsPerCell * d_nVectors, + d_nQuadsPerCell * d_nVectors * 3, + d_inverseJacobianData.data() + 9 * cellRange.first, + 3, + 9, + &scalarCoeffBeta, + quadratureGradients->data(), + d_nQuadsPerCell * d_nVectors, + d_nVectors * d_nQuadsPerCell * 3, + cellRange.second - cellRange.first); + } + else + { + dftfe::utils::deviceBlasWrapper::gemmStridedBatched( + *d_deviceBlasHandlePtr, + dftfe::utils::DEVICEBLAS_OP_N, + dftfe::utils::DEVICEBLAS_OP_N, + d_nVectors, + 3, + 3, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + d_nVectors, + d_nVectors * 3, + d_inverseJacobianData.data() + + 9 * cellRange.first * d_nQuadsPerCell, + 3, + 9, + &scalarCoeffBeta, + tempQuadratureGradientsDataNonAffine.data(), + d_nVectors, + d_nVectors * 3, + (cellRange.second - cellRange.first) * d_nQuadsPerCell); + dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + d_nVectors, + (cellRange.second - cellRange.first) * d_nQuadsPerCell * 3, + tempQuadratureGradientsDataNonAffine.data(), + quadratureGradients->data(), + d_nonAffineReshapeIDs.data() + + cellRange.first * d_nDofsPerCell); + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisKernel( + const dftfe::utils::MemoryStorage + *quadratureValues, + const dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + {} + + template + void + FEBasisOperations:: + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector< + ValueTypeBasisCoeff, + dftfe::utils::MemorySpace::DEVICE> &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + const std::pair cellRange) const + { + dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + nodalData.data(), + cellNodalDataPtr->data(), + d_flattenedCellDofIndexToProcessDofIndexMap.data() + + cellRange.first * d_nDofsPerCell); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataKernel( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + dftfe::utils::deviceKernelsGeneric::axpyStridedBlockAtomicAdd( + d_nVectors, + (cellRange.second - cellRange.first) * d_nDofsPerCell, + cellNodalDataPtr->begin(), + nodalData.begin(), + d_flattenedCellDofIndexToProcessDofIndexMap.begin() + + cellRange.first * d_nDofsPerCell); + } + + template + void + FEBasisOperations:: + setDeviceBLASHandle(dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr) + { + d_deviceBlasHandlePtr = deviceBlasHandlePtr; + } + + template + dftfe::utils::deviceBlasHandle_t & + FEBasisOperations::getDeviceBLASHandle() + { + return *d_deviceBlasHandlePtr; + } + + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.cc new file mode 100644 index 000000000..fd91781f4 --- /dev/null +++ b/utils/FEBasisOperationsHost.cc @@ -0,0 +1,535 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +namespace dftfe +{ + namespace basis + { + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + std::map> + *quadratureValues, + std::map> + *quadratureGradients) const + { + dftfe::utils::MemoryStorage + *quadratureValuesCurrentCell; + dftfe::utils::MemoryStorage + *quadratureGradientsCurrentCell; + + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + quadratureValuesCurrentCell = &(quadratureValues->at(currentCellId)); + quadratureGradientsCurrentCell = + quadratureGradients ? &(quadratureGradients->at(currentCellId)) : + NULL; + interpolateKernel(nodalData, + quadratureValuesCurrentCell, + quadratureGradientsCurrentCell, + std::pair(iCell, + iCell + 1)); + } + } + + template + void + FEBasisOperations:: + interpolate( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients) const + { + interpolateKernel(nodalData, + quadratureValues, + quadratureGradients, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + integrateWithBasis( + std::map> + *quadratureValues, + std::map> + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) + { + dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; + const dftfe::utils::MemoryStorage + *quadratureValuesCurrentCell = + &(quadratureValues->at(currentCellId)); + const dftfe::utils::MemoryStorage + *quadratureGradientsCurrentCell = + quadratureGradients ? &(quadratureGradients->at(currentCellId)) : + NULL; + integrateWithBasisKernel( + quadratureValuesCurrentCell, + quadratureGradientsCurrentCell, + nodalData, + std::pair(iCell, iCell + 1)); + } + } + + template + void + FEBasisOperations:: + integrateWithBasis( + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + integrateWithBasisKernel(quadratureValues, + quadratureGradients, + nodalData, + std::pair(0, + d_nCells)); + } + + + template + void + FEBasisOperations:: + extractToCellNodalData( + dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + *cellNodalDataPtr) const + { + extractToCellNodalDataKernel( + nodalData, + cellNodalDataPtr, + std::pair(0, d_nCells)); + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalData( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + &nodalData) const + { + accumulateFromCellNodalDataKernel( + cellNodalDataPtr, + nodalData, + std::pair(0, d_nCells)); + } + template + void + FEBasisOperations:: + interpolateKernel( + const dftfe::linearAlgebra::MultiVector + &nodalValues, + dftfe::utils::MemoryStorage + *quadratureValues, + dftfe::utils::MemoryStorage + * quadratureGradients, + const std::pair cellRange) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + cellNodalData.resize(d_nVectors * d_nDofsPerCell); + + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize( + areAllCellsCartesian ? 0 : (d_nVectors * d_nQuadsPerCell * 3)); + + if (quadratureGradients != NULL) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? 0 : (d_nVectors * d_nQuadsPerCell * 3)); + + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + extractToCellNodalDataKernel( + nodalValues, + &cellNodalData, + std::pair(iCell, iCell + 1)); + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'N'; + + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCell, + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data() + + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, + &d_nVectors, + d_shapeFunctionData.data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + quadratureValues->data() + + d_nQuadsPerCell * (iCell - cellRange.first) * d_nVectors, + &d_nVectors); + if (quadratureGradients != NULL) + { + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell * 3; + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCellTimesThree, + &d_nDofsPerCell, + &scalarCoeffAlpha, + cellNodalData.data() + + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, + &d_nVectors, + d_shapeFunctionGradientData.data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + areAllCellsCartesian ? (quadratureGradients->data() + + d_nQuadsPerCell * d_nVectors * 3 * + (iCell - cellRange.first)) : + (tempQuadratureGradientsData.data()), + &d_nVectors); + if (areAllCellsCartesian) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int one = 1; + for (unsigned int iDim = 0; iDim < 3; ++iDim) + xscal(&d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 3 * iCell + iDim, + quadratureGradients->data() + + d_nQuadsPerCell * d_nVectors * 3 * + (iCell - cellRange.first) + + d_nQuadsPerCell * d_nVectors * iDim, + &one); + } + else if (areAllCellsAffine) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 9 * iCell, + &three, + &scalarCoeffBeta, + quadratureGradients->data() + + d_nQuadsPerCell * d_nVectors * 3 * + (iCell - cellRange.first), + &d_nQuadsPerCellTimesnVectors); + } + else + { + const unsigned int three = 3; + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + xgemm(&transA, + &transB, + &d_nVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data() + + iQuad * d_nVectors * 3, + &d_nVectors, + d_inverseJacobianData.data() + + 9 * d_nQuadsPerCell * iCell + 9 * iQuad, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsDataNonAffine.data() + + iQuad * d_nVectors * 3, + &d_nVectors); + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(quadratureGradients->data() + + d_nVectors * 3 * d_nQuadsPerCell * + (iCell - cellRange.first) + + d_nVectors * d_nQuadsPerCell * iDim + + d_nVectors * iQuad, + tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad + d_nVectors * iDim, + d_nVectors * sizeof(ValueTypeBasisCoeff)); + } + } + } + } + + template + void + FEBasisOperations:: + integrateWithBasisKernel( + const dftfe::utils::MemoryStorage + *quadratureValues, + const dftfe::utils::MemoryStorage + *quadratureGradients, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + dftfe::utils::MemoryStorage + cellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); + if (quadratureGradients != NULL) + tempQuadratureGradientsData.resize(3 * d_nVectors * d_nQuadsPerCell); + + if (quadratureGradients != NULL) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? 0 : (3 * d_nVectors * d_nQuadsPerCell)); + + + + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { + const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), + scalarCoeffBeta = ValueTypeBasisCoeff(0.0); + const char transA = 'N', transB = 'T'; + + xgemm(&transA, + &transB, + &d_nVectors, + &d_nDofsPerCell, + &d_nQuadsPerCell, + &scalarCoeffAlpha, + quadratureValues->data() + d_nQuadsPerCell * iCell, + &d_nVectors, + d_shapeFunctionData.data(), + &d_nQuadsPerCell, + &scalarCoeffBeta, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + if (quadratureGradients != NULL) + { + if (areAllCellsCartesian) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int one = 1; + std::memcpy(tempQuadratureGradientsData.data(), + quadratureGradients->data() + + d_nQuadsPerCell * d_nVectors * 3 * iCell, + 3 * d_nQuadsPerCellTimesnVectors * + sizeof(ValueTypeBasisCoeff)); + for (unsigned int iDim = 0; iDim < 3; ++iDim) + xscal(&d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 3 * iCell + iDim, + tempQuadratureGradientsData.data() + + d_nQuadsPerCell * d_nVectors * iDim, + &one); + } + else if (areAllCellsAffine) + { + const unsigned int d_nQuadsPerCellTimesnVectors = + d_nQuadsPerCell * d_nVectors; + const unsigned int three = 3; + xgemm(&transA, + &transB, + &d_nQuadsPerCellTimesnVectors, + &three, + &three, + &scalarCoeffAlpha, + quadratureGradients->data() + + d_nQuadsPerCell * d_nVectors * 3 * iCell, + &d_nQuadsPerCellTimesnVectors, + d_inverseJacobianData.data() + 9 * iCell, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsData.data(), + &d_nQuadsPerCellTimesnVectors); + } + else + { + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + std::memcpy(tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad + d_nVectors * iDim, + quadratureGradients->data() + + d_nVectors * 3 * d_nQuadsPerCell * iCell + + d_nVectors * d_nQuadsPerCell * iDim + + d_nVectors * iQuad, + d_nVectors * sizeof(ValueTypeBasisCoeff)); + const unsigned int three = 3; + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + xgemm(&transA, + &transB, + &d_nVectors, + &three, + &three, + &scalarCoeffAlpha, + tempQuadratureGradientsDataNonAffine.data() + + d_nVectors * 3 * iQuad, + &d_nVectors, + d_inverseJacobianData.data() + + 9 * d_nQuadsPerCell * iCell + 9 * iQuad, + &three, + &scalarCoeffBeta, + tempQuadratureGradientsData.data() + + d_nVectors * 3 * iQuad, + &d_nVectors); + } + const unsigned int d_nQuadsPerCellTimesThree = + d_nQuadsPerCell * 3; + xgemm(&transA, + &transB, + &d_nVectors, + &d_nQuadsPerCellTimesThree, + &d_nDofsPerCell, + &scalarCoeffAlpha, + tempQuadratureGradientsData.data(), + &d_nVectors, + d_shapeFunctionGradientData.data(), + &d_nDofsPerCell, + &scalarCoeffBeta, + cellNodalData.data() + d_nDofsPerCell * iCell, + &d_nVectors); + } + accumulateFromCellNodalDataKernel( + &cellNodalData, + nodalData, + std::pair(iCell, iCell + 1)); + } + } + + template + void + FEBasisOperations:: + extractToCellNodalDataKernel( + const dftfe::linearAlgebra::MultiVector + &nodalData, + dftfe::utils::MemoryStorage + * cellNodalDataPtr, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + { + std::memcpy(cellNodalDataPtr->data() + + (iCell - cellRange.first) * d_nVectors * + d_nDofsPerCell + + iDof * d_nVectors, + nodalData.data() + + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + d_nVectors * sizeof(ValueTypeBasisCoeff)); + } + } + + template + void + FEBasisOperations:: + accumulateFromCellNodalDataKernel( + const dftfe::utils::MemoryStorage + *cellNodalDataPtr, + dftfe::linearAlgebra::MultiVector + & nodalData, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) + std::transform( + cellNodalDataPtr->data() + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors, + cellNodalDataPtr->data() + iCell * d_nVectors * d_nDofsPerCell + + iDof * d_nVectors + d_nVectors, + nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap + [iCell * d_nDofsPerCell + iDof], + std::plus()); + } + } // namespace basis +} // namespace dftfe diff --git a/utils/FEBasisOperationsHostKernels.cc b/utils/FEBasisOperationsHostKernels.cc deleted file mode 100644 index ec90708db..000000000 --- a/utils/FEBasisOperationsHostKernels.cc +++ /dev/null @@ -1,270 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#include -namespace dftfe -{ - namespace basis - { - template - void - FEBasisOperations:: - interpolateHostKernel( - dftfe::linearAlgebra::MultiVector - &nodalValues, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, - std::pair cellRange, - bool useMacroCellSubCellOrdering) const - { - dftfe::utils::MemoryStorage - cellNodalData, tempQuadratureGradientsData; - cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); - - if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize(d_nVectors * d_nQuadsPerCell * 3); - - - extractToCellNodalDataHostKernel(nodalValues, - &cellNodalData, - cellRange, - useMacroCellSubCellOrdering); - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - { - const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), - scalarCoeffBeta = ValueTypeBasisCoeff(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - &d_nVectors, - &d_nQuadsPerCell, - &d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalData.data() + d_nDofsPerCell * iCell, - &d_nVectors, - d_shapeFunctionData.data(), - &d_nDofsPerCell, - &scalarCoeffBeta, - quadratureValues->data() + d_nQuadsPerCell * iCell, - &d_nVectors); - if (quadratureGradients != NULL) - { - const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell * 3; - xgemm(&transA, - &transB, - &d_nVectors, - &d_nQuadsPerCellTimesThree, - &d_nDofsPerCell, - &scalarCoeffAlpha, - cellNodalData.data() + d_nDofsPerCell * iCell, - &d_nVectors, - d_shapeFunctionGradientData.data(), - &d_nDofsPerCell, - &scalarCoeffBeta, - tempQuadratureGradientsData.data(), - &d_nVectors); - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; - const unsigned int three = 3; - xgemm(&transA, - &transB, - &d_nQuadsPerCellTimesnVectors, - &three, - &three, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - &d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 9 * iCell, - &three, - &scalarCoeffBeta, - quadratureGradients->data() + d_nQuadsPerCell * 3 * iCell, - &d_nQuadsPerCellTimesnVectors); - } - } - } - - template - void - FEBasisOperations:: - integrateWithBasisHostKernel( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - & nodalData, - std::pair cellRange, - bool useMacroCellSubCellOrdering) const - { - dftfe::utils::MemoryStorage - cellNodalData, tempQuadratureGradientsData; - cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); - if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize(3 * d_nVectors * d_nQuadsPerCell); - - - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - { - const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), - scalarCoeffBeta = ValueTypeBasisCoeff(0.0); - const char transA = 'N', transB = 'T'; - - xgemm(&transA, - &transB, - &d_nVectors, - &d_nDofsPerCell, - &d_nQuadsPerCell, - &scalarCoeffAlpha, - quadratureValues->data() + d_nQuadsPerCell * iCell, - &d_nVectors, - d_shapeFunctionData.data(), - &d_nQuadsPerCell, - &scalarCoeffBeta, - cellNodalData.data() + d_nDofsPerCell * iCell, - &d_nVectors); - if (quadratureGradients != NULL) - { - const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell * 3; - const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; - const unsigned int three = 3; - xgemm(&transA, - &transB, - &d_nQuadsPerCellTimesnVectors, - &three, - &three, - &scalarCoeffAlpha, - quadratureGradients->data() + d_nQuadsPerCell * 3 * iCell, - &d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 9 * iCell, - &three, - &scalarCoeffBeta, - tempQuadratureGradientsData.data(), - &d_nQuadsPerCellTimesnVectors); - xgemm(&transA, - &transB, - &d_nVectors, - &d_nDofsPerCell, - &d_nQuadsPerCellTimesThree, - &scalarCoeffAlpha, - tempQuadratureGradientsData.data(), - &d_nVectors, - d_shapeFunctionGradientData.data(), - &d_nQuadsPerCellTimesThree, - &scalarCoeffAlpha, - cellNodalData.data() + d_nDofsPerCell * iCell, - &d_nVectors); - } - } - accumulateFromCellNodalDataHostKernel(&cellNodalData, - nodalData, - cellRange, - useMacroCellSubCellOrdering); - } - - template - void - FEBasisOperations:: - extractToCellNodalDataHostKernel( - dftfe::linearAlgebra::MultiVector - &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, - std::pair cellRange, - bool useMacroCellSubCellOrdering) const - { - auto &cellDofIndexToProcessDofIndexMap = - useMacroCellSubCellOrdering ? - d_macroCellSubCellDofIndexToProcessDofIndexMap : - d_cellDofIndexToProcessDofIndexMap; - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) - std::memcpy( - cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + - iDof * d_nVectors, - nodalData.data() + - d_nVectors * - cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], - d_nVectors * sizeof(ValueTypeBasisCoeff)); - } - - template - void - FEBasisOperations:: - accumulateFromCellNodalDataHostKernel( - dftfe::utils::MemoryStorage - *cellNodalDataPtr, - dftfe::linearAlgebra::MultiVector - & nodalData, - std::pair cellRange, - bool useMacroCellSubCellOrdering) const - { - auto &cellDofIndexToProcessDofIndexMap = - useMacroCellSubCellOrdering ? - d_macroCellSubCellDofIndexToProcessDofIndexMap : - d_cellDofIndexToProcessDofIndexMap; - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) - std::transform( - cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + - iDof * d_nVectors, - cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + - iDof * d_nVectors + d_nVectors, - nodalData.data() + - d_nVectors * - cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], - nodalData.data() + - d_nVectors * - cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof], - std::plus()); - } - - } // namespace basis -} // namespace dftfe diff --git a/utils/vectorTools/vectorUtilities.cc b/utils/vectorTools/vectorUtilities.cc index 508d87ee8..68a4a448d 100644 --- a/utils/vectorTools/vectorUtilities.cc +++ b/utils/vectorTools/vectorUtilities.cc @@ -682,8 +682,9 @@ namespace dftfe #ifdef USE_COMPLEX void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, const std::vector &localProcDofIndicesReal, @@ -699,8 +700,6 @@ namespace dftfe dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second; @@ -720,8 +719,9 @@ namespace dftfe void copyFlattenedSTLVecToSingleCompVec( - const std::vector> & flattenedArray, + const std::complex * flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> & componentVectors) { @@ -733,8 +733,6 @@ namespace dftfe dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second; @@ -750,8 +748,9 @@ namespace dftfe #else void copyFlattenedSTLVecToSingleCompVec( - const std::vector &flattenedArray, + const double *flattenedArray, const unsigned int totalNumberComponents, + const unsigned int localVectorSize, const std::pair componentIndexRange, std::vector> &componentVectors) { @@ -762,8 +761,6 @@ namespace dftfe componentIndexRange.second <= totalNumberComponents, dealii::ExcMessage( "componentIndexRange doesn't lie within totalNumberComponents")); - const unsigned int localVectorSize = - flattenedArray.size() / totalNumberComponents; for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode) for (unsigned int icomp = componentIndexRange.first; icomp < componentIndexRange.second; From a1f46aa4b05cfe5d5d987836cb3847292e29c1e0 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Thu, 21 Sep 2023 21:26:31 +0530 Subject: [PATCH 03/25] nscf compilation fix --- src/dft/solveNSCF.cc | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc index 58a823225..624907b26 100644 --- a/src/dft/solveNSCF.cc +++ b/src/dft/solveNSCF.cc @@ -1130,17 +1130,7 @@ namespace dftfe if (d_dftParamsPtr->useDevice && (d_dftParamsPtr->writeWfcSolutionFields || d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile)) - for (unsigned int kPoint = 0; - kPoint < - (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(); - ++kPoint) - { - d_eigenVectorsFlattenedDevice.copyTo( - &d_eigenVectorsFlattenedSTL[kPoint][0], - d_eigenVectorsFlattenedSTL[kPoint].size(), - (kPoint * d_eigenVectorsFlattenedSTL[0].size()), - 0); - } + d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost); #endif //#ifdef USE_COMPLEX From b74554c9978b243b52ae2eeb11de0fc63026a8dc Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Sun, 24 Sep 2023 08:43:08 +0530 Subject: [PATCH 04/25] densityCalculator bugfixes --- CMakeLists.txt | 2 - src/dft/densityCalculator.cc | 84 ++++++++++++++++++++++-------------- utils/FEBasisOperations.cc | 41 ++++++++---------- 3 files changed, 71 insertions(+), 56 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7447aad00..0a095c478 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -47,7 +47,6 @@ SET(TARGET_SRC ./src/dft/dft.cc ./src/dft/vselfBinsManager.cc ./src/dft/energyCalculator.cc - ./src/dft/densityCalculatorCPU.cc ./src/dft/densityCalculator.cc ./src/dft/densityFirstOrderResponseCalculatorCPU.cc ./src/excManager/excDensityBaseClass.cpp @@ -169,7 +168,6 @@ SET(DEVICE_SRC ./utils/MemoryTransferKernelsDevice.cc ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc - ./src/dft/densityCalculatorDevice.cc ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index 6f3e408cf..836a6e076 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -85,7 +85,7 @@ namespace dftfe bandGroupLowHighPlusOneIndices); const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); + std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); const double spinPolarizedFactor = (dftParams.spinPolarized == 1) ? 1.0 : 2.0; @@ -181,8 +181,6 @@ namespace dftfe for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) { - rho.setValue(0.0); - gradRho.setValue(0.0); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) { @@ -255,18 +253,17 @@ namespace dftfe kPointWeights[kPoint] * spinPolarizedFactor; } } + } #if defined(DFTFE_WITH_DEVICE) - for (unsigned int spinIndex = 0; - spinIndex < numSpinComponents; - ++spinIndex) - { - partialOccupVec[spinIndex].resize( - partialOccupVecHost[spinIndex].size()); - partialOccupVec[spinIndex].copyFrom( - partialOccupVecHost[spinIndex]); - } + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; + ++spinIndex) + { + partialOccupVec[spinIndex].resize( + partialOccupVecHost[spinIndex].size()); + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); + } #endif - } for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) if (memorySpace == dftfe::utils::MemorySpace::HOST) @@ -298,8 +295,8 @@ namespace dftfe use2pPlusOneGLQuad ? 2 : 0; dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | dftfe::basis::update_gradients; - basisOperationsPtr->reinit(currentBlockSize * numSpinComponents, - 0, + basisOperationsPtr->reinit(currentBlockSize, + matrixFreeDofhandlerIndex, d_quadratureIndex, updateFlags); @@ -440,7 +437,8 @@ namespace dftfe if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < Nfr; ++iWave) + for (unsigned int iWave = 0; iWave < currentBlockSize; + ++iWave) flattenedArrayBlock[spinIndex] .data()[iNode * currentBlockSize + iWave] = (XFrac->data())[numLocalDofs * Nfr * @@ -466,7 +464,7 @@ namespace dftfe dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, - 0, + matrixFreeDofhandlerIndex, d_quadratureIndex, updateFlags); @@ -609,6 +607,10 @@ namespace dftfe tempRhoQuadsSP[2 * q + 1] = rhoHost[totalLocallyOwnedCells * numQuadPoints + iElem * numQuadPoints + q]; + tempRhoQuads[q] = + rhoHost[iElem * numQuadPoints + q] + + rhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; } if (isEvaluateGradRho) @@ -633,25 +635,43 @@ namespace dftfe gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + 2 * totalLocallyOwnedCells * numQuadPoints + iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q] = + gradRhoHost[iElem * numQuadPoints + q] + + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 1] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q] + + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 2] = + gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q] + + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + 2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; } } - - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoHost[iElem * numQuadPoints + q]; + else + { + for (unsigned int q = 0; q < numQuadPoints; ++q) + tempRhoQuads[q] = rhoHost[iElem * numQuadPoints + q]; - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoHost[iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 1] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 2] = - gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - } + if (isEvaluateGradRho) + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + tempGradRhoQuads[3 * q] = + gradRhoHost[iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 1] = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempGradRhoQuads[3 * q + 2] = + gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + } + } iElem++; } } diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 5ff644f48..f6934aab4 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -168,17 +168,19 @@ namespace dftfe ValueTypeBasisData, memorySpace>::initializeIndexMaps() { - d_nCells = d_matrixFreeDataPtr->n_physical_cells(); - d_nDofsPerCell = - d_matrixFreeDataPtr->get_dof_handler(0).get_fe().dofs_per_cell; + d_nCells = d_matrixFreeDataPtr->n_physical_cells(); + d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) + .get_fe() + .dofs_per_cell; d_cellDofIndexToProcessDofIndexMap.clear(); d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell); d_cellIndexToCellIdMap.clear(); d_cellIndexToCellIdMap.resize(d_nCells); - auto cellPtr = d_matrixFreeDataPtr->get_dof_handler(0).begin_active(); - auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(0).end(); + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); std::vector cellDofIndicesGlobal(d_nDofsPerCell); std::map cellIdToCellIndexMap; @@ -191,8 +193,8 @@ namespace dftfe for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) d_cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell + iDof] = - d_matrixFreeDataPtr->get_vector_partitioner(0)->global_to_local( - cellDofIndicesGlobal[iDof]); + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->global_to_local(cellDofIndicesGlobal[iDof]); d_cellIndexToCellIdMap[iCell] = cellPtr->id(); @@ -211,11 +213,13 @@ namespace dftfe memorySpace>::initializeConstraints() { d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner( - 0), - *((*d_constraintsVector)[0])); + d_dofHandlerID), + *((*d_constraintsVector)[d_dofHandlerID])); d_constraintInfo.precomputeMaps( - d_matrixFreeDataPtr->get_vector_partitioner(0)->locally_owned_size() + - d_matrixFreeDataPtr->get_vector_partitioner(0)->n_ghost_indices(), + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->locally_owned_size() + + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->n_ghost_indices(), d_nVectors); } @@ -230,7 +234,7 @@ namespace dftfe const dealii::Quadrature<3> &quadrature = d_matrixFreeDataPtr->get_quadrature(d_quadratureID); dealii::FEValues<3> fe_values( - d_matrixFreeDataPtr->get_dof_handler(0).get_fe(), + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), quadrature, dealii::update_values | dealii::update_gradients | dealii::update_jacobians | dealii::update_JxW_values | @@ -282,8 +286,9 @@ namespace dftfe const unsigned int nJacobiansPerCell = areAllCellsAffine ? 1 : d_nQuadsPerCell; - auto cellPtr = d_matrixFreeDataPtr->get_dof_handler(0).begin_active(); - auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(0).end(); + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); unsigned int iCell = 0; for (; cellPtr != endcPtr; ++cellPtr) @@ -386,13 +391,5 @@ namespace dftfe { d_constraintInfo.distribute(multiVector, d_nVectors); } - - - // template class FEBasisOperations; - // template class FEBasisOperations; } // namespace basis } // namespace dftfe From a55948876b10d344558e37c0606f107e5a583809 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 25 Sep 2023 15:46:07 +0530 Subject: [PATCH 05/25] densityCalculator bugfixes and optimizations --- include/DeviceBlasWrapper.h | 56 +++++++ include/FEBasisOperations.h | 28 +++- src/dft/densityCalculator.cc | 194 +++++++++------------- src/dft/densityCalculatorDeviceKernels.cc | 75 +++++---- utils/DeviceBlasWrapper.cu.cc | 102 ++++++++++++ utils/FEBasisOperations.cc | 60 +++++-- utils/FEBasisOperationsDevice.cc | 25 +-- 7 files changed, 362 insertions(+), 178 deletions(-) diff --git a/include/DeviceBlasWrapper.h b/include/DeviceBlasWrapper.h index 0331c7e45..b246bff99 100644 --- a/include/DeviceBlasWrapper.h +++ b/include/DeviceBlasWrapper.h @@ -260,6 +260,62 @@ namespace dftfe long long int strideC, int batchCount); + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy); + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy); + + deviceBlasStatus_t + gemm(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy); + + deviceBlasStatus_t + gemm(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy); + } // namespace deviceBlasWrapper } // namespace utils diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index f06eb0e84..30c7fcff1 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -74,6 +74,11 @@ namespace dftfe dftfe::utils::MemorySpace memorySpace> class FEBasisOperationsBase { + protected: + mutable dftfe::utils::MemoryStorage + tempCellNodalData, tempQuadratureGradientsData, + tempQuadratureGradientsDataNonAffine; + public: FEBasisOperationsBase( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, @@ -83,7 +88,8 @@ namespace dftfe ~FEBasisOperationsBase() = default; void - reinit(const unsigned int &blockSize, + reinit(const unsigned int &vecBlockSize, + const unsigned int &cellBlockSize, const unsigned int &dofHandlerID, const unsigned int &quadratureID, const UpdateFlags updateFlags = update_values); @@ -112,6 +118,9 @@ namespace dftfe void initializeShapeFunctionAndJacobianData(); + void + resizeTempStorage(); + void createMultiVector( const unsigned int dofHandlerIndex, @@ -150,6 +159,7 @@ namespace dftfe unsigned int d_dofHandlerID; unsigned int d_nVectors; unsigned int d_nCells; + unsigned int d_cellsBlockSize; unsigned int d_nDofsPerCell; unsigned int d_nQuadsPerCell; bool areAllCellsAffine; @@ -371,10 +381,26 @@ namespace dftfe using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::tempCellNodalData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::tempQuadratureGradientsData; + using FEBasisOperationsBase:: + tempQuadratureGradientsDataNonAffine; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_cellsBlockSize; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index 836a6e076..664d5d79f 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -69,10 +69,10 @@ namespace dftfe { int this_process; MPI_Comm_rank(mpiCommParent, &this_process); - // dftfe::utils::deviceSynchronize(); - // MPI_Barrier(mpiCommParent); - // double device_time = MPI_Wtime(); - const unsigned int numKPoints = kPointWeights.size(); + dftfe::utils::deviceSynchronize(); + MPI_Barrier(mpiCommParent); + double computeRho_time = MPI_Wtime(); + const unsigned int numKPoints = kPointWeights.size(); // band group parallelization data structures const unsigned int numberBandGroups = @@ -165,6 +165,8 @@ namespace dftfe #if defined(DFTFE_WITH_DEVICE) std::vector> partialOccupVec(numSpinComponents); + for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) + partialOccupVec[spinIndex].resize(partialOccupVecHost[spinIndex].size()); #else auto &partialOccupVec = partialOccupVecHost; #endif @@ -257,26 +259,18 @@ namespace dftfe #if defined(DFTFE_WITH_DEVICE) for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - { - partialOccupVec[spinIndex].resize( - partialOccupVecHost[spinIndex].size()); - partialOccupVec[spinIndex].copyFrom( - partialOccupVecHost[spinIndex]); - } + partialOccupVec[spinIndex].copyFrom( + partialOccupVecHost[spinIndex]); #endif for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock[spinIndex] - .data()[iNode * currentBlockSize + iWave] = - (X->data())[numLocalDofs * totalNumWaveFunctions * + std::memcpy(flattenedArrayBlock[spinIndex] + .data()+iNode * currentBlockSize,X->data()+numLocalDofs * totalNumWaveFunctions * (numSpinComponents * kPoint + spinIndex) + - iNode * totalNumWaveFunctions + jvec + - iWave]; + iNode * totalNumWaveFunctions + jvec,currentBlockSize*sizeof(NumberType)); #if defined(DFTFE_WITH_DEVICE) else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) dftfe::utils::deviceKernelsGeneric:: @@ -296,6 +290,7 @@ namespace dftfe dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, + cellsBlockSize, matrixFreeDofhandlerIndex, d_quadratureIndex, updateFlags); @@ -437,14 +432,19 @@ namespace dftfe if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock[spinIndex] - .data()[iNode * currentBlockSize + iWave] = - (XFrac->data())[numLocalDofs * Nfr * + std::memcpy(flattenedArrayBlock[spinIndex] + .data()+iNode * currentBlockSize,XFrac->data()+numLocalDofs * Nfr * (numSpinComponents * kPoint + spinIndex) + - iNode * Nfr + jvec + iWave]; + iNode * Nfr + jvec,currentBlockSize*sizeof(NumberType)); + // for (unsigned int iWave = 0; iWave < currentBlockSize; + // ++iWave) + // flattenedArrayBlock[spinIndex] + // .data()[iNode * currentBlockSize + iWave] = + // (XFrac->data())[numLocalDofs * Nfr * + // (numSpinComponents * kPoint + + // spinIndex) + + // iNode * Nfr + jvec + iWave]; #if defined(DFTFE_WITH_DEVICE) else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) dftfe::utils::deviceKernelsGeneric:: @@ -464,6 +464,7 @@ namespace dftfe dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, + cellsBlockSize, matrixFreeDofhandlerIndex, d_quadratureIndex, updateFlags); @@ -575,6 +576,9 @@ namespace dftfe MPI_SUM, interBandGroupComm); } + dftfe::utils::deviceSynchronize(); + MPI_Barrier(mpiCommParent); + double computeRho_time2 = MPI_Wtime(); unsigned int iElem = 0; auto cell = dofHandler.begin_active(); @@ -601,79 +605,60 @@ namespace dftfe { for (unsigned int q = 0; q < numQuadPoints; ++q) { - tempRhoQuadsSP[2 * q + 0] = - rhoHost[iElem * numQuadPoints + q]; - - tempRhoQuadsSP[2 * q + 1] = - rhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempRhoQuads[q] = - rhoHost[iElem * numQuadPoints + q] + - rhoHost[totalLocallyOwnedCells * numQuadPoints + + const double rho0 = rhoHost[iElem * numQuadPoints + q]; + const double rho1 = rhoHost[totalLocallyOwnedCells * numQuadPoints + iElem * numQuadPoints + q]; + tempRhoQuadsSP[2 * q + 0] = rho0; + + tempRhoQuadsSP[2 * q + 1] = rho1; + tempRhoQuads[q] = rho0+rho1; } if (isEvaluateGradRho) for (unsigned int q = 0; q < numQuadPoints; ++q) { - tempGradRhoQuadsSP[6 * q + 0] = - gradRhoHost[iElem * numQuadPoints + q]; - tempGradRhoQuadsSP[6 * q + 1] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuadsSP[6 * q + 2] = - gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuadsSP[6 * q + 3] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - iElem * numQuadPoints + q]; - tempGradRhoQuadsSP[6 * q + 4] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuadsSP[6 * q + 5] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - 2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q] = - gradRhoHost[iElem * numQuadPoints + q] + - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 1] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q] + - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 2] = - gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q] + - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - 2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; + const double gradRho0x = gradRhoHost[iElem * numQuadPoints * 3 + 3*q]; + const double gradRho0y = gradRhoHost[iElem * numQuadPoints * 3 + 3*q+1]; + const double gradRho0z = gradRhoHost[iElem * numQuadPoints * 3 + 3*q+2]; + const double gradRho1x = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q]; + const double gradRho1y = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q+1]; + const double gradRho1z = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q+2]; + tempGradRhoQuadsSP[6 * q + 0] = gradRho0x; + tempGradRhoQuadsSP[6 * q + 1] = gradRho0y; + tempGradRhoQuadsSP[6 * q + 2] = gradRho0z; + tempGradRhoQuadsSP[6 * q + 3] = gradRho1x; + tempGradRhoQuadsSP[6 * q + 4] = gradRho1y; + tempGradRhoQuadsSP[6 * q + 5] = gradRho1z; + tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; + tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; + tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; } } else { - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoHost[iElem * numQuadPoints + q]; - + std::memcpy(tempRhoQuads.data(), + rhoHost.data() + iElem * numQuadPoints, + numQuadPoints * sizeof(double)); if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoHost[iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 1] = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempGradRhoQuads[3 * q + 2] = - gradRhoHost[2 * totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - } + std::memcpy(tempGradRhoQuads.data(), + gradRhoHost.data() + iElem * numQuadPoints * 3, + 3 * numQuadPoints * sizeof(double)); } iElem++; } + dftfe::utils::deviceSynchronize(); + MPI_Barrier(mpiCommParent); + computeRho_time = MPI_Wtime() - computeRho_time; + computeRho_time2 = MPI_Wtime() - computeRho_time2; + + if (this_process == 0 && dftParams.verbosity >= 2) + if (memorySpace == dftfe::utils::MemorySpace::HOST) + std::cout << "Time for compute rho on CPU: " << computeRho_time + << std::endl; + else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) + std::cout << "Time for compute rho on Device: " << computeRho_time + << " " << computeRho_time2 << std::endl; } template void @@ -703,63 +688,42 @@ namespace dftfe for (unsigned int iWave = 0; iWave < vecRange.second - vecRange.first; ++iWave) { + const NumberType psi = + wfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * + vectorsBlockSize + + iQuad * vectorsBlockSize + iWave]; rho[iCell * nQuadsPerCell + iQuad] += - partialOccupVec[iWave] * - std::abs(wfcQuadPointData[(iCell - cellRange.first) * - nQuadsPerCell * vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]) * - std::abs(wfcQuadPointData[(iCell - cellRange.first) * - nQuadsPerCell * vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]); - } - if (isEvaluateGradRho) - { - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; - ++iCell) - for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) - for (unsigned int iWave = 0; - iWave < vecRange.second - vecRange.first; - ++iWave) + partialOccupVec[iWave] * std::abs(psi) * std::abs(psi); + if (isEvaluateGradRho) { - gradRho[iCell * nQuadsPerCell + iQuad] += + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad] += 2 * partialOccupVec[iWave] * dftfe::utils::realPart( - dftfe::utils::complexConj( - wfcQuadPointData[(iCell - cellRange.first) * - nQuadsPerCell * vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]) * + dftfe::utils::complexConj(psi) * gradWfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * vectorsBlockSize * 3 + iQuad * vectorsBlockSize + iWave]); - gradRho[nCells * nQuadsPerCell + iCell * nQuadsPerCell + - iQuad] += + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 1] += 2 * partialOccupVec[iWave] * dftfe::utils::realPart( - dftfe::utils::complexConj( - wfcQuadPointData[(iCell - cellRange.first) * - nQuadsPerCell * vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]) * + dftfe::utils::complexConj(psi) * gradWfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * vectorsBlockSize * 3 + nQuadsPerCell * vectorsBlockSize + iQuad * vectorsBlockSize + iWave]); - gradRho[2 * nCells * nQuadsPerCell + iCell * nQuadsPerCell + - iQuad] += + gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 2] += 2 * partialOccupVec[iWave] * dftfe::utils::realPart( - dftfe::utils::complexConj( - wfcQuadPointData[(iCell - cellRange.first) * - nQuadsPerCell * vectorsBlockSize + - iQuad * vectorsBlockSize + iWave]) * + dftfe::utils::complexConj(psi) * gradWfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell * vectorsBlockSize * 3 + 2 * nQuadsPerCell * vectorsBlockSize + iQuad * vectorsBlockSize + iWave]); } - } + } } #if defined(DFTFE_WITH_DEVICE) template void diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc index 87a9783c3..8f799b124 100644 --- a/src/dft/densityCalculatorDeviceKernels.cc +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -33,8 +33,9 @@ namespace dftfe { __global__ void computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, + const unsigned int numVectors, const unsigned int numCells, + const unsigned int nQuadsPerCell, double * wfcContributions, double * gradwfcContributions, double * rhoCellsWfcContributions, @@ -42,7 +43,8 @@ namespace dftfe const bool isEvaluateGradRho) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int numEntriesPerCell = numberEntries / numCells; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; for (unsigned int index = globalThreadId; index < numberEntries; index += blockDim.x * gridDim.x) @@ -52,23 +54,31 @@ namespace dftfe if (isEvaluateGradRho) { - unsigned int iCell = index / numCells; + unsigned int iCell = index / numEntriesPerCell; unsigned int intraCellIndex = index - iCell * numEntriesPerCell; - const double gradPsiX = + unsigned int iQuad = intraCellIndex / numVectors; + unsigned int iVec = intraCellIndex - iQuad * numVectors; + const double gradPsiX = //[iVec * numCells * numVectors + + 0] gradwfcContributions[intraCellIndex + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index] = 2.0 * psi * gradPsiX; + gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = + 2.0 * psi * gradPsiX; const double gradPsiY = gradwfcContributions[intraCellIndex + numEntriesPerCell + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index + numberEntries] = + gradRhoCellsWfcContributions[iVec + numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = 2.0 * psi * gradPsiY; const double gradPsiZ = gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index + 2 * numberEntries] = + gradRhoCellsWfcContributions[iVec + 2 * numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = 2.0 * psi * gradPsiZ; } } @@ -76,8 +86,9 @@ namespace dftfe __global__ void computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, + const unsigned int numVectors, const unsigned int numCells, + const unsigned int nQuadsPerCell, dftfe::utils::deviceDoubleComplex *wfcContributions, dftfe::utils::deviceDoubleComplex *gradwfcContributions, double * rhoCellsWfcContributions, @@ -85,7 +96,8 @@ namespace dftfe const bool isEvaluateGradRho) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; - const unsigned int numEntriesPerCell = numberEntries / numCells; + const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell; + const unsigned int numberEntries = numEntriesPerCell * numCells; for (unsigned int index = globalThreadId; index < numberEntries; index += blockDim.x * gridDim.x) @@ -95,24 +107,31 @@ namespace dftfe if (isEvaluateGradRho) { - unsigned int iCell = index / numCells; + unsigned int iCell = index / numEntriesPerCell; unsigned int intraCellIndex = index - iCell * numEntriesPerCell; + unsigned int iQuad = intraCellIndex / numVectors; + unsigned int iVec = intraCellIndex - iQuad * numVectors; const dftfe::utils::deviceDoubleComplex gradPsiX = gradwfcContributions[intraCellIndex + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index] = + gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = 2.0 * (psi.x * gradPsiX.x + psi.y * gradPsiX.y); const dftfe::utils::deviceDoubleComplex gradPsiY = gradwfcContributions[intraCellIndex + numEntriesPerCell + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index + numberEntries] = + gradRhoCellsWfcContributions[iVec + numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = 2.0 * (psi.x * gradPsiY.x + psi.y * gradPsiY.y); const dftfe::utils::deviceDoubleComplex gradPsiZ = gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell + numEntriesPerCell * 3 * iCell]; - gradRhoCellsWfcContributions[index + 2 * numberEntries] = + gradRhoCellsWfcContributions[iVec + 2 * numVectors + + 3 * iQuad * numVectors + + numEntriesPerCell * 3 * iCell] = 2.0 * (psi.x * gradPsiZ.x + psi.y * gradPsiZ.y); } } @@ -150,8 +169,9 @@ namespace dftfe (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - cellsBlockSize * nQuadsPerCell * vectorsBlockSize, + vectorsBlockSize, cellsBlockSize, + nQuadsPerCell, dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), @@ -167,24 +187,23 @@ namespace dftfe 0, cellsBlockSize * nQuadsPerCell * vectorsBlockSize, cellsBlockSize, + nQuadsPerCell, dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData), dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions), dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions), isEvaluateGradRho); #endif - dftfe::utils::deviceBlasWrapper::gemm( + dftfe::utils::deviceBlasWrapper::gemv( basisOperationsPtr->getDeviceBLASHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - cellsBlockSize * nQuadsPerCell, + dftfe::utils::DEVICEBLAS_OP_T, vectorsBlockSize, + cellsBlockSize * nQuadsPerCell, &scalarCoeffAlphaRho, - partialOccupVec, - 1, rhoCellsWfcContributions, vectorsBlockSize, + partialOccupVec, + 1, &scalarCoeffBetaRho, rho + cellRange.first * nQuadsPerCell, 1); @@ -192,20 +211,18 @@ namespace dftfe if (isEvaluateGradRho) { - dftfe::utils::deviceBlasWrapper::gemm( + dftfe::utils::deviceBlasWrapper::gemv( basisOperationsPtr->getDeviceBLASHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - cellsBlockSize * nQuadsPerCell * 3, + dftfe::utils::DEVICEBLAS_OP_T, vectorsBlockSize, + cellsBlockSize * nQuadsPerCell * 3, &scalarCoeffAlphaGradRho, - partialOccupVec, - 1, gradRhoCellsWfcContributions, vectorsBlockSize, + partialOccupVec, + 1, &scalarCoeffBetaGradRho, - gradRho + cellRange.first * nQuadsPerCell, + gradRho + cellRange.first * nQuadsPerCell * 3, 1); } } diff --git a/utils/DeviceBlasWrapper.cu.cc b/utils/DeviceBlasWrapper.cu.cc index 6ed34f28a..dede70848 100644 --- a/utils/DeviceBlasWrapper.cu.cc +++ b/utils/DeviceBlasWrapper.cu.cc @@ -474,6 +474,108 @@ namespace dftfe return status; } + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy) + { + deviceBlasStatus_t status = cublasDgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy) + { + deviceBlasStatus_t status = cublasSgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + cublasZgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + cublasCgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + } // namespace deviceBlasWrapper } // namespace utils } // namespace dftfe diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index f6934aab4..6d2dce026 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -67,41 +67,79 @@ namespace dftfe void FEBasisOperationsBase::reinit(const unsigned int &blockSize, + memorySpace>::reinit(const unsigned int &vecBlockSize, + const unsigned int + &cellsBlockSize, const unsigned int &dofHandlerID, const unsigned int &quadratureID, const UpdateFlags updateFlags) { if ((d_dofHandlerID != dofHandlerID) || (d_updateFlags != updateFlags)) { - d_dofHandlerID = dofHandlerID; - d_quadratureID = quadratureID; - d_nVectors = blockSize; - d_updateFlags = updateFlags; + d_dofHandlerID = dofHandlerID; + d_quadratureID = quadratureID; + d_nVectors = vecBlockSize; + d_cellsBlockSize = cellsBlockSize; + d_updateFlags = updateFlags; initializeIndexMaps(); initializeConstraints(); initializeShapeFunctionAndJacobianData(); initializeFlattenedIndexMaps(); + resizeTempStorage(); } - else if ((d_quadratureID != quadratureID) && (d_nVectors != blockSize)) + else if ((d_quadratureID != quadratureID) && (d_nVectors != vecBlockSize)) { - d_quadratureID = quadratureID; - d_nVectors = blockSize; + d_quadratureID = quadratureID; + d_nVectors = vecBlockSize; + d_cellsBlockSize = cellsBlockSize; initializeConstraints(); initializeShapeFunctionAndJacobianData(); initializeFlattenedIndexMaps(); + resizeTempStorage(); } else if (d_quadratureID != quadratureID) { - d_quadratureID = quadratureID; + d_quadratureID = quadratureID; + d_cellsBlockSize = cellsBlockSize; initializeShapeFunctionAndJacobianData(); + resizeTempStorage(); } - else if (d_nVectors != blockSize) + else if (d_nVectors != vecBlockSize) { - d_nVectors = blockSize; + d_nVectors = vecBlockSize; + d_cellsBlockSize = cellsBlockSize; initializeConstraints(); initializeFlattenedIndexMaps(); + resizeTempStorage(); } + else if (d_cellsBlockSize != cellsBlockSize) + { + d_cellsBlockSize = cellsBlockSize; + resizeTempStorage(); + } + } + + template + void + FEBasisOperationsBase::resizeTempStorage() + { + tempCellNodalData.resize(d_nVectors * d_nDofsPerCell * d_cellsBlockSize); + + if (d_updateFlags & update_gradients) + tempQuadratureGradientsData.resize( + areAllCellsCartesian ? + 0 : + (d_nVectors * d_nQuadsPerCell * 3 * d_cellsBlockSize)); + + if (d_updateFlags & update_gradients) + tempQuadratureGradientsDataNonAffine.resize( + areAllCellsAffine ? + 0 : + (d_nVectors * d_nQuadsPerCell * 3 * d_cellsBlockSize)); } template cellRange) const { - dftfe::utils::MemoryStorage - cellNodalData, tempQuadratureGradientsData, - tempQuadratureGradientsDataNonAffine; - cellNodalData.resize(d_nVectors * d_nDofsPerCell * - (cellRange.second - cellRange.first)); - - if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize( - areAllCellsCartesian ? 0 : - (d_nVectors * d_nQuadsPerCell * 3 * - (cellRange.second - cellRange.first))); - - if (quadratureGradients != NULL) - tempQuadratureGradientsDataNonAffine.resize( - areAllCellsAffine ? 0 : - (d_nVectors * d_nQuadsPerCell * 3 * - (cellRange.second - cellRange.first))); - - extractToCellNodalDataKernel(nodalValues, &cellNodalData, cellRange); + extractToCellNodalDataKernel(nodalValues, &tempCellNodalData, cellRange); const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), scalarCoeffBeta = ValueTypeBasisCoeff(0.0); @@ -261,7 +242,7 @@ namespace dftfe d_nQuadsPerCell, d_nDofsPerCell, &scalarCoeffAlpha, - cellNodalData.data(), + tempCellNodalData.data(), d_nVectors, d_nVectors * d_nDofsPerCell, d_shapeFunctionData.data(), @@ -282,7 +263,7 @@ namespace dftfe d_nQuadsPerCell * 3, d_nDofsPerCell, &scalarCoeffAlpha, - cellNodalData.data(), + tempCellNodalData.data(), d_nVectors, d_nVectors * d_nDofsPerCell, d_shapeFunctionGradientData.data(), From 4de2b29f3a9fdce3295f7f139e045a1e845abfb3 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 26 Sep 2023 11:51:50 +0530 Subject: [PATCH 06/25] Remove std::map functions, use valueType pointers --- include/FEBasisOperations.h | 151 +++++----------------------- src/dft/densityCalculator.cc | 8 +- utils/FEBasisOperationsDevice.cc | 164 ++++--------------------------- utils/FEBasisOperationsHost.cc | 154 +++++------------------------ 4 files changed, 74 insertions(+), 403 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 30c7fcff1..dc4bb437a 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -245,19 +245,6 @@ namespace dftfe ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>::d_constraintsVector; - void - interpolate( - dftfe::linearAlgebra::MultiVector - &nodalData, - std::map> - *quadratureValues, - std::map> - *quadratureGradients = NULL) const; void @@ -265,36 +252,14 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients = NULL) const; - - void - integrateWithBasis( - std::map> - *quadratureValues, - std::map> - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const; + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; void integrateWithBasis( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -304,15 +269,11 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *cellNodalDataPtr) const; + ValueTypeBasisCoeff *cellNodalDataPtr) const; void accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -322,22 +283,14 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, const std::pair cellRange) const; void integrateWithBasisKernel( - const dftfe::utils::MemoryStorage - *quadratureValues, - const dftfe::utils::MemoryStorage - *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -349,16 +302,12 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, + ValueTypeBasisCoeff *cellNodalDataPtr, const std::pair cellRange) const; void accumulateFromCellNodalDataKernel( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, @@ -466,56 +415,20 @@ namespace dftfe getDeviceBLASHandle(); - void - interpolate( - dftfe::linearAlgebra::MultiVector - &nodalData, - std::map> - *quadratureValues, - std::map> - *quadratureGradients = NULL) const; - void interpolate( dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients = NULL) const; - - void - integrateWithBasis( - std::map> - *quadratureValues, - std::map> - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const; + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; void integrateWithBasis( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -525,15 +438,11 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *cellNodalDataPtr) const; + ValueTypeBasisCoeff *cellNodalDataPtr) const; void accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -543,22 +452,14 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, dftfe::utils::MemorySpace::DEVICE> &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, const std::pair cellRange) const; void integrateWithBasisKernel( - const dftfe::utils::MemoryStorage - *quadratureValues, - const dftfe::utils::MemoryStorage - *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -570,16 +471,12 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, dftfe::utils::MemorySpace::DEVICE> &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, + ValueTypeBasisCoeff *cellNodalDataPtr, const std::pair cellRange) const; void accumulateFromCellNodalDataKernel( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index 664d5d79f..563152a18 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -319,9 +319,9 @@ namespace dftfe ++spinIndex) basisOperationsPtr->interpolateKernel( flattenedArrayBlock[spinIndex], - &wfcQuadPointData[spinIndex], + wfcQuadPointData[spinIndex].data(), isEvaluateGradRho ? - &gradWfcQuadPointData[spinIndex] : + gradWfcQuadPointData[spinIndex].data() : NULL, std::pair( startingCellId, @@ -493,9 +493,9 @@ namespace dftfe ++spinIndex) basisOperationsPtr->interpolateKernel( flattenedArrayBlock[spinIndex], - &wfcQuadPointData[spinIndex], + wfcQuadPointData[spinIndex].data(), isEvaluateGradRho ? - &gradWfcQuadPointData[spinIndex] : + gradWfcQuadPointData[spinIndex].data() : NULL, std::pair( startingCellId, diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.cc index 267a52331..2a1219652 100644 --- a/utils/FEBasisOperationsDevice.cc +++ b/utils/FEBasisOperationsDevice.cc @@ -25,56 +25,6 @@ namespace dftfe { namespace basis { - template - void - FEBasisOperations:: - interpolate( - dftfe::linearAlgebra::MultiVector - &nodalData, - std::map> - *quadratureValues, - std::map> - *quadratureGradients) const - { - dftfe::utils::MemoryStorage - quadratureValuesAllCells; - dftfe::utils::MemoryStorage - quadratureGradientsAllCells; - quadratureValuesAllCells.resize(d_nCells * d_nQuadsPerCell * d_nVectors); - quadratureGradientsAllCells.resize(d_nCells * 3 * d_nQuadsPerCell * - d_nVectors); - - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - quadratureValuesAllCells.copyFrom(quadratureValues->at(currentCellId), - d_nQuadsPerCell * d_nVectors, - 0, - d_nVectors * d_nQuadsPerCell * - iCell); - if (quadratureGradients != NULL) - quadratureGradientsAllCells.copyFrom( - quadratureGradients->at(currentCellId), - d_nQuadsPerCell * d_nVectors * 3, - 0, - d_nVectors * d_nQuadsPerCell * 3 * iCell); - } - interpolateKernel(nodalData, - &quadratureValuesAllCells, - quadratureGradients == NULL ? - NULL : - &quadratureGradientsAllCells, - std::pair(0, d_nCells)); - } template void @@ -85,12 +35,8 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients) const + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const { interpolateKernel(nodalData, quadratureValues, @@ -104,62 +50,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: integrateWithBasis( - std::map> - *quadratureValues, - std::map> - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - dftfe::utils::MemoryStorage - quadratureValuesAllCells; - dftfe::utils::MemoryStorage - quadratureGradientsAllCells; - quadratureValuesAllCells.resize(d_nCells * d_nQuadsPerCell * d_nVectors); - quadratureGradientsAllCells.resize(d_nCells * 3 * d_nQuadsPerCell * - d_nVectors); - - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - quadratureValuesAllCells.copyFrom(quadratureValues->at(currentCellId), - d_nQuadsPerCell * d_nVectors, - 0, - d_nVectors * d_nQuadsPerCell * - iCell); - if (quadratureGradients != NULL) - quadratureGradientsAllCells.copyFrom( - quadratureGradients->at(currentCellId), - d_nQuadsPerCell * d_nVectors * 3, - 0, - d_nVectors * d_nQuadsPerCell * 3 * iCell); - } - integrateWithBasisKernel( - &quadratureValuesAllCells, - quadratureGradients == NULL ? NULL : &quadratureGradientsAllCells, - nodalData, - std::pair(0, d_nCells)); - } - - template - void - FEBasisOperations:: - integrateWithBasis( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -181,9 +73,7 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *cellNodalDataPtr) const + ValueTypeBasisCoeff *cellNodalDataPtr) const { extractToCellNodalDataKernel( nodalData, @@ -197,9 +87,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -221,15 +109,11 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, dftfe::utils::MemorySpace::DEVICE> &nodalValues, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, const std::pair cellRange) const { - extractToCellNodalDataKernel(nodalValues, &tempCellNodalData, cellRange); + extractToCellNodalDataKernel(nodalValues, tempCellNodalData.data(), cellRange); const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), scalarCoeffBeta = ValueTypeBasisCoeff(0.0); @@ -249,7 +133,7 @@ namespace dftfe d_nDofsPerCell, 0, &scalarCoeffBeta, - quadratureValues->data(), + quadratureValues, d_nVectors, d_nVectors * d_nQuadsPerCell, cellRange.second - cellRange.first); @@ -270,7 +154,7 @@ namespace dftfe d_nDofsPerCell, 0, &scalarCoeffBeta, - areAllCellsCartesian ? quadratureGradients->data() : + areAllCellsCartesian ? quadratureGradients : tempQuadratureGradientsData.data(), d_nVectors, d_nVectors * d_nQuadsPerCell * 3, @@ -282,7 +166,7 @@ namespace dftfe 3 * (cellRange.second - cellRange.first), ValueTypeBasisCoeff(1.0), d_inverseJacobianData.data() + cellRange.first * 3, - quadratureGradients->data()); + quadratureGradients); } else if (areAllCellsAffine) { @@ -301,7 +185,7 @@ namespace dftfe 3, 9, &scalarCoeffBeta, - quadratureGradients->data(), + quadratureGradients, d_nQuadsPerCell * d_nVectors, d_nVectors * d_nQuadsPerCell * 3, cellRange.second - cellRange.first); @@ -332,7 +216,7 @@ namespace dftfe d_nVectors, (cellRange.second - cellRange.first) * d_nQuadsPerCell * 3, tempQuadratureGradientsDataNonAffine.data(), - quadratureGradients->data(), + quadratureGradients, d_nonAffineReshapeIDs.data() + cellRange.first * d_nDofsPerCell); } @@ -345,12 +229,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: integrateWithBasisKernel( - const dftfe::utils::MemoryStorage - *quadratureValues, - const dftfe::utils::MemoryStorage - *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -366,16 +246,14 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, dftfe::utils::MemorySpace::DEVICE> &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, + ValueTypeBasisCoeff *cellNodalDataPtr, const std::pair cellRange) const { dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( d_nVectors, (cellRange.second - cellRange.first) * d_nDofsPerCell, nodalData.data(), - cellNodalDataPtr->data(), + cellNodalDataPtr, d_flattenedCellDofIndexToProcessDofIndexMap.data() + cellRange.first * d_nDofsPerCell); } @@ -386,9 +264,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: accumulateFromCellNodalDataKernel( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, @@ -397,7 +273,7 @@ namespace dftfe dftfe::utils::deviceKernelsGeneric::axpyStridedBlockAtomicAdd( d_nVectors, (cellRange.second - cellRange.first) * d_nDofsPerCell, - cellNodalDataPtr->begin(), + cellNodalDataPtr, nodalData.begin(), d_flattenedCellDofIndexToProcessDofIndexMap.begin() + cellRange.first * d_nDofsPerCell); diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.cc index fd91781f4..2d67db86a 100644 --- a/utils/FEBasisOperationsHost.cc +++ b/utils/FEBasisOperationsHost.cc @@ -30,52 +30,8 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - std::map> - *quadratureValues, - std::map> - *quadratureGradients) const - { - dftfe::utils::MemoryStorage - *quadratureValuesCurrentCell; - dftfe::utils::MemoryStorage - *quadratureGradientsCurrentCell; - - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - quadratureValuesCurrentCell = &(quadratureValues->at(currentCellId)); - quadratureGradientsCurrentCell = - quadratureGradients ? &(quadratureGradients->at(currentCellId)) : - NULL; - interpolateKernel(nodalData, - quadratureValuesCurrentCell, - quadratureGradientsCurrentCell, - std::pair(iCell, - iCell + 1)); - } - } - - template - void - FEBasisOperations:: - interpolate( - dftfe::linearAlgebra::MultiVector - &nodalData, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients) const + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const { interpolateKernel(nodalData, quadratureValues, @@ -89,50 +45,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: integrateWithBasis( - std::map> - *quadratureValues, - std::map> - *quadratureGradients, - dftfe::linearAlgebra::MultiVector - &nodalData) const - { - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - dealii::CellId currentCellId = d_cellIndexToCellIdMap[iCell]; - const dftfe::utils::MemoryStorage - *quadratureValuesCurrentCell = - &(quadratureValues->at(currentCellId)); - const dftfe::utils::MemoryStorage - *quadratureGradientsCurrentCell = - quadratureGradients ? &(quadratureGradients->at(currentCellId)) : - NULL; - integrateWithBasisKernel( - quadratureValuesCurrentCell, - quadratureGradientsCurrentCell, - nodalData, - std::pair(iCell, iCell + 1)); - } - } - - template - void - FEBasisOperations:: - integrateWithBasis( - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -154,9 +68,7 @@ namespace dftfe dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - *cellNodalDataPtr) const + ValueTypeBasisCoeff *cellNodalDataPtr) const { extractToCellNodalDataKernel( nodalData, @@ -170,9 +82,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: accumulateFromCellNodalData( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -191,12 +101,8 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector &nodalValues, - dftfe::utils::MemoryStorage - *quadratureValues, - dftfe::utils::MemoryStorage - * quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, const std::pair cellRange) const { dftfe::utils::MemoryStorage(iCell, iCell + 1)); const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), scalarCoeffBeta = ValueTypeBasisCoeff(0.0); @@ -237,7 +143,7 @@ namespace dftfe d_shapeFunctionData.data(), &d_nDofsPerCell, &scalarCoeffBeta, - quadratureValues->data() + + quadratureValues + d_nQuadsPerCell * (iCell - cellRange.first) * d_nVectors, &d_nVectors); if (quadratureGradients != NULL) @@ -256,7 +162,7 @@ namespace dftfe d_shapeFunctionGradientData.data(), &d_nDofsPerCell, &scalarCoeffBeta, - areAllCellsCartesian ? (quadratureGradients->data() + + areAllCellsCartesian ? (quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * (iCell - cellRange.first)) : (tempQuadratureGradientsData.data()), @@ -269,7 +175,7 @@ namespace dftfe for (unsigned int iDim = 0; iDim < 3; ++iDim) xscal(&d_nQuadsPerCellTimesnVectors, d_inverseJacobianData.data() + 3 * iCell + iDim, - quadratureGradients->data() + + quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * (iCell - cellRange.first) + d_nQuadsPerCell * d_nVectors * iDim, @@ -291,7 +197,7 @@ namespace dftfe d_inverseJacobianData.data() + 9 * iCell, &three, &scalarCoeffBeta, - quadratureGradients->data() + + quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * (iCell - cellRange.first), &d_nQuadsPerCellTimesnVectors); @@ -318,7 +224,7 @@ namespace dftfe &d_nVectors); for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) for (unsigned int iDim = 0; iDim < 3; ++iDim) - std::memcpy(quadratureGradients->data() + + std::memcpy(quadratureGradients + d_nVectors * 3 * d_nQuadsPerCell * (iCell - cellRange.first) + d_nVectors * d_nQuadsPerCell * iDim + @@ -337,12 +243,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: integrateWithBasisKernel( - const dftfe::utils::MemoryStorage - *quadratureValues, - const dftfe::utils::MemoryStorage - *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -375,7 +277,7 @@ namespace dftfe &d_nDofsPerCell, &d_nQuadsPerCell, &scalarCoeffAlpha, - quadratureValues->data() + d_nQuadsPerCell * iCell, + quadratureValues + d_nQuadsPerCell * iCell, &d_nVectors, d_shapeFunctionData.data(), &d_nQuadsPerCell, @@ -390,7 +292,7 @@ namespace dftfe d_nQuadsPerCell * d_nVectors; const unsigned int one = 1; std::memcpy(tempQuadratureGradientsData.data(), - quadratureGradients->data() + + quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * iCell, 3 * d_nQuadsPerCellTimesnVectors * sizeof(ValueTypeBasisCoeff)); @@ -412,7 +314,7 @@ namespace dftfe &three, &three, &scalarCoeffAlpha, - quadratureGradients->data() + + quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * iCell, &d_nQuadsPerCellTimesnVectors, d_inverseJacobianData.data() + 9 * iCell, @@ -427,7 +329,7 @@ namespace dftfe for (unsigned int iDim = 0; iDim < 3; ++iDim) std::memcpy(tempQuadratureGradientsDataNonAffine.data() + d_nVectors * 3 * iQuad + d_nVectors * iDim, - quadratureGradients->data() + + quadratureGradients + d_nVectors * 3 * d_nQuadsPerCell * iCell + d_nVectors * d_nQuadsPerCell * iDim + d_nVectors * iQuad, @@ -468,7 +370,7 @@ namespace dftfe &d_nVectors); } accumulateFromCellNodalDataKernel( - &cellNodalData, + cellNodalData.data(), nodalData, std::pair(iCell, iCell + 1)); } @@ -483,16 +385,14 @@ namespace dftfe const dftfe::linearAlgebra::MultiVector &nodalData, - dftfe::utils::MemoryStorage - * cellNodalDataPtr, + ValueTypeBasisCoeff *cellNodalDataPtr, const std::pair cellRange) const { for (unsigned int iCell = cellRange.first; iCell < cellRange.second; ++iCell) for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) { - std::memcpy(cellNodalDataPtr->data() + + std::memcpy(cellNodalDataPtr + (iCell - cellRange.first) * d_nVectors * d_nDofsPerCell + iDof * d_nVectors, @@ -509,9 +409,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: accumulateFromCellNodalDataKernel( - const dftfe::utils::MemoryStorage - *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, @@ -521,9 +419,9 @@ namespace dftfe ++iCell) for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof) std::transform( - cellNodalDataPtr->data() + iCell * d_nVectors * d_nDofsPerCell + + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + iDof * d_nVectors, - cellNodalDataPtr->data() + iCell * d_nVectors * d_nDofsPerCell + + cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell + iDof * d_nVectors + d_nVectors, nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap [iCell * d_nDofsPerCell + iDof], From b28b2520b655ed5b847a5031d8fb50c8e9db4596 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 26 Sep 2023 14:52:25 +0530 Subject: [PATCH 07/25] indentation, minor changes --- include/FEBasisOperations.h | 105 +++++++++++++++++++++---------- src/dft/densityCalculator.cc | 60 +++++++++++------- utils/FEBasisOperations.cc | 6 ++ utils/FEBasisOperationsDevice.cc | 57 +++++++++++------ utils/FEBasisOperationsHost.cc | 87 +++++++++++++------------ 5 files changed, 200 insertions(+), 115 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index dc4bb437a..75309ffbd 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -162,6 +162,8 @@ namespace dftfe unsigned int d_cellsBlockSize; unsigned int d_nDofsPerCell; unsigned int d_nQuadsPerCell; + unsigned int d_localSize; + unsigned int d_locallyOwnedSize; bool areAllCellsAffine; bool areAllCellsCartesian; UpdateFlags d_updateFlags; @@ -191,6 +193,25 @@ namespace dftfe using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase::d_localSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_locallyOwnedSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempCellNodalData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsDataNonAffine; using FEBasisOperationsBase::d_nVectors; @@ -251,15 +272,15 @@ namespace dftfe interpolate( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients = NULL) const; + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; void integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -268,12 +289,12 @@ namespace dftfe extractToCellNodalData( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const; + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const; void accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -282,15 +303,21 @@ namespace dftfe interpolateKernel( const dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + & nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + void + interpolateKernel( + const ValueTypeBasisCoeff * nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; void integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -301,13 +328,13 @@ namespace dftfe extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr, + & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, const std::pair cellRange) const; void accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, @@ -330,6 +357,14 @@ namespace dftfe using FEBasisOperationsBase::d_nCells; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_localSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_locallyOwnedSize; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, @@ -420,15 +455,15 @@ namespace dftfe interpolate( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients = NULL) const; + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients = NULL) const; void integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -437,12 +472,12 @@ namespace dftfe extractToCellNodalData( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const; + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const; void accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const; @@ -451,15 +486,21 @@ namespace dftfe interpolateKernel( const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const; + void + interpolateKernel( + const ValueTypeBasisCoeff * nodalData, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; void integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -470,13 +511,13 @@ namespace dftfe extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, const std::pair cellRange) const; void accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index 563152a18..b9d80e104 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -266,11 +266,13 @@ namespace dftfe ++spinIndex) if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - std::memcpy(flattenedArrayBlock[spinIndex] - .data()+iNode * currentBlockSize,X->data()+numLocalDofs * totalNumWaveFunctions * - (numSpinComponents * kPoint + - spinIndex) + - iNode * totalNumWaveFunctions + jvec,currentBlockSize*sizeof(NumberType)); + std::memcpy(flattenedArrayBlock[spinIndex].data() + + iNode * currentBlockSize, + X->data() + + numLocalDofs * totalNumWaveFunctions * + (numSpinComponents * kPoint + spinIndex) + + iNode * totalNumWaveFunctions + jvec, + currentBlockSize * sizeof(NumberType)); #if defined(DFTFE_WITH_DEVICE) else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) dftfe::utils::deviceKernelsGeneric:: @@ -432,12 +434,16 @@ namespace dftfe if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - std::memcpy(flattenedArrayBlock[spinIndex] - .data()+iNode * currentBlockSize,XFrac->data()+numLocalDofs * Nfr * - (numSpinComponents * kPoint + - spinIndex) + - iNode * Nfr + jvec,currentBlockSize*sizeof(NumberType)); - // for (unsigned int iWave = 0; iWave < currentBlockSize; + std::memcpy(flattenedArrayBlock[spinIndex].data() + + iNode * currentBlockSize, + XFrac->data() + + numLocalDofs * Nfr * + (numSpinComponents * kPoint + + spinIndex) + + iNode * Nfr + jvec, + currentBlockSize * sizeof(NumberType)); + // for (unsigned int iWave = 0; iWave < + // currentBlockSize; // ++iWave) // flattenedArrayBlock[spinIndex] // .data()[iNode * currentBlockSize + iWave] = @@ -606,32 +612,42 @@ namespace dftfe for (unsigned int q = 0; q < numQuadPoints; ++q) { const double rho0 = rhoHost[iElem * numQuadPoints + q]; - const double rho1 = rhoHost[totalLocallyOwnedCells * numQuadPoints + + const double rho1 = + rhoHost[totalLocallyOwnedCells * numQuadPoints + iElem * numQuadPoints + q]; tempRhoQuadsSP[2 * q + 0] = rho0; tempRhoQuadsSP[2 * q + 1] = rho1; - tempRhoQuads[q] = rho0+rho1; + tempRhoQuads[q] = rho0 + rho1; } if (isEvaluateGradRho) for (unsigned int q = 0; q < numQuadPoints; ++q) { - const double gradRho0x = gradRhoHost[iElem * numQuadPoints * 3 + 3*q]; - const double gradRho0y = gradRhoHost[iElem * numQuadPoints * 3 + 3*q+1]; - const double gradRho0z = gradRhoHost[iElem * numQuadPoints * 3 + 3*q+2]; - const double gradRho1x = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q]; - const double gradRho1y = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q+1]; - const double gradRho1z = gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3+iElem * numQuadPoints * 3 + 3*q+2]; + const double gradRho0x = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho0y = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho0z = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 2]; + const double gradRho1x = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho1y = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho1z = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 2]; tempGradRhoQuadsSP[6 * q + 0] = gradRho0x; tempGradRhoQuadsSP[6 * q + 1] = gradRho0y; tempGradRhoQuadsSP[6 * q + 2] = gradRho0z; tempGradRhoQuadsSP[6 * q + 3] = gradRho1x; tempGradRhoQuadsSP[6 * q + 4] = gradRho1y; tempGradRhoQuadsSP[6 * q + 5] = gradRho1z; - tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; - tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; - tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; + tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; + tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; + tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; } } else diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 6d2dce026..ab15bb427 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -210,6 +210,12 @@ namespace dftfe d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID) .get_fe() .dofs_per_cell; + d_locallyOwnedSize = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->locally_owned_size(); + d_localSize = d_locallyOwnedSize + + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->n_ghost_indices(); d_cellDofIndexToProcessDofIndexMap.clear(); d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell); diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.cc index 2a1219652..4330dca89 100644 --- a/utils/FEBasisOperationsDevice.cc +++ b/utils/FEBasisOperationsDevice.cc @@ -25,7 +25,6 @@ namespace dftfe { namespace basis { - template void FEBasisOperations - &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients) const + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const { interpolateKernel(nodalData, quadratureValues, @@ -50,8 +49,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -72,8 +71,8 @@ namespace dftfe extractToCellNodalData( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const { extractToCellNodalDataKernel( nodalData, @@ -87,7 +86,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -108,13 +107,31 @@ namespace dftfe interpolateKernel( const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> &nodalValues, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + dftfe::utils::MemorySpace::DEVICE> & nodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const { - extractToCellNodalDataKernel(nodalValues, tempCellNodalData.data(), cellRange); + extractToCellNodalDataKernel(nodalValues, + tempCellNodalData.data(), + cellRange); + interpolateKernel(tempCellNodalData.data(), + quadratureValues, + quadratureGradients, + cellRange); + } + template + void + FEBasisOperations:: + interpolateKernel( + const ValueTypeBasisCoeff * cellNodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), scalarCoeffBeta = ValueTypeBasisCoeff(0.0); @@ -126,7 +143,7 @@ namespace dftfe d_nQuadsPerCell, d_nDofsPerCell, &scalarCoeffAlpha, - tempCellNodalData.data(), + cellNodalValues, d_nVectors, d_nVectors * d_nDofsPerCell, d_shapeFunctionData.data(), @@ -147,7 +164,7 @@ namespace dftfe d_nQuadsPerCell * 3, d_nDofsPerCell, &scalarCoeffAlpha, - tempCellNodalData.data(), + cellNodalValues, d_nVectors, d_nVectors * d_nDofsPerCell, d_shapeFunctionGradientData.data(), @@ -229,8 +246,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -245,8 +262,8 @@ namespace dftfe extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector< ValueTypeBasisCoeff, - dftfe::utils::MemorySpace::DEVICE> &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr, + dftfe::utils::MemorySpace::DEVICE> & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, const std::pair cellRange) const { dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( @@ -264,7 +281,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>:: accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.cc index 2d67db86a..4f8357f49 100644 --- a/utils/FEBasisOperationsHost.cc +++ b/utils/FEBasisOperationsHost.cc @@ -29,9 +29,9 @@ namespace dftfe interpolate( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients) const + & nodalData, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients) const { interpolateKernel(nodalData, quadratureValues, @@ -45,8 +45,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: integrateWithBasis( - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + ValueTypeBasisCoeff *quadratureValues, + ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -67,8 +67,8 @@ namespace dftfe extractToCellNodalData( dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr) const + & nodalData, + ValueTypeBasisCoeff *cellNodalDataPtr) const { extractToCellNodalDataKernel( nodalData, @@ -82,7 +82,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: accumulateFromCellNodalData( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector &nodalData) const @@ -100,33 +100,39 @@ namespace dftfe interpolateKernel( const dftfe::linearAlgebra::MultiVector - &nodalValues, - ValueTypeBasisCoeff *quadratureValues, - ValueTypeBasisCoeff *quadratureGradients, + & nodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const { - dftfe::utils::MemoryStorage - cellNodalData, tempQuadratureGradientsData, - tempQuadratureGradientsDataNonAffine; - cellNodalData.resize(d_nVectors * d_nDofsPerCell); - - if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize( - areAllCellsCartesian ? 0 : (d_nVectors * d_nQuadsPerCell * 3)); - - if (quadratureGradients != NULL) - tempQuadratureGradientsDataNonAffine.resize( - areAllCellsAffine ? 0 : (d_nVectors * d_nQuadsPerCell * 3)); - - for (unsigned int iCell = cellRange.first; iCell < cellRange.second; ++iCell) { extractToCellNodalDataKernel( nodalValues, - cellNodalData.data(), + tempCellNodalData.data(), std::pair(iCell, iCell + 1)); + interpolateKernel(tempCellNodalData.data(), + quadratureValues, + quadratureGradients, + std::pair(iCell, + iCell + 1)); + } + } + template + void + FEBasisOperations:: + interpolateKernel( + const ValueTypeBasisCoeff * cellNodalValues, + ValueTypeBasisCoeff * quadratureValues, + ValueTypeBasisCoeff * quadratureGradients, + const std::pair cellRange) const + { + for (unsigned int iCell = cellRange.first; iCell < cellRange.second; + ++iCell) + { const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0), scalarCoeffBeta = ValueTypeBasisCoeff(0.0); const char transA = 'N', transB = 'N'; @@ -137,7 +143,7 @@ namespace dftfe &d_nQuadsPerCell, &d_nDofsPerCell, &scalarCoeffAlpha, - cellNodalData.data() + + cellNodalValues + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, &d_nVectors, d_shapeFunctionData.data(), @@ -156,16 +162,16 @@ namespace dftfe &d_nQuadsPerCellTimesThree, &d_nDofsPerCell, &scalarCoeffAlpha, - cellNodalData.data() + + cellNodalValues + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, &d_nVectors, d_shapeFunctionGradientData.data(), &d_nDofsPerCell, &scalarCoeffBeta, - areAllCellsCartesian ? (quadratureGradients + - d_nQuadsPerCell * d_nVectors * 3 * - (iCell - cellRange.first)) : - (tempQuadratureGradientsData.data()), + areAllCellsCartesian ? + (quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * + (iCell - cellRange.first)) : + (tempQuadratureGradientsData.data()), &d_nVectors); if (areAllCellsCartesian) { @@ -197,9 +203,8 @@ namespace dftfe d_inverseJacobianData.data() + 9 * iCell, &three, &scalarCoeffBeta, - quadratureGradients + - d_nQuadsPerCell * d_nVectors * 3 * - (iCell - cellRange.first), + quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * + (iCell - cellRange.first), &d_nQuadsPerCellTimesnVectors); } else @@ -243,8 +248,8 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: integrateWithBasisKernel( - const ValueTypeBasisCoeff *quadratureValues, - const ValueTypeBasisCoeff *quadratureGradients, + const ValueTypeBasisCoeff *quadratureValues, + const ValueTypeBasisCoeff *quadratureGradients, dftfe::linearAlgebra::MultiVector & nodalData, @@ -384,8 +389,8 @@ namespace dftfe extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector - &nodalData, - ValueTypeBasisCoeff *cellNodalDataPtr, + & nodalData, + ValueTypeBasisCoeff * cellNodalDataPtr, const std::pair cellRange) const { for (unsigned int iCell = cellRange.first; iCell < cellRange.second; @@ -409,7 +414,7 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>:: accumulateFromCellNodalDataKernel( - const ValueTypeBasisCoeff *cellNodalDataPtr, + const ValueTypeBasisCoeff *cellNodalDataPtr, dftfe::linearAlgebra::MultiVector & nodalData, From f2f472be0f9bfdd9a8ea18642f20788d1bd27e7f Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Thu, 28 Sep 2023 17:00:35 +0530 Subject: [PATCH 08/25] Bugfixes, cleanups, Device Hamiltonian construction changes --- include/FEBasisOperations.h | 20 + include/forceWfcContractionsDevice.h | 6 + include/kohnShamDFTOperatorDevice.h | 33 +- include/operatorDevice.h | 30 +- ...iltonianMatrixCalculatorFlattenedDevice.cc | 951 ++++++++++++++---- src/dftOperator/kohnShamDFTOperatorDevice.cc | 58 +- .../shapeFunctionDataCalculatorDevice.cc | 149 +-- ...onfigurationalForceEEshelbyFPSPFnlLinFE.cc | 1 + .../computeStressEEshelbyEPSPEnlEk.cc | 1 + src/force/forceWfcContractionsDevice.cc | 303 ++---- utils/FEBasisOperationsDevice.cc | 78 +- utils/constraintMatrixInfo.cc | 9 +- 12 files changed, 1118 insertions(+), 521 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 75309ffbd..c1006cefa 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -528,19 +528,39 @@ namespace dftfe template class FEBasisOperationsBase; +#ifdef USE_COMPLEX + template class FEBasisOperationsBase; +#endif #if defined(DFTFE_WITH_DEVICE) template class FEBasisOperationsBase; +# ifdef USE_COMPLEX + template class FEBasisOperationsBase; +# endif #endif template class FEBasisOperations; +#ifdef USE_COMPLEX + template class FEBasisOperations; +#endif #if defined(DFTFE_WITH_DEVICE) template class FEBasisOperations; +# ifdef USE_COMPLEX + template class FEBasisOperations; +# endif #endif } // end of namespace basis diff --git a/include/forceWfcContractionsDevice.h b/include/forceWfcContractionsDevice.h index e3253a854..d9d8e520f 100644 --- a/include/forceWfcContractionsDevice.h +++ b/include/forceWfcContractionsDevice.h @@ -22,6 +22,7 @@ # include "headers.h" # include "operatorDevice.h" # include "dftParameters.h" +# include "FEBasisOperations.h" namespace dftfe { @@ -29,6 +30,11 @@ namespace dftfe { void wfcContractionsForceKernelsAllH( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index a23c4d7ff..b2a15422b 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -21,6 +21,7 @@ #include #include #include +#include namespace dftfe { @@ -85,14 +86,14 @@ namespace dftfe dftfe::utils::MemoryStorage & getShapeFunctionValuesNLPTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesXTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesXTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesYTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesYTransposed(); - dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesZTransposed(); + // dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesZTransposed(); dftfe::utils::MemoryStorage & getShapeFunctionGradientValuesNLPTransposed(); @@ -647,6 +648,14 @@ namespace dftfe /// pointer to dft class dftClass *dftPtr; + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + basisOperationsPtrDevice; + std::unique_ptr< + dftfe::basis:: + FEBasisOperations> + basisOperationsPtrHOST; /// data structures to store diagonal of inverse square root mass matrix and @@ -687,14 +696,14 @@ namespace dftfe d_shapeFunctionValueTransposedLpspDevice; /// storage for shapefunction gradients - std::vector d_shapeFunctionGradientValueX; - std::vector d_shapeFunctionGradientValueXTransposed; + // std::vector d_shapeFunctionGradientValueX; + // std::vector d_shapeFunctionGradientValueXTransposed; - std::vector d_shapeFunctionGradientValueY; - std::vector d_shapeFunctionGradientValueYTransposed; + // std::vector d_shapeFunctionGradientValueY; + // std::vector d_shapeFunctionGradientValueYTransposed; - std::vector d_shapeFunctionGradientValueZ; - std::vector d_shapeFunctionGradientValueZTransposed; + // std::vector d_shapeFunctionGradientValueZ; + // std::vector d_shapeFunctionGradientValueZTransposed; std::vector d_cellJxWValues; diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 5e980b89b..5cbaab16b 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -97,17 +97,17 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> & getShapeFunctionValuesNLPTransposed() = 0; - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesXTransposed() = 0; + // virtual dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesXTransposed() = 0; - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesYTransposed() = 0; + // virtual dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesYTransposed() = 0; - virtual dftfe::utils::MemoryStorage & - getShapeFunctionGradientValuesZTransposed() = 0; + // virtual dftfe::utils::MemoryStorage & + // getShapeFunctionGradientValuesZTransposed() = 0; virtual dftfe::utils::MemoryStorage & @@ -426,14 +426,14 @@ namespace dftfe dftfe::utils::MemoryStorage d_shapeFunctionValueNLPTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueXTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueXTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueYTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueYTransposedDevice; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientValueZTransposedDevice; + // dftfe::utils::MemoryStorage + // d_shapeFunctionGradientValueZTransposedDevice; dftfe::utils::MemoryStorage d_shapeFunctionGradientValueNLPTransposedDevice; diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc index 963d2cafd..5ad596be7 100644 --- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc +++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc @@ -68,9 +68,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -124,9 +124,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -165,21 +165,108 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + } val += vEffJxW[cellIndex * numQuadPoints + q] * shapeI * shapeJ; @@ -227,9 +314,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -264,37 +351,153 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexJ]; + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -334,9 +537,9 @@ namespace const unsigned int numkPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * cellShapeFunctionGradientIntegral, const double * vEffJxW, const double * JxW, @@ -376,37 +579,157 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexJ]; + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -462,12 +785,12 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double *shapeFunctionGradientValuesXTransposed, - const double *shapeFunctionGradientValuesYTransposed, - const double *shapeFunctionGradientValuesZTransposed, - const double *vEffPrimeJxW, - const double *JxW, - double * cellHamiltonianPrimeMatrixFlattened) + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, + const double * vEffPrimeJxW, + const double * JxW, + double *cellHamiltonianPrimeMatrixFlattened) { const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -506,9 +829,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, dftfe::utils::deviceDoubleComplex *cellHamiltonianPrimeMatrixFlattened) @@ -553,9 +876,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, const double * derExcPrimeWithSigmaTimesGradRhoJxW, @@ -585,37 +908,157 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexJ]; + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -647,9 +1090,9 @@ namespace const unsigned int numQuadPoints, const double * shapeFunctionValues, const double * shapeFunctionValuesTransposed, - const double * shapeFunctionGradientValuesXTransposed, - const double * shapeFunctionGradientValuesYTransposed, - const double * shapeFunctionGradientValuesZTransposed, + const double * shapeFunctionGradientValues, + const double * inverseJacobianValues, + const int areAllCellsAffineOrCartesianFlag, const double * vEffPrimeJxW, const double * JxW, const double * derExcPrimeWithSigmaTimesGradRhoJxW, @@ -679,37 +1122,157 @@ namespace const double shapeJ = shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ]; - const double gradShapeXI = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYI = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZI = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexI]; - - const double gradShapeXJ = - shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJ = - shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJ = - shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints * - numDofsPerCell + - numDofsPerCell * q + - cellDofIndexJ]; + double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, + gradShapeZI, gradShapeZJ; + if (areAllCellsAffineOrCartesianFlag == 0) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + numDofsPerCell + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * q * 3 + + 2 * numDofsPerCell + + cellDofIndexJ]; + const double Jxx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 0]; + const double Jxy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 1]; + const double Jxz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 2]; + const double Jyx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 3]; + const double Jyy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 4]; + const double Jyz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 5]; + const double Jzx = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 6]; + const double Jzy = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 7]; + const double Jzz = + inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 1) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; + const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; + const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; + const double Jyx = inverseJacobianValues[cellIndex * 9 + 3]; + const double Jyy = inverseJacobianValues[cellIndex * 9 + 4]; + const double Jyz = inverseJacobianValues[cellIndex * 9 + 5]; + const double Jzx = inverseJacobianValues[cellIndex * 9 + 6]; + const double Jzy = inverseJacobianValues[cellIndex * 9 + 7]; + const double Jzz = inverseJacobianValues[cellIndex * 9 + 8]; + + gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy + + gradShapeZIRef * Jxz; + gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy + + gradShapeZIRef * Jyz; + gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy + + gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy + + gradShapeZJRef * Jxz; + gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy + + gradShapeZJRef * Jyz; + gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy + + gradShapeZJRef * Jzz; + } + else if (areAllCellsAffineOrCartesianFlag == 2) + { + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * + 2 + + numDofsPerCell * q + + cellDofIndexJ]; + const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; + const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; + const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; + + gradShapeXI = gradShapeXIRef * Jxx; + gradShapeYI = gradShapeYIRef * Jyy; + gradShapeZI = gradShapeZIRef * Jzz; + gradShapeXJ = gradShapeXJRef * Jxx; + gradShapeYJ = gradShapeYJRef * Jyy; + gradShapeZJ = gradShapeZJRef * Jzz; + } val += @@ -815,9 +1378,10 @@ kohnShamDFTOperatorDeviceClass:: d_numQuadPoints, d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -840,9 +1404,10 @@ kohnShamDFTOperatorDeviceClass:: d_numQuadPoints, d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -865,9 +1430,10 @@ kohnShamDFTOperatorDeviceClass:: d_numQuadPoints, d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -889,9 +1455,10 @@ kohnShamDFTOperatorDeviceClass:: d_numQuadPoints, d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -919,9 +1486,10 @@ kohnShamDFTOperatorDeviceClass:: dftPtr->d_kPointWeights.size(), d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -951,9 +1519,10 @@ kohnShamDFTOperatorDeviceClass:: dftPtr->d_kPointWeights.size(), d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -982,9 +1551,10 @@ kohnShamDFTOperatorDeviceClass:: dftPtr->d_kPointWeights.size(), d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1013,9 +1583,10 @@ kohnShamDFTOperatorDeviceClass:: dftPtr->d_kPointWeights.size(), d_shapeFunctionValueDevice.begin(), d_shapeFunctionValueTransposedDevice.begin(), - d_shapeFunctionGradientValueXTransposedDevice.begin(), - d_shapeFunctionGradientValueYTransposedDevice.begin(), - d_shapeFunctionGradientValueZTransposedDevice.begin(), + basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), + basisOperationsPtrDevice->d_inverseJacobianData.begin(), + (int)basisOperationsPtrDevice->areAllCellsAffine + + (int)basisOperationsPtrDevice->areAllCellsCartesian, d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 0485516e3..7ec359a43 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -386,29 +386,29 @@ namespace dftfe return d_shapeFunctionValueNLPTransposedDevice; } - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesXTransposed() - { - return d_shapeFunctionGradientValueXTransposedDevice; - } - - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesYTransposed() - { - return d_shapeFunctionGradientValueYTransposedDevice; - } - - template - dftfe::utils::MemoryStorage & - kohnShamDFTOperatorDeviceClass:: - getShapeFunctionGradientValuesZTransposed() - { - return d_shapeFunctionGradientValueZTransposedDevice; - } + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesXTransposed() + // { + // return d_shapeFunctionGradientValueXTransposedDevice; + // } + + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesYTransposed() + // { + // return d_shapeFunctionGradientValueYTransposedDevice; + // } + + // template + // dftfe::utils::MemoryStorage & + // kohnShamDFTOperatorDeviceClass:: + // getShapeFunctionGradientValuesZTransposed() + // { + // return d_shapeFunctionGradientValueZTransposedDevice; + // } template dftfe::utils::MemoryStorage & @@ -495,6 +495,18 @@ namespace dftfe { computing_timer.enter_subsection("kohnShamDFTOperatorDeviceClass setup"); + basisOperationsPtrDevice = std::make_unique< + dftfe::basis:: + FEBasisOperations>( + dftPtr->matrix_free_data, dftPtr->d_constraintsVector); + basisOperationsPtrHOST = std::make_unique< + dftfe::basis:: + FEBasisOperations>( + dftPtr->matrix_free_data, dftPtr->d_constraintsVector); + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | dftfe::basis::update_gradients; + basisOperationsPtrDevice->reinit(0, 0, 0, 0, updateFlags); + basisOperationsPtrHOST->reinit(0, 0, 0, 0, updateFlags); dftPtr->matrix_free_data.initialize_dof_vector( d_invSqrtMassVector, dftPtr->d_densityDofHandlerIndex); diff --git a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc index 24a144115..ffdae8006 100644 --- a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc +++ b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc @@ -380,32 +380,35 @@ kohnShamDFTOperatorDeviceClass:: numberDofsPerElement, 0.0); - d_shapeFunctionGradientValueX.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueXTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - - d_shapeFunctionGradientValueY.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueYTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - - d_shapeFunctionGradientValueZ.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); - d_shapeFunctionGradientValueZTransposed.resize(numberPhysicalCells * - numberQuadraturePoints * - numberDofsPerElement, - 0.0); + // d_shapeFunctionGradientValueX.resize(numberPhysicalCells * + // numberQuadraturePoints * + // numberDofsPerElement, + // 0.0); + // d_shapeFunctionGradientValueXTransposed.resize(numberPhysicalCells * + // numberQuadraturePoints + // * + // numberDofsPerElement, + // 0.0); + + // d_shapeFunctionGradientValueY.resize(numberPhysicalCells * + // numberQuadraturePoints * + // numberDofsPerElement, + // 0.0); + // d_shapeFunctionGradientValueYTransposed.resize(numberPhysicalCells * + // numberQuadraturePoints + // * + // numberDofsPerElement, + // 0.0); + + // d_shapeFunctionGradientValueZ.resize(numberPhysicalCells * + // numberQuadraturePoints * + // numberDofsPerElement, + // 0.0); + // d_shapeFunctionGradientValueZTransposed.resize(numberPhysicalCells * + // numberQuadraturePoints + // * + // numberDofsPerElement, + // 0.0); std::vector shapeFunctionValueLpsp(numberQuadraturePointsLpsp * numberDofsPerElement, @@ -435,37 +438,45 @@ kohnShamDFTOperatorDeviceClass:: d_cellJxWValues[iElem * numberQuadraturePoints + q_point] = fe_values.JxW(q_point); - for (unsigned int iNode = 0; iNode < numberDofsPerElement; ++iNode) - for (unsigned int q_point = 0; q_point < numberQuadraturePoints; - ++q_point) - { - const dealii::Tensor<1, 3, double> &shape_grad = - fe_values.shape_grad(iNode, q_point); - - d_shapeFunctionGradientValueX[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[0]; - d_shapeFunctionGradientValueXTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[0]; - - d_shapeFunctionGradientValueY[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[1]; - d_shapeFunctionGradientValueYTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[1]; - - d_shapeFunctionGradientValueZ[iElem * numberDofsPerElement * - numberQuadraturePoints + - iNode * numberQuadraturePoints + - q_point] = shape_grad[2]; - d_shapeFunctionGradientValueZTransposed - [iElem * numberQuadraturePoints * numberDofsPerElement + - q_point * numberDofsPerElement + iNode] = shape_grad[2]; - } + // for (unsigned int iNode = 0; iNode < numberDofsPerElement; + // ++iNode) + // for (unsigned int q_point = 0; q_point < + // numberQuadraturePoints; + // ++q_point) + // { + // const dealii::Tensor<1, 3, double> &shape_grad = + // fe_values.shape_grad(iNode, q_point); + + // d_shapeFunctionGradientValueX[iElem * numberDofsPerElement + // * + // numberQuadraturePoints + + // iNode * + // numberQuadraturePoints + + // q_point] = shape_grad[0]; + // d_shapeFunctionGradientValueXTransposed + // [iElem * numberQuadraturePoints * numberDofsPerElement + + // q_point * numberDofsPerElement + iNode] = shape_grad[0]; + + // d_shapeFunctionGradientValueY[iElem * numberDofsPerElement + // * + // numberQuadraturePoints + + // iNode * + // numberQuadraturePoints + + // q_point] = shape_grad[1]; + // d_shapeFunctionGradientValueYTransposed + // [iElem * numberQuadraturePoints * numberDofsPerElement + + // q_point * numberDofsPerElement + iNode] = shape_grad[1]; + + // d_shapeFunctionGradientValueZ[iElem * numberDofsPerElement + // * + // numberQuadraturePoints + + // iNode * + // numberQuadraturePoints + + // q_point] = shape_grad[2]; + // d_shapeFunctionGradientValueZTransposed + // [iElem * numberQuadraturePoints * numberDofsPerElement + + // q_point * numberDofsPerElement + iNode] = shape_grad[2]; + // } if (iElem == 0) { @@ -512,20 +523,20 @@ kohnShamDFTOperatorDeviceClass:: d_shapeFunctionValueTransposedDevice.copyFrom( d_shapeFunctionValueTransposed); - d_shapeFunctionGradientValueXTransposedDevice.resize( - d_shapeFunctionGradientValueXTransposed.size()); - d_shapeFunctionGradientValueXTransposedDevice.copyFrom( - d_shapeFunctionGradientValueXTransposed); + // d_shapeFunctionGradientValueXTransposedDevice.resize( + // d_shapeFunctionGradientValueXTransposed.size()); + // d_shapeFunctionGradientValueXTransposedDevice.copyFrom( + // d_shapeFunctionGradientValueXTransposed); - d_shapeFunctionGradientValueYTransposedDevice.resize( - d_shapeFunctionGradientValueYTransposed.size()); - d_shapeFunctionGradientValueYTransposedDevice.copyFrom( - d_shapeFunctionGradientValueYTransposed); + // d_shapeFunctionGradientValueYTransposedDevice.resize( + // d_shapeFunctionGradientValueYTransposed.size()); + // d_shapeFunctionGradientValueYTransposedDevice.copyFrom( + // d_shapeFunctionGradientValueYTransposed); - d_shapeFunctionGradientValueZTransposedDevice.resize( - d_shapeFunctionGradientValueZTransposed.size()); - d_shapeFunctionGradientValueZTransposedDevice.copyFrom( - d_shapeFunctionGradientValueZTransposed); + // d_shapeFunctionGradientValueZTransposedDevice.resize( + // d_shapeFunctionGradientValueZTransposed.size()); + // d_shapeFunctionGradientValueZTransposedDevice.copyFrom( + // d_shapeFunctionGradientValueZTransposed); d_shapeFunctionValueLpspDevice.resize(shapeFunctionValueLpsp.size()); d_shapeFunctionValueLpspDevice.copyFrom(shapeFunctionValueLpsp); diff --git a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc index bc07edf27..3cf5db1f2 100644 --- a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc +++ b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc @@ -250,6 +250,7 @@ namespace dftfe double device_time = MPI_Wtime(); forceDevice::wfcContractionsForceKernelsAllH( + dftPtr->basisOperationsPtrDevice, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, diff --git a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc index 08c534633..10c40a236 100644 --- a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc +++ b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc @@ -244,6 +244,7 @@ namespace dftfe double device_time = MPI_Wtime(); forceDevice::wfcContractionsForceKernelsAllH( + dftPtr->basisOperationsPtrDevice, kohnShamDFTEigenOperatorDevice, dftPtr->d_eigenVectorsFlattenedDevice.begin(), d_dftParams.spinPolarized, diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index 985eb75c8..af840f6da 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -41,9 +41,7 @@ namespace dftfe const unsigned int numContiguousBlocks, const unsigned int numQuads, const double * psiQuadValues, - const double * gradPsiQuadValuesX, - const double * gradPsiQuadValuesY, - const double * gradPsiQuadValuesZ, + const double * gradPsiQuadValues, const double * eigenValues, const double * partialOccupancies, double * eshelbyTensor) @@ -66,10 +64,16 @@ namespace dftfe const unsigned int tempIndex = (cellIndex)*numQuads * contiguousBlockSize + quadId * contiguousBlockSize + intraBlockIndex; - const double psi = psiQuadValues[tempIndex]; - const double gradPsiX = gradPsiQuadValuesX[tempIndex]; - const double gradPsiY = gradPsiQuadValuesY[tempIndex]; - const double gradPsiZ = gradPsiQuadValuesZ[tempIndex]; + const unsigned int tempIndex2 = + (cellIndex)*numQuads * contiguousBlockSize * 3 + + quadId * contiguousBlockSize + intraBlockIndex; + const double psi = psiQuadValues[tempIndex]; + const double gradPsiX = gradPsiQuadValues[tempIndex2]; + const double gradPsiY = + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]; + const double gradPsiZ = + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]; const double eigenValue = eigenValues[intraBlockIndex]; const double partOcc = partialOccupancies[intraBlockIndex]; @@ -109,9 +113,7 @@ namespace dftfe const unsigned int numContiguousBlocks, const unsigned int numQuads, const dftfe::utils::deviceDoubleComplex *psiQuadValues, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesX, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesY, - const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesZ, + const dftfe::utils::deviceDoubleComplex *gradPsiQuadValues, const double * eigenValues, const double * partialOccupancies, const double kcoordx, @@ -138,22 +140,29 @@ namespace dftfe const unsigned int tempIndex = (cellIndex)*numQuads * contiguousBlockSize + quadId * contiguousBlockSize + intraBlockIndex; + const unsigned int tempIndex2 = + (cellIndex)*numQuads * contiguousBlockSize * 3 + + quadId * contiguousBlockSize + intraBlockIndex; const dftfe::utils::deviceDoubleComplex psi = psiQuadValues[tempIndex]; const dftfe::utils::deviceDoubleComplex psiConj = dftfe::utils::conj(psiQuadValues[tempIndex]); const dftfe::utils::deviceDoubleComplex gradPsiX = - gradPsiQuadValuesX[tempIndex]; + gradPsiQuadValues[tempIndex2]; const dftfe::utils::deviceDoubleComplex gradPsiY = - gradPsiQuadValuesY[tempIndex]; + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]; const dftfe::utils::deviceDoubleComplex gradPsiZ = - gradPsiQuadValuesZ[tempIndex]; + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]; const dftfe::utils::deviceDoubleComplex gradPsiXConj = - dftfe::utils::conj(gradPsiQuadValuesX[tempIndex]); + dftfe::utils::conj(gradPsiQuadValues[tempIndex2]); const dftfe::utils::deviceDoubleComplex gradPsiYConj = - dftfe::utils::conj(gradPsiQuadValuesY[tempIndex]); + dftfe::utils::conj( + gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]); const dftfe::utils::deviceDoubleComplex gradPsiZConj = - dftfe::utils::conj(gradPsiQuadValuesZ[tempIndex]); + dftfe::utils::conj( + gradPsiQuadValues[tempIndex2 + + 2 * numQuads * contiguousBlockSize]); const double eigenValue = eigenValues[intraBlockIndex]; const double partOcc = partialOccupancies[intraBlockIndex]; @@ -410,6 +419,11 @@ namespace dftfe void interpolatePsiComputeELocWfcEshelbyTensorD( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &Xb, const unsigned int BVec, @@ -437,13 +451,7 @@ namespace dftfe &psiQuadsFlatD, dftfe::utils::MemoryStorage - &gradPsiQuadsXFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsYFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsZFlatD, + &gradPsiQuadsFlatD, #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -463,31 +471,34 @@ namespace dftfe dftfe::utils::MemoryStorage &cellWaveFunctionMatrix = operatorMatrix.getCellWaveFunctionMatrix(); + dftfe::basis::UpdateFlags updateFlags = + dftfe::basis::update_values | dftfe::basis::update_gradients; + basisOperationsPtr->reinit(BVec, cellsBlockSize, 0, 0, updateFlags); - dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( - BVec, - numCells * numNodesPerElement, - Xb.begin(), - cellWaveFunctionMatrix.begin(), - (operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap()).begin()); + // dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + // BVec, + // numCells * numNodesPerElement, + // Xb.begin(), + // cellWaveFunctionMatrix.begin(), + // (operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap()).begin()); const int blockSize = cellsBlockSize; const int numberBlocks = numCells / blockSize; const int remBlockSize = numCells - numberBlocks * blockSize; - dftfe::utils::MemoryStorage - shapeFunctionValuesReferenceD(numQuads * numNodesPerElement, - dataTypes::number(0.0)); + // dftfe::utils::MemoryStorage + // shapeFunctionValuesReferenceD(numQuads * numNodesPerElement, + // dataTypes::number(0.0)); dftfe::utils::MemoryStorage shapeFunctionValuesNLPReferenceD(numQuadsNLP * numNodesPerElement, dataTypes::number(0.0)); - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - numQuads * numNodesPerElement, - (operatorMatrix.getShapeFunctionValuesTransposed()).begin(), - shapeFunctionValuesReferenceD.begin()); + // dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( + // numQuads * numNodesPerElement, + // (operatorMatrix.getShapeFunctionValuesTransposed()).begin(), + // shapeFunctionValuesReferenceD.begin()); dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( @@ -495,23 +506,23 @@ namespace dftfe (operatorMatrix.getShapeFunctionValuesNLPTransposed()).begin(), shapeFunctionValuesNLPReferenceD.begin()); - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesXTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); + // dftfe::utils::MemoryStorage + // shapeFunctionGradientValuesXTransposedDevice(blockSize * numQuads * + // numNodesPerElement, + // dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesYTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); + // dftfe::utils::MemoryStorage + // shapeFunctionGradientValuesYTransposedDevice(blockSize * numQuads * + // numNodesPerElement, + // dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesZTransposedDevice(blockSize * numQuads * - numNodesPerElement, - dataTypes::number(0.0)); + // dftfe::utils::MemoryStorage + // shapeFunctionGradientValuesZTransposedDevice(blockSize * numQuads * + // numNodesPerElement, + // dataTypes::number(0.0)); dftfe::utils::MemoryStorage shapeFunctionGradientValuesNLPReferenceD(blockSize * numQuadsNLP * 3 * @@ -535,6 +546,10 @@ namespace dftfe 0, i * numQuadsNLP * 3 * numNodesPerElement); + basisOperationsPtr->extractToCellNodalDataKernel( + Xb, + cellWaveFunctionMatrix.data(), + std::pair(0, numCells)); for (int iblock = 0; iblock < (numberBlocks + 1); iblock++) @@ -558,120 +573,12 @@ namespace dftfe if (!isFloatingChargeForces) { - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionValuesReferenceD.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - psiQuadsFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - strideB = numNodesPerElement * numQuads; - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesXTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsXFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesYTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsYFlatD.begin(), - BVec, - strideC, - currentBlockSize); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentBlockSize * numQuads * numNodesPerElement, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingId * numQuads * numNodesPerElement, - shapeFunctionGradientValuesZTransposedDevice.begin()); - - dftfe::utils::deviceBlasWrapper::gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuads, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix.begin() + - startingId * numNodesPerElement * BVec, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradPsiQuadsZFlatD.begin(), - BVec, - strideC, - currentBlockSize); - + basisOperationsPtr->interpolateKernel( + cellWaveFunctionMatrix.data(), + psiQuadsFlatD.data(), + gradPsiQuadsFlatD.begin(), + std::pair( + startingId, startingId + currentBlockSize)); #ifdef DFTFE_WITH_DEVICE_LANG_CUDA computeELocWfcEshelbyTensorContributions<<< (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / @@ -684,11 +591,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible( psiQuadsFlatD.begin()), dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsXFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsYFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsZFlatD.begin()), + gradPsiQuadsFlatD.begin()), eigenValuesD.begin(), partialOccupanciesD.begin(), # ifdef USE_COMPLEX @@ -717,11 +620,7 @@ namespace dftfe dftfe::utils::makeDataTypeDeviceCompatible( psiQuadsFlatD.begin()), dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsXFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsYFlatD.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradPsiQuadsZFlatD.begin()), + gradPsiQuadsFlatD.begin()), eigenValuesD.begin(), partialOccupanciesD.begin(), # ifdef USE_COMPLEX @@ -1069,6 +968,11 @@ namespace dftfe void devicePortedForceKernelsAllD( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, distributedDeviceVec &deviceFlattenedArrayBlock, distributedDeviceVec &projectorKetTimesVectorD, @@ -1109,13 +1013,7 @@ namespace dftfe &psiQuadsFlatD, dftfe::utils::MemoryStorage - &gradPsiQuadsXFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsYFlatD, - dftfe::utils::MemoryStorage - &gradPsiQuadsZFlatD, + &gradPsiQuadsFlatD, #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -1154,11 +1052,13 @@ namespace dftfe // int this_process; // MPI_Comm_rank(d_mpiCommParent, &this_process); - const unsigned int M = operatorMatrix.getMatrixFreeData() - ->get_vector_partitioner() - ->local_size(); dftfe::utils::deviceKernelsGeneric::stridedCopyToBlockConstantStride( - numPsi, N, M, startingVecId, X, deviceFlattenedArrayBlock.begin()); + numPsi, + N, + basisOperationsPtr->d_locallyOwnedSize, + startingVecId, + X, + deviceFlattenedArrayBlock.begin()); deviceFlattenedArrayBlock.updateGhostValues(); (operatorMatrix.getOverloadedConstraintMatrix()) @@ -1169,7 +1069,8 @@ namespace dftfe // MPI_Barrier(d_mpiCommParent); // double kernel1_time = MPI_Wtime(); - interpolatePsiComputeELocWfcEshelbyTensorD(operatorMatrix, + interpolatePsiComputeELocWfcEshelbyTensorD(basisOperationsPtr, + operatorMatrix, deviceFlattenedArrayBlock, numPsi, numCells, @@ -1186,9 +1087,7 @@ namespace dftfe onesVecD, cellsBlockSize, psiQuadsFlatD, - gradPsiQuadsXFlatD, - gradPsiQuadsYFlatD, - gradPsiQuadsZFlatD, + gradPsiQuadsFlatD, #ifdef USE_COMPLEX psiQuadsNLPD, #endif @@ -1273,6 +1172,11 @@ namespace dftfe void wfcContractionsForceKernelsAllH( + std::unique_ptr< + dftfe::basis::FEBasisOperations> + & basisOperationsPtr, operatorDFTDeviceClass & operatorMatrix, const dataTypes::number * X, const unsigned int spinPolarizedFlag, @@ -1361,16 +1265,8 @@ namespace dftfe dataTypes::number(0.0)); dftfe::utils::MemoryStorage - gradPsiQuadsXFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - gradPsiQuadsYFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); - dftfe::utils::MemoryStorage - gradPsiQuadsZFlatD(cellsBlockSize * numQuads * blockSize, - dataTypes::number(0.0)); + gradPsiQuadsFlatD(cellsBlockSize * numQuads * blockSize * 3, + dataTypes::number(0.0)); #ifdef USE_COMPLEX dftfe::utils::MemoryStorage @@ -1501,6 +1397,7 @@ namespace dftfe // double kernel_time = MPI_Wtime(); devicePortedForceKernelsAllD( + basisOperationsPtr, operatorMatrix, deviceFlattenedArrayBlock, projectorKetTimesVectorD, @@ -1526,9 +1423,7 @@ namespace dftfe numNodesPerElement, totalNonTrivialPseudoWfcs, psiQuadsFlatD, - gradPsiQuadsXFlatD, - gradPsiQuadsYFlatD, - gradPsiQuadsZFlatD, + gradPsiQuadsFlatD, #ifdef USE_COMPLEX psiQuadsNLPD, #endif diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.cc index 4330dca89..d79edba3d 100644 --- a/utils/FEBasisOperationsDevice.cc +++ b/utils/FEBasisOperationsDevice.cc @@ -20,9 +20,43 @@ #include #include #include +#include +#include +#include namespace dftfe { + namespace + { + template + __global__ void + reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / numVecs; + dftfe::size_type iVec = index - blockIndex * numVecs; + dftfe::size_type blockIndex2 = blockIndex / numQuads; + dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + dftfe::size_type iCell = blockIndex2 / 3; + dftfe::size_type iDim = blockIndex2 - iCell * 3; + dftfe::utils::copyValue( + copyToVec + index, + copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + iCell * 3 * numQuads * numVecs]); + } + } + } // namespace + namespace basis { template @@ -229,13 +263,45 @@ namespace dftfe d_nVectors, d_nVectors * 3, (cellRange.second - cellRange.first) * d_nQuadsPerCell); - dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + // dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( + // d_nVectors, + // (cellRange.second - cellRange.first) * d_nQuadsPerCell * 3, + // tempQuadratureGradientsDataNonAffine.data(), + // quadratureGradients, + // d_nonAffineReshapeIDs.data() + + // cellRange.first * d_nQuadsPerCell * 3); +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + reshapeNonAffineCaseDeviceKernel<<< + (d_nVectors * (cellRange.second - cellRange.first) * + d_nQuadsPerCell * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( d_nVectors, - (cellRange.second - cellRange.first) * d_nQuadsPerCell * 3, - tempQuadratureGradientsDataNonAffine.data(), - quadratureGradients, - d_nonAffineReshapeIDs.data() + - cellRange.first * d_nDofsPerCell); + d_nQuadsPerCell, + (cellRange.second - cellRange.first), + dftfe::utils::makeDataTypeDeviceCompatible( + tempQuadratureGradientsDataNonAffine.data()), + dftfe::utils::makeDataTypeDeviceCompatible( + quadratureGradients)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel, + (d_nVectors * + (cellRange.second - cellRange.first) * + d_nQuadsPerCell * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + d_nVectors, + d_nQuadsPerCell, + (cellRange.second - cellRange.first), + dftfe::utils::makeDataTypeDeviceCompatible( + tempQuadratureGradientsDataNonAffine.data()), + dftfe::utils::makeDataTypeDeviceCompatible( + quadratureGradients), ); +#endif } } } diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index e0026e7e6..6267953c6 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -527,8 +527,13 @@ namespace dftfe template void constraintMatrixInfo::distribute( - distributedCPUMultiVec &fieldVector, - const unsigned int blockSize) const; + distributedCPUMultiVec &fieldVector, + const unsigned int blockSize) const; + + template void + constraintMatrixInfo::distribute( + distributedCPUMultiVec> &fieldVector, + const unsigned int blockSize) const; template void constraintMatrixInfo::distribute_slave_to_master( From cf54bc3f198c2b3a5166d2f73d47d99d64787df1 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 2 Oct 2023 13:34:51 +0530 Subject: [PATCH 09/25] Cleanups --- include/FEBasisOperations.h | 126 +- include/constraintMatrixInfo.h | 31 - include/constraintMatrixInfoDevice.h | 25 - include/kohnShamDFTOperatorDevice.h | 2 +- include/operatorDevice.h | 12 - src/dft/densityCalculator.cc | 20 +- src/dft/densityCalculator.inst.cc | 44 - src/dft/densityCalculatorCPU.cc | 881 ----------- src/dft/densityCalculatorDevice.cc | 1318 ----------------- src/dft/densityCalculatorDeviceKernels.cc | 4 +- src/dft/dos.cc | 10 - src/dft/initBoundaryConditions.cc | 24 +- src/dft/solveVselfInBinsDevice.cc | 4 - ...iltonianMatrixCalculatorFlattenedDevice.cc | 533 ++----- src/dftOperator/kohnShamDFTOperator.cc | 8 - src/dftOperator/kohnShamDFTOperatorDevice.cc | 28 +- .../shapeFunctionDataCalculatorDevice.cc | 87 -- src/force/forceWfcContractionsDevice.cc | 4 +- src/helmholtz/kerkerSolverProblemDevice.cc | 5 - src/poisson/poissonSolverProblemDevice.cc | 4 - src/symmetry/symmetrizeRho.cc | 5 - utils/DeviceBlasWrapper.hip.cc | 101 ++ utils/FEBasisOperations.cc | 555 ++++--- utils/FEBasisOperationsDevice.cc | 50 +- utils/FEBasisOperationsHost.cc | 106 +- utils/constraintMatrixInfo.cc | 103 +- utils/constraintMatrixInfoDevice.cc | 278 +--- 27 files changed, 923 insertions(+), 3445 deletions(-) delete mode 100644 src/dft/densityCalculator.inst.cc delete mode 100644 src/dft/densityCalculatorCPU.cc delete mode 100644 src/dft/densityCalculatorDevice.cc diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index c1006cefa..024f5a403 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -34,7 +34,9 @@ namespace dftfe update_values = 0x0001, - update_gradients = 0x0002 + update_gradients = 0x0002, + + update_transpose = 0x0004 }; inline UpdateFlags @@ -87,12 +89,15 @@ namespace dftfe ~FEBasisOperationsBase() = default; + void + init(const unsigned int & dofHandlerID, + const std::vector &quadratureID, + const UpdateFlags updateFlags = update_values); + void reinit(const unsigned int &vecBlockSize, const unsigned int &cellBlockSize, - const unsigned int &dofHandlerID, - const unsigned int &quadratureID, - const UpdateFlags updateFlags = update_values); + const unsigned int &quadratureID); // private: #if defined(DFTFE_WITH_DEVICE) @@ -121,6 +126,37 @@ namespace dftfe void resizeTempStorage(); + unsigned int + nQuadsPerCell() const; + + unsigned int + nDofsPerCell() const; + + unsigned int + nCells() const; + + unsigned int + nRelaventDofs() const; + + unsigned int + nOwnedDofs() const; + + const ValueTypeBasisCoeff * + shapeFunctionData(bool transpose = false) const; + + const ValueTypeBasisCoeff * + shapeFunctionGradientData(bool transpose = false) const; + + const ValueTypeBasisCoeff * + inverseJacobians() const; + + const ValueTypeBasisCoeff * + JxW() const; + + unsigned int + cellsTypeFlag() const; + + void createMultiVector( const unsigned int dofHandlerIndex, @@ -145,28 +181,34 @@ namespace dftfe dftfe::utils::MemoryStorage d_flattenedCellDofIndexToProcessDofIndexMap; std::vector d_cellIndexToCellIdMap; - dftfe::utils::MemoryStorage - d_inverseJacobianData; - dftfe::utils::MemoryStorage d_JxWData; - dftfe::utils::MemoryStorage + std::vector> + d_inverseJacobianData; + std::vector> + d_JxWData; + std::vector> d_shapeFunctionData; - dftfe::utils::MemoryStorage + std::vector> + d_shapeFunctionGradientDataInternalLayout; + std::vector> d_shapeFunctionGradientData; - dftfe::utils::MemoryStorage - d_nonAffineReshapeIDs; - - unsigned int d_quadratureID; - unsigned int d_dofHandlerID; - unsigned int d_nVectors; - unsigned int d_nCells; - unsigned int d_cellsBlockSize; - unsigned int d_nDofsPerCell; - unsigned int d_nQuadsPerCell; - unsigned int d_localSize; - unsigned int d_locallyOwnedSize; - bool areAllCellsAffine; - bool areAllCellsCartesian; - UpdateFlags d_updateFlags; + std::vector> + d_shapeFunctionDataTranspose; + std::vector> + d_shapeFunctionGradientDataTranspose; + + std::vector d_quadratureIDsVector; + unsigned int d_quadratureID; + std::vector d_nQuadsPerCell; + unsigned int d_dofHandlerID; + unsigned int d_nVectors; + unsigned int d_nCells; + unsigned int d_cellsBlockSize; + unsigned int d_nDofsPerCell; + unsigned int d_localSize; + unsigned int d_locallyOwnedSize; + bool areAllCellsAffine; + bool areAllCellsCartesian; + UpdateFlags d_updateFlags; }; template ::d_nVectors; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_quadratureID; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, @@ -239,10 +285,22 @@ namespace dftfe ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionDataTranspose; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientDataTranspose; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataInternalLayout; using FEBasisOperationsBase::d_JxWData; @@ -385,6 +443,10 @@ namespace dftfe ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_cellsBlockSize; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_quadratureID; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, @@ -409,10 +471,22 @@ namespace dftfe ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionData; + using FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionDataTranspose; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionGradientData; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataTranspose; + using FEBasisOperationsBase:: + d_shapeFunctionGradientDataInternalLayout; using FEBasisOperationsBase::d_JxWData; @@ -424,10 +498,6 @@ namespace dftfe ValueTypeBasisCoeff, ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_cellIndexToCellIdMap; - using FEBasisOperationsBase< - ValueTypeBasisCoeff, - ValueTypeBasisData, - dftfe::utils::MemorySpace::DEVICE>::d_nonAffineReshapeIDs; using FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h index 5c7083b97..92a71e0f4 100644 --- a/include/constraintMatrixInfo.h +++ b/include/constraintMatrixInfo.h @@ -63,37 +63,6 @@ namespace dftfe & partitioner, const dealii::AffineConstraints &constraintMatrixData); - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps( - const std::shared_ptr - &partitioner1, - const std::shared_ptr - & partitioner2, - const unsigned int blockSize); - - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps(const std::shared_ptr> &partitioner2, - const unsigned int blockSize); - - void - precomputeMaps(const unsigned int totalSize, - const unsigned int blockSize); - /** * @brief overloaded dealii internal function "distribute" which sets the slave node * field values from master nodes diff --git a/include/constraintMatrixInfoDevice.h b/include/constraintMatrixInfoDevice.h index 4782c0541..dd5338a0e 100644 --- a/include/constraintMatrixInfoDevice.h +++ b/include/constraintMatrixInfoDevice.h @@ -61,31 +61,6 @@ namespace dftfe & partitioner, const dealii::AffineConstraints &constraintMatrixData); - /** - * @brief precompute map between local processor index of unflattened deallii array to the local processor index of - * the first field associated with the multi-field flattened dealii array - * - * @param partitioner1 associated with unflattened dealii vector - * @param partitioner2 associated with flattened dealii vector storing multi-fields - */ - void - precomputeMaps(const std::shared_ptr> &partitioner2, - const unsigned int blockSize); - - void - precomputeMaps(const unsigned int totalSize, - const unsigned int blockSize); - - void - precomputeMaps( - const std::shared_ptr - &partitioner1, - const std::shared_ptr - & partitioner2, - const unsigned int blockSize); - - /** * @brief overloaded dealii internal function distribute for flattened dealii array which sets * the slave node field values from master nodes diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index b2a15422b..9fb45ccad 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -655,7 +655,7 @@ namespace dftfe std::unique_ptr< dftfe::basis:: FEBasisOperations> - basisOperationsPtrHOST; + basisOperationsPtrHost; /// data structures to store diagonal of inverse square root mass matrix and diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 5cbaab16b..85c316921 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -97,18 +97,6 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> & getShapeFunctionValuesNLPTransposed() = 0; - // virtual dftfe::utils::MemoryStorage & - // getShapeFunctionGradientValuesXTransposed() = 0; - - // virtual dftfe::utils::MemoryStorage & - // getShapeFunctionGradientValuesYTransposed() = 0; - - // virtual dftfe::utils::MemoryStorage & - // getShapeFunctionGradientValuesZTransposed() = 0; - virtual dftfe::utils::MemoryStorage & getShapeFunctionGradientValuesNLPTransposed() = 0; diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index b9d80e104..b2c6188ae 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -293,9 +293,7 @@ namespace dftfe dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - matrixFreeDofhandlerIndex, - d_quadratureIndex, - updateFlags); + d_quadratureIndex); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; @@ -471,9 +469,7 @@ namespace dftfe dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - matrixFreeDofhandlerIndex, - d_quadratureIndex, - updateFlags); + d_quadratureIndex); for (unsigned int spinIndex = 0; @@ -582,9 +578,6 @@ namespace dftfe MPI_SUM, interBandGroupComm); } - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - double computeRho_time2 = MPI_Wtime(); unsigned int iElem = 0; auto cell = dofHandler.begin_active(); @@ -665,8 +658,7 @@ namespace dftfe } dftfe::utils::deviceSynchronize(); MPI_Barrier(mpiCommParent); - computeRho_time = MPI_Wtime() - computeRho_time; - computeRho_time2 = MPI_Wtime() - computeRho_time2; + computeRho_time = MPI_Wtime() - computeRho_time; if (this_process == 0 && dftParams.verbosity >= 2) if (memorySpace == dftfe::utils::MemorySpace::HOST) @@ -674,7 +666,7 @@ namespace dftfe << std::endl; else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) std::cout << "Time for compute rho on Device: " << computeRho_time - << " " << computeRho_time2 << std::endl; + << std::endl; } template void @@ -696,8 +688,8 @@ namespace dftfe { const unsigned int cellsBlockSize = cellRange.second - cellRange.first; const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; - const unsigned int nQuadsPerCell = basisOperationsPtr->d_nQuadsPerCell; - const unsigned int nCells = basisOperationsPtr->d_nCells; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); for (unsigned int iCell = cellRange.first; iCell < cellRange.second; ++iCell) for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad) diff --git a/src/dft/densityCalculator.inst.cc b/src/dft/densityCalculator.inst.cc deleted file mode 100644 index 9fbf7a331..000000000 --- a/src/dft/densityCalculator.inst.cc +++ /dev/null @@ -1,44 +0,0 @@ -template class DensityCalculator<1, 1>; -template class DensityCalculator<1, 2>; -template class DensityCalculator<2, 2>; -template class DensityCalculator<2, 3>; -template class DensityCalculator<2, 4>; -template class DensityCalculator<3, 3>; -template class DensityCalculator<3, 4>; -template class DensityCalculator<3, 5>; -template class DensityCalculator<3, 6>; -template class DensityCalculator<4, 4>; -template class DensityCalculator<4, 5>; -template class DensityCalculator<4, 6>; -template class DensityCalculator<4, 7>; -template class DensityCalculator<4, 8>; -template class DensityCalculator<5, 5>; -template class DensityCalculator<5, 6>; -template class DensityCalculator<5, 7>; -template class DensityCalculator<5, 8>; -template class DensityCalculator<5, 9>; -template class DensityCalculator<5, 10>; -template class DensityCalculator<6, 6>; -template class DensityCalculator<6, 7>; -template class DensityCalculator<6, 8>; -template class DensityCalculator<6, 9>; -template class DensityCalculator<6, 10>; -template class DensityCalculator<6, 11>; -template class DensityCalculator<6, 12>; -template class DensityCalculator<7, 7>; -template class DensityCalculator<7, 8>; -template class DensityCalculator<7, 9>; -template class DensityCalculator<7, 10>; -template class DensityCalculator<7, 11>; -template class DensityCalculator<7, 12>; -template class DensityCalculator<7, 13>; -template class DensityCalculator<7, 14>; -template class DensityCalculator<8, 8>; -template class DensityCalculator<8, 9>; -template class DensityCalculator<8, 10>; -template class DensityCalculator<8, 11>; -template class DensityCalculator<8, 12>; -template class DensityCalculator<8, 13>; -template class DensityCalculator<8, 14>; -template class DensityCalculator<8, 15>; -template class DensityCalculator<8, 16>; diff --git a/src/dft/densityCalculatorCPU.cc b/src/dft/densityCalculatorCPU.cc deleted file mode 100644 index dbb509c29..000000000 --- a/src/dft/densityCalculatorCPU.cc +++ /dev/null @@ -1,881 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -// source file for electron density related computations -#include -#include -#include -#include -#include -#include -#include -namespace dftfe -{ - template - void - computeRhoFromPSICPU( - const T * X, - const T * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - std::unique_ptr< - dftfe::basis:: - FEBasisOperations> - & basisOperationsPtrHost, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad) - { - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - MPI_Barrier(mpiCommParent); - double cpu_time = MPI_Wtime(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices(interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - - // std::vector wfcQuads(numQuadPoints * BVec, T(0.0)); - - // std::vector gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); - dftfe::utils::MemoryStorage wfcQuads( - numQuadPoints * BVec, T(0.0)), - gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0)); - - std::vector shapeFunctionValues(numQuadPoints * numNodesPerElement, - T(0.0)); - std::vector shapeFunctionGradValues(numQuadPoints * 3 * - numNodesPerElement, - T(0.0)); - const unsigned int numQuadPointsTimes3 = numQuadPoints * 3; - - if (useFEOrderRhoPlusOneGLQuad) - { - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = - T(operatorMatrix.getShapeFunctionValuesDensityGaussLobattoQuad() - [iquad * numNodesPerElement + iNode]); - } - else - { - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode) - shapeFunctionValues[iquad * numNodesPerElement + iNode] = - T(operatorMatrix.getShapeFunctionValuesDensityGaussQuad() - [iquad * numNodesPerElement + iNode]); - } - - std::vector partialOccupVecTimesKptWeight(BVec, 0.0); - - - dftfe::distributedCPUMultiVec flattenedArrayBlock; - - dftfe::utils::MemoryStorage - cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0)); - - - // set density to zero - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - - std::fill((*rhoValues)[cellid].begin(), - (*rhoValues)[cellid].end(), - 0.0); - if (isEvaluateGradRho) - std::fill((*gradRhoValues)[cellid].begin(), - (*gradRhoValues)[cellid].end(), - 0.0); - - if (dftParams.spinPolarized == 1) - { - std::fill((*rhoValuesSpinPolarized)[cellid].begin(), - (*rhoValuesSpinPolarized)[cellid].end(), - 0.0); - if (isEvaluateGradRho) - std::fill((*gradRhoValuesSpinPolarized)[cellid].begin(), - (*gradRhoValuesSpinPolarized)[cellid].end(), - 0.0); - } - } - - std::vector rhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector gradRhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints * 3, - 0.0); - std::vector rhoValuesSpinPolarizedFlattened(totalLocallyOwnedCells * - numQuadPoints * 2, - 0.0); - std::vector gradRhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 6, 0.0); - - - for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - std::vector rhoContribution(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - - std::vector gradRhoXContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - std::vector gradRhoYContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - std::vector gradRhoZContribution( - isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1, - 0.0); - - const T *XCurrentKPoint = - X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * totalNumWaveFunctions; - const T *XFracCurrentKPoint = - XFrac + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) * - numLocalDofs * Nfr; - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, totalNumWaveFunctions - jvec); - const unsigned int d_eigenDofHandlerIndex = 1; - const unsigned int d_quadratureIndex = - useFEOrderRhoPlusOneGLQuad ? 2 : 0; - dftfe::basis::UpdateFlags updateFlags = - dftfe::basis::update_values | dftfe::basis::update_gradients; - basisOperationsPtrHost->reinit(currentBlockSize, - d_eigenDofHandlerIndex, - d_quadratureIndex, - updateFlags); - - if (currentBlockSize != BVec || jvec == 0) - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock, - true); - - if ((jvec + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && - (jvec + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (spectrumSplit) - { - std::fill(partialOccupVecTimesKptWeight.begin(), - partialOccupVecTimesKptWeight.end(), - kPointWeights[kPoint] * spinPolarizedFactor); - } - else - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - if (eigenValues[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - partialOccupVecTimesKptWeight[iEigenVec] = - 0.0; - else - partialOccupVecTimesKptWeight[iEigenVec] = - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - else - { - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - partialOccupVecTimesKptWeight[iEigenVec] = - dftUtils::getPartialOccupancy( - eigenValues[kPoint][totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - } - - - for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock - .data()[iNode * currentBlockSize + iWave] = - XCurrentKPoint[iNode * totalNumWaveFunctions + jvec + - iWave]; - - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock, currentBlockSize); - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - { - const unsigned int inc = 1; - for (unsigned int iNode = 0; iNode < numNodesPerElement; - ++iNode) - { - xcopy( - ¤tBlockSize, - flattenedArrayBlock.data() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionMatrix[currentBlockSize * iNode], - &inc); - } - - - const T scalarCoeffAlpha = T(1.0), - scalarCoeffBeta = T(0.0); - const char transA = 'N', transB = 'N'; - basisOperationsPtrHost->interpolateKernel( - flattenedArrayBlock, - &wfcQuads, - &gradWfcQuads, - std::pair(icell, - icell + 1)); - - // xgemm(&transA, - // &transB, - // ¤tBlockSize, - // &numQuadPoints, - // &numNodesPerElement, - // &scalarCoeffAlpha, - // &cellWaveFunctionMatrix[0], - // ¤tBlockSize, - // &shapeFunctionValues[0], - // &numNodesPerElement, - // &scalarCoeffBeta, - // &wfcQuads[0], - // ¤tBlockSize); - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - rhoContribution[icell * numQuadPoints + iquad] += - partialOccupVecTimesKptWeight[iWave] * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]) * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]); - - if (isEvaluateGradRho) - { - for (unsigned int i = 0; - i < numNodesPerElement * 3 * numQuadPoints; - ++i) - { - shapeFunctionGradValues[i] = T( - operatorMatrix - .getShapeFunctionGradValuesDensityGaussQuad() - [icell * numNodesPerElement * 3 * - numQuadPoints + - i]); - } - - // xgemm(&transA, - // &transB, - // ¤tBlockSize, - // &numQuadPointsTimes3, - // &numNodesPerElement, - // &scalarCoeffAlpha, - // &cellWaveFunctionMatrix[0], - // ¤tBlockSize, - // &shapeFunctionGradValues[0], - // &numNodesPerElement, - // &scalarCoeffBeta, - // &gradWfcQuads[0], - // ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * currentBlockSize + - iWave]; - gradRhoXContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[currentBlockSize * - numQuadPoints + - iquad * currentBlockSize + - iWave]; - gradRhoYContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[currentBlockSize * - numQuadPoints * 2 + - iquad * currentBlockSize + - iWave]; - gradRhoZContribution[icell * numQuadPoints + - iquad] += - 2.0 * partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - } - - } // cells loop - } // band parallelizatoin check - } // wave function block loop - - if (spectrumSplit) - for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) - { - const unsigned int currentBlockSize = - std::min(BVec, Nfr - jvec); - - if (currentBlockSize != BVec || jvec == 0) - operatorMatrix.reinit(currentBlockSize, - flattenedArrayBlock, - true); - - if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - partialOccupVecTimesKptWeight[iEigenVec] = - -kPointWeights[kPoint] * spinPolarizedFactor; - else - partialOccupVecTimesKptWeight[iEigenVec] = 0.0; - } - } - else - { - for (unsigned int iEigenVec = 0; - iEigenVec < currentBlockSize; - ++iEigenVec) - { - partialOccupVecTimesKptWeight[iEigenVec] = - (dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) - - 1.0) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - for (unsigned int iNode = 0; iNode < numLocalDofs; - ++iNode) - for (unsigned int iWave = 0; iWave < currentBlockSize; - ++iWave) - flattenedArrayBlock - .data()[iNode * currentBlockSize + iWave] = - XFracCurrentKPoint[iNode * Nfr + jvec + iWave]; - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(flattenedArrayBlock, currentBlockSize); - - for (int icell = 0; icell < totalLocallyOwnedCells; - icell++) - { - const unsigned int inc = 1; - for (unsigned int iNode = 0; - iNode < numNodesPerElement; - ++iNode) - { - xcopy( - ¤tBlockSize, - flattenedArrayBlock.data() + - operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap() - [icell * numNodesPerElement + iNode], - &inc, - &cellWaveFunctionMatrix[currentBlockSize * - iNode], - &inc); - } - - - const T scalarCoeffAlpha = T(1.0), - scalarCoeffBeta = T(0.0); - const char transA = 'N', transB = 'N'; - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPoints, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &wfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - rhoContribution[icell * numQuadPoints + iquad] += - partialOccupVecTimesKptWeight[iWave] * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]) * - std::abs( - wfcQuads[iquad * currentBlockSize + iWave]); - - if (isEvaluateGradRho) - { - for (unsigned int i = 0; - i < numNodesPerElement * 3 * numQuadPoints; - ++i) - { - shapeFunctionGradValues[i] = T( - operatorMatrix - .getShapeFunctionGradValuesDensityGaussQuad() - [icell * numNodesPerElement * 3 * - numQuadPoints + - i]); - } - - xgemm(&transA, - &transB, - ¤tBlockSize, - &numQuadPointsTimes3, - &numNodesPerElement, - &scalarCoeffAlpha, - &cellWaveFunctionMatrix[0], - ¤tBlockSize, - &shapeFunctionGradValues[0], - &numNodesPerElement, - &scalarCoeffBeta, - &gradWfcQuads[0], - ¤tBlockSize); - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - iWave]; - gradRhoXContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - currentBlockSize + iWave]; - gradRhoYContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - - for (unsigned int iquad = 0; - iquad < numQuadPoints; - ++iquad) - for (unsigned int iWave = 0; - iWave < currentBlockSize; - ++iWave) - { - const T wfcQuadVal = - dftfe::utils::complexConj( - wfcQuads[iquad * currentBlockSize + - iWave]); - const T temp1 = - wfcQuadVal * - gradWfcQuads[iquad * 3 * - currentBlockSize + - 2 * currentBlockSize + - iWave]; - gradRhoZContribution[icell * numQuadPoints + - iquad] += - 2.0 * - partialOccupVecTimesKptWeight[iWave] * - dftfe::utils::realPart(temp1); - } - } - - } // cells loop - } - } - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesFlattened[icell * numQuadPoints + iquad] += - rhoContribution[icell * numQuadPoints + iquad]; - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 0] += - gradRhoXContribution[icell * numQuadPoints + iquad]; - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 1] += - gradRhoYContribution[icell * numQuadPoints + iquad]; - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 2] += - gradRhoZContribution[icell * numQuadPoints + iquad]; - } - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesSpinPolarizedFlattened[icell * numQuadPoints * - 2 + - iquad * 2 + spinIndex] += - rhoContribution[icell * numQuadPoints + iquad]; - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3] += - gradRhoXContribution[icell * numQuadPoints + iquad]; - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 1] += - gradRhoYContribution[icell * numQuadPoints + iquad]; - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 2] += - gradRhoZContribution[icell * numQuadPoints + iquad]; - } - } - - } // kpoint loop - } // spin index loop - - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interpoolcomm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interpoolcomm, - gradRhoValuesFlattened); - - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interpoolcomm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened, - interpoolcomm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interBandGroupComm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interBandGroupComm, - gradRhoValuesFlattened); - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interBandGroupComm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened, - interBandGroupComm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector dummy(1); - std::vector &tempRhoQuads = (*rhoValues)[cellid]; - std::vector &tempGradRhoQuads = - isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; - - std::vector &tempRhoQuadsSP = - (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : - dummy; - std::vector &tempGradRhoQuadsSP = - ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? - (*gradRhoValuesSpinPolarized)[cellid] : - dummy; - - if (dftParams.spinPolarized == 1) - { - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempRhoQuadsSP[2 * q + 0] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 + - q * 2 + 0]; - - tempRhoQuadsSP[2 * q + 1] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 + - q * 2 + 1]; - } - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuadsSP[6 * q + 0] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q]; - tempGradRhoQuadsSP[6 * q + 1] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 1]; - tempGradRhoQuadsSP[6 * q + 2] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 2]; - tempGradRhoQuadsSP[6 * q + 3] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 3]; - tempGradRhoQuadsSP[6 * q + 4] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 4]; - tempGradRhoQuadsSP[6 * q + 5] = - gradRhoValuesSpinPolarizedFlattened[iElem * - numQuadPoints * 6 + - 6 * q + 5]; - } - } - - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q]; - - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3]; - tempGradRhoQuads[3 * q + 1] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 1]; - tempGradRhoQuads[3 * q + 2] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 2]; - } - iElem++; - } - - MPI_Barrier(mpiCommParent); - cpu_time = MPI_Wtime() - cpu_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rho on CPU: " << cpu_time << std::endl; - } - - template void - computeRhoFromPSICPU( - const dataTypes::number * X, - const dataTypes::number * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - std::unique_ptr< - dftfe::basis::FEBasisOperations> - & basisOperationsPtrHost, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad); -} // namespace dftfe diff --git a/src/dft/densityCalculatorDevice.cc b/src/dft/densityCalculatorDevice.cc deleted file mode 100644 index 3559e9d1e..000000000 --- a/src/dft/densityCalculatorDevice.cc +++ /dev/null @@ -1,1318 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// -// @author Sambit Das -// - -// source file for electron density related computations -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace dftfe -{ - namespace Device - { - namespace - { - __global__ void - computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, - double * rhoCellsWfcContributions, - double * gradRhoCellsWfcContributionsX, - double * gradRhoCellsWfcContributionsY, - double * gradRhoCellsWfcContributionsZ, - const bool isEvaluateGradRho) - { - const unsigned int globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const double psi = rhoCellsWfcContributions[index]; - rhoCellsWfcContributions[index] = psi * psi; - - if (isEvaluateGradRho) - { - const double gradPsiX = gradRhoCellsWfcContributionsX[index]; - gradRhoCellsWfcContributionsX[index] = 2.0 * psi * gradPsiX; - - const double gradPsiY = gradRhoCellsWfcContributionsY[index]; - gradRhoCellsWfcContributionsY[index] = 2.0 * psi * gradPsiY; - - const double gradPsiZ = gradRhoCellsWfcContributionsZ[index]; - gradRhoCellsWfcContributionsZ[index] = 2.0 * psi * gradPsiZ; - } - } - } - - __global__ void - computeRhoGradRhoFromInterpolatedValues( - const unsigned int numberEntries, - dftfe::utils::deviceDoubleComplex *rhoCellsWfcContributions, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsX, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsY, - dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsZ, - const bool isEvaluateGradRho) - { - const unsigned int globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - - for (unsigned int index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - const dftfe::utils::deviceDoubleComplex psi = - rhoCellsWfcContributions[index]; - rhoCellsWfcContributions[index] = - dftfe::utils::makeComplex(psi.x * psi.x + psi.y * psi.y, 0.0); - - if (isEvaluateGradRho) - { - const dftfe::utils::deviceDoubleComplex gradPsiX = - gradRhoCellsWfcContributionsX[index]; - gradRhoCellsWfcContributionsX[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiX.x + - psi.y * gradPsiX.y), - 0.0); - - const dftfe::utils::deviceDoubleComplex gradPsiY = - gradRhoCellsWfcContributionsY[index]; - gradRhoCellsWfcContributionsY[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiY.x + - psi.y * gradPsiY.y), - 0.0); - - const dftfe::utils::deviceDoubleComplex gradPsiZ = - gradRhoCellsWfcContributionsZ[index]; - gradRhoCellsWfcContributionsZ[index] = - dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiZ.x + - psi.y * gradPsiZ.y), - 0.0); - } - } - } - } // namespace - - template - void - computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - std::unique_ptr< - dftfe::basis::FEBasisOperations> - & basisOperationsPtrDevice, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad) - { - if (use2pPlusOneGLQuad) - AssertThrow(!isEvaluateGradRho, dftUtils::ExcNotImplementedYet()); - - int this_process; - MPI_Comm_rank(mpiCommParent, &this_process); - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - double device_time = MPI_Wtime(); - const unsigned int numKPoints = kPointWeights.size(); - - // band group parallelization data structures - const unsigned int numberBandGroups = - dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); - const unsigned int bandGroupTaskId = - dealii::Utilities::MPI::this_mpi_process(interBandGroupComm); - std::vector bandGroupLowHighPlusOneIndices; - dftUtils::createBandParallelizationIndices( - interBandGroupComm, - totalNumWaveFunctions, - bandGroupLowHighPlusOneIndices); - - const unsigned int BVec = - std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions); - - const double spinPolarizedFactor = - (dftParams.spinPolarized == 1) ? 1.0 : 2.0; - - const NumberType zero = 0; - const NumberType scalarCoeffAlphaRho = 1.0; - const NumberType scalarCoeffBetaRho = 1.0; - const NumberType scalarCoeffAlphaGradRho = 1.0; - const NumberType scalarCoeffBetaGradRho = 1.0; - - const unsigned int cellsBlockSize = 50; - const unsigned int numCellBlocks = - totalLocallyOwnedCells / cellsBlockSize; - const unsigned int remCellBlockSize = - totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; - - dftfe::utils::MemoryStorage - rhoDevice(totalLocallyOwnedCells * numQuadPoints, zero); - dftfe::utils::MemoryStorage - rhoWfcContributionsDevice(cellsBlockSize * numQuadPoints * BVec, zero); - - dftfe::utils::MemoryStorage - gradRhoDeviceX(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoDeviceY(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoDeviceZ(isEvaluateGradRho ? - (totalLocallyOwnedCells * numQuadPoints) : - 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceX( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceY( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - dftfe::utils::MemoryStorage - gradRhoWfcContributionsDeviceZ( - isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1, - zero); - - dftfe::utils::MemoryStorage - rhoHost; - dftfe::utils::MemoryStorage - gradRhoHostX; - dftfe::utils::MemoryStorage - gradRhoHostY; - dftfe::utils::MemoryStorage - gradRhoHostZ; - - rhoHost.resize(totalLocallyOwnedCells * numQuadPoints, zero); - - if (isEvaluateGradRho) - { - gradRhoHostX.resize(totalLocallyOwnedCells * numQuadPoints, zero); - - gradRhoHostY.resize(totalLocallyOwnedCells * numQuadPoints, zero); - gradRhoHostZ.resize(totalLocallyOwnedCells * numQuadPoints, zero); - } - - - dftfe::utils::MemoryStorage - shapeFunctionValuesTransposedDevice(numNodesPerElement * numQuadPoints, - zero); - - shapeFunctionValuesTransposedDevice.setValue(zero); - - - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - numNodesPerElement * numQuadPoints, - (operatorMatrix.getShapeFunctionValuesTransposed(use2pPlusOneGLQuad)) - .begin(), - shapeFunctionValuesTransposedDevice.begin()); - - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesXTransposedDevice; - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesYTransposedDevice; - dftfe::utils::MemoryStorage - shapeFunctionGradientValuesZTransposedDevice; - - if (isEvaluateGradRho) - { - shapeFunctionGradientValuesXTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesXTransposedDevice.setValue(0); - - shapeFunctionGradientValuesYTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesYTransposedDevice.setValue(0); - - shapeFunctionGradientValuesZTransposedDevice.resize( - cellsBlockSize * numNodesPerElement * numQuadPoints, 0); - shapeFunctionGradientValuesZTransposedDevice.setValue(0); - } - - dftfe::utils::MemoryStorage - partialOccupVec(BVec, zero); - dftfe::utils::MemoryStorage - partialOccupVecDevice(BVec, zero); - - distributedDeviceVec &deviceFlattenedArrayBlock = - operatorMatrix.getParallelChebyBlockVectorDevice(); - dftfe::utils::MemoryStorage - &cellWaveFunctionMatrixMV = operatorMatrix.getCellWaveFunctionMatrix(); - NumberType *cellWaveFunctionMatrix = (cellWaveFunctionMatrixMV).begin(); - - typename dealii::DoFHandler<3>::active_cell_iterator cell = - dofHandler.begin_active(); - typename dealii::DoFHandler<3>::active_cell_iterator endc = - dofHandler.end(); - - std::vector rhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints, - 0.0); - std::vector gradRhoValuesFlattened(totalLocallyOwnedCells * - numQuadPoints * 3, - 0.0); - std::vector rhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 2, 0.0); - std::vector gradRhoValuesSpinPolarizedFlattened( - totalLocallyOwnedCells * numQuadPoints * 6, 0.0); - - for (unsigned int spinIndex = 0; - spinIndex < (1 + dftParams.spinPolarized); - ++spinIndex) - { - for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint) - { - rhoDevice.setValue(zero); - rhoWfcContributionsDevice.setValue(zero); - gradRhoDeviceX.setValue(zero); - gradRhoDeviceY.setValue(zero); - gradRhoDeviceZ.setValue(zero); - gradRhoWfcContributionsDeviceX.setValue(zero); - gradRhoWfcContributionsDeviceY.setValue(zero); - gradRhoWfcContributionsDeviceZ.setValue(zero); - - for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; - jvec += BVec) - { - if ((jvec + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (spectrumSplit) - { - partialOccupVecDevice.setValue(kPointWeights[kPoint] * - spinPolarizedFactor); - } - else - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : - fermiEnergyDown; - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - *(partialOccupVec.begin() + iEigenVec) = 0; - else - *(partialOccupVec.begin() + iEigenVec) = - kPointWeights[kPoint] * - spinPolarizedFactor; - } - } - else - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(partialOccupVec.begin() + iEigenVec) = - dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - partialOccupVec - .template copyTo( - partialOccupVecDevice); - } - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - totalNumWaveFunctions, - numLocalDofs, - jvec, - X + numLocalDofs * totalNumWaveFunctions * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayBlock.begin()); - - const unsigned int d_eigenDofHandlerIndex = 1; - const unsigned int d_quadratureIndex = - use2pPlusOneGLQuad ? 2 : 0; - dftfe::basis::UpdateFlags updateFlags = - dftfe::basis::update_values | - dftfe::basis::update_gradients; - basisOperationsPtrDevice->reinit(BVec, - 0, - d_quadratureIndex, - updateFlags); - - - deviceFlattenedArrayBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayBlock, BVec); - - for (int iblock = 0; iblock < (numCellBlocks + 1); - iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayBlock.begin(), - cellWaveFunctionMatrix, - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - // basisOperationsPtrDevice - // ->extractToCellNodalDataKernel( - // deviceFlattenedArrayBlock, - // &cellWaveFunctionMatrixMV, - // std::pair( - // startingCellId, - // startingCellId + currentCellsBlockSize)); - - NumberType scalarCoeffAlpha = 1.0; - NumberType scalarCoeffBeta = 0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - // dftfe::utils::deviceBlasWrapper:: - // gemmStridedBatched( - // operatorMatrix.getDeviceBlasHandle(), - // dftfe::utils::DEVICEBLAS_OP_N, - // dftfe::utils::DEVICEBLAS_OP_N, - // BVec, - // numQuadPoints, - // numNodesPerElement, - // &scalarCoeffAlpha, - // cellWaveFunctionMatrixMV.data(), - // BVec, - // strideA, - // shapeFunctionValuesTransposedDevice.begin(), - // numNodesPerElement, - // strideB, - // &scalarCoeffBeta, - // rhoWfcContributionsDevice.begin(), - // BVec, - // strideC, - // currentCellsBlockSize); - basisOperationsPtrDevice->interpolateKernel( - deviceFlattenedArrayBlock, - &rhoWfcContributionsDevice, - NULL, - std::pair( - startingCellId, - startingCellId + currentCellsBlockSize)); - - - if (isEvaluateGradRho) - { - strideB = numNodesPerElement * numQuadPoints; - - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesXTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesYTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesZTransposedDevice - .begin()); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - strideC, - currentCellsBlockSize); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - strideC, - currentCellsBlockSize); - } - - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoGradRhoFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoGradRhoFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#endif - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - partialOccupVecDevice.begin(), - 1, - rhoWfcContributionsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoDevice.begin() + - startingCellId * numQuadPoints, - 1); - - - if (isEvaluateGradRho) - { - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceX.begin() + - startingCellId * numQuadPoints, - 1); - - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceY.begin() + - startingCellId * numQuadPoints, - 1); - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceZ.begin() + - startingCellId * numQuadPoints, - 1); - } - } // non-trivial cell block check - } // cells block loop - } // band parallelizatoin check - } // wave function block loop - - if (spectrumSplit) - for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec) - if ((jvec + totalNumWaveFunctions - Nfr + BVec) <= - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + - 1] && - (jvec + totalNumWaveFunctions - Nfr + BVec) > - bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId]) - { - if (dftParams.constraintMagnetization) - { - const double fermiEnergyConstraintMag = - spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown; - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - if (eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec] > - fermiEnergyConstraintMag) - *(partialOccupVec.begin() + iEigenVec) = - -kPointWeights[kPoint] * spinPolarizedFactor; - else - *(partialOccupVec.begin() + iEigenVec) = 0; - } - } - else - { - for (unsigned int iEigenVec = 0; iEigenVec < BVec; - ++iEigenVec) - { - *(partialOccupVec.begin() + iEigenVec) = - (dftUtils::getPartialOccupancy( - eigenValues[kPoint] - [totalNumWaveFunctions * - spinIndex + - (totalNumWaveFunctions - Nfr) + - jvec + iEigenVec], - fermiEnergy, - C_kb, - dftParams.TVal) - - 1.0) * - kPointWeights[kPoint] * spinPolarizedFactor; - } - } - - partialOccupVec - .template copyTo( - partialOccupVecDevice); - - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlockConstantStride( - BVec, - Nfr, - numLocalDofs, - jvec, - XFrac + numLocalDofs * Nfr * - ((dftParams.spinPolarized + 1) * kPoint + - spinIndex), - deviceFlattenedArrayBlock.begin()); - - deviceFlattenedArrayBlock.updateGhostValues(); - - (operatorMatrix.getOverloadedConstraintMatrix()) - ->distribute(deviceFlattenedArrayBlock, BVec); - - for (int iblock = 0; iblock < (numCellBlocks + 1); - iblock++) - { - const unsigned int currentCellsBlockSize = - (iblock == numCellBlocks) ? remCellBlockSize : - cellsBlockSize; - if (currentCellsBlockSize > 0) - { - const unsigned int startingCellId = - iblock * cellsBlockSize; - - dftfe::utils::deviceKernelsGeneric:: - stridedCopyToBlock( - BVec, - currentCellsBlockSize * numNodesPerElement, - deviceFlattenedArrayBlock.begin(), - cellWaveFunctionMatrix, - (operatorMatrix - .getFlattenedArrayCellLocalProcIndexIdMap()) - .begin() + - startingCellId * numNodesPerElement); - - NumberType scalarCoeffAlpha = 1.0; - NumberType scalarCoeffBeta = 0; - int strideA = BVec * numNodesPerElement; - int strideB = 0; - int strideC = BVec * numQuadPoints; - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionValuesTransposedDevice.begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - rhoWfcContributionsDevice.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - - if (isEvaluateGradRho) - { - strideB = numNodesPerElement * numQuadPoints; - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesXTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesXTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesYTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesYTransposedDevice - .begin()); - - dftfe::utils::deviceKernelsGeneric:: - copyValueType1ArrToValueType2Arr( - currentCellsBlockSize * - numNodesPerElement * numQuadPoints, - (operatorMatrix - .getShapeFunctionGradientValuesZTransposed()) - .begin() + - startingCellId * numNodesPerElement * - numQuadPoints, - shapeFunctionGradientValuesZTransposedDevice - .begin()); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesXTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - strideC, - currentCellsBlockSize); - - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesYTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - strideC, - currentCellsBlockSize); - - dftfe::utils::deviceBlasWrapper:: - gemmStridedBatched( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - BVec, - numQuadPoints, - numNodesPerElement, - &scalarCoeffAlpha, - cellWaveFunctionMatrix, - BVec, - strideA, - shapeFunctionGradientValuesZTransposedDevice - .begin(), - numNodesPerElement, - strideB, - &scalarCoeffBeta, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - strideC, - currentCellsBlockSize); - } - - -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - computeRhoGradRhoFromInterpolatedValues<<< - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL( - computeRhoGradRhoFromInterpolatedValues, - (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / - dftfe::utils::DEVICE_BLOCK_SIZE * - numQuadPoints * currentCellsBlockSize, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - currentCellsBlockSize * numQuadPoints * BVec, - dftfe::utils::makeDataTypeDeviceCompatible( - rhoWfcContributionsDevice.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceX.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceY.begin()), - dftfe::utils::makeDataTypeDeviceCompatible( - gradRhoWfcContributionsDeviceZ.begin()), - isEvaluateGradRho); -#endif - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaRho, - partialOccupVecDevice.begin(), - 1, - rhoWfcContributionsDevice.begin(), - BVec, - &scalarCoeffBetaRho, - rhoDevice.begin() + - startingCellId * numQuadPoints, - 1); - - - if (isEvaluateGradRho) - { - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceX.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceX.begin() + - startingCellId * numQuadPoints, - 1); - - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceY.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceY.begin() + - startingCellId * numQuadPoints, - 1); - - dftfe::utils::deviceBlasWrapper::gemm( - operatorMatrix.getDeviceBlasHandle(), - dftfe::utils::DEVICEBLAS_OP_N, - dftfe::utils::DEVICEBLAS_OP_N, - 1, - currentCellsBlockSize * numQuadPoints, - BVec, - &scalarCoeffAlphaGradRho, - partialOccupVecDevice.begin(), - 1, - gradRhoWfcContributionsDeviceZ.begin(), - BVec, - &scalarCoeffBetaGradRho, - gradRhoDeviceZ.begin() + - startingCellId * numQuadPoints, - 1); - } - } // non-tivial cells block - } // cells block loop - } // spectrum split block - - - // do memcopy to host - rhoDevice.template copyTo( - rhoHost.begin(), totalLocallyOwnedCells * numQuadPoints, 0, 0); - - if (isEvaluateGradRho) - { - gradRhoDeviceX - .template copyTo( - gradRhoHostX.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - gradRhoDeviceY - .template copyTo( - gradRhoHostY.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - - gradRhoDeviceZ - .template copyTo( - gradRhoHostZ.begin(), - totalLocallyOwnedCells * numQuadPoints, - 0, - 0); - } - - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesFlattened[icell * numQuadPoints + iquad] += - dftfe::utils::realPart( - *(rhoHost.begin() + icell * numQuadPoints + iquad)); - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 0] += - dftfe::utils::realPart(*(gradRhoHostX.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 1] += - dftfe::utils::realPart(*(gradRhoHostY.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesFlattened[icell * numQuadPoints * 3 + - 3 * iquad + 2] += - dftfe::utils::realPart(*(gradRhoHostZ.begin() + - icell * numQuadPoints + - iquad)); - } - if (dftParams.spinPolarized == 1) - { - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad) - { - rhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] += - dftfe::utils::realPart( - *(rhoHost.begin() + icell * numQuadPoints + iquad)); - } - - if (isEvaluateGradRho) - for (int icell = 0; icell < totalLocallyOwnedCells; icell++) - for (unsigned int iquad = 0; iquad < numQuadPoints; - ++iquad) - { - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3] += - dftfe::utils::realPart(*(gradRhoHostX.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 1] += - dftfe::utils::realPart(*(gradRhoHostY.begin() + - icell * numQuadPoints + - iquad)); - gradRhoValuesSpinPolarizedFlattened - [icell * numQuadPoints * 6 + iquad * 6 + - spinIndex * 3 + 2] += - dftfe::utils::realPart(*(gradRhoHostZ.begin() + - icell * numQuadPoints + - iquad)); - } - } - } // kpoint loop - } // spin index - - - // gather density from all inter communicators - if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interpoolcomm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interpoolcomm, - gradRhoValuesFlattened); - - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interpoolcomm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum( - gradRhoValuesSpinPolarizedFlattened, - interpoolcomm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1) - { - dealii::Utilities::MPI::sum(rhoValuesFlattened, - interBandGroupComm, - rhoValuesFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum(gradRhoValuesFlattened, - interBandGroupComm, - gradRhoValuesFlattened); - - - if (dftParams.spinPolarized == 1) - { - dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened, - interBandGroupComm, - rhoValuesSpinPolarizedFlattened); - - if (isEvaluateGradRho) - dealii::Utilities::MPI::sum( - gradRhoValuesSpinPolarizedFlattened, - interBandGroupComm, - gradRhoValuesSpinPolarizedFlattened); - } - } - - - unsigned int iElem = 0; - cell = dofHandler.begin_active(); - endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector dummy(1); - std::vector &tempRhoQuads = (*rhoValues)[cellid]; - std::vector &tempGradRhoQuads = - isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; - - std::vector &tempRhoQuadsSP = - (dftParams.spinPolarized == 1) ? - (*rhoValuesSpinPolarized)[cellid] : - dummy; - std::vector &tempGradRhoQuadsSP = - ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? - (*gradRhoValuesSpinPolarized)[cellid] : - dummy; - - if (dftParams.spinPolarized == 1) - { - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempRhoQuadsSP[2 * q + 0] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * - 2 + - q * 2 + 0]; - - tempRhoQuadsSP[2 * q + 1] = - rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * - 2 + - q * 2 + 1]; - } - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuadsSP[6 * q + 0] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q]; - tempGradRhoQuadsSP[6 * q + 1] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 1]; - tempGradRhoQuadsSP[6 * q + 2] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 2]; - tempGradRhoQuadsSP[6 * q + 3] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 3]; - tempGradRhoQuadsSP[6 * q + 4] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 4]; - tempGradRhoQuadsSP[6 * q + 5] = - gradRhoValuesSpinPolarizedFlattened - [iElem * numQuadPoints * 6 + 6 * q + 5]; - } - } - - for (unsigned int q = 0; q < numQuadPoints; ++q) - tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q]; - - - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - tempGradRhoQuads[3 * q] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3]; - tempGradRhoQuads[3 * q + 1] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + - 1]; - tempGradRhoQuads[3 * q + 2] = - gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + - 2]; - } - iElem++; - } - - dftfe::utils::deviceSynchronize(); - MPI_Barrier(mpiCommParent); - device_time = MPI_Wtime() - device_time; - - if (this_process == 0 && dftParams.verbosity >= 2) - std::cout << "Time for compute rho on Device: " << device_time - << std::endl; - } - - template void - computeRhoFromPSI( - const dataTypes::number * X, - const dataTypes::number * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - std::unique_ptr< - dftfe::basis::FEBasisOperations> - & basisOperationsPtrDevice, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad); - } // namespace Device -} // namespace dftfe diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc index 8f799b124..52a15c0ef 100644 --- a/src/dft/densityCalculatorDeviceKernels.cc +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -158,8 +158,8 @@ namespace dftfe { const unsigned int cellsBlockSize = cellRange.second - cellRange.first; const unsigned int vectorsBlockSize = vecRange.second - vecRange.first; - const unsigned int nQuadsPerCell = basisOperationsPtr->d_nQuadsPerCell; - const unsigned int nCells = basisOperationsPtr->d_nCells; + const unsigned int nQuadsPerCell = basisOperationsPtr->nQuadsPerCell(); + const unsigned int nCells = basisOperationsPtr->nCells(); const double scalarCoeffAlphaRho = 1.0; const double scalarCoeffBetaRho = 1.0; const double scalarCoeffAlphaGradRho = 1.0; diff --git a/src/dft/dos.cc b/src/dft/dos.cc index 44d6c3873..78ad4b545 100644 --- a/src/dft/dos.cc +++ b/src/dft/dos.cc @@ -475,11 +475,6 @@ namespace dftfe eigenVectorsFlattenedBlock[kPoint]); eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0); } - - constraintsNoneDataInfo.precomputeMaps( - matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedBlock[0].get_partitioner(), - currentBlockSize); } @@ -1024,11 +1019,6 @@ namespace dftfe eigenVectorsFlattenedBlock[kPoint]); eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0); } - - constraintsNoneDataInfo.precomputeMaps( - matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedBlock[0].get_partitioner(), - currentBlockSize); } diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 7f3b612f2..16e64ed84 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -267,13 +267,27 @@ namespace dftfe double, dftfe::utils::MemorySpace::HOST>>( matrix_free_data, d_constraintsVector); + dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | + dftfe::basis::update_gradients | + dftfe::basis::update_transpose; + std::vector quadratureIndices(4, 0); + for (auto i = 0; i < 4; ++i) + quadratureIndices[i] = i; + basisOperationsPtrHost->init(d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); #if defined(DFTFE_WITH_DEVICE) if (d_dftParamsPtr->useDevice) - basisOperationsPtrDevice = std::make_unique< - dftfe::basis::FEBasisOperations>( - matrix_free_data, d_constraintsVector); + { + basisOperationsPtrDevice = std::make_unique< + dftfe::basis::FEBasisOperations>( + matrix_free_data, d_constraintsVector); + basisOperationsPtrDevice->init(d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); + } #endif MPI_Barrier(d_mpiCommParent); diff --git a/src/dft/solveVselfInBinsDevice.cc b/src/dft/solveVselfInBinsDevice.cc index 22bad5577..3fc681a31 100644 --- a/src/dft/solveVselfInBinsDevice.cc +++ b/src/dft/solveVselfInBinsDevice.cc @@ -515,10 +515,6 @@ namespace dftfe matrixFreeData.get_vector_partitioner(mfDofHandlerIndex), hangingPeriodicConstraintMatrix); - - constraintsMatrixDataInfoDevice.precomputeMaps( - flattenedArray.getMPIPatternP2P(), blockSize); - constraintsMatrixDataInfoDevice.set_zero(xD, blockSize); dftfe::utils::deviceSynchronize(); diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc index 5ad596be7..c9c9f04b1 100644 --- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc +++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc @@ -167,18 +167,16 @@ namespace double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; if (areAllCellsAffineOrCartesianFlag == 0) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; const double Jxx = inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + 0]; @@ -216,18 +214,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 1) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; @@ -247,18 +233,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 2) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; @@ -353,28 +327,24 @@ namespace double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; if (areAllCellsAffineOrCartesianFlag == 0) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + 0]; @@ -418,26 +388,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 1) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; @@ -463,30 +413,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 2) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; @@ -581,28 +507,24 @@ namespace double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; if (areAllCellsAffineOrCartesianFlag == 0) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + 0]; @@ -646,30 +568,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 1) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; @@ -695,30 +593,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 2) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; @@ -910,28 +784,24 @@ namespace double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; if (areAllCellsAffineOrCartesianFlag == 0) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + 0]; @@ -975,30 +845,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 1) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; @@ -1024,30 +870,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 2) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; @@ -1124,28 +946,24 @@ namespace double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ, gradShapeZI, gradShapeZJ; + const double gradShapeXIRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI]; + const double gradShapeYIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeZIRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexI]; + const double gradShapeXJRef = + shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeYJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + + numDofsPerCell * q + cellDofIndexJ]; + const double gradShapeZJRef = + shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 + + numDofsPerCell * q + cellDofIndexJ]; if (areAllCellsAffineOrCartesianFlag == 0) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - numDofsPerCell + cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * q * 3 + - 2 * numDofsPerCell + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 + 0]; @@ -1189,30 +1007,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 1) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 9 + 0]; const double Jxy = inverseJacobianValues[cellIndex * 9 + 1]; const double Jxz = inverseJacobianValues[cellIndex * 9 + 2]; @@ -1238,30 +1032,6 @@ namespace } else if (areAllCellsAffineOrCartesianFlag == 2) { - const double gradShapeXIRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeYIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeZIRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexI]; - const double gradShapeXJRef = - shapeFunctionGradientValues[numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeYJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints + - numDofsPerCell * q + - cellDofIndexJ]; - const double gradShapeZJRef = - shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * - 2 + - numDofsPerCell * q + - cellDofIndexJ]; const double Jxx = inverseJacobianValues[cellIndex * 3 + 0]; const double Jyy = inverseJacobianValues[cellIndex * 3 + 1]; const double Jzz = inverseJacobianValues[cellIndex * 3 + 2]; @@ -1327,17 +1097,18 @@ kohnShamDFTOperatorDeviceClass:: !d_isStiffnessMatrixExternalPotCorrComputed && !onlyHPrimePartForFirstOrderDensityMatResponse) { + basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_lpspQuadratureId); #ifdef DFTFE_WITH_DEVICE_LANG_CUDA hamMatrixExtPotCorr<<<(d_numLocallyOwnedCells * d_numberNodesPerElement * d_numberNodesPerElement + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPointsLpsp, - d_shapeFunctionValueLpspDevice.begin(), - d_shapeFunctionValueTransposedLpspDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP @@ -1350,18 +1121,18 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPointsLpsp, - d_shapeFunctionValueLpspDevice.begin(), - d_shapeFunctionValueTransposedLpspDevice.begin(), + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #endif d_isStiffnessMatrixExternalPotCorrComputed = true; } - + basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_densityQuadratureId); if (onlyHPrimePartForFirstOrderDensityMatResponse) { if (dftPtr->d_excManagerPtr->getDensityBasedFamilyType() == @@ -1373,15 +1144,14 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -1399,15 +1169,14 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), d_derExcWithSigmaTimesGradRhoJxWDevice.begin(), @@ -1425,15 +1194,14 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -1450,15 +1218,14 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), dftfe::utils::makeDataTypeDeviceCompatible( @@ -1478,18 +1245,17 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1511,18 +1277,17 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1543,18 +1308,17 @@ kohnShamDFTOperatorDeviceClass:: (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE, dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1575,18 +1339,17 @@ kohnShamDFTOperatorDeviceClass:: dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - d_numLocallyOwnedCells, - d_numberNodesPerElement, - d_numQuadPoints, + basisOperationsPtrDevice->nCells(), + basisOperationsPtrDevice->nDofsPerCell(), + basisOperationsPtrDevice->nQuadsPerCell(), spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - d_shapeFunctionValueDevice.begin(), - d_shapeFunctionValueTransposedDevice.begin(), - basisOperationsPtrDevice->d_shapeFunctionGradientData.begin(), - basisOperationsPtrDevice->d_inverseJacobianData.begin(), - (int)basisOperationsPtrDevice->areAllCellsAffine + - (int)basisOperationsPtrDevice->areAllCellsCartesian, + basisOperationsPtrDevice->shapeFunctionData(true), + basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionGradientData(), + basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc index 7cfc3d102..fed3b321f 100644 --- a/src/dftOperator/kohnShamDFTOperator.cc +++ b/src/dftOperator/kohnShamDFTOperator.cc @@ -170,11 +170,6 @@ namespace dftfe d_normalCellIdToMacroCellIdMap, d_macroCellIdToNormalCellIdMap, d_FullflattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - flattenedArray.get_partitioner(), - numberWaveFunctions); } template @@ -217,9 +212,6 @@ namespace dftfe d_normalCellIdToMacroCellIdMap, d_macroCellIdToNormalCellIdMap, d_FullflattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - flattenedArray.getMPIPatternP2P(), numberWaveFunctions); } template diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 7ec359a43..1c4fb99b4 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -499,14 +499,22 @@ namespace dftfe dftfe::basis:: FEBasisOperations>( dftPtr->matrix_free_data, dftPtr->d_constraintsVector); - basisOperationsPtrHOST = std::make_unique< + basisOperationsPtrHost = std::make_unique< dftfe::basis:: FEBasisOperations>( dftPtr->matrix_free_data, dftPtr->d_constraintsVector); - dftfe::basis::UpdateFlags updateFlags = - dftfe::basis::update_values | dftfe::basis::update_gradients; - basisOperationsPtrDevice->reinit(0, 0, 0, 0, updateFlags); - basisOperationsPtrHOST->reinit(0, 0, 0, 0, updateFlags); + dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | + dftfe::basis::update_gradients | + dftfe::basis::update_transpose; + std::vector quadratureIndices(4, 0); + for (auto i = 0; i < 4; ++i) + quadratureIndices[i] = i; + basisOperationsPtrHost->init(dftPtr->d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); + basisOperationsPtrDevice->init(dftPtr->d_densityDofHandlerIndex, + quadratureIndices, + updateFlags); dftPtr->matrix_free_data.initialize_dof_vector( d_invSqrtMassVector, dftPtr->d_densityDofHandlerIndex); @@ -654,16 +662,6 @@ namespace dftfe d_flattenedArrayCellLocalProcIndexIdMap); - - getOverloadedConstraintMatrix()->precomputeMaps( - flattenedArray.getMPIPatternP2P(), numberWaveFunctions); - - getOverloadedConstraintMatrixHost()->precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - dftPtr->matrix_free_data.get_vector_partitioner(), - 1); - - const unsigned int totalLocallyOwnedCells = dftPtr->matrix_free_data.n_physical_cells(); diff --git a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc index ffdae8006..c21491078 100644 --- a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc +++ b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc @@ -368,8 +368,6 @@ kohnShamDFTOperatorDeviceClass:: // // resize data members // - // d_cellShapeFunctionGradientIntegralFlattened.clear(); - // d_cellShapeFunctionGradientIntegralFlattened.resize(numberPhysicalCells*numberDofsPerElement*numberDofsPerElement); d_cellJxWValues.clear(); d_cellJxWValues.resize(numberPhysicalCells * numberQuadraturePoints); @@ -380,36 +378,6 @@ kohnShamDFTOperatorDeviceClass:: numberDofsPerElement, 0.0); - // d_shapeFunctionGradientValueX.resize(numberPhysicalCells * - // numberQuadraturePoints * - // numberDofsPerElement, - // 0.0); - // d_shapeFunctionGradientValueXTransposed.resize(numberPhysicalCells * - // numberQuadraturePoints - // * - // numberDofsPerElement, - // 0.0); - - // d_shapeFunctionGradientValueY.resize(numberPhysicalCells * - // numberQuadraturePoints * - // numberDofsPerElement, - // 0.0); - // d_shapeFunctionGradientValueYTransposed.resize(numberPhysicalCells * - // numberQuadraturePoints - // * - // numberDofsPerElement, - // 0.0); - - // d_shapeFunctionGradientValueZ.resize(numberPhysicalCells * - // numberQuadraturePoints * - // numberDofsPerElement, - // 0.0); - // d_shapeFunctionGradientValueZTransposed.resize(numberPhysicalCells * - // numberQuadraturePoints - // * - // numberDofsPerElement, - // 0.0); - std::vector shapeFunctionValueLpsp(numberQuadraturePointsLpsp * numberDofsPerElement, 0.0); @@ -438,46 +406,6 @@ kohnShamDFTOperatorDeviceClass:: d_cellJxWValues[iElem * numberQuadraturePoints + q_point] = fe_values.JxW(q_point); - // for (unsigned int iNode = 0; iNode < numberDofsPerElement; - // ++iNode) - // for (unsigned int q_point = 0; q_point < - // numberQuadraturePoints; - // ++q_point) - // { - // const dealii::Tensor<1, 3, double> &shape_grad = - // fe_values.shape_grad(iNode, q_point); - - // d_shapeFunctionGradientValueX[iElem * numberDofsPerElement - // * - // numberQuadraturePoints + - // iNode * - // numberQuadraturePoints + - // q_point] = shape_grad[0]; - // d_shapeFunctionGradientValueXTransposed - // [iElem * numberQuadraturePoints * numberDofsPerElement + - // q_point * numberDofsPerElement + iNode] = shape_grad[0]; - - // d_shapeFunctionGradientValueY[iElem * numberDofsPerElement - // * - // numberQuadraturePoints + - // iNode * - // numberQuadraturePoints + - // q_point] = shape_grad[1]; - // d_shapeFunctionGradientValueYTransposed - // [iElem * numberQuadraturePoints * numberDofsPerElement + - // q_point * numberDofsPerElement + iNode] = shape_grad[1]; - - // d_shapeFunctionGradientValueZ[iElem * numberDofsPerElement - // * - // numberQuadraturePoints + - // iNode * - // numberQuadraturePoints + - // q_point] = shape_grad[2]; - // d_shapeFunctionGradientValueZTransposed - // [iElem * numberQuadraturePoints * numberDofsPerElement + - // q_point * numberDofsPerElement + iNode] = shape_grad[2]; - // } - if (iElem == 0) { fe_values_lpsp.reinit(cellPtr); @@ -523,21 +451,6 @@ kohnShamDFTOperatorDeviceClass:: d_shapeFunctionValueTransposedDevice.copyFrom( d_shapeFunctionValueTransposed); - // d_shapeFunctionGradientValueXTransposedDevice.resize( - // d_shapeFunctionGradientValueXTransposed.size()); - // d_shapeFunctionGradientValueXTransposedDevice.copyFrom( - // d_shapeFunctionGradientValueXTransposed); - - // d_shapeFunctionGradientValueYTransposedDevice.resize( - // d_shapeFunctionGradientValueYTransposed.size()); - // d_shapeFunctionGradientValueYTransposedDevice.copyFrom( - // d_shapeFunctionGradientValueYTransposed); - - // d_shapeFunctionGradientValueZTransposedDevice.resize( - // d_shapeFunctionGradientValueZTransposed.size()); - // d_shapeFunctionGradientValueZTransposedDevice.copyFrom( - // d_shapeFunctionGradientValueZTransposed); - d_shapeFunctionValueLpspDevice.resize(shapeFunctionValueLpsp.size()); d_shapeFunctionValueLpspDevice.copyFrom(shapeFunctionValueLpsp); diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index af840f6da..46977639d 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -473,7 +473,7 @@ namespace dftfe &cellWaveFunctionMatrix = operatorMatrix.getCellWaveFunctionMatrix(); dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | dftfe::basis::update_gradients; - basisOperationsPtr->reinit(BVec, cellsBlockSize, 0, 0, updateFlags); + basisOperationsPtr->reinit(BVec, cellsBlockSize, 0); // dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( // BVec, @@ -1055,7 +1055,7 @@ namespace dftfe dftfe::utils::deviceKernelsGeneric::stridedCopyToBlockConstantStride( numPsi, N, - basisOperationsPtr->d_locallyOwnedSize, + basisOperationsPtr->nOwnedDofs(), startingVecId, X, deviceFlattenedArrayBlock.begin()); diff --git a/src/helmholtz/kerkerSolverProblemDevice.cc b/src/helmholtz/kerkerSolverProblemDevice.cc index 2a2afc5d4..84712f276 100644 --- a/src/helmholtz/kerkerSolverProblemDevice.cc +++ b/src/helmholtz/kerkerSolverProblemDevice.cc @@ -103,11 +103,6 @@ namespace dftfe d_matrixFreeDataPRefinedPtr->get_vector_partitioner( d_matrixFreeVectorComponent), *d_constraintMatrixPRefinedPtr); - d_constraintsTotalPotentialInfo.precomputeMaps( - d_matrixFreeDataPRefinedPtr->get_vector_partitioner( - d_matrixFreeVectorComponent), - d_xPtr->get_partitioner(), - 1); } diff --git a/src/poisson/poissonSolverProblemDevice.cc b/src/poisson/poissonSolverProblemDevice.cc index 4881183dc..27f4ad642 100644 --- a/src/poisson/poissonSolverProblemDevice.cc +++ b/src/poisson/poissonSolverProblemDevice.cc @@ -785,10 +785,6 @@ namespace dftfe d_constraintsTotalPotentialInfo.initialize( d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent), *d_constraintMatrixPtr); - d_constraintsTotalPotentialInfo.precomputeMaps( - d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent), - d_xPtr->get_partitioner(), - 1); } diff --git a/src/symmetry/symmetrizeRho.cc b/src/symmetry/symmetrizeRho.cc index 9410ecea6..5ee927dee 100644 --- a/src/symmetry/symmetrizeRho.cc +++ b/src/symmetry/symmetrizeRho.cc @@ -231,11 +231,6 @@ namespace dftfe dftPtr->d_numEigenValues, eigenVectorsFlattenedArrayFullBlock); - dftPtr->constraintsNoneDataInfo.precomputeMaps( - dftPtr->matrix_free_data.get_vector_partitioner(), - eigenVectorsFlattenedArrayFullBlock.get_partitioner(), - dftPtr->d_numEigenValues); - for (unsigned int kPoint = 0; kPoint < (1 + dftPtr->getParametersObject().spinPolarized) * dftPtr->d_kPointWeights.size(); diff --git a/utils/DeviceBlasWrapper.hip.cc b/utils/DeviceBlasWrapper.hip.cc index ec6a5b316..d474133ad 100644 --- a/utils/DeviceBlasWrapper.hip.cc +++ b/utils/DeviceBlasWrapper.hip.cc @@ -517,6 +517,107 @@ namespace dftfe DEVICEBLAS_API_CHECK(status); return status; } + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const double * alpha, + const double * A, + int lda, + const double * x, + int incx, + const double * beta, + double * y, + int incy) + { + deviceBlasStatus_t status = hipblasDgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const float * alpha, + const float * A, + int lda, + const float * x, + int incx, + const float * beta, + float * y, + int incy) + { + deviceBlasStatus_t status = hipblasSgemv( + handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + hipblasZgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } + + deviceBlasStatus_t + gemv(deviceBlasHandle_t handle, + deviceBlasOperation_t trans, + int m, + int n, + const std::complex *alpha, + const std::complex *A, + int lda, + const std::complex *x, + int incx, + const std::complex *beta, + std::complex * y, + int incy) + { + deviceBlasStatus_t status = + hipblasCgemv(handle, + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); + DEVICEBLAS_API_CHECK(status); + return status; + } } // namespace deviceBlasWrapper } // namespace utils diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index ab15bb427..2721274d7 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -34,7 +34,6 @@ namespace dftfe d_matrixFreeDataPtr = &matrixFreeData; d_constraintsVector = &constraintsVector; d_dofHandlerID = 0; - d_quadratureID = 0; d_nVectors = 0; d_updateFlags = update_default; areAllCellsAffine = true; @@ -57,8 +56,25 @@ namespace dftfe (d_matrixFreeDataPtr->get_mapping_info().get_cell_type( iMacroCell) == dealii::internal::MatrixFreeFunctions::cartesian); } - // std::cout << "DEBUG cart " << areAllCellsCartesian << " " - // << areAllCellsAffine << std::endl; + } + + template + void + FEBasisOperationsBase::init(const unsigned int &dofHandlerID, + const std::vector + & quadratureID, + const UpdateFlags updateFlags) + { + d_dofHandlerID = dofHandlerID; + d_quadratureIDsVector = quadratureID; + d_updateFlags = updateFlags; + initializeIndexMaps(); + initializeConstraints(); + initializeShapeFunctionAndJacobianData(); } template ::reinit(const unsigned int &vecBlockSize, const unsigned int &cellsBlockSize, - const unsigned int &dofHandlerID, - const unsigned int &quadratureID, - const UpdateFlags updateFlags) + const unsigned int &quadratureID) { - if ((d_dofHandlerID != dofHandlerID) || (d_updateFlags != updateFlags)) + d_quadratureID = quadratureID; + d_cellsBlockSize = cellsBlockSize; + if (d_nVectors != vecBlockSize) { - d_dofHandlerID = dofHandlerID; - d_quadratureID = quadratureID; - d_nVectors = vecBlockSize; - d_cellsBlockSize = cellsBlockSize; - d_updateFlags = updateFlags; - initializeIndexMaps(); - initializeConstraints(); - initializeShapeFunctionAndJacobianData(); + d_nVectors = vecBlockSize; initializeFlattenedIndexMaps(); - resizeTempStorage(); - } - else if ((d_quadratureID != quadratureID) && (d_nVectors != vecBlockSize)) - { - d_quadratureID = quadratureID; - d_nVectors = vecBlockSize; - d_cellsBlockSize = cellsBlockSize; - initializeConstraints(); - initializeShapeFunctionAndJacobianData(); - initializeFlattenedIndexMaps(); - resizeTempStorage(); - } - else if (d_quadratureID != quadratureID) - { - d_quadratureID = quadratureID; - d_cellsBlockSize = cellsBlockSize; - initializeShapeFunctionAndJacobianData(); - resizeTempStorage(); - } - else if (d_nVectors != vecBlockSize) - { - d_nVectors = vecBlockSize; - d_cellsBlockSize = cellsBlockSize; - initializeConstraints(); - initializeFlattenedIndexMaps(); - resizeTempStorage(); - } - else if (d_cellsBlockSize != cellsBlockSize) - { - d_cellsBlockSize = cellsBlockSize; - resizeTempStorage(); } + resizeTempStorage(); + } + + template + unsigned int + FEBasisOperationsBase::nQuadsPerCell() const + { + return d_nQuadsPerCell[d_quadratureID]; + } + + template + unsigned int + FEBasisOperationsBase::nDofsPerCell() const + { + return d_nDofsPerCell; + } + + template + unsigned int + FEBasisOperationsBase::nCells() const + { + return d_nCells; + } + + template + unsigned int + FEBasisOperationsBase::nRelaventDofs() const + { + return d_localSize; + } + + template + unsigned int + FEBasisOperationsBase::nOwnedDofs() const + { + return d_locallyOwnedSize; + } + + template + const ValueTypeBasisCoeff * + FEBasisOperationsBase::shapeFunctionData(bool transpose) const + { + return transpose ? d_shapeFunctionDataTranspose[d_quadratureID].data() : + d_shapeFunctionData[d_quadratureID].data(); + } + + template + const ValueTypeBasisCoeff * + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientData(bool transpose) const + { + return transpose ? + d_shapeFunctionGradientDataTranspose[d_quadratureID].data() : + d_shapeFunctionGradientData[d_quadratureID].data(); + } + + template + const ValueTypeBasisCoeff * + FEBasisOperationsBase::inverseJacobians() const + { + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID] + .data(); } + template + const ValueTypeBasisCoeff * + FEBasisOperationsBase::JxW() const + { + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID] + .data(); + } + + template + unsigned int + FEBasisOperationsBase::cellsTypeFlag() const + { + return (unsigned int)areAllCellsAffine + + (unsigned int)areAllCellsCartesian; + } + + + template @@ -131,15 +229,15 @@ namespace dftfe if (d_updateFlags & update_gradients) tempQuadratureGradientsData.resize( - areAllCellsCartesian ? - 0 : - (d_nVectors * d_nQuadsPerCell * 3 * d_cellsBlockSize)); + areAllCellsCartesian ? 0 : + (d_nVectors * d_nQuadsPerCell[d_quadratureID] * + 3 * d_cellsBlockSize)); if (d_updateFlags & update_gradients) tempQuadratureGradientsDataNonAffine.resize( - areAllCellsAffine ? - 0 : - (d_nVectors * d_nQuadsPerCell * 3 * d_cellsBlockSize)); + areAllCellsAffine ? 0 : + (d_nVectors * d_nQuadsPerCell[d_quadratureID] * + 3 * d_cellsBlockSize)); } template d_flattenedCellDofIndexToProcessDofIndexMapHost; - dftfe::utils::MemoryStorage - d_nonAffineReshapeIDsHost; - if ((memorySpace == dftfe::utils::MemorySpace::DEVICE) && - (!areAllCellsAffine)) - { - d_nonAffineReshapeIDsHost.resize(d_nCells * d_nQuadsPerCell * 3); - for (unsigned int iCell = 0; iCell < d_nCells; ++iCell) - { - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) - { - for (unsigned int iDim = 0; iDim < 3; ++iDim) - { - d_nonAffineReshapeIDsHost[iQuad + d_nQuadsPerCell * iDim + - d_nQuadsPerCell * 3 * iCell] = - (iDim + 3 * iQuad + d_nQuadsPerCell * 3 * iCell) * - d_nVectors; - } - } - } - } - d_nonAffineReshapeIDs.resize(d_nonAffineReshapeIDsHost.size()); - d_nonAffineReshapeIDs.copyFrom(d_nonAffineReshapeIDsHost); #else auto &d_flattenedCellDofIndexToProcessDofIndexMapHost = d_flattenedCellDofIndexToProcessDofIndexMap; @@ -259,12 +334,6 @@ namespace dftfe d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner( d_dofHandlerID), *((*d_constraintsVector)[d_dofHandlerID])); - d_constraintInfo.precomputeMaps( - d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) - ->locally_owned_size() + - d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) - ->n_ghost_indices(), - d_nVectors); } template ::initializeShapeFunctionAndJacobianData() { - const dealii::Quadrature<3> &quadrature = - d_matrixFreeDataPtr->get_quadrature(d_quadratureID); - dealii::FEValues<3> fe_values( - d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), - quadrature, - dealii::update_values | dealii::update_gradients | - dealii::update_jacobians | dealii::update_JxW_values | - dealii::update_inverse_jacobians); + d_nQuadsPerCell.resize(d_quadratureIDsVector.size()); + d_inverseJacobianData.resize( + areAllCellsAffine ? 1 : d_quadratureIDsVector.size()); + d_JxWData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_values) + { + d_shapeFunctionData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionDataTranspose.resize(d_quadratureIDsVector.size()); + } + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataInternalLayout.resize( + d_quadratureIDsVector.size()); + d_shapeFunctionGradientData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTranspose.resize( + d_quadratureIDsVector.size()); + } + for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size(); + ++iQuadID) + { + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_values | dealii::update_gradients | + dealii::update_jacobians | dealii::update_JxW_values | + dealii::update_inverse_jacobians); - d_nQuadsPerCell = quadrature.size(); + d_nQuadsPerCell[iQuadID] = quadrature.size(); #if defined(DFTFE_WITH_DEVICE) - dftfe::utils::MemoryStorage - d_inverseJacobianDataHost; - dftfe::utils::MemoryStorage - d_JxWDataHost; - dftfe::utils::MemoryStorage - d_shapeFunctionDataHost; - dftfe::utils::MemoryStorage - d_shapeFunctionGradientDataHost; + dftfe::utils::MemoryStorage + d_inverseJacobianDataHost; + dftfe::utils::MemoryStorage + d_JxWDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataTransposeHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataInternalLayoutHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataTransposeHost; #else - auto &d_inverseJacobianDataHost = d_inverseJacobianData; - auto &d_JxWDataHost = d_JxWData; - auto &d_shapeFunctionDataHost = d_shapeFunctionData; - auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; + auto &d_inverseJacobianDataHost = d_inverseJacobianData; + auto &d_JxWDataHost = d_JxWData; + auto &d_shapeFunctionDataHost = d_shapeFunctionData; + auto &d_shapeFunctionGradientDataInternalLayoutHost = + d_shapeFunctionGradientDataInternalLayout; + auto &d_shapeFunctionDataTransposeHost = d_shapeFunctionDataTranspose; + auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; + auto &d_shapeFunctionGradientDataTransposeHost = + d_shapeFunctionGradientDataTranspose; #endif - d_shapeFunctionDataHost.clear(); - if (d_updateFlags & update_values) - d_shapeFunctionDataHost.resize(d_nQuadsPerCell * d_nDofsPerCell, 0.0); - d_shapeFunctionGradientDataHost.clear(); - if (d_updateFlags & update_gradients) - d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell * - d_nDofsPerCell * 3, - 0.0); + d_shapeFunctionDataHost.clear(); + if (d_updateFlags & update_values) + d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionDataTransposeHost.clear(); + if ((d_updateFlags & update_values) && + (d_updateFlags & update_transpose)) + d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionGradientDataInternalLayoutHost.clear(); + d_shapeFunctionGradientDataHost.clear(); + d_shapeFunctionGradientDataTransposeHost.clear(); + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataInternalLayoutHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell * 3, + 0.0); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTransposeHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + } - d_JxWDataHost.clear(); - if ((d_updateFlags & update_values) || (d_updateFlags & update_gradients)) - d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell); + d_JxWDataHost.clear(); + if ((d_updateFlags & update_values) || + (d_updateFlags & update_gradients)) + d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]); - d_inverseJacobianDataHost.clear(); - if (d_updateFlags & update_gradients) - d_inverseJacobianDataHost.resize(areAllCellsCartesian ? - d_nCells * 3 : - (areAllCellsAffine ? - d_nCells * 9 : - d_nCells * 9 * d_nQuadsPerCell)); - const unsigned int nJacobiansPerCell = - areAllCellsAffine ? 1 : d_nQuadsPerCell; + d_inverseJacobianDataHost.clear(); + if (d_updateFlags & update_gradients) + d_inverseJacobianDataHost.resize( + areAllCellsCartesian ? + d_nCells * 3 : + (areAllCellsAffine ? d_nCells * 9 : + d_nCells * 9 * d_nQuadsPerCell[iQuadID])); + const unsigned int nJacobiansPerCell = + areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID]; - auto cellPtr = - d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); - auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); - unsigned int iCell = 0; - for (; cellPtr != endcPtr; ++cellPtr) - if (cellPtr->is_locally_owned()) - { - fe_values.reinit(cellPtr); - auto &jacobians = fe_values.get_jacobians(); - auto &inverseJacobians = fe_values.get_inverse_jacobians(); - if (iCell == 0) + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) { - if (d_updateFlags & update_values) - for (unsigned int iNode = 0; iNode < d_nDofsPerCell; ++iNode) - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; - ++iQuad) - d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + iNode] = - fe_values.shape_value(iNode, iQuad); - - - if (d_updateFlags & update_gradients) - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) - for (unsigned int iNode = 0; iNode < d_nDofsPerCell; - ++iNode) + fe_values.reinit(cellPtr); + auto &jacobians = fe_values.get_jacobians(); + auto &inverseJacobians = fe_values.get_inverse_jacobians(); + if (iCell == 0) + { + if (d_updateFlags & update_values) { - const auto &shape_grad_real = - fe_values.shape_grad(iNode, iQuad); - const auto &shape_grad_reference = - apply_transformation(jacobians[iQuad].transpose(), - shape_grad_real); - for (unsigned int iDim = 0; iDim < 3; ++iDim) - if (areAllCellsAffine) - d_shapeFunctionGradientDataHost - [d_nQuadsPerCell * d_nDofsPerCell * iDim + - d_nDofsPerCell * iQuad + iNode] = - shape_grad_reference[iDim]; - else - d_shapeFunctionGradientDataHost - [iQuad * d_nDofsPerCell * 3 + - d_nDofsPerCell * iDim + iNode] = - shape_grad_reference[iDim]; + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + + iNode] = + fe_values.shape_value(iNode, iQuad); + if (d_updateFlags & update_transpose) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataTransposeHost + [iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.shape_value(iNode, iQuad); } + + + if (d_updateFlags & update_gradients) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + { + const auto &shape_grad_real = + fe_values.shape_grad(iNode, iQuad); + const auto &shape_grad_reference = + apply_transformation(jacobians[iQuad].transpose(), + shape_grad_real); + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsAffine) + d_shapeFunctionGradientDataInternalLayoutHost + [d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * + iDim + + d_nDofsPerCell * iQuad + iNode] = + shape_grad_reference[iDim]; + else + d_shapeFunctionGradientDataInternalLayoutHost + [iQuad * d_nDofsPerCell * 3 + + d_nDofsPerCell * iDim + iNode] = + shape_grad_reference[iDim]; + + + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iQuad * d_nDofsPerCell + iNode] = + shape_grad_reference[iDim]; + if (d_updateFlags & update_transpose) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataTransposeHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + shape_grad_reference[iDim]; + } + } + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.JxW(iQuad); + for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsCartesian) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + + iDim * nJacobiansPerCell + + iQuad] = + inverseJacobians[iQuad][iDim][iDim]; + else + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * + 9 + + 9 * iQuad + jDim * 3 + iDim] = + inverseJacobians[iQuad][iDim][jDim]; + ++iCell; } - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) - d_JxWDataHost[iCell * d_nQuadsPerCell + iQuad] = - fe_values.JxW(iQuad); - for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) - for (unsigned int iDim = 0; iDim < 3; ++iDim) - if (areAllCellsCartesian) - d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + - iDim * nJacobiansPerCell + iQuad] = - inverseJacobians[iQuad][iDim][iDim]; - else - for (unsigned int jDim = 0; jDim < 3; ++jDim) - d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 9 + - 9 * iQuad + jDim * 3 + iDim] = - inverseJacobians[iQuad][iDim][jDim]; - ++iCell; - } #if defined(DFTFE_WITH_DEVICE) - d_inverseJacobianData.resize(d_inverseJacobianDataHost.size()); - d_inverseJacobianData.copyFrom(d_inverseJacobianDataHost); - d_JxWData.resize(d_JxWDataHost.size()); - d_JxWData.copyFrom(d_JxWDataHost); - d_shapeFunctionData.resize(d_shapeFunctionDataHost.size()); - d_shapeFunctionData.copyFrom(d_shapeFunctionDataHost); - d_shapeFunctionGradientData.resize( - d_shapeFunctionGradientDataHost.size()); - d_shapeFunctionGradientData.copyFrom(d_shapeFunctionGradientDataHost); + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].resize( + d_inverseJacobianDataHost.size()); + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].copyFrom( + d_inverseJacobianDataHost); + d_JxWData[iQuadID].resize(d_JxWDataHost.size()); + d_JxWData[iQuadID].copyFrom(d_JxWDataHost); + d_shapeFunctionData[iQuadID].resize(d_shapeFunctionDataHost.size()); + d_shapeFunctionData[iQuadID].copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionGradientDataInternalLayout[iQuadID].resize( + d_shapeFunctionGradientDataInternalLayoutHost.size()); + d_shapeFunctionGradientDataInternalLayout[iQuadID].copyFrom( + d_shapeFunctionGradientDataInternalLayoutHost); + d_shapeFunctionDataTranspose[iQuadID].resize( + d_shapeFunctionDataTransposeHost.size()); + d_shapeFunctionDataTranspose[iQuadID].copyFrom( + d_shapeFunctionDataTransposeHost); + d_shapeFunctionGradientData[iQuadID].resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientData[iQuadID].copyFrom( + d_shapeFunctionGradientDataHost); + d_shapeFunctionGradientDataTranspose[iQuadID].resize( + d_shapeFunctionGradientDataTransposeHost.size()); + d_shapeFunctionGradientDataTranspose[iQuadID].copyFrom( + d_shapeFunctionGradientDataTransposeHost); #endif + } } template >>( d_nVectors, - d_nQuadsPerCell, + d_nQuadsPerCell[d_quadratureID], (cellRange.second - cellRange.first), dftfe::utils::makeDataTypeDeviceCompatible( tempQuadratureGradientsDataNonAffine.data()), @@ -288,14 +282,14 @@ namespace dftfe hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel, (d_nVectors * (cellRange.second - cellRange.first) * - d_nQuadsPerCell * 3) / + d_nQuadsPerCell[d_quadratureID] * 3) / dftfe::utils::DEVICE_BLOCK_SIZE + 1, dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, d_nVectors, - d_nQuadsPerCell, + d_nQuadsPerCell[d_quadratureID], (cellRange.second - cellRange.first), dftfe::utils::makeDataTypeDeviceCompatible( tempQuadratureGradientsDataNonAffine.data()), diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.cc index 4f8357f49..6e9a06318 100644 --- a/utils/FEBasisOperationsHost.cc +++ b/utils/FEBasisOperationsHost.cc @@ -140,22 +140,22 @@ namespace dftfe xgemm(&transA, &transB, &d_nVectors, - &d_nQuadsPerCell, + &d_nQuadsPerCell[d_quadratureID], &d_nDofsPerCell, &scalarCoeffAlpha, cellNodalValues + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, &d_nVectors, - d_shapeFunctionData.data(), + d_shapeFunctionData[d_quadratureID].data(), &d_nDofsPerCell, &scalarCoeffBeta, - quadratureValues + - d_nQuadsPerCell * (iCell - cellRange.first) * d_nVectors, + quadratureValues + d_nQuadsPerCell[d_quadratureID] * + (iCell - cellRange.first) * d_nVectors, &d_nVectors); if (quadratureGradients != NULL) { const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell * 3; + d_nQuadsPerCell[d_quadratureID] * 3; xgemm(&transA, &transB, &d_nVectors, @@ -165,32 +165,34 @@ namespace dftfe cellNodalValues + d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors, &d_nVectors, - d_shapeFunctionGradientData.data(), + d_shapeFunctionGradientDataInternalLayout[d_quadratureID] + .data(), &d_nDofsPerCell, &scalarCoeffBeta, areAllCellsCartesian ? - (quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * + (quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * (iCell - cellRange.first)) : (tempQuadratureGradientsData.data()), &d_nVectors); if (areAllCellsCartesian) { const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; + d_nQuadsPerCell[d_quadratureID] * d_nVectors; const unsigned int one = 1; for (unsigned int iDim = 0; iDim < 3; ++iDim) xscal(&d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 3 * iCell + iDim, + d_inverseJacobianData[0].data() + 3 * iCell + iDim, quadratureGradients + - d_nQuadsPerCell * d_nVectors * 3 * + d_nQuadsPerCell[d_quadratureID] * d_nVectors * 3 * (iCell - cellRange.first) + - d_nQuadsPerCell * d_nVectors * iDim, + d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim, &one); } else if (areAllCellsAffine) { const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; + d_nQuadsPerCell[d_quadratureID] * d_nVectors; const unsigned int three = 3; xgemm(&transA, &transB, @@ -200,17 +202,20 @@ namespace dftfe &scalarCoeffAlpha, tempQuadratureGradientsData.data(), &d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 9 * iCell, + d_inverseJacobianData[0].data() + 9 * iCell, &three, &scalarCoeffBeta, - quadratureGradients + d_nQuadsPerCell * d_nVectors * 3 * + quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * (iCell - cellRange.first), &d_nQuadsPerCellTimesnVectors); } else { const unsigned int three = 3; - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) xgemm(&transA, &transB, &d_nVectors, @@ -220,19 +225,24 @@ namespace dftfe tempQuadratureGradientsData.data() + iQuad * d_nVectors * 3, &d_nVectors, - d_inverseJacobianData.data() + - 9 * d_nQuadsPerCell * iCell + 9 * iQuad, + d_inverseJacobianData[d_quadratureID].data() + + 9 * d_nQuadsPerCell[d_quadratureID] * iCell + + 9 * iQuad, &three, &scalarCoeffBeta, tempQuadratureGradientsDataNonAffine.data() + iQuad * d_nVectors * 3, &d_nVectors); - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) for (unsigned int iDim = 0; iDim < 3; ++iDim) std::memcpy(quadratureGradients + - d_nVectors * 3 * d_nQuadsPerCell * + d_nVectors * 3 * + d_nQuadsPerCell[d_quadratureID] * (iCell - cellRange.first) + - d_nVectors * d_nQuadsPerCell * iDim + + d_nVectors * + d_nQuadsPerCell[d_quadratureID] * iDim + d_nVectors * iQuad, tempQuadratureGradientsDataNonAffine.data() + d_nVectors * 3 * iQuad + d_nVectors * iDim, @@ -261,11 +271,14 @@ namespace dftfe tempQuadratureGradientsDataNonAffine; cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells); if (quadratureGradients != NULL) - tempQuadratureGradientsData.resize(3 * d_nVectors * d_nQuadsPerCell); + tempQuadratureGradientsData.resize(3 * d_nVectors * + d_nQuadsPerCell[d_quadratureID]); if (quadratureGradients != NULL) tempQuadratureGradientsDataNonAffine.resize( - areAllCellsAffine ? 0 : (3 * d_nVectors * d_nQuadsPerCell)); + areAllCellsAffine ? + 0 : + (3 * d_nVectors * d_nQuadsPerCell[d_quadratureID])); @@ -280,12 +293,12 @@ namespace dftfe &transB, &d_nVectors, &d_nDofsPerCell, - &d_nQuadsPerCell, + &d_nQuadsPerCell[d_quadratureID], &scalarCoeffAlpha, - quadratureValues + d_nQuadsPerCell * iCell, + quadratureValues + d_nQuadsPerCell[d_quadratureID] * iCell, &d_nVectors, - d_shapeFunctionData.data(), - &d_nQuadsPerCell, + d_shapeFunctionData[d_quadratureID].data(), + &d_nQuadsPerCell[d_quadratureID], &scalarCoeffBeta, cellNodalData.data() + d_nDofsPerCell * iCell, &d_nVectors); @@ -294,24 +307,25 @@ namespace dftfe if (areAllCellsCartesian) { const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; + d_nQuadsPerCell[d_quadratureID] * d_nVectors; const unsigned int one = 1; std::memcpy(tempQuadratureGradientsData.data(), quadratureGradients + - d_nQuadsPerCell * d_nVectors * 3 * iCell, + d_nQuadsPerCell[d_quadratureID] * d_nVectors * + 3 * iCell, 3 * d_nQuadsPerCellTimesnVectors * sizeof(ValueTypeBasisCoeff)); for (unsigned int iDim = 0; iDim < 3; ++iDim) xscal(&d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 3 * iCell + iDim, + d_inverseJacobianData[0].data() + 3 * iCell + iDim, tempQuadratureGradientsData.data() + - d_nQuadsPerCell * d_nVectors * iDim, + d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim, &one); } else if (areAllCellsAffine) { const unsigned int d_nQuadsPerCellTimesnVectors = - d_nQuadsPerCell * d_nVectors; + d_nQuadsPerCell[d_quadratureID] * d_nVectors; const unsigned int three = 3; xgemm(&transA, &transB, @@ -319,10 +333,10 @@ namespace dftfe &three, &three, &scalarCoeffAlpha, - quadratureGradients + - d_nQuadsPerCell * d_nVectors * 3 * iCell, + quadratureGradients + d_nQuadsPerCell[d_quadratureID] * + d_nVectors * 3 * iCell, &d_nQuadsPerCellTimesnVectors, - d_inverseJacobianData.data() + 9 * iCell, + d_inverseJacobianData[0].data() + 9 * iCell, &three, &scalarCoeffBeta, tempQuadratureGradientsData.data(), @@ -330,17 +344,23 @@ namespace dftfe } else { - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) for (unsigned int iDim = 0; iDim < 3; ++iDim) std::memcpy(tempQuadratureGradientsDataNonAffine.data() + d_nVectors * 3 * iQuad + d_nVectors * iDim, quadratureGradients + - d_nVectors * 3 * d_nQuadsPerCell * iCell + - d_nVectors * d_nQuadsPerCell * iDim + + d_nVectors * 3 * + d_nQuadsPerCell[d_quadratureID] * iCell + + d_nVectors * + d_nQuadsPerCell[d_quadratureID] * iDim + d_nVectors * iQuad, d_nVectors * sizeof(ValueTypeBasisCoeff)); const unsigned int three = 3; - for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell; ++iQuad) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[d_quadratureID]; + ++iQuad) xgemm(&transA, &transB, &d_nVectors, @@ -350,8 +370,9 @@ namespace dftfe tempQuadratureGradientsDataNonAffine.data() + d_nVectors * 3 * iQuad, &d_nVectors, - d_inverseJacobianData.data() + - 9 * d_nQuadsPerCell * iCell + 9 * iQuad, + d_inverseJacobianData[d_quadratureID].data() + + 9 * d_nQuadsPerCell[d_quadratureID] * iCell + + 9 * iQuad, &three, &scalarCoeffBeta, tempQuadratureGradientsData.data() + @@ -359,7 +380,7 @@ namespace dftfe &d_nVectors); } const unsigned int d_nQuadsPerCellTimesThree = - d_nQuadsPerCell * 3; + d_nQuadsPerCell[d_quadratureID] * 3; xgemm(&transA, &transB, &d_nVectors, @@ -368,7 +389,8 @@ namespace dftfe &scalarCoeffAlpha, tempQuadratureGradientsData.data(), &d_nVectors, - d_shapeFunctionGradientData.data(), + d_shapeFunctionGradientDataInternalLayout[d_quadratureID] + .data(), &d_nDofsPerCell, &scalarCoeffBeta, cellNodalData.data() + d_nDofsPerCell * iCell, diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc index 6267953c6..7b3eb77a2 100644 --- a/utils/constraintMatrixInfo.cc +++ b/utils/constraintMatrixInfo.cc @@ -171,84 +171,6 @@ namespace dftfe } } - - void - constraintMatrixInfo::precomputeMaps( - const std::shared_ptr - &unFlattenedPartitioner, - const std::shared_ptr - & flattenedPartitioner, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int n_ghosts = unFlattenedPartitioner->n_ghost_indices(); - const unsigned int localSize = unFlattenedPartitioner->local_size(); - const unsigned int totalSize = n_ghosts + localSize; - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - const dealii::types::global_dof_index globalIndex = - unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - flattenedPartitioner->global_to_local(globalIndex * blockSize); - } - } - - void - constraintMatrixInfo::precomputeMaps( - const std::shared_ptr< - const utils::mpi::MPIPatternP2P> - & mpiPattern, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int totalSize = - mpiPattern->localOwnedSize() + mpiPattern->localGhostSize(); - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - (dealii::types::global_dof_index)ilocalDof * - (dealii::types::global_dof_index)blockSize; - } - } - - void - constraintMatrixInfo::precomputeMaps(const unsigned int totalSize, - const unsigned int blockSize) - { - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - (dealii::types::global_dof_index)ilocalDof * - (dealii::types::global_dof_index)blockSize; - } - } - - - // // set the constrained degrees of freedom to values so that constraints // are satisfied @@ -291,7 +213,7 @@ namespace dftfe d_inhomogenities[i]); const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { @@ -302,8 +224,7 @@ namespace dftfe const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; @@ -341,7 +262,7 @@ namespace dftfe d_inhomogenities[i]); const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { @@ -352,8 +273,7 @@ namespace dftfe const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; @@ -389,13 +309,12 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; callaxpy(&blockSize, @@ -429,13 +348,12 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; for (unsigned int j = 0; j < d_rowSizes[i]; ++j) { const dealii::types::global_dof_index startingLocalDofIndexColumn = - d_localIndexMapUnflattenedToFlattened - [d_columnIdsLocal[count]]; + d_columnIdsLocal[count] * blockSize; T alpha = d_columnValues[count]; callaxpy(&blockSize, @@ -466,7 +384,7 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; // set constrained nodes to zero std::fill(fieldVector.begin() + startingLocalDofIndexRow, @@ -483,7 +401,7 @@ namespace dftfe for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i) { const dealii::types::global_dof_index startingLocalDofIndexRow = - d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]]; + d_rowIdsLocal[i] * blockSize; // set constrained nodes to zero std::fill(fieldVector.data() + startingLocalDofIndexRow, @@ -506,7 +424,6 @@ namespace dftfe d_columnValues.clear(); d_inhomogenities.clear(); d_rowSizes.clear(); - d_localIndexMapUnflattenedToFlattened.clear(); } diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc index 7d992b1ad..119ef6fef 100644 --- a/utils/constraintMatrixInfoDevice.cc +++ b/utils/constraintMatrixInfoDevice.cc @@ -39,9 +39,7 @@ namespace dftfe const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -60,7 +58,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] = inhomogenities[blockIndex]; for (unsigned int i = 0; i < numberColumns; ++i) @@ -69,7 +67,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] += constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -89,9 +87,7 @@ namespace dftfe const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -110,7 +106,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] = inhomogenities[blockIndex]; for (unsigned int i = 0; i < numberColumns; ++i) @@ -119,7 +115,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; xVec[xVecStartingIdRow + intraBlockIndex] += constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -139,9 +135,7 @@ namespace dftfe const unsigned int * constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -160,7 +154,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex, inhomogenities[blockIndex]); for (unsigned int i = 0; i < numberColumns; ++i) @@ -169,7 +163,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; dftfe::utils::copyValue( xVec + xVecStartingIdRow + intraBlockIndex, dftfe::utils::add( @@ -196,9 +190,7 @@ namespace dftfe const unsigned int * constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, const double * constraintColumnValuesAllRowsUnflattened, - const double * inhomogenities, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * inhomogenities) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -217,7 +209,7 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex, inhomogenities[blockIndex]); for (unsigned int i = 0; i < numberColumns; ++i) @@ -226,7 +218,7 @@ namespace dftfe constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; dftfe::utils::copyValue( xVec + xVecStartingIdRow + intraBlockIndex, dftfe::utils::add( @@ -251,9 +243,7 @@ namespace dftfe const unsigned int *constraintRowSizes, const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, - const double * constraintColumnValuesAllRowsUnflattened, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * constraintColumnValuesAllRowsUnflattened) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -272,14 +262,14 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; for (unsigned int i = 0; i < numberColumns; ++i) { const unsigned int constrainedColumnId = constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex]), constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -299,9 +289,7 @@ namespace dftfe const unsigned int *constraintRowSizes, const unsigned int *constraintRowSizesAccumulated, const unsigned int *constraintLocalColumnIdsAllRowsUnflattened, - const double * constraintColumnValuesAllRowsUnflattened, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const double * constraintColumnValuesAllRowsUnflattened) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -320,14 +308,14 @@ namespace dftfe const unsigned int startingColumnNumber = constraintRowSizesAccumulated[blockIndex]; const dealii::types::global_dof_index xVecStartingIdRow = - localIndexMapUnflattenedToFlattened[constrainedRowId]; + constrainedRowId * contiguousBlockSize; for (unsigned int i = 0; i < numberColumns; ++i) { const unsigned int constrainedColumnId = constraintLocalColumnIdsAllRowsUnflattened [startingColumnNumber + i]; const dealii::types::global_dof_index xVecStartingIdColumn = - localIndexMapUnflattenedToFlattened[constrainedColumnId]; + constrainedColumnId * contiguousBlockSize; const float tempfloatval = constraintColumnValuesAllRowsUnflattened [startingColumnNumber + i] * @@ -344,9 +332,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, double * xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -359,8 +345,8 @@ namespace dftfe { const unsigned int blockIndex = index / contiguousBlockSize; const unsigned int intraBlockIndex = index % contiguousBlockSize; - xVec[localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + xVec[constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex] = 0; } } @@ -369,9 +355,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, float * xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -384,8 +368,8 @@ namespace dftfe { const unsigned int blockIndex = index / contiguousBlockSize; const unsigned int intraBlockIndex = index % contiguousBlockSize; - xVec[localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + xVec[constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex] = 0; } } @@ -394,9 +378,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, dftfe::utils::deviceDoubleComplex *xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -411,8 +393,8 @@ namespace dftfe const unsigned int intraBlockIndex = index % contiguousBlockSize; dftfe::utils::copyValue( xVec + - localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex, 0.0); } @@ -423,9 +405,7 @@ namespace dftfe setzeroKernel(const unsigned int contiguousBlockSize, dftfe::utils::deviceFloatComplex *xVec, const unsigned int *constraintLocalRowIdsUnflattened, - const unsigned int numConstraints, - const dealii::types::global_dof_index - *localIndexMapUnflattenedToFlattened) + const unsigned int numConstraints) { const dealii::types::global_dof_index globalThreadId = blockIdx.x * blockDim.x + threadIdx.x; @@ -440,8 +420,8 @@ namespace dftfe const unsigned int intraBlockIndex = index % contiguousBlockSize; dftfe::utils::copyValue( xVec + - localIndexMapUnflattenedToFlattened - [constraintLocalRowIdsUnflattened[blockIndex]] + + constraintLocalRowIdsUnflattened[blockIndex] * + contiguousBlockSize + intraBlockIndex, 0.0); } @@ -561,102 +541,6 @@ namespace dftfe d_numConstrainedDofs = d_rowIdsLocal.size(); } - - void - constraintMatrixInfoDevice::precomputeMaps( - const std::shared_ptr - &unFlattenedPartitioner, - const std::shared_ptr - & flattenedPartitioner, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int n_ghosts = unFlattenedPartitioner->n_ghost_indices(); - const unsigned int localSize = unFlattenedPartitioner->local_size(); - const unsigned int totalSize = n_ghosts + localSize; - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - const dealii::types::global_dof_index globalIndex = - unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - flattenedPartitioner->global_to_local(globalIndex * blockSize); - } - - d_localIndexMapUnflattenedToFlattenedDevice.resize( - d_localIndexMapUnflattenedToFlattened.size()); - d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( - d_localIndexMapUnflattenedToFlattened); - } - - void - constraintMatrixInfoDevice::precomputeMaps( - const std::shared_ptr< - const utils::mpi::MPIPatternP2P> - & mpiPattern, - const unsigned int blockSize) - { - // - // Get required sizes - // - const unsigned int totalSize = - mpiPattern->localOwnedSize() + mpiPattern->localGhostSize(); - - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - // const dealii::types::global_dof_index globalIndex = - // unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - ilocalDof * blockSize; - // flattenedPartitioner->globalToLocal(globalIndex * blockSize); - } - - d_localIndexMapUnflattenedToFlattenedDevice.resize( - d_localIndexMapUnflattenedToFlattened.size()); - d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( - d_localIndexMapUnflattenedToFlattened); - } - - void - constraintMatrixInfoDevice::precomputeMaps(const unsigned int totalSize, - const unsigned int blockSize) - { - d_localIndexMapUnflattenedToFlattened.clear(); - d_localIndexMapUnflattenedToFlattened.resize(totalSize); - - // - // fill the data array - // - for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof) - { - // const dealii::types::global_dof_index globalIndex = - // unFlattenedPartitioner->local_to_global(ilocalDof); - d_localIndexMapUnflattenedToFlattened[ilocalDof] = - ilocalDof * blockSize; - // flattenedPartitioner->globalToLocal(globalIndex * blockSize); - } - - d_localIndexMapUnflattenedToFlattenedDevice.resize( - d_localIndexMapUnflattenedToFlattened.size()); - d_localIndexMapUnflattenedToFlattenedDevice.copyFrom( - d_localIndexMapUnflattenedToFlattened); - } - - template void constraintMatrixInfoDevice::distribute( @@ -681,8 +565,7 @@ namespace dftfe d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), d_columnValuesDevice.begin(), - d_inhomogenitiesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_inhomogenitiesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeKernel, @@ -700,8 +583,7 @@ namespace dftfe d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), d_columnValuesDevice.begin(), - d_inhomogenitiesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_inhomogenitiesDevice.begin()); #endif } @@ -732,8 +614,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -750,8 +631,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif } @@ -780,31 +660,27 @@ namespace dftfe min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempReal, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempReal, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); distributeSlaveToMasterKernelAtomicAdd<<< min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempImag, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempImag, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -821,8 +697,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -839,8 +714,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( @@ -875,31 +749,27 @@ namespace dftfe min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempReal, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempReal, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); distributeSlaveToMasterKernelAtomicAdd<<< min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) / dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs, 30000), - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - blockSize, - tempImag, - d_rowIdsLocalDevice.begin(), - d_numConstrainedDofs, - d_rowSizesDevice.begin(), - d_rowSizesAccumulatedDevice.begin(), - d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize, + tempImag, + d_rowIdsLocalDevice.begin(), + d_numConstrainedDofs, + d_rowSizesDevice.begin(), + d_rowSizesAccumulatedDevice.begin(), + d_columnIdsLocalDevice.begin(), + d_columnValuesDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -916,8 +786,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); hipLaunchKernelGGL( distributeSlaveToMasterKernelAtomicAdd, @@ -934,8 +803,7 @@ namespace dftfe d_rowSizesDevice.begin(), d_rowSizesAccumulatedDevice.begin(), d_columnIdsLocalDevice.begin(), - d_columnValuesDevice.begin(), - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + d_columnValuesDevice.begin()); #endif dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( @@ -965,8 +833,7 @@ namespace dftfe blockSize, dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), d_rowIdsLocalDevice.begin(), - numConstrainedDofs, - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + numConstrainedDofs); #elif DFTFE_WITH_DEVICE_LANG_HIP hipLaunchKernelGGL( setzeroKernel, @@ -979,8 +846,7 @@ namespace dftfe blockSize, dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()), d_rowIdsLocalDevice.begin(), - numConstrainedDofs, - d_localIndexMapUnflattenedToFlattenedDevice.begin()); + numConstrainedDofs); #endif } From 1408d2f7052e0ecca6f5ee44ca1e6f7a4b7ff6fe Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 2 Oct 2023 13:50:32 +0530 Subject: [PATCH 10/25] Fix cpu compilation --- src/dft/densityCalculator.cc | 4 ++++ utils/FEBasisOperations.cc | 14 +++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index b2c6188ae..c831d529c 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -69,7 +69,9 @@ namespace dftfe { int this_process; MPI_Comm_rank(mpiCommParent, &this_process); +#if defined(DFTFE_WITH_DEVICE) dftfe::utils::deviceSynchronize(); +#endif MPI_Barrier(mpiCommParent); double computeRho_time = MPI_Wtime(); const unsigned int numKPoints = kPointWeights.size(); @@ -656,7 +658,9 @@ namespace dftfe } iElem++; } +#if defined(DFTFE_WITH_DEVICE) dftfe::utils::deviceSynchronize(); +#endif MPI_Barrier(mpiCommParent); computeRho_time = MPI_Wtime() - computeRho_time; diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 2721274d7..7aa620dc6 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -400,15 +400,15 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> d_shapeFunctionGradientDataTransposeHost; #else - auto &d_inverseJacobianDataHost = d_inverseJacobianData; - auto &d_JxWDataHost = d_JxWData; - auto &d_shapeFunctionDataHost = d_shapeFunctionData; + auto &d_inverseJacobianDataHost = d_inverseJacobianData[areAllCellsAffine ? 0:iQuadID]; + auto &d_JxWDataHost = d_JxWData[iQuadID]; + auto &d_shapeFunctionDataHost = d_shapeFunctionData[iQuadID]; auto &d_shapeFunctionGradientDataInternalLayoutHost = - d_shapeFunctionGradientDataInternalLayout; - auto &d_shapeFunctionDataTransposeHost = d_shapeFunctionDataTranspose; - auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData; + d_shapeFunctionGradientDataInternalLayout[iQuadID]; + auto &d_shapeFunctionDataTransposeHost = d_shapeFunctionDataTranspose[iQuadID]; + auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData[iQuadID]; auto &d_shapeFunctionGradientDataTransposeHost = - d_shapeFunctionGradientDataTranspose; + d_shapeFunctionGradientDataTranspose[iQuadID]; #endif From bdcca47eb1834b966da3d5be9a53534c32099930 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 2 Oct 2023 14:52:57 +0530 Subject: [PATCH 11/25] Indentation standard --- utils/DeviceBlasWrapper.hip.cc | 44 +++++++++++++++++----------------- utils/FEBasisOperations.cc | 13 ++++++---- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/utils/DeviceBlasWrapper.hip.cc b/utils/DeviceBlasWrapper.hip.cc index d474133ad..8e96b6c40 100644 --- a/utils/DeviceBlasWrapper.hip.cc +++ b/utils/DeviceBlasWrapper.hip.cc @@ -573,17 +573,17 @@ namespace dftfe { deviceBlasStatus_t status = hipblasZgemv(handle, - trans, - m, - n, - dftfe::utils::makeDataTypeDeviceCompatible(alpha), - dftfe::utils::makeDataTypeDeviceCompatible(A), - lda, - dftfe::utils::makeDataTypeDeviceCompatible(x), - incx, - dftfe::utils::makeDataTypeDeviceCompatible(beta), - dftfe::utils::makeDataTypeDeviceCompatible(y), - incy); + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); DEVICEBLAS_API_CHECK(status); return status; } @@ -604,17 +604,17 @@ namespace dftfe { deviceBlasStatus_t status = hipblasCgemv(handle, - trans, - m, - n, - dftfe::utils::makeDataTypeDeviceCompatible(alpha), - dftfe::utils::makeDataTypeDeviceCompatible(A), - lda, - dftfe::utils::makeDataTypeDeviceCompatible(x), - incx, - dftfe::utils::makeDataTypeDeviceCompatible(beta), - dftfe::utils::makeDataTypeDeviceCompatible(y), - incy); + trans, + m, + n, + dftfe::utils::makeDataTypeDeviceCompatible(alpha), + dftfe::utils::makeDataTypeDeviceCompatible(A), + lda, + dftfe::utils::makeDataTypeDeviceCompatible(x), + incx, + dftfe::utils::makeDataTypeDeviceCompatible(beta), + dftfe::utils::makeDataTypeDeviceCompatible(y), + incy); DEVICEBLAS_API_CHECK(status); return status; } diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 7aa620dc6..4813184f9 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -400,13 +400,16 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> d_shapeFunctionGradientDataTransposeHost; #else - auto &d_inverseJacobianDataHost = d_inverseJacobianData[areAllCellsAffine ? 0:iQuadID]; - auto &d_JxWDataHost = d_JxWData[iQuadID]; - auto &d_shapeFunctionDataHost = d_shapeFunctionData[iQuadID]; + auto &d_inverseJacobianDataHost = + d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID]; + auto &d_JxWDataHost = d_JxWData[iQuadID]; + auto &d_shapeFunctionDataHost = d_shapeFunctionData[iQuadID]; auto &d_shapeFunctionGradientDataInternalLayoutHost = d_shapeFunctionGradientDataInternalLayout[iQuadID]; - auto &d_shapeFunctionDataTransposeHost = d_shapeFunctionDataTranspose[iQuadID]; - auto &d_shapeFunctionGradientDataHost = d_shapeFunctionGradientData[iQuadID]; + auto &d_shapeFunctionDataTransposeHost = + d_shapeFunctionDataTranspose[iQuadID]; + auto &d_shapeFunctionGradientDataHost = + d_shapeFunctionGradientData[iQuadID]; auto &d_shapeFunctionGradientDataTransposeHost = d_shapeFunctionGradientDataTranspose[iQuadID]; #endif From e92c5b960e30d1ec425fb0f00f2c690c755086c8 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 2 Oct 2023 14:56:09 +0530 Subject: [PATCH 12/25] fix hip with nvidia gpus --- CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ac94ce9ec..43e26a0aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -227,7 +227,10 @@ IF (WITH_GPU) set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE CUDA) ELSEIF ("${GPU_LANG}" STREQUAL "hip") set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE HIP) - ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__) + IF ("${GPU_VENDOR}" STREQUAL "amd") + ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__) + ELSEIF ("${GPU_VENDOR}" STREQUAL "nvidia") + ADD_DEFINITIONS(-D__HIP_PLATFORM_NVIDIA__) ENDIF() ENDIF() IF (WITH_GPU) From 1a463c676bb5581b82ccb156f62ffb73b3d9f785 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 2 Oct 2023 23:54:45 +0530 Subject: [PATCH 13/25] Fix cmakelists --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43e26a0aa..72db98957 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -231,6 +231,7 @@ IF (WITH_GPU) ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__) ELSEIF ("${GPU_VENDOR}" STREQUAL "nvidia") ADD_DEFINITIONS(-D__HIP_PLATFORM_NVIDIA__) + ENDIF() ENDIF() ENDIF() IF (WITH_GPU) From 3605717c9b7b6896d25ccd47a55c6c197697711b Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 3 Oct 2023 17:50:17 +0530 Subject: [PATCH 14/25] BugFixes and cleanups --- include/FEBasisOperations.h | 3 +- include/densityCalculatorCPU.h | 68 ------------------------ include/densityCalculatorDevice.h | 69 ------------------------- src/dft/dft.cc | 10 +++- src/dft/kohnShamEigenSolve.cc | 1 - src/force/forceWfcContractionsDevice.cc | 38 +------------- utils/FEBasisOperations.cc | 6 ++- utils/FEBasisOperationsDevice.cc | 39 +++++++------- utils/FEBasisOperationsHost.cc | 31 +++++------ 9 files changed, 52 insertions(+), 213 deletions(-) delete mode 100644 include/densityCalculatorCPU.h delete mode 100644 include/densityCalculatorDevice.h diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 024f5a403..f49221f2b 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -97,7 +97,8 @@ namespace dftfe void reinit(const unsigned int &vecBlockSize, const unsigned int &cellBlockSize, - const unsigned int &quadratureID); + const unsigned int &quadratureID, + const bool isResizeTempStorage = true); // private: #if defined(DFTFE_WITH_DEVICE) diff --git a/include/densityCalculatorCPU.h b/include/densityCalculatorCPU.h deleted file mode 100644 index 05300ae50..000000000 --- a/include/densityCalculatorCPU.h +++ /dev/null @@ -1,68 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#ifndef densityCalculatorCPU_H_ -#define densityCalculatorCPU_H_ - -#include "headers.h" -#include "operator.h" -#include "FEBasisOperations.h" -#include "dftParameters.h" - -namespace dftfe -{ - /** - * @brief Density calculator class using gemm recasting - * - * @author Sambit Das - */ - - template - void - computeRhoFromPSICPU( - const T * X, - const T * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTClass & operatorMatrix, - std::unique_ptr< - dftfe::basis:: - FEBasisOperations> - & basisOperationsPtrHost, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool useFEOrderRhoPlusOneGLQuad); -} // namespace dftfe -#endif diff --git a/include/densityCalculatorDevice.h b/include/densityCalculatorDevice.h deleted file mode 100644 index 774b67adf..000000000 --- a/include/densityCalculatorDevice.h +++ /dev/null @@ -1,69 +0,0 @@ -// --------------------------------------------------------------------- -// -// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE -// authors. -// -// This file is part of the DFT-FE code. -// -// The DFT-FE code is free software; you can use it, redistribute -// it, and/or modify it under the terms of the GNU Lesser General -// Public License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// The full text of the license can be found in the file LICENSE at -// the top level of the DFT-FE distribution. -// -// --------------------------------------------------------------------- -// - -#if defined(DFTFE_WITH_DEVICE) -# ifndef densityCalculatorDevice_H_ -# define densityCalculatorDevice_H_ - -# include -# include -# include "dftParameters.h" -# include "FEBasisOperations.h" - -namespace dftfe -{ - namespace Device - { - template - void - computeRhoFromPSI( - const NumberType * X, - const NumberType * XFrac, - const unsigned int totalNumWaveFunctions, - const unsigned int Nfr, - const unsigned int numLocalDofs, - const std::vector> &eigenValues, - const double fermiEnergy, - const double fermiEnergyUp, - const double fermiEnergyDown, - operatorDFTDeviceClass & operatorMatrix, - std::unique_ptr< - dftfe::basis::FEBasisOperations> - & basisOperationsPtrDevice, - const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, - const std::vector & kPointWeights, - std::map> *rhoValues, - std::map> *gradRhoValues, - std::map> *rhoValuesSpinPolarized, - std::map> *gradRhoValuesSpinPolarized, - const bool isEvaluateGradRho, - const MPI_Comm & mpiCommParent, - const MPI_Comm & interpoolcomm, - const MPI_Comm & interBandGroupComm, - const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad = false); - } -} // namespace dftfe -# endif -#endif diff --git a/src/dft/dft.cc b/src/dft/dft.cc index 6658b09c7..decfb89fd 100644 --- a/src/dft/dft.cc +++ b/src/dft/dft.cc @@ -20,7 +20,6 @@ // Include header files #include #include -#include #include #include #include @@ -65,7 +64,6 @@ #include #ifdef DFTFE_WITH_DEVICE -# include # include #endif @@ -1941,6 +1939,9 @@ namespace dftfe d_kohnShamDFTOperatorDevicePtr->reinit( std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues), true); + + basisOperationsPtrDevice->setDeviceBLASHandle( + &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); } #endif } @@ -3883,6 +3884,11 @@ namespace dftfe #endif ); } +#ifdef DFTFE_WITH_DEVICE + if (d_dftParamsPtr->useDevice) + basisOperationsPtrDevice->setDeviceBLASHandle( + &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle())); +#endif forcePtr->computeStress(matrix_free_data, #ifdef DFTFE_WITH_DEVICE diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc index dd18ad3eb..92b0552c3 100644 --- a/src/dft/kohnShamEigenSolve.cc +++ b/src/dft/kohnShamEigenSolve.cc @@ -20,7 +20,6 @@ #include #include #include -#include namespace dftfe { diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index 46977639d..43d07bb93 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -475,55 +475,20 @@ namespace dftfe dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(BVec, cellsBlockSize, 0); - // dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock( - // BVec, - // numCells * numNodesPerElement, - // Xb.begin(), - // cellWaveFunctionMatrix.begin(), - // (operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap()).begin()); - const int blockSize = cellsBlockSize; const int numberBlocks = numCells / blockSize; const int remBlockSize = numCells - numberBlocks * blockSize; - // dftfe::utils::MemoryStorage - // shapeFunctionValuesReferenceD(numQuads * numNodesPerElement, - // dataTypes::number(0.0)); dftfe::utils::MemoryStorage shapeFunctionValuesNLPReferenceD(numQuadsNLP * numNodesPerElement, dataTypes::number(0.0)); - // dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( - // numQuads * numNodesPerElement, - // (operatorMatrix.getShapeFunctionValuesTransposed()).begin(), - // shapeFunctionValuesReferenceD.begin()); - - dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr( numQuadsNLP * numNodesPerElement, (operatorMatrix.getShapeFunctionValuesNLPTransposed()).begin(), shapeFunctionValuesNLPReferenceD.begin()); - // dftfe::utils::MemoryStorage - // shapeFunctionGradientValuesXTransposedDevice(blockSize * numQuads * - // numNodesPerElement, - // dataTypes::number(0.0)); - - // dftfe::utils::MemoryStorage - // shapeFunctionGradientValuesYTransposedDevice(blockSize * numQuads * - // numNodesPerElement, - // dataTypes::number(0.0)); - - // dftfe::utils::MemoryStorage - // shapeFunctionGradientValuesZTransposedDevice(blockSize * numQuads * - // numNodesPerElement, - // dataTypes::number(0.0)); - dftfe::utils::MemoryStorage shapeFunctionGradientValuesNLPReferenceD(blockSize * numQuadsNLP * 3 * numNodesPerElement, @@ -574,7 +539,8 @@ namespace dftfe if (!isFloatingChargeForces) { basisOperationsPtr->interpolateKernel( - cellWaveFunctionMatrix.data(), + cellWaveFunctionMatrix.data() + + startingId * numNodesPerElement * BVec, psiQuadsFlatD.data(), gradPsiQuadsFlatD.begin(), std::pair( diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 4813184f9..281b1c103 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -86,7 +86,8 @@ namespace dftfe memorySpace>::reinit(const unsigned int &vecBlockSize, const unsigned int &cellsBlockSize, - const unsigned int &quadratureID) + const unsigned int &quadratureID, + const bool isResizeTempStorage) { d_quadratureID = quadratureID; d_cellsBlockSize = cellsBlockSize; @@ -95,7 +96,8 @@ namespace dftfe d_nVectors = vecBlockSize; initializeFlattenedIndexMaps(); } - resizeTempStorage(); + if (isResizeTempStorage) + resizeTempStorage(); } template Date: Tue, 3 Oct 2023 22:56:53 +0530 Subject: [PATCH 15/25] Move template instatiations to cc files --- include/FEBasisOperations.h | 39 -------------------------------- utils/FEBasisOperations.cc | 20 ++++++++++++++++ utils/FEBasisOperationsDevice.cc | 8 +++++++ utils/FEBasisOperationsHost.cc | 10 ++++++++ 4 files changed, 38 insertions(+), 39 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index f49221f2b..e7c76bc67 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -595,45 +595,6 @@ namespace dftfe const std::pair cellRange) const; }; #endif - - template class FEBasisOperationsBase; -#ifdef USE_COMPLEX - template class FEBasisOperationsBase; -#endif -#if defined(DFTFE_WITH_DEVICE) - template class FEBasisOperationsBase; -# ifdef USE_COMPLEX - template class FEBasisOperationsBase; -# endif -#endif - - template class FEBasisOperations; -#ifdef USE_COMPLEX - template class FEBasisOperations; -#endif -#if defined(DFTFE_WITH_DEVICE) - template class FEBasisOperations; -# ifdef USE_COMPLEX - template class FEBasisOperations; -# endif -#endif - } // end of namespace basis } // end of namespace dftfe diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index 281b1c103..b2bb8a727 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -615,5 +615,25 @@ namespace dftfe { d_constraintInfo.distribute(multiVector, d_nVectors); } + + + template class FEBasisOperationsBase; +#ifdef USE_COMPLEX + template class FEBasisOperationsBase; +#endif +#ifdef DFTFE_WITH_DEVICE + template class FEBasisOperationsBase; +# ifdef USE_COMPLEX + template class FEBasisOperationsBase; +# endif +#endif } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.cc index 356b03fa8..6ada00c1f 100644 --- a/utils/FEBasisOperationsDevice.cc +++ b/utils/FEBasisOperationsDevice.cc @@ -375,6 +375,14 @@ namespace dftfe { return *d_deviceBlasHandlePtr; } + template class FEBasisOperations; +#ifdef USE_COMPLEX + template class FEBasisOperations; +#endif } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.cc index 8b17a7265..29ea894cf 100644 --- a/utils/FEBasisOperationsHost.cc +++ b/utils/FEBasisOperationsHost.cc @@ -457,5 +457,15 @@ namespace dftfe [iCell * d_nDofsPerCell + iDof], std::plus()); } + + template class FEBasisOperations; +#ifdef USE_COMPLEX + template class FEBasisOperations; +#endif + } // namespace basis } // namespace dftfe From 338d6328cb3141a10df42fe367791bce2e20f9cd Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Wed, 4 Oct 2023 00:14:38 +0530 Subject: [PATCH 16/25] Remove redundant inputs to densitycalculator --- include/FEBasisOperations.h | 2 + include/densityCalculator.h | 9 +- src/dft/density.cc | 210 ++++++++++++++++------------------- src/dft/densityCalculator.cc | 187 ++++++++++++++----------------- utils/FEBasisOperations.cc | 11 ++ 5 files changed, 190 insertions(+), 229 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index e7c76bc67..28a295d67 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -157,6 +157,8 @@ namespace dftfe unsigned int cellsTypeFlag() const; + dealii::CellId + cellID(const unsigned int iElem) const; void createMultiVector( diff --git a/include/densityCalculator.h b/include/densityCalculator.h index 8871665f1..76eb017ea 100644 --- a/include/densityCalculator.h +++ b/include/densityCalculator.h @@ -32,7 +32,6 @@ namespace dftfe const dftfe::utils::MemoryStorage *XFrac, const unsigned int totalNumWaveFunctions, const unsigned int Nfr, - const unsigned int numLocalDofs, const std::vector> &eigenValues, const double fermiEnergy, const double fermiEnergyUp, @@ -41,10 +40,7 @@ namespace dftfe dftfe::basis::FEBasisOperations> & basisOperationsPtr, const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numberNodesPerElement, - const unsigned int numQuadPoints, + const unsigned int quadratureIndex, const std::vector & kPointWeights, std::map> *rhoValues, std::map> *gradRhoValues, @@ -55,8 +51,7 @@ namespace dftfe const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad = false); + const bool spectrumSplit); template void diff --git a/src/dft/density.cc b/src/dft/density.cc index 1034e4c87..20596afb8 100644 --- a/src/dft/density.cc +++ b/src/dft/density.cc @@ -190,68 +190,56 @@ namespace dftfe #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - computeRhoFromPSI( - &d_eigenVectorsFlattenedDevice, - &d_eigenVectorsRotFracFlattenedDevice, - d_numEigenValues, - d_numEigenValuesRR, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - basisOperationsPtrDevice, - 0, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - matrix_free_data.get_quadrature(d_densityQuadratureId).size(), - d_kPointWeights, - rhoOutValues.get(), - gradRhoOutValues.get(), - rhoOutValuesSpinPolarized.get(), - gradRhoOutValuesSpinPolarized.get(), - d_excManagerPtr->getDensityBasedFamilyType() == - densityFamilyType::GGA, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && - d_numEigenValues != d_numEigenValuesRR, - false); + computeRhoFromPSI(&d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrDevice, + d_densityDofHandlerIndex, + d_densityQuadratureId, + d_kPointWeights, + rhoOutValues.get(), + gradRhoOutValues.get(), + rhoOutValuesSpinPolarized.get(), + gradRhoOutValuesSpinPolarized.get(), + d_excManagerPtr->getDensityBasedFamilyType() == + densityFamilyType::GGA, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSI( - &d_eigenVectorsFlattenedHost, - &d_eigenVectorsRotFracDensityFlattenedHost, - d_numEigenValues, - d_numEigenValuesRR, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - basisOperationsPtrHost, - 0, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - matrix_free_data.get_quadrature(d_densityQuadratureId).size(), - d_kPointWeights, - rhoOutValues.get(), - gradRhoOutValues.get(), - rhoOutValuesSpinPolarized.get(), - gradRhoOutValuesSpinPolarized.get(), - d_excManagerPtr->getDensityBasedFamilyType() == - densityFamilyType::GGA, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && - d_numEigenValues != d_numEigenValuesRR, - false); + computeRhoFromPSI(&d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrHost, + d_densityDofHandlerIndex, + d_densityQuadratureId, + d_kPointWeights, + rhoOutValues.get(), + gradRhoOutValues.get(), + rhoOutValuesSpinPolarized.get(), + gradRhoOutValuesSpinPolarized.get(), + d_excManagerPtr->getDensityBasedFamilyType() == + densityFamilyType::GGA, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); // normalizeRhoOutQuadValues(); if (isGroundState) @@ -610,64 +598,54 @@ namespace dftfe // nodes in each cell #ifdef DFTFE_WITH_DEVICE if (d_dftParamsPtr->useDevice) - computeRhoFromPSI( - &d_eigenVectorsFlattenedDevice, - &d_eigenVectorsRotFracFlattenedDevice, - d_numEigenValues, - d_numEigenValuesRR, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - basisOperationsPtrDevice, - 0, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - quadrature_formula.size(), - d_kPointWeights, - &rhoPRefinedNodalData, - &_gradRhoValues, - &rhoPRefinedSpinPolarizedNodalData, - &_gradRhoValuesSpinPolarized, - false, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR, - true); + computeRhoFromPSI(&d_eigenVectorsFlattenedDevice, + &d_eigenVectorsRotFracFlattenedDevice, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrDevice, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + &rhoPRefinedNodalData, + &_gradRhoValues, + &rhoPRefinedSpinPolarizedNodalData, + &_gradRhoValuesSpinPolarized, + false, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); #endif if (!d_dftParamsPtr->useDevice) - computeRhoFromPSI( - &d_eigenVectorsFlattenedHost, - &d_eigenVectorsRotFracDensityFlattenedHost, - d_numEigenValues, - d_numEigenValuesRR, - matrix_free_data.get_vector_partitioner()->locally_owned_size(), - eigenValues, - fermiEnergy, - fermiEnergyUp, - fermiEnergyDown, - basisOperationsPtrHost, - 0, - dofHandler, - matrix_free_data.n_physical_cells(), - matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex), - quadrature_formula.size(), - d_kPointWeights, - &rhoPRefinedNodalData, - &_gradRhoValues, - &rhoPRefinedSpinPolarizedNodalData, - &_gradRhoValuesSpinPolarized, - false, - d_mpiCommParent, - interpoolcomm, - interBandGroupComm, - *d_dftParamsPtr, - isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR, - true); + computeRhoFromPSI(&d_eigenVectorsFlattenedHost, + &d_eigenVectorsRotFracDensityFlattenedHost, + d_numEigenValues, + d_numEigenValuesRR, + eigenValues, + fermiEnergy, + fermiEnergyUp, + fermiEnergyDown, + basisOperationsPtrHost, + d_densityDofHandlerIndex, + d_gllQuadratureId, + d_kPointWeights, + &rhoPRefinedNodalData, + &_gradRhoValues, + &rhoPRefinedSpinPolarizedNodalData, + &_gradRhoValuesSpinPolarized, + false, + d_mpiCommParent, + interpoolcomm, + interBandGroupComm, + *d_dftParamsPtr, + isConsiderSpectrumSplitting && + d_numEigenValues != d_numEigenValuesRR); // copy Lobatto quadrature data to fill in 2p DoFHandler nodal data dealii::DoFHandler<3>::active_cell_iterator cellP = d_dofHandlerRhoNodal diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index c831d529c..d5a1eb700 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -41,7 +41,6 @@ namespace dftfe const dftfe::utils::MemoryStorage *XFrac, const unsigned int totalNumWaveFunctions, const unsigned int Nfr, - const unsigned int numLocalDofs, const std::vector> &eigenValues, const double fermiEnergy, const double fermiEnergyUp, @@ -50,10 +49,7 @@ namespace dftfe dftfe::basis::FEBasisOperations> & basisOperationsPtr, const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, + const unsigned int quadratureIndex, const std::vector & kPointWeights, std::map> *rhoValues, std::map> *gradRhoValues, @@ -64,8 +60,7 @@ namespace dftfe const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad) + const bool spectrumSplit) { int this_process; MPI_Comm_rank(mpiCommParent, &this_process); @@ -75,7 +70,9 @@ namespace dftfe MPI_Barrier(mpiCommParent); double computeRho_time = MPI_Wtime(); const unsigned int numKPoints = kPointWeights.size(); - + const unsigned int numLocalDofs = basisOperationsPtr->nOwnedDofs(); + const unsigned int totalLocallyOwnedCells = basisOperationsPtr->nCells(); + const unsigned int numNodesPerElement = basisOperationsPtr->nDofsPerCell(); // band group parallelization data structures const unsigned int numberBandGroups = dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm); @@ -105,6 +102,8 @@ namespace dftfe const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize; const unsigned int remCellBlockSize = totalLocallyOwnedCells - numCellBlocks * cellsBlockSize; + basisOperationsPtr->reinit(BVec, cellsBlockSize, quadratureIndex); + const unsigned int numQuadPoints = basisOperationsPtr->nQuadsPerCell(); std::vector> wfcQuadPointData(numSpinComponents); @@ -289,13 +288,9 @@ namespace dftfe #endif - const unsigned int d_quadratureIndex = - use2pPlusOneGLQuad ? 2 : 0; - dftfe::basis::UpdateFlags updateFlags = - dftfe::basis::update_values | dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - d_quadratureIndex); + quadratureIndex); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; @@ -464,14 +459,9 @@ namespace dftfe (numSpinComponents * kPoint + spinIndex), flattenedArrayBlock[spinIndex].begin()); #endif - const unsigned int d_quadratureIndex = - use2pPlusOneGLQuad ? 2 : 0; - dftfe::basis::UpdateFlags updateFlags = - dftfe::basis::update_values | - dftfe::basis::update_gradients; basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - d_quadratureIndex); + quadratureIndex); for (unsigned int spinIndex = 0; @@ -581,83 +571,78 @@ namespace dftfe interBandGroupComm); } - unsigned int iElem = 0; - auto cell = dofHandler.begin_active(); - auto endc = dofHandler.end(); - for (; cell != endc; ++cell) - if (cell->is_locally_owned()) - { - const dealii::CellId cellid = cell->id(); - - std::vector dummy(1); - std::vector &tempRhoQuads = (*rhoValues)[cellid]; - std::vector &tempGradRhoQuads = - isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; - - std::vector &tempRhoQuadsSP = - (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : - dummy; - std::vector &tempGradRhoQuadsSP = - ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? - (*gradRhoValuesSpinPolarized)[cellid] : - dummy; - - if (dftParams.spinPolarized == 1) - { + for (unsigned int iElem = 0; iElem < totalLocallyOwnedCells; ++iElem) + { + const dealii::CellId cellid = basisOperationsPtr->cellID(iElem); + + std::vector dummy(1); + std::vector &tempRhoQuads = (*rhoValues)[cellid]; + std::vector &tempGradRhoQuads = + isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy; + + std::vector &tempRhoQuadsSP = + (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] : + dummy; + std::vector &tempGradRhoQuadsSP = + ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ? + (*gradRhoValuesSpinPolarized)[cellid] : + dummy; + + if (dftParams.spinPolarized == 1) + { + for (unsigned int q = 0; q < numQuadPoints; ++q) + { + const double rho0 = rhoHost[iElem * numQuadPoints + q]; + const double rho1 = + rhoHost[totalLocallyOwnedCells * numQuadPoints + + iElem * numQuadPoints + q]; + tempRhoQuadsSP[2 * q + 0] = rho0; + + tempRhoQuadsSP[2 * q + 1] = rho1; + tempRhoQuads[q] = rho0 + rho1; + } + + if (isEvaluateGradRho) for (unsigned int q = 0; q < numQuadPoints; ++q) { - const double rho0 = rhoHost[iElem * numQuadPoints + q]; - const double rho1 = - rhoHost[totalLocallyOwnedCells * numQuadPoints + - iElem * numQuadPoints + q]; - tempRhoQuadsSP[2 * q + 0] = rho0; - - tempRhoQuadsSP[2 * q + 1] = rho1; - tempRhoQuads[q] = rho0 + rho1; + const double gradRho0x = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho0y = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho0z = + gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 2]; + const double gradRho1x = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q]; + const double gradRho1y = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 1]; + const double gradRho1z = + gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + + iElem * numQuadPoints * 3 + 3 * q + 2]; + tempGradRhoQuadsSP[6 * q + 0] = gradRho0x; + tempGradRhoQuadsSP[6 * q + 1] = gradRho0y; + tempGradRhoQuadsSP[6 * q + 2] = gradRho0z; + tempGradRhoQuadsSP[6 * q + 3] = gradRho1x; + tempGradRhoQuadsSP[6 * q + 4] = gradRho1y; + tempGradRhoQuadsSP[6 * q + 5] = gradRho1z; + tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; + tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; + tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; } + } + else + { + std::memcpy(tempRhoQuads.data(), + rhoHost.data() + iElem * numQuadPoints, + numQuadPoints * sizeof(double)); - if (isEvaluateGradRho) - for (unsigned int q = 0; q < numQuadPoints; ++q) - { - const double gradRho0x = - gradRhoHost[iElem * numQuadPoints * 3 + 3 * q]; - const double gradRho0y = - gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 1]; - const double gradRho0z = - gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 2]; - const double gradRho1x = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - iElem * numQuadPoints * 3 + 3 * q]; - const double gradRho1y = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - iElem * numQuadPoints * 3 + 3 * q + 1]; - const double gradRho1z = - gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 + - iElem * numQuadPoints * 3 + 3 * q + 2]; - tempGradRhoQuadsSP[6 * q + 0] = gradRho0x; - tempGradRhoQuadsSP[6 * q + 1] = gradRho0y; - tempGradRhoQuadsSP[6 * q + 2] = gradRho0z; - tempGradRhoQuadsSP[6 * q + 3] = gradRho1x; - tempGradRhoQuadsSP[6 * q + 4] = gradRho1y; - tempGradRhoQuadsSP[6 * q + 5] = gradRho1z; - tempGradRhoQuads[3 * q] = gradRho0x + gradRho1x; - tempGradRhoQuads[3 * q + 1] = gradRho0y + gradRho1y; - tempGradRhoQuads[3 * q + 2] = gradRho0z + gradRho1z; - } - } - else - { - std::memcpy(tempRhoQuads.data(), - rhoHost.data() + iElem * numQuadPoints, - numQuadPoints * sizeof(double)); - - if (isEvaluateGradRho) - std::memcpy(tempGradRhoQuads.data(), - gradRhoHost.data() + iElem * numQuadPoints * 3, - 3 * numQuadPoints * sizeof(double)); - } - iElem++; - } + if (isEvaluateGradRho) + std::memcpy(tempGradRhoQuads.data(), + gradRhoHost.data() + iElem * numQuadPoints * 3, + 3 * numQuadPoints * sizeof(double)); + } + } #if defined(DFTFE_WITH_DEVICE) dftfe::utils::deviceSynchronize(); #endif @@ -746,7 +731,6 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> *XFrac, const unsigned int totalNumWaveFunctions, const unsigned int Nfr, - const unsigned int numLocalDofs, const std::vector> &eigenValues, const double fermiEnergy, const double fermiEnergyUp, @@ -757,10 +741,7 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE>> & basisOperationsPtrDevice, const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, + const unsigned int quadratureIndex, const std::vector & kPointWeights, std::map> *rhoValues, std::map> *gradRhoValues, @@ -771,8 +752,7 @@ namespace dftfe const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad); + const bool spectrumSplit); #endif template void @@ -783,7 +763,6 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> *XFrac, const unsigned int totalNumWaveFunctions, const unsigned int Nfr, - const unsigned int numLocalDofs, const std::vector> &eigenValues, const double fermiEnergy, const double fermiEnergyUp, @@ -794,10 +773,7 @@ namespace dftfe dftfe::utils::MemorySpace::HOST>> & basisOperationsPtr, const unsigned int matrixFreeDofhandlerIndex, - const dealii::DoFHandler<3> & dofHandler, - const unsigned int totalLocallyOwnedCells, - const unsigned int numNodesPerElement, - const unsigned int numQuadPoints, + const unsigned int quadratureIndex, const std::vector & kPointWeights, std::map> *rhoValues, std::map> *gradRhoValues, @@ -808,6 +784,5 @@ namespace dftfe const MPI_Comm & interpoolcomm, const MPI_Comm & interBandGroupComm, const dftParameters & dftParams, - const bool spectrumSplit, - const bool use2pPlusOneGLQuad); + const bool spectrumSplit); } // namespace dftfe diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.cc index b2bb8a727..efab6ca19 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.cc @@ -217,6 +217,17 @@ namespace dftfe (unsigned int)areAllCellsCartesian; } + template + dealii::CellId + FEBasisOperationsBase::cellID(const unsigned int iElem) const + { + return d_cellIndexToCellIdMap[iElem]; + } + template Date: Wed, 4 Oct 2023 09:56:55 +0530 Subject: [PATCH 17/25] Remove repeated reallocations of internal storage --- src/dft/densityCalculator.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index d5a1eb700..27714aed7 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -290,7 +290,8 @@ namespace dftfe basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - quadratureIndex); + quadratureIndex, + false); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; @@ -461,7 +462,8 @@ namespace dftfe #endif basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, - quadratureIndex); + quadratureIndex, + false); for (unsigned int spinIndex = 0; From f33f37c0e347e24d914e61ef4ba638605de73bc8 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 9 Oct 2023 11:10:41 +0530 Subject: [PATCH 18/25] Scratch mutlivectors and templatization changes --- CMakeLists.txt | 6 +- include/FEBasisOperations.h | 107 +++- include/FEBasisOperationsKernelsDevice.h | 43 ++ include/densityCalculator.h | 6 +- include/dft.h | 4 +- include/forceWfcContractionsDevice.h | 2 +- include/kohnShamDFTOperatorDevice.h | 25 +- include/operatorDevice.h | 4 - src/dft/densityCalculator.cc | 62 +-- src/dft/densityCalculatorDeviceKernels.cc | 4 +- src/dft/initBoundaryConditions.cc | 31 +- ...miltonianTimesXMemoryOptBatchGEMMDevice.cc | 6 +- ...iltonianMatrixCalculatorFlattenedDevice.cc | 72 +-- src/dftOperator/kohnShamDFTOperatorDevice.cc | 56 +- ...atrixVectorProductImplementationsDevice.cc | 6 +- src/force/forceWfcContractionsDevice.cc | 6 +- ...isOperations.cc => FEBasisOperations.t.cc} | 481 ++++++++++++++++-- ...Device.cc => FEBasisOperationsDevice.t.cc} | 146 +++--- ...ionsHost.cc => FEBasisOperationsHost.t.cc} | 10 - utils/FEBasisOperationsKernelsDevice.cc | 110 ++++ 20 files changed, 901 insertions(+), 286 deletions(-) create mode 100644 include/FEBasisOperationsKernelsDevice.h rename utils/{FEBasisOperations.cc => FEBasisOperations.t.cc} (56%) rename utils/{FEBasisOperationsDevice.cc => FEBasisOperationsDevice.t.cc} (74%) rename utils/{FEBasisOperationsHost.cc => FEBasisOperationsHost.t.cc} (98%) create mode 100644 utils/FEBasisOperationsKernelsDevice.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 72db98957..f12341fc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,8 +108,6 @@ SET(TARGET_SRC ./pseudoConverters/upfToxml.cc ./utils/PeriodicTable.cc ./utils/xmlTodftfeParser.cc - ./utils/FEBasisOperations.cc - ./utils/FEBasisOperationsHost.cc ./src/dft/dftd.cc ./src/mdi/MDIEngine.cpp ./src/mdi/libraryMDI.cpp @@ -187,7 +185,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc - ./utils/FEBasisOperationsDevice.cc + ./utils/FEBasisOperationsKernelsDevice.cc ) ELSEIF ("${GPU_LANG}" STREQUAL "hip") @@ -216,7 +214,7 @@ SET(DEVICE_SRC ./src/solvers/linearSolverProblemDevice.cc ./src/poisson/poissonSolverProblemDevice.cc ./src/helmholtz/kerkerSolverProblemDevice.cc - ./utils/FEBasisOperationsDevice.cc + ./utils/FEBasisOperationsKernelsDevice.cc ) ENDIF() diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 28a295d67..629c42252 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -115,15 +115,22 @@ namespace dftfe void initializeIndexMaps(); + void initializeFlattenedIndexMaps(); void initializeConstraints(); + void + initializeMPIPattern(); + void initializeShapeFunctionAndJacobianData(); + void + initializeShapeFunctionAndJacobianBasisData(); + void resizeTempStorage(); @@ -142,18 +149,62 @@ namespace dftfe unsigned int nOwnedDofs() const; - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & shapeFunctionData(bool transpose = false) const; - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & shapeFunctionGradientData(bool transpose = false) const; - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & inverseJacobians() const; - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & JxW() const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionBasisData(bool transpose = false) const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionBasisData(bool transpose = false) const; + + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionGradientBasisData(bool transpose = false) const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + shapeFunctionGradientBasisData(bool transpose = false) const; + + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + inverseJacobiansBasisData() const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + inverseJacobiansBasisData() const; + + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + JxWBasisData() const; + template ::value, int> = 0> + const dftfe::utils::MemoryStorage & + JxWBasisData() const; + unsigned int cellsTypeFlag() const; @@ -162,7 +213,23 @@ namespace dftfe void createMultiVector( - const unsigned int dofHandlerIndex, + const unsigned int blocksize, + dftfe::linearAlgebra::MultiVector + &multiVector) const; + + void + createScratchMultiVectors(const unsigned int vecBlockSize, + const unsigned int numMultiVecs = 1) const; + + void + clearScratchMultiVectors() const; + + dftfe::linearAlgebra::MultiVector & + getMultiVector(const unsigned int vecBlockSize, + const unsigned int index = 0) const; + + void + getMultiVector( const unsigned int blocksize, dftfe::linearAlgebra::MultiVector &multiVector) const; @@ -199,6 +266,26 @@ namespace dftfe std::vector> d_shapeFunctionGradientDataTranspose; + std::vector> + d_inverseJacobianBasisData; + std::vector> + d_JxWBasisData; + std::vector> + d_shapeFunctionBasisData; + std::vector> + d_shapeFunctionGradientBasisData; + std::vector> + d_shapeFunctionBasisDataTranspose; + std::vector> + d_shapeFunctionGradientBasisDataTranspose; + + + mutable std::map< + unsigned int, + std::vector< + dftfe::linearAlgebra::MultiVector>> + scratchMultiVectors; + std::vector d_quadratureIDsVector; unsigned int d_quadratureID; std::vector d_nQuadsPerCell; @@ -212,6 +299,9 @@ namespace dftfe bool areAllCellsAffine; bool areAllCellsCartesian; UpdateFlags d_updateFlags; + + std::shared_ptr> + mpiPatternP2P; }; template + +namespace dftfe +{ + namespace basis + { + namespace FEBasisOperationsKernelsDevice + { + template + void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec); + + + }; // namespace FEBasisOperationsKernelsDevice + } // namespace basis +} // namespace dftfe + +#endif // DFTFE_WITH_DEVICE +#endif // dftfeFEBasisOperationsKernelsDevice_h diff --git a/include/densityCalculator.h b/include/densityCalculator.h index 76eb017ea..b6154609e 100644 --- a/include/densityCalculator.h +++ b/include/densityCalculator.h @@ -36,7 +36,7 @@ namespace dftfe const double fermiEnergy, const double fermiEnergyUp, const double fermiEnergyDown, - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> & basisOperationsPtr, const unsigned int matrixFreeDofhandlerIndex, @@ -56,7 +56,7 @@ namespace dftfe template void computeRhoGradRhoFromInterpolatedValues( - std::unique_ptr< + std::shared_ptr< dftfe::basis:: FEBasisOperations> & basisOperationsPtr, @@ -75,7 +75,7 @@ namespace dftfe template void computeRhoGradRhoFromInterpolatedValues( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/include/dft.h b/include/dft.h index f7ff85348..49bf1bc49 100644 --- a/include/dft.h +++ b/include/dft.h @@ -1285,13 +1285,13 @@ namespace dftfe unsigned int d_densityQuadratureId; unsigned int d_densityQuadratureIdElectro; dealii::MatrixFree<3, double> matrix_free_data, d_matrixFreeDataPRefined; - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> basisOperationsPtrHost; #if defined(DFTFE_WITH_DEVICE) - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/include/forceWfcContractionsDevice.h b/include/forceWfcContractionsDevice.h index d9d8e520f..4f0771dfe 100644 --- a/include/forceWfcContractionsDevice.h +++ b/include/forceWfcContractionsDevice.h @@ -30,7 +30,7 @@ namespace dftfe { void wfcContractionsForceKernelsAllH( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h index 9fb45ccad..8e4d15b53 100644 --- a/include/kohnShamDFTOperatorDevice.h +++ b/include/kohnShamDFTOperatorDevice.h @@ -648,13 +648,15 @@ namespace dftfe /// pointer to dft class dftClass *dftPtr; - std::unique_ptr< - dftfe::basis:: - FEBasisOperations> + std::shared_ptr< + dftfe::basis::FEBasisOperations> basisOperationsPtrDevice; - std::unique_ptr< - dftfe::basis:: - FEBasisOperations> + std::shared_ptr< + dftfe::basis::FEBasisOperations> basisOperationsPtrHost; @@ -695,17 +697,6 @@ namespace dftfe dftfe::utils::MemoryStorage d_shapeFunctionValueTransposedLpspDevice; - /// storage for shapefunction gradients - // std::vector d_shapeFunctionGradientValueX; - // std::vector d_shapeFunctionGradientValueXTransposed; - - // std::vector d_shapeFunctionGradientValueY; - // std::vector d_shapeFunctionGradientValueYTransposed; - - // std::vector d_shapeFunctionGradientValueZ; - // std::vector d_shapeFunctionGradientValueZTransposed; - - std::vector d_cellJxWValues; dftfe::utils::MemoryStorage d_cellJxWValuesDevice; diff --git a/include/operatorDevice.h b/include/operatorDevice.h index 85c316921..be38b5ff5 100644 --- a/include/operatorDevice.h +++ b/include/operatorDevice.h @@ -443,10 +443,6 @@ namespace dftfe dftfe::utils::MemorySpace::DEVICE> d_cellWaveFunctionMatrix; - distributedDeviceVec d_parallelChebyBlockVectorDevice; - - distributedDeviceVec d_parallelChebyBlockVector2Device; - distributedDeviceVec d_parallelProjectorKetTimesBlockVectorDevice; diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc index 27714aed7..9e4f1dbeb 100644 --- a/src/dft/densityCalculator.cc +++ b/src/dft/densityCalculator.cc @@ -45,7 +45,7 @@ namespace dftfe const double fermiEnergy, const double fermiEnergyUp, const double fermiEnergyDown, - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> & basisOperationsPtr, const unsigned int matrixFreeDofhandlerIndex, @@ -172,12 +172,8 @@ namespace dftfe auto &partialOccupVec = partialOccupVecHost; #endif - std::vector> + std::vector *> flattenedArrayBlock(numSpinComponents); - for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - basisOperationsPtr->createMultiVector(matrixFreeDofhandlerIndex, - BVec, - flattenedArrayBlock[spinIndex]); dftfe::utils::MemoryStorage cellWaveFunctionMatrix( cellsBlockSize * numNodesPerElement * BVec); @@ -198,12 +194,10 @@ namespace dftfe std::min(BVec, totalNumWaveFunctions - jvec); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - if (currentBlockSize != - flattenedArrayBlock[spinIndex].numVectors()) - basisOperationsPtr->createMultiVector( - matrixFreeDofhandlerIndex, - currentBlockSize, - flattenedArrayBlock[spinIndex]); + flattenedArrayBlock[spinIndex] = + &(basisOperationsPtr->getMultiVector(currentBlockSize, + spinIndex)); + if ((jvec + currentBlockSize) <= bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && (jvec + currentBlockSize) > @@ -267,7 +261,7 @@ namespace dftfe ++spinIndex) if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - std::memcpy(flattenedArrayBlock[spinIndex].data() + + std::memcpy(flattenedArrayBlock[spinIndex]->data() + iNode * currentBlockSize, X->data() + numLocalDofs * totalNumWaveFunctions * @@ -284,7 +278,7 @@ namespace dftfe jvec, X->data() + numLocalDofs * totalNumWaveFunctions * (numSpinComponents * kPoint + spinIndex), - flattenedArrayBlock[spinIndex].begin()); + flattenedArrayBlock[spinIndex]->data()); #endif @@ -297,9 +291,9 @@ namespace dftfe for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) { - flattenedArrayBlock[spinIndex].updateGhostValues(); + flattenedArrayBlock[spinIndex]->updateGhostValues(); basisOperationsPtr->distribute( - flattenedArrayBlock[spinIndex]); + *(flattenedArrayBlock[spinIndex])); } for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) @@ -316,7 +310,7 @@ namespace dftfe spinIndex < numSpinComponents; ++spinIndex) basisOperationsPtr->interpolateKernel( - flattenedArrayBlock[spinIndex], + *(flattenedArrayBlock[spinIndex]), wfcQuadPointData[spinIndex].data(), isEvaluateGradRho ? gradWfcQuadPointData[spinIndex].data() : @@ -357,12 +351,9 @@ namespace dftfe const unsigned int currentBlockSize = std::min(BVec, Nfr - jvec); for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex) - if (currentBlockSize != - flattenedArrayBlock[spinIndex].numVectors()) - basisOperationsPtr->createMultiVector( - matrixFreeDofhandlerIndex, - currentBlockSize, - flattenedArrayBlock[spinIndex]); + flattenedArrayBlock[spinIndex] = + &(basisOperationsPtr->getMultiVector(currentBlockSize, + spinIndex)); if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <= bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] && (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) > @@ -430,7 +421,7 @@ namespace dftfe if (memorySpace == dftfe::utils::MemorySpace::HOST) for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode) - std::memcpy(flattenedArrayBlock[spinIndex].data() + + std::memcpy(flattenedArrayBlock[spinIndex]->data() + iNode * currentBlockSize, XFrac->data() + numLocalDofs * Nfr * @@ -438,15 +429,6 @@ namespace dftfe spinIndex) + iNode * Nfr + jvec, currentBlockSize * sizeof(NumberType)); - // for (unsigned int iWave = 0; iWave < - // currentBlockSize; - // ++iWave) - // flattenedArrayBlock[spinIndex] - // .data()[iNode * currentBlockSize + iWave] = - // (XFrac->data())[numLocalDofs * Nfr * - // (numSpinComponents * kPoint + - // spinIndex) + - // iNode * Nfr + jvec + iWave]; #if defined(DFTFE_WITH_DEVICE) else if (memorySpace == dftfe::utils::MemorySpace::DEVICE) dftfe::utils::deviceKernelsGeneric:: @@ -458,7 +440,7 @@ namespace dftfe XFrac->data() + numLocalDofs * Nfr * (numSpinComponents * kPoint + spinIndex), - flattenedArrayBlock[spinIndex].begin()); + flattenedArrayBlock[spinIndex]->data()); #endif basisOperationsPtr->reinit(currentBlockSize, cellsBlockSize, @@ -470,9 +452,9 @@ namespace dftfe spinIndex < numSpinComponents; ++spinIndex) { - flattenedArrayBlock[spinIndex].updateGhostValues(); + flattenedArrayBlock[spinIndex]->updateGhostValues(); basisOperationsPtr->distribute( - flattenedArrayBlock[spinIndex]); + *(flattenedArrayBlock[spinIndex])); } for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++) @@ -488,7 +470,7 @@ namespace dftfe spinIndex < numSpinComponents; ++spinIndex) basisOperationsPtr->interpolateKernel( - flattenedArrayBlock[spinIndex], + *(flattenedArrayBlock[spinIndex]), wfcQuadPointData[spinIndex].data(), isEvaluateGradRho ? gradWfcQuadPointData[spinIndex].data() : @@ -662,7 +644,7 @@ namespace dftfe template void computeRhoGradRhoFromInterpolatedValues( - std::unique_ptr< + std::shared_ptr< dftfe::basis:: FEBasisOperations> & basisOperationsPtr, @@ -737,7 +719,7 @@ namespace dftfe const double fermiEnergy, const double fermiEnergyUp, const double fermiEnergyDown, - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> @@ -769,7 +751,7 @@ namespace dftfe const double fermiEnergy, const double fermiEnergyUp, const double fermiEnergyDown, - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc index 52a15c0ef..5c2c4db62 100644 --- a/src/dft/densityCalculatorDeviceKernels.cc +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -140,7 +140,7 @@ namespace dftfe template void computeRhoGradRhoFromInterpolatedValues( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> @@ -228,7 +228,7 @@ namespace dftfe } template void computeRhoGradRhoFromInterpolatedValues( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc index 16e64ed84..d7e207974 100644 --- a/src/dft/initBoundaryConditions.cc +++ b/src/dft/initBoundaryConditions.cc @@ -262,7 +262,7 @@ namespace dftfe d_constraintsVector, quadratureVector, additional_data); - basisOperationsPtrHost = std::make_unique< + basisOperationsPtrHost = std::make_shared< dftfe::basis::FEBasisOperations>( @@ -276,10 +276,29 @@ namespace dftfe basisOperationsPtrHost->init(d_densityDofHandlerIndex, quadratureIndices, updateFlags); + if (!d_dftParamsPtr->useDevice) + { + std::vector bandGroupLowHighPlusOneIndices; + dftUtils::createBandParallelizationIndices( + interBandGroupComm, d_numEigenValues, bandGroupLowHighPlusOneIndices); + + unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize, + bandGroupLowHighPlusOneIndices[1]); + + basisOperationsPtrHost->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)); + if (d_numEigenValues % BVec != 0) + basisOperationsPtrHost->createScratchMultiVectors( + d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1)); + if (d_numEigenValues != d_numEigenValuesRR && + d_numEigenValuesRR % BVec != 0) + basisOperationsPtrHost->createScratchMultiVectors( + d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1)); + } #if defined(DFTFE_WITH_DEVICE) if (d_dftParamsPtr->useDevice) { - basisOperationsPtrDevice = std::make_unique< + basisOperationsPtrDevice = std::make_shared< dftfe::basis::FEBasisOperations>( @@ -287,6 +306,14 @@ namespace dftfe basisOperationsPtrDevice->init(d_densityDofHandlerIndex, quadratureIndices, updateFlags); + const unsigned int BVec = + std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues); + + if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") + basisOperationsPtrDevice->createScratchMultiVectors(BVec, 2); + else + basisOperationsPtrDevice->createScratchMultiVectors( + BVec, (d_dftParamsPtr->spinPolarized + 1)); } #endif diff --git a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc index 349ff0004..35ec71e9d 100644 --- a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc +++ b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc @@ -253,8 +253,7 @@ kohnShamDFTOperatorDeviceClass:: if (std::is_same>::value) { utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), dst, d_tempRealVec.begin(), d_tempImagVec.begin()); @@ -269,8 +268,7 @@ kohnShamDFTOperatorDeviceClass:: utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), d_tempRealVec.begin(), d_tempImagVec.begin(), dst); diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc index c9c9f04b1..661c67f8f 100644 --- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc +++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc @@ -1107,8 +1107,8 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #elif DFTFE_WITH_DEVICE_LANG_HIP @@ -1124,8 +1124,8 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), d_vEffExternalPotCorrJxWDevice.begin(), d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin()); #endif @@ -1147,10 +1147,10 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1172,10 +1172,10 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1197,10 +1197,10 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1221,10 +1221,10 @@ kohnShamDFTOperatorDeviceClass:: basisOperationsPtrDevice->nCells(), basisOperationsPtrDevice->nDofsPerCell(), basisOperationsPtrDevice->nQuadsPerCell(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_vEffJxWDevice.begin(), d_cellJxWValuesDevice.begin(), @@ -1251,10 +1251,10 @@ kohnShamDFTOperatorDeviceClass:: spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), @@ -1283,10 +1283,10 @@ kohnShamDFTOperatorDeviceClass:: spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), @@ -1314,10 +1314,10 @@ kohnShamDFTOperatorDeviceClass:: spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), @@ -1345,10 +1345,10 @@ kohnShamDFTOperatorDeviceClass:: spinIndex, (1 + dftPtr->d_dftParamsPtr->spinPolarized), dftPtr->d_kPointWeights.size(), - basisOperationsPtrDevice->shapeFunctionData(true), - basisOperationsPtrDevice->shapeFunctionData(false), - basisOperationsPtrDevice->shapeFunctionGradientData(), - basisOperationsPtrDevice->inverseJacobians(), + basisOperationsPtrDevice->shapeFunctionBasisData(true).data(), + basisOperationsPtrDevice->shapeFunctionBasisData(false).data(), + basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(), + basisOperationsPtrDevice->inverseJacobiansBasisData().data(), basisOperationsPtrDevice->cellsTypeFlag(), d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(), d_vEffJxWDevice.begin(), diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc index 1c4fb99b4..c63a353ae 100644 --- a/src/dftOperator/kohnShamDFTOperatorDevice.cc +++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc @@ -458,7 +458,10 @@ namespace dftfe kohnShamDFTOperatorDeviceClass:: getParallelChebyBlockVectorDevice() { - return d_parallelChebyBlockVectorDevice; + const unsigned int BVec = + std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, + dftPtr->d_numEigenValues); + return basisOperationsPtrDevice->getMultiVector(BVec); } template @@ -466,7 +469,10 @@ namespace dftfe kohnShamDFTOperatorDeviceClass:: getParallelChebyBlockVector2Device() { - return d_parallelChebyBlockVector2Device; + const unsigned int BVec = + std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, + dftPtr->d_numEigenValues); + return basisOperationsPtrDevice->getMultiVector(BVec, 1); } template @@ -495,26 +501,8 @@ namespace dftfe { computing_timer.enter_subsection("kohnShamDFTOperatorDeviceClass setup"); - basisOperationsPtrDevice = std::make_unique< - dftfe::basis:: - FEBasisOperations>( - dftPtr->matrix_free_data, dftPtr->d_constraintsVector); - basisOperationsPtrHost = std::make_unique< - dftfe::basis:: - FEBasisOperations>( - dftPtr->matrix_free_data, dftPtr->d_constraintsVector); - dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values | - dftfe::basis::update_gradients | - dftfe::basis::update_transpose; - std::vector quadratureIndices(4, 0); - for (auto i = 0; i < 4; ++i) - quadratureIndices[i] = i; - basisOperationsPtrHost->init(dftPtr->d_densityDofHandlerIndex, - quadratureIndices, - updateFlags); - basisOperationsPtrDevice->init(dftPtr->d_densityDofHandlerIndex, - quadratureIndices, - updateFlags); + basisOperationsPtrDevice = dftPtr->basisOperationsPtrDevice; + basisOperationsPtrHost = dftPtr->basisOperationsPtrHost; dftPtr->matrix_free_data.initialize_dof_vector( d_invSqrtMassVector, dftPtr->d_densityDofHandlerIndex); @@ -589,25 +577,6 @@ namespace dftfe std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, numberWaveFunctions); - dftfe::linearAlgebra::createMultiVectorFromDealiiPartitioner( - dftPtr->matrix_free_data.get_vector_partitioner( - dftPtr->d_densityDofHandlerIndex), - BVec, - d_parallelChebyBlockVectorDevice); - - if (dftPtr->d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND") - d_parallelChebyBlockVector2Device.reinit( - d_parallelChebyBlockVectorDevice); - - if (std::is_same>::value) - { - d_tempRealVec.resize((d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), - 0.0); - d_tempImagVec.resize((d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), - 0.0); - } const unsigned int n_ghosts = dftPtr->matrix_free_data @@ -617,6 +586,11 @@ namespace dftfe dftPtr->matrix_free_data .get_vector_partitioner(dftPtr->d_densityDofHandlerIndex) ->local_size(); + if (std::is_same>::value) + { + d_tempRealVec.resize(((localSize + n_ghosts) * BVec), 0.0); + d_tempImagVec.resize(((localSize + n_ghosts) * BVec), 0.0); + } dftfe::utils::MemoryStorage locallyOwnedProcBoundaryNodesVector(localSize, 0); diff --git a/src/dftOperator/matrixVectorProductImplementationsDevice.cc b/src/dftOperator/matrixVectorProductImplementationsDevice.cc index 33461fca7..625354d54 100644 --- a/src/dftOperator/matrixVectorProductImplementationsDevice.cc +++ b/src/dftOperator/matrixVectorProductImplementationsDevice.cc @@ -86,8 +86,7 @@ kohnShamDFTOperatorDeviceClass:: if (std::is_same>::value) { utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), dst, d_tempRealVec.begin(), d_tempImagVec.begin()); @@ -102,8 +101,7 @@ kohnShamDFTOperatorDeviceClass:: utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice( - (d_parallelChebyBlockVectorDevice.localSize() * - d_parallelChebyBlockVectorDevice.numVectors()), + (d_tempRealVec.size()), d_tempRealVec.begin(), d_tempImagVec.begin(), dst); diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc index 43d07bb93..12a942a7c 100644 --- a/src/force/forceWfcContractionsDevice.cc +++ b/src/force/forceWfcContractionsDevice.cc @@ -419,7 +419,7 @@ namespace dftfe void interpolatePsiComputeELocWfcEshelbyTensorD( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> @@ -934,7 +934,7 @@ namespace dftfe void devicePortedForceKernelsAllD( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> @@ -1138,7 +1138,7 @@ namespace dftfe void wfcContractionsForceKernelsAllH( - std::unique_ptr< + std::shared_ptr< dftfe::basis::FEBasisOperations> diff --git a/utils/FEBasisOperations.cc b/utils/FEBasisOperations.t.cc similarity index 56% rename from utils/FEBasisOperations.cc rename to utils/FEBasisOperations.t.cc index efab6ca19..b26ee0c40 100644 --- a/utils/FEBasisOperations.cc +++ b/utils/FEBasisOperations.t.cc @@ -73,8 +73,11 @@ namespace dftfe d_quadratureIDsVector = quadratureID; d_updateFlags = updateFlags; initializeIndexMaps(); + initializeMPIPattern(); initializeConstraints(); initializeShapeFunctionAndJacobianData(); + if (!std::is_same::value) + initializeShapeFunctionAndJacobianBasisData(); } template - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & FEBasisOperationsBase::shapeFunctionData(bool transpose) const { - return transpose ? d_shapeFunctionDataTranspose[d_quadratureID].data() : - d_shapeFunctionData[d_quadratureID].data(); + return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] : + d_shapeFunctionData[d_quadratureID]; } template - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & FEBasisOperationsBase< ValueTypeBasisCoeff, ValueTypeBasisData, memorySpace>::shapeFunctionGradientData(bool transpose) const { - return transpose ? - d_shapeFunctionGradientDataTranspose[d_quadratureID].data() : - d_shapeFunctionGradientData[d_quadratureID].data(); + return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] : + d_shapeFunctionGradientData[d_quadratureID]; } template - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & FEBasisOperationsBase::inverseJacobians() const { - return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID] - .data(); + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID]; } template - const ValueTypeBasisCoeff * + const dftfe::utils::MemoryStorage & FEBasisOperationsBase::JxW() const { - return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID] - .data(); + return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::JxWBasisData() const + { + return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::JxWBasisData() const + { + return d_JxWBasisData[areAllCellsAffine ? 0 : d_quadratureID]; } + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::inverseJacobiansBasisData() const + { + return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::inverseJacobiansBasisData() const + { + return d_inverseJacobianBasisData[areAllCellsAffine ? 0 : d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) + const + { + return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] : + d_shapeFunctionData[d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase::shapeFunctionBasisData(bool transpose) + const + { + return transpose ? d_shapeFunctionBasisDataTranspose[d_quadratureID] : + d_shapeFunctionBasisData[d_quadratureID]; + } + + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientBasisData(bool transpose) const + { + return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] : + d_shapeFunctionGradientData[d_quadratureID]; + } + + template + template ::value, int>> + const dftfe::utils::MemoryStorage & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::shapeFunctionGradientBasisData(bool transpose) const + { + return transpose ? + d_shapeFunctionGradientBasisDataTranspose[d_quadratureID] : + d_shapeFunctionGradientBasisData[d_quadratureID]; + } + + template @@ -285,6 +408,30 @@ namespace dftfe #endif } + template + void + FEBasisOperationsBase::initializeMPIPattern() + { + const std::pair &locallyOwnedRange = + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->local_range(); + + std::vector ghostIndices; + (d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->ghost_indices()) + .fill_index_vector(ghostIndices); + + mpiPatternP2P = + std::make_shared>( + locallyOwnedRange, + ghostIndices, + d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID) + ->get_mpi_communicator()); + } template + void + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::initializeShapeFunctionAndJacobianBasisData() + { + d_inverseJacobianBasisData.resize( + areAllCellsAffine ? 1 : d_quadratureIDsVector.size()); + d_JxWBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_values) + { + d_shapeFunctionBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionBasisDataTranspose.resize( + d_quadratureIDsVector.size()); + } + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientBasisData.resize(d_quadratureIDsVector.size()); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientBasisDataTranspose.resize( + d_quadratureIDsVector.size()); + } + for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size(); + ++iQuadID) + { + const dealii::Quadrature<3> &quadrature = + d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]); + dealii::FEValues<3> fe_values( + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(), + quadrature, + dealii::update_values | dealii::update_gradients | + dealii::update_jacobians | dealii::update_JxW_values | + dealii::update_inverse_jacobians); + +#if defined(DFTFE_WITH_DEVICE) + dftfe::utils::MemoryStorage + d_inverseJacobianDataHost; + dftfe::utils::MemoryStorage + d_JxWDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionDataTransposeHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataHost; + dftfe::utils::MemoryStorage + d_shapeFunctionGradientDataTransposeHost; +#else + auto &d_inverseJacobianDataHost = + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID]; + auto &d_JxWDataHost = d_JxWBasisData[iQuadID]; + auto &d_shapeFunctionDataHost = d_shapeFunctionBasisData[iQuadID]; + auto &d_shapeFunctionDataTransposeHost = + d_shapeFunctionBasisDataTranspose[iQuadID]; + auto &d_shapeFunctionGradientDataHost = + d_shapeFunctionGradientBasisData[iQuadID]; + auto &d_shapeFunctionGradientDataTransposeHost = + d_shapeFunctionGradientBasisDataTranspose[iQuadID]; +#endif + + + d_shapeFunctionDataHost.clear(); + if (d_updateFlags & update_values) + d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionDataTransposeHost.clear(); + if ((d_updateFlags & update_values) && + (d_updateFlags & update_transpose)) + d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell, + 0.0); + d_shapeFunctionGradientDataHost.clear(); + d_shapeFunctionGradientDataTransposeHost.clear(); + if (d_updateFlags & update_gradients) + { + d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell * 3, + 0.0); + if (d_updateFlags & update_transpose) + d_shapeFunctionGradientDataTransposeHost.resize( + d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0); + } + + d_JxWDataHost.clear(); + if ((d_updateFlags & update_values) || + (d_updateFlags & update_gradients)) + d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]); + + d_inverseJacobianDataHost.clear(); + if (d_updateFlags & update_gradients) + d_inverseJacobianDataHost.resize( + areAllCellsCartesian ? + d_nCells * 3 : + (areAllCellsAffine ? d_nCells * 9 : + d_nCells * 9 * d_nQuadsPerCell[iQuadID])); + const unsigned int nJacobiansPerCell = + areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID]; + + auto cellPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active(); + auto endcPtr = + d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end(); + + unsigned int iCell = 0; + for (; cellPtr != endcPtr; ++cellPtr) + if (cellPtr->is_locally_owned()) + { + fe_values.reinit(cellPtr); + auto &jacobians = fe_values.get_jacobians(); + auto &inverseJacobians = fe_values.get_inverse_jacobians(); + if (iCell == 0) + { + if (d_updateFlags & update_values) + { + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataHost[iQuad * d_nDofsPerCell + + iNode] = + fe_values.shape_value(iNode, iQuad); + if (d_updateFlags & update_transpose) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_shapeFunctionDataTransposeHost + [iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.shape_value(iNode, iQuad); + } + + + if (d_updateFlags & update_gradients) + for (unsigned int iQuad = 0; + iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + for (unsigned int iNode = 0; iNode < d_nDofsPerCell; + ++iNode) + { + const auto &shape_grad_real = + fe_values.shape_grad(iNode, iQuad); + const auto &shape_grad_reference = + apply_transformation(jacobians[iQuad].transpose(), + shape_grad_real); + + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iQuad * d_nDofsPerCell + iNode] = + shape_grad_reference[iDim]; + if (d_updateFlags & update_transpose) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + d_shapeFunctionGradientDataTransposeHost + [iDim * d_nQuadsPerCell[iQuadID] * + d_nDofsPerCell + + iNode * d_nQuadsPerCell[iQuadID] + iQuad] = + shape_grad_reference[iDim]; + } + } + for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID]; + ++iQuad) + d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] = + fe_values.JxW(iQuad); + for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad) + for (unsigned int iDim = 0; iDim < 3; ++iDim) + if (areAllCellsCartesian) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 + + iDim * nJacobiansPerCell + + iQuad] = + inverseJacobians[iQuad][iDim][iDim]; + else + for (unsigned int jDim = 0; jDim < 3; ++jDim) + d_inverseJacobianDataHost[iCell * nJacobiansPerCell * + 9 + + 9 * iQuad + jDim * 3 + iDim] = + inverseJacobians[iQuad][iDim][jDim]; + ++iCell; + } + +#if defined(DFTFE_WITH_DEVICE) + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].resize( + d_inverseJacobianDataHost.size()); + d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].copyFrom( + d_inverseJacobianDataHost); + d_JxWBasisData[iQuadID].resize(d_JxWDataHost.size()); + d_JxWBasisData[iQuadID].copyFrom(d_JxWDataHost); + d_shapeFunctionBasisData[iQuadID].resize( + d_shapeFunctionDataHost.size()); + d_shapeFunctionBasisData[iQuadID].copyFrom(d_shapeFunctionDataHost); + d_shapeFunctionBasisDataTranspose[iQuadID].resize( + d_shapeFunctionDataTransposeHost.size()); + d_shapeFunctionBasisDataTranspose[iQuadID].copyFrom( + d_shapeFunctionDataTransposeHost); + d_shapeFunctionGradientBasisData[iQuadID].resize( + d_shapeFunctionGradientDataHost.size()); + d_shapeFunctionGradientBasisData[iQuadID].copyFrom( + d_shapeFunctionGradientDataHost); + d_shapeFunctionGradientBasisDataTranspose[iQuadID].resize( + d_shapeFunctionGradientDataTransposeHost.size()); + d_shapeFunctionGradientBasisDataTranspose[iQuadID].copyFrom( + d_shapeFunctionGradientDataTransposeHost); +#endif + } + } + + template @@ -602,17 +971,75 @@ namespace dftfe ValueTypeBasisData, memorySpace>:: createMultiVector( - const unsigned int dofHandlerIndex, const unsigned int blocksize, dftfe::linearAlgebra::MultiVector &multiVector) const { - dftfe::linearAlgebra::createMultiVectorFromDealiiPartitioner( - d_matrixFreeDataPtr->get_vector_partitioner(dofHandlerIndex), - blocksize, - multiVector); + multiVector.reinit(mpiPatternP2P, blocksize); + } + + template + void + FEBasisOperationsBase:: + createScratchMultiVectors(const unsigned int vecBlockSize, + const unsigned int numMultiVecs) const + { + auto iter = scratchMultiVectors.find(vecBlockSize); + if (iter == scratchMultiVectors.end()) + { + scratchMultiVectors[vecBlockSize] = + std::vector>( + numMultiVecs); + for (unsigned int iVec = 0; iVec < numMultiVecs; ++iVec) + scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P, + vecBlockSize); + } + else + { + scratchMultiVectors[vecBlockSize].resize( + scratchMultiVectors[vecBlockSize].size() + numMultiVecs); + for (unsigned int iVec = 0; + iVec < scratchMultiVectors[vecBlockSize].size(); + ++iVec) + scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P, + vecBlockSize); + } + } + + template + void + FEBasisOperationsBase::clearScratchMultiVectors() const + { + scratchMultiVectors.clear(); } + template + dftfe::linearAlgebra::MultiVector & + FEBasisOperationsBase< + ValueTypeBasisCoeff, + ValueTypeBasisData, + memorySpace>::getMultiVector(const unsigned int vecBlockSize, + const unsigned int index) const + { + AssertThrow(scratchMultiVectors.find(vecBlockSize) != + scratchMultiVectors.end(), + dealii::ExcMessage( + "DFT-FE Error: MultiVector not found in scratch storage.")); + return scratchMultiVectors[vecBlockSize][index]; + } + + template @@ -628,23 +1055,5 @@ namespace dftfe } - template class FEBasisOperationsBase; -#ifdef USE_COMPLEX - template class FEBasisOperationsBase; -#endif -#ifdef DFTFE_WITH_DEVICE - template class FEBasisOperationsBase; -# ifdef USE_COMPLEX - template class FEBasisOperationsBase; -# endif -#endif } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsDevice.cc b/utils/FEBasisOperationsDevice.t.cc similarity index 74% rename from utils/FEBasisOperationsDevice.cc rename to utils/FEBasisOperationsDevice.t.cc index 6ada00c1f..ef79fe1d5 100644 --- a/utils/FEBasisOperationsDevice.cc +++ b/utils/FEBasisOperationsDevice.t.cc @@ -23,39 +23,40 @@ #include #include #include - +#include namespace dftfe { - namespace - { - template - __global__ void - reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, - const dftfe::size_type numQuads, - const dftfe::size_type numCells, - const ValueType1 * copyFromVec, - ValueType2 * copyToVec) - { - const dftfe::size_type globalThreadId = - blockIdx.x * blockDim.x + threadIdx.x; - const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + // namespace + // { + // template + // __global__ void + // reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, + // const dftfe::size_type numQuads, + // const dftfe::size_type numCells, + // const ValueType1 * copyFromVec, + // ValueType2 * copyToVec) + // { + // const dftfe::size_type globalThreadId = + // blockIdx.x * blockDim.x + threadIdx.x; + // const dftfe::size_type numberEntries = numQuads * numCells * numVecs * + // 3; - for (dftfe::size_type index = globalThreadId; index < numberEntries; - index += blockDim.x * gridDim.x) - { - dftfe::size_type blockIndex = index / numVecs; - dftfe::size_type iVec = index - blockIndex * numVecs; - dftfe::size_type blockIndex2 = blockIndex / numQuads; - dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; - dftfe::size_type iCell = blockIndex2 / 3; - dftfe::size_type iDim = blockIndex2 - iCell * 3; - dftfe::utils::copyValue( - copyToVec + index, - copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + - iCell * 3 * numQuads * numVecs]); - } - } - } // namespace + // for (dftfe::size_type index = globalThreadId; index < numberEntries; + // index += blockDim.x * gridDim.x) + // { + // dftfe::size_type blockIndex = index / numVecs; + // dftfe::size_type iVec = index - blockIndex * numVecs; + // dftfe::size_type blockIndex2 = blockIndex / numQuads; + // dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + // dftfe::size_type iCell = blockIndex2 / 3; + // dftfe::size_type iDim = blockIndex2 - iCell * 3; + // dftfe::utils::copyValue( + // copyToVec + index, + // copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + // iCell * 3 * numQuads * numVecs]); + // } + // } + // } // namespace namespace basis { @@ -265,38 +266,50 @@ namespace dftfe d_nVectors * 3, (cellRange.second - cellRange.first) * d_nQuadsPerCell[d_quadratureID]); -#ifdef DFTFE_WITH_DEVICE_LANG_CUDA - reshapeNonAffineCaseDeviceKernel<<< - (d_nVectors * (cellRange.second - cellRange.first) * - d_nQuadsPerCell[d_quadratureID] * 3) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE>>>( - d_nVectors, - d_nQuadsPerCell[d_quadratureID], - (cellRange.second - cellRange.first), - dftfe::utils::makeDataTypeDeviceCompatible( - tempQuadratureGradientsDataNonAffine.data()), - dftfe::utils::makeDataTypeDeviceCompatible( - quadratureGradients)); -#elif DFTFE_WITH_DEVICE_LANG_HIP - hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel, - (d_nVectors * - (cellRange.second - cellRange.first) * - d_nQuadsPerCell[d_quadratureID] * 3) / - dftfe::utils::DEVICE_BLOCK_SIZE + - 1, - dftfe::utils::DEVICE_BLOCK_SIZE, - 0, - 0, - d_nVectors, - d_nQuadsPerCell[d_quadratureID], - (cellRange.second - cellRange.first), - dftfe::utils::makeDataTypeDeviceCompatible( - tempQuadratureGradientsDataNonAffine.data()), - dftfe::utils::makeDataTypeDeviceCompatible( - quadratureGradients), ); -#endif + dftfe::basis::FEBasisOperationsKernelsDevice:: + reshapeNonAffineCase( + d_nVectors, + d_nQuadsPerCell[d_quadratureID], + (cellRange.second - cellRange.first), + tempQuadratureGradientsDataNonAffine.data(), + quadratureGradients); + // #ifdef DFTFE_WITH_DEVICE_LANG_CUDA + // reshapeNonAffineCaseDeviceKernel<<< + // (d_nVectors * (cellRange.second - + // cellRange.first) * + // d_nQuadsPerCell[d_quadratureID] * 3) / + // dftfe::utils::DEVICE_BLOCK_SIZE + + // 1, + // dftfe::utils::DEVICE_BLOCK_SIZE>>>( + // d_nVectors, + // d_nQuadsPerCell[d_quadratureID], + // (cellRange.second - cellRange.first), + // dftfe::utils::makeDataTypeDeviceCompatible( + // tempQuadratureGradientsDataNonAffine.data()), + // dftfe::utils::makeDataTypeDeviceCompatible( + // quadratureGradients)); + // #elif DFTFE_WITH_DEVICE_LANG_HIP + // hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel, + // (d_nVectors * + // (cellRange.second - + // cellRange.first) * + // d_nQuadsPerCell[d_quadratureID] + // * 3) / + // dftfe::utils::DEVICE_BLOCK_SIZE + // + + // 1, + // dftfe::utils::DEVICE_BLOCK_SIZE, + // 0, + // 0, + // d_nVectors, + // d_nQuadsPerCell[d_quadratureID], + // (cellRange.second - + // cellRange.first), + // dftfe::utils::makeDataTypeDeviceCompatible( + // tempQuadratureGradientsDataNonAffine.data()), + // dftfe::utils::makeDataTypeDeviceCompatible( + // quadratureGradients)); + // #endif } } } @@ -375,14 +388,5 @@ namespace dftfe { return *d_deviceBlasHandlePtr; } - template class FEBasisOperations; -#ifdef USE_COMPLEX - template class FEBasisOperations; -#endif - } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsHost.cc b/utils/FEBasisOperationsHost.t.cc similarity index 98% rename from utils/FEBasisOperationsHost.cc rename to utils/FEBasisOperationsHost.t.cc index 29ea894cf..8b17a7265 100644 --- a/utils/FEBasisOperationsHost.cc +++ b/utils/FEBasisOperationsHost.t.cc @@ -457,15 +457,5 @@ namespace dftfe [iCell * d_nDofsPerCell + iDof], std::plus()); } - - template class FEBasisOperations; -#ifdef USE_COMPLEX - template class FEBasisOperations; -#endif - } // namespace basis } // namespace dftfe diff --git a/utils/FEBasisOperationsKernelsDevice.cc b/utils/FEBasisOperationsKernelsDevice.cc new file mode 100644 index 000000000..2ad2a4706 --- /dev/null +++ b/utils/FEBasisOperationsKernelsDevice.cc @@ -0,0 +1,110 @@ +// --------------------------------------------------------------------- +// +// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE +// authors. +// +// This file is part of the DFT-FE code. +// +// The DFT-FE code is free software; you can use it, redistribute +// it, and/or modify it under the terms of the GNU Lesser General +// Public License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// The full text of the license can be found in the file LICENSE at +// the top level of the DFT-FE distribution. +// +// --------------------------------------------------------------------- +// + +#include +#include +#include +#include +#include + + +namespace dftfe +{ + namespace + { + template + __global__ void + reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { + const dftfe::size_type globalThreadId = + blockIdx.x * blockDim.x + threadIdx.x; + const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3; + + for (dftfe::size_type index = globalThreadId; index < numberEntries; + index += blockDim.x * gridDim.x) + { + dftfe::size_type blockIndex = index / numVecs; + dftfe::size_type iVec = index - blockIndex * numVecs; + dftfe::size_type blockIndex2 = blockIndex / numQuads; + dftfe::size_type iQuad = blockIndex - blockIndex2 * numQuads; + dftfe::size_type iCell = blockIndex2 / 3; + dftfe::size_type iDim = blockIndex2 - iCell * 3; + dftfe::utils::copyValue( + copyToVec + index, + copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs + + iCell * 3 * numQuads * numVecs]); + } + } + } // namespace + namespace basis + { + namespace FEBasisOperationsKernelsDevice + { + template + void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const ValueType1 * copyFromVec, + ValueType2 * copyToVec) + { +#ifdef DFTFE_WITH_DEVICE_LANG_CUDA + reshapeNonAffineCaseDeviceKernel<<<(numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE>>>( + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#elif DFTFE_WITH_DEVICE_LANG_HIP + hipLaunchKernelGGL( + reshapeNonAffineCaseDeviceKernel, + (numVecs * numCells * numQuads * 3) / + dftfe::utils::DEVICE_BLOCK_SIZE + + 1, + dftfe::utils::DEVICE_BLOCK_SIZE, + 0, + 0, + numVecs, + numQuads, + numCells, + dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec), + dftfe::utils::makeDataTypeDeviceCompatible(copyToVec)); +#endif + } + template void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const double * copyFromVec, + double * copyToVec); + template void + reshapeNonAffineCase(const dftfe::size_type numVecs, + const dftfe::size_type numQuads, + const dftfe::size_type numCells, + const std::complex *copyFromVec, + std::complex * copyToVec); + + } // namespace FEBasisOperationsKernelsDevice + } // namespace basis +} // namespace dftfe From 472db991a09ec0c0f00c5ee17b33ce99cfdb7bfe Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Mon, 9 Oct 2023 11:23:26 +0530 Subject: [PATCH 19/25] Comments --- include/dft.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/dft.h b/include/dft.h index 49bf1bc49..de63bbf10 100644 --- a/include/dft.h +++ b/include/dft.h @@ -1408,17 +1408,17 @@ namespace dftfe /// Spectrum split higher eigenvalues computed in Rayleigh-Ritz step std::vector> eigenValuesRRSplit; - // std::vector> - // d_eigenVectorsFlattened; std::vector> - // d_eigenVectorsFlattenedSTL; + + /** + * The indexing of d_eigenVectorsFlattenedHost and + * d_eigenVectorsFlattenedDevice [kPoint * numSpinComponents * + * numLocallyOwnedNodes * numWaveFunctions + iSpin * numLocallyOwnedNodes * + * numWaveFunctions + iNode * numWaveFunctions + iWaveFunction] + */ dftfe::utils::MemoryStorage d_eigenVectorsFlattenedHost; - // std::vector> - // d_eigenVectorsRotFracDensityFlattenedSTL; - // std::vector> - // d_eigenVectorsDensityMatrixPrimeSTL; dftfe::utils::MemoryStorage d_eigenVectorsRotFracDensityFlattenedHost; From 6a1fd6f7551fdd3c790a2048037c81a7a14707c7 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 10 Oct 2023 19:19:37 +0530 Subject: [PATCH 20/25] Fix hip compilation --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f12341fc7..93dd0db76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,7 +194,6 @@ SET(DEVICE_SRC ./utils/MemoryTransferKernelsDevice.cc ./utils/DeviceKernelsGeneric.cc ./utils/DeviceDirectCCLWrapper.cc - ./src/dft/densityCalculatorDevice.cc ./src/dft/densityCalculatorDeviceKernels.cc ./src/dft/densityFirstOrderResponseCalculatorDevice.cc ./src/dftOperator/operatorDevice.cc From 425c5246f3a763132513796d2a59fa66e5b45ba5 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Tue, 10 Oct 2023 19:51:06 +0530 Subject: [PATCH 21/25] Fix hipblas data type --- utils/DeviceBlasWrapper.hip.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/utils/DeviceBlasWrapper.hip.cc b/utils/DeviceBlasWrapper.hip.cc index 8e96b6c40..f24a55ccf 100644 --- a/utils/DeviceBlasWrapper.hip.cc +++ b/utils/DeviceBlasWrapper.hip.cc @@ -576,13 +576,13 @@ namespace dftfe trans, m, n, - dftfe::utils::makeDataTypeDeviceCompatible(alpha), - dftfe::utils::makeDataTypeDeviceCompatible(A), + makeDataTypeHipBlasCompatible(alpha), + makeDataTypeHipBlasCompatible(A), lda, - dftfe::utils::makeDataTypeDeviceCompatible(x), + makeDataTypeHipBlasCompatible(x), incx, - dftfe::utils::makeDataTypeDeviceCompatible(beta), - dftfe::utils::makeDataTypeDeviceCompatible(y), + makeDataTypeHipBlasCompatible(beta), + makeDataTypeHipBlasCompatible(y), incy); DEVICEBLAS_API_CHECK(status); return status; @@ -607,13 +607,13 @@ namespace dftfe trans, m, n, - dftfe::utils::makeDataTypeDeviceCompatible(alpha), - dftfe::utils::makeDataTypeDeviceCompatible(A), + makeDataTypeHipBlasCompatible(alpha), + makeDataTypeHipBlasCompatible(A), lda, - dftfe::utils::makeDataTypeDeviceCompatible(x), + makeDataTypeHipBlasCompatible(x), incx, - dftfe::utils::makeDataTypeDeviceCompatible(beta), - dftfe::utils::makeDataTypeDeviceCompatible(y), + makeDataTypeHipBlasCompatible(beta), + makeDataTypeHipBlasCompatible(y), incy); DEVICEBLAS_API_CHECK(status); return status; From 300d49f0451fbb82add5087c286608b3dab654cf Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Wed, 11 Oct 2023 08:13:47 +0530 Subject: [PATCH 22/25] gemv typo --- include/DeviceBlasWrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/DeviceBlasWrapper.h b/include/DeviceBlasWrapper.h index b246bff99..4925b51f2 100644 --- a/include/DeviceBlasWrapper.h +++ b/include/DeviceBlasWrapper.h @@ -289,7 +289,7 @@ namespace dftfe int incy); deviceBlasStatus_t - gemm(deviceBlasHandle_t handle, + gemv(deviceBlasHandle_t handle, deviceBlasOperation_t trans, int m, int n, @@ -303,7 +303,7 @@ namespace dftfe int incy); deviceBlasStatus_t - gemm(deviceBlasHandle_t handle, + gemv(deviceBlasHandle_t handle, deviceBlasOperation_t trans, int m, int n, From cb4b95ddb58aecc8160a0eebf4ab6301f4ffe560 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Wed, 11 Oct 2023 08:25:43 +0530 Subject: [PATCH 23/25] likely hip bugfix --- src/dft/densityCalculatorDeviceKernels.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc index 5c2c4db62..8b8ac25c1 100644 --- a/src/dft/densityCalculatorDeviceKernels.cc +++ b/src/dft/densityCalculatorDeviceKernels.cc @@ -185,7 +185,7 @@ namespace dftfe dftfe::utils::DEVICE_BLOCK_SIZE, 0, 0, - cellsBlockSize * nQuadsPerCell * vectorsBlockSize, + vectorsBlockSize, cellsBlockSize, nQuadsPerCell, dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData), From c168d392b5abb72878ce0ac60b6cbd26cfbc7a34 Mon Sep 17 00:00:00 2001 From: Sambit Das Date: Fri, 13 Oct 2023 09:25:51 -0400 Subject: [PATCH 24/25] Update complex gpu ctest --- .../frontierJobScript6GCDs6MPITasks.rc | 32 +++++++++++++++ .../frontierJobScript18GCDs18MPITasks.rc | 41 +++++++++++++++++++ .../frontierJobScript6GCDs6MPITasks.rc | 39 ++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc create mode 100644 testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc create mode 100644 testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc diff --git a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc new file mode 100644 index 000000000..aa420fa35 --- /dev/null +++ b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -0,0 +1,32 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 1 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/complex +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc new file mode 100644 index 000000000..0659588b2 --- /dev/null +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc @@ -0,0 +1,41 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 3 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/real +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe + + diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc new file mode 100644 index 000000000..9c051b5e5 --- /dev/null +++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc @@ -0,0 +1,39 @@ +#!/ccs/home/dsambit/frontier/bin/rc +#SBATCH -A mat239 +#SBATCH -J gputests +#SBATCH -t 1:00:00 +#SBATCH -p batch +#SBATCH -N 1 +#SBATCH --gpus-per-node 6 +#SBATCH --ntasks-per-gpu 1 +#SBATCH --gpu-bind closest + +OMP_NUM_THREADS = 1 +MPICH_VERSION_DISPLAY=1 +MPICH_ENV_DISPLAY=1 +MPICH_OFI_NIC_POLICY = NUMA +MPICH_GPU_SUPPORT_ENABLED=1 +MPICH_SMP_SINGLE_COPY_MODE=NONE + +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64 +LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH + + +BASE = $WD/src/dftfeDebug/build/release/real +n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc} + +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13 +srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe From 10d6a2b14e4581181125c861108661083e5b3b46 Mon Sep 17 00:00:00 2001 From: Nikhil Kodali Date: Sun, 15 Oct 2023 22:32:05 +0530 Subject: [PATCH 25/25] Comments for basis operations class --- include/FEBasisOperations.h | 337 ++++++++++++++++++++++- include/FEBasisOperationsKernelsDevice.h | 9 + 2 files changed, 338 insertions(+), 8 deletions(-) diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h index 629c42252..62021d158 100644 --- a/include/FEBasisOperations.h +++ b/include/FEBasisOperations.h @@ -82,18 +82,47 @@ namespace dftfe tempQuadratureGradientsDataNonAffine; public: + /** + * @brief Constructor, fills required data structures using deal.ii's MatrixFree and AffineConstraints objects + * @param[in] matrixFreeData MatrixFree object. + * @param[in] constraintsVector std::vector of AffineConstraints, should + * be the same vector which was passed for the construction of the given + * MatrixFree object. + */ FEBasisOperationsBase( dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData, std::vector *> &constraintsVector); + /** + * @brief Default Destructor + */ ~FEBasisOperationsBase() = default; + /** + * @brief fills required data structures for the given dofHandlerID + * @param[in] dofHandlerID dofHandler index to be used for getting data + * from the MatrixFree object. + * @param[in] quadratureID std::vector of quadratureIDs to be used, should + * be the same IDs which were used during the construction of the given + * MatrixFree object. + */ void init(const unsigned int & dofHandlerID, const std::vector &quadratureID, const UpdateFlags updateFlags = update_values); + /** + * @brief sets internal variables and optionally resizes internal temp storage for interpolation operations + * @param[in] vecBlockSize block size to used for operations on vectors, + * this has to be set to the exact value before any such operations are + * called. + * @param[in] cellBlockSize block size to used for cells, this has to be + * set to a value greater than or equal to the required value before any + * such operations are called + * @param[in] quadratureID Quadrature index to be used. + * @param[in] isResizeTempStorage whether to resize internal tempstorage. + */ void reinit(const unsigned int &vecBlockSize, const unsigned int &cellBlockSize, @@ -113,54 +142,119 @@ namespace dftfe + /** + * @brief Initializes indexset maps from process level indices to cell level indices for a single vector, also initializes cell index to cellid map. + */ void initializeIndexMaps(); + /** + * @brief Initializes indexset maps from process level indices to cell level indices for multivectors. + */ void initializeFlattenedIndexMaps(); + /** + * @brief Initializes the constraintMatrixInfo object. + */ void initializeConstraints(); + /** + * @brief Constructs the MPIPatternP2P object. + */ void initializeMPIPattern(); + /** + * @brief Fill the shape function data and jacobian data in the ValueTypeBasisCoeff datatype. + */ void initializeShapeFunctionAndJacobianData(); + /** + * @brief Fill the shape function data and jacobian data in the ValueTypeBasisData datatype. + */ void initializeShapeFunctionAndJacobianBasisData(); + /** + * @brief Resizes the internal temp storage to be sufficient for the vector and cell block sizes provided in reinit. + */ void resizeTempStorage(); + /** + * @brief Number of quadrature points per cell for the quadratureID set in reinit. + */ unsigned int nQuadsPerCell() const; + /** + * @brief Number of DoFs per cell for the dofHandlerID set in init. + */ unsigned int nDofsPerCell() const; + /** + * @brief Number of locally owned cells on the current processor. + */ unsigned int nCells() const; + /** + * @brief Number of DoFs on the current processor, locally owned + ghosts. + */ unsigned int nRelaventDofs() const; + /** + * @brief Number of locally owned DoFs on the current processor. + */ unsigned int nOwnedDofs() const; + /** + * @brief Shape function values at quadrature points. + * @param[in] transpose if false the the data is indexed as [iQuad * + * d_nDofsPerCell + iNode] and if true it is indexed as [iNode * + * d_nQuadsPerCell + iQuad]. + */ const dftfe::utils::MemoryStorage & shapeFunctionData(bool transpose = false) const; + /** + * @brief Shape function gradient values at quadrature points. + * @param[in] transpose if false the the data is indexed as [iDim * + * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and + * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell + + * iNode * d_nQuadsPerCell + iQuad]. + */ const dftfe::utils::MemoryStorage & shapeFunctionGradientData(bool transpose = false) const; + /** + * @brief Inverse Jacobian matrices, for cartesian cells returns the + * diagonal elements of the inverse Jacobian matrices for each cell, for + * affine cells returns the 3x3 inverse Jacobians for each cell otherwise + * returns the 3x3 inverse Jacobians at each quad point for each cell. + */ const dftfe::utils::MemoryStorage & inverseJacobians() const; + /** + * @brief determinant of Jacobian times the quadrature weight at each + * quad point for each cell. + */ const dftfe::utils::MemoryStorage & JxW() const; + /** + * @brief Shape function values at quadrature points in ValueTypeBasisData. + * @param[in] transpose if false the the data is indexed as [iQuad * + * d_nDofsPerCell + iNode] and if true it is indexed as [iNode * + * d_nQuadsPerCell + iQuad]. + */ template ::value, int> = 0> @@ -172,6 +266,13 @@ namespace dftfe const dftfe::utils::MemoryStorage & shapeFunctionBasisData(bool transpose = false) const; + /** + * @brief Shape function gradient values at quadrature points in ValueTypeBasisData. + * @param[in] transpose if false the the data is indexed as [iDim * + * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and + * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell + + * iNode * d_nQuadsPerCell + iQuad]. + */ template ::value, int> = 0> @@ -183,6 +284,12 @@ namespace dftfe const dftfe::utils::MemoryStorage & shapeFunctionGradientBasisData(bool transpose = false) const; + /** + * @brief Inverse Jacobian matrices in ValueTypeBasisData, for cartesian cells returns the + * diagonal elements of the inverse Jacobian matrices for each cell, for + * affine cells returns the 3x3 inverse Jacobians for each cell otherwise + * returns the 3x3 inverse Jacobians at each quad point for each cell. + */ template ::value, int> = 0> @@ -194,6 +301,10 @@ namespace dftfe const dftfe::utils::MemoryStorage & inverseJacobiansBasisData() const; + /** + * @brief determinant of Jacobian times the quadrature weight in ValueTypeBasisData at each + * quad point for each cell. + */ template ::value, int> = 0> @@ -205,35 +316,61 @@ namespace dftfe const dftfe::utils::MemoryStorage & JxWBasisData() const; + /** + * @brief returns 2 if all cells on current processor are Cartesian, + * 1 if all cells on current processor are affine and 0 otherwise. + */ unsigned int cellsTypeFlag() const; + /** + * @brief returns the deal.ii cellID corresponing to given cell Index. + * @param[in] iElem cell Index + */ dealii::CellId cellID(const unsigned int iElem) const; + /** + * @brief Creates a multivector. + * @param[in] blocksize Number of vectors in the multivector. + * @param[out] multiVector the created multivector. + */ void createMultiVector( const unsigned int blocksize, dftfe::linearAlgebra::MultiVector &multiVector) const; + /** + * @brief Creates scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs number of scratch multivectors needed with + * this vecBlockSize. + */ void createScratchMultiVectors(const unsigned int vecBlockSize, const unsigned int numMultiVecs = 1) const; + /** + * @brief Clears scratch multivectors. + */ void clearScratchMultiVectors() const; + /** + * @brief Gets scratch multivectors. + * @param[in] vecBlockSize Number of vectors in the multivector. + * @param[out] numMultiVecs index of the multivector among those with the + * same vecBlockSize. + */ dftfe::linearAlgebra::MultiVector & getMultiVector(const unsigned int vecBlockSize, const unsigned int index = 0) const; - void - getMultiVector( - const unsigned int blocksize, - dftfe::linearAlgebra::MultiVector - &multiVector) const; - + /** + * @brief Apply constraints on given multivector. + * @param[inout] multiVector the given multivector. + */ void distribute( dftfe::linearAlgebra::MultiVector @@ -419,6 +556,16 @@ namespace dftfe dftfe::utils::MemorySpace::HOST>::d_constraintsVector; + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + */ void interpolate( dftfe::linearAlgebra::MultiVector &nodalData) const; + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + */ void extractToCellNodalData( dftfe::linearAlgebra::MultiVector & nodalData, ValueTypeBasisCoeff *cellNodalDataPtr) const; - + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + */ void accumulateFromCellNodalData( const ValueTypeBasisCoeff *cellNodalDataPtr, @@ -450,6 +619,18 @@ namespace dftfe dftfe::utils::MemorySpace::HOST> &nodalData) const; + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ void interpolateKernel( const dftfe::linearAlgebra::MultiVector cellRange) const; + + /** + * @brief Interpolate cell level nodal data to cell level quadrature data. + * @param[in] nodalData cell level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ void interpolateKernel( const ValueTypeBasisCoeff * nodalData, @@ -465,6 +659,18 @@ namespace dftfe ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which integration has to be + * done. + */ void integrateWithBasisKernel( const ValueTypeBasisCoeff *quadratureValues, @@ -475,6 +681,15 @@ namespace dftfe const std::pair cellRange) const; + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ void extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector cellRange) const; + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ void accumulateFromCellNodalDataKernel( const ValueTypeBasisCoeff *cellNodalDataPtr, @@ -604,16 +828,34 @@ namespace dftfe ValueTypeBasisData, dftfe::utils::MemorySpace::DEVICE>::d_constraintsVector; + // FIXME has to be removed in a future PR + /** + * @brief sets device blas handle for internal blas operations. + */ dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr; void setDeviceBLASHandle( dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr); + // FIXME has to be removed in a future PR + /** + * @brief gets device blas handle for blas operations. + */ dftfe::utils::deviceBlasHandle_t & getDeviceBLASHandle(); + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + */ void interpolate( dftfe::linearAlgebra::MultiVector &nodalData) const; + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + */ void extractToCellNodalData( dftfe::linearAlgebra::MultiVector &nodalData) const; + /** + * @brief Interpolate process level nodal data to cell level quadrature data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ void interpolateKernel( const dftfe::linearAlgebra::MultiVector< @@ -653,6 +931,19 @@ namespace dftfe ValueTypeBasisCoeff * quadratureValues, ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; + + /** + * @brief Interpolate cell level nodal data to cell level quadrature data. + * @param[in] nodalData cell level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which interpolation has to + * be done. + */ void interpolateKernel( const ValueTypeBasisCoeff * nodalData, @@ -660,6 +951,18 @@ namespace dftfe ValueTypeBasisCoeff * quadratureGradients, const std::pair cellRange) const; + // FIXME Untested function + /** + * @brief Integrate cell level quadrature data times shape functions to process level nodal data. + * @param[in] quadratureValues Cell level quadrature values, indexed by + * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] quadratureGradients Cell level quadrature gradients, + * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which integration has to be + * done. + */ void integrateWithBasisKernel( const ValueTypeBasisCoeff *quadratureValues, @@ -670,6 +973,15 @@ namespace dftfe const std::pair cellRange) const; + /** + * @brief Get cell level nodal data from process level nodal data. + * @param[in] nodalData process level nodal data, the multivector should + * already have ghost data and constraints should have been applied. + * @param[out] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ void extractToCellNodalDataKernel( const dftfe::linearAlgebra::MultiVector< @@ -678,6 +990,15 @@ namespace dftfe ValueTypeBasisCoeff * cellNodalDataPtr, const std::pair cellRange) const; + // FIXME Untested function + /** + * @brief Accumulate cell level nodal data into process level nodal data. + * @param[in] cellNodalDataPtr Cell level nodal values, indexed by + * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec]. + * @param[out] nodalData process level nodal data. + * @param[in] cellRange the range of cells for which extraction has to be + * done. + */ void accumulateFromCellNodalDataKernel( const ValueTypeBasisCoeff *cellNodalDataPtr, diff --git a/include/FEBasisOperationsKernelsDevice.h b/include/FEBasisOperationsKernelsDevice.h index e33fdf3cd..8a38c53a8 100644 --- a/include/FEBasisOperationsKernelsDevice.h +++ b/include/FEBasisOperationsKernelsDevice.h @@ -26,6 +26,15 @@ namespace dftfe { namespace FEBasisOperationsKernelsDevice { + /** + * @brief rehsape gradient data from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim * + * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec]. + * @param[in] numVecs number of vectors. + * @param[in] numQuads number of quadrature points per cell. + * @param[in] numCells number of locally owned cells. + * @param[in] copyFromVec source data pointer. + * @param[out] copyToVec destination data pointer. + */ template void reshapeNonAffineCase(const dftfe::size_type numVecs,