diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e26cd7a8..93dd0db76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ SET(TARGET_SRC
   ./src/dft/dft.cc
   ./src/dft/vselfBinsManager.cc
   ./src/dft/energyCalculator.cc
-  ./src/dft/densityCalculatorCPU.cc
+  ./src/dft/densityCalculator.cc
   ./src/dft/densityFirstOrderResponseCalculatorCPU.cc
   ./src/excManager/excDensityBaseClass.cpp
   ./src/excManager/excDensityLDAClass.cpp
@@ -166,7 +166,7 @@ SET(DEVICE_SRC
   ./utils/MemoryTransferKernelsDevice.cc
   ./utils/DeviceKernelsGeneric.cc
   ./utils/DeviceDirectCCLWrapper.cc
-  ./src/dft/densityCalculatorDevice.cc
+  ./src/dft/densityCalculatorDeviceKernels.cc
   ./src/dft/densityFirstOrderResponseCalculatorDevice.cc
   ./src/dftOperator/operatorDevice.cc
   ./src/dftOperator/kohnShamDFTOperatorDevice.cc
@@ -185,6 +185,7 @@ SET(DEVICE_SRC
   ./src/solvers/linearSolverProblemDevice.cc
   ./src/poisson/poissonSolverProblemDevice.cc
   ./src/helmholtz/kerkerSolverProblemDevice.cc
+  ./utils/FEBasisOperationsKernelsDevice.cc
   )
 
 ELSEIF ("${GPU_LANG}" STREQUAL "hip")
@@ -193,7 +194,7 @@ SET(DEVICE_SRC
   ./utils/MemoryTransferKernelsDevice.cc
   ./utils/DeviceKernelsGeneric.cc
   ./utils/DeviceDirectCCLWrapper.cc
-  ./src/dft/densityCalculatorDevice.cc
+  ./src/dft/densityCalculatorDeviceKernels.cc
   ./src/dft/densityFirstOrderResponseCalculatorDevice.cc
   ./src/dftOperator/operatorDevice.cc
   ./src/dftOperator/kohnShamDFTOperatorDevice.cc
@@ -212,6 +213,7 @@ SET(DEVICE_SRC
   ./src/solvers/linearSolverProblemDevice.cc
   ./src/poisson/poissonSolverProblemDevice.cc
   ./src/helmholtz/kerkerSolverProblemDevice.cc
+  ./utils/FEBasisOperationsKernelsDevice.cc
   )
 
 ENDIF()
@@ -222,7 +224,11 @@ IF (WITH_GPU)
        set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE CUDA)
     ELSEIF ("${GPU_LANG}" STREQUAL "hip")
        set_source_files_properties(${DEVICE_SRC} PROPERTIES LANGUAGE HIP)
-       ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__)
+       IF ("${GPU_VENDOR}" STREQUAL "amd")
+          ADD_DEFINITIONS(-D__HIP_PLATFORM_AMD__)
+       ELSEIF ("${GPU_VENDOR}" STREQUAL "nvidia")
+          ADD_DEFINITIONS(-D__HIP_PLATFORM_NVIDIA__)
+       ENDIF()
     ENDIF()
 ENDIF()
 IF (WITH_GPU)
diff --git a/include/DeviceBlasWrapper.h b/include/DeviceBlasWrapper.h
index 0331c7e45..4925b51f2 100644
--- a/include/DeviceBlasWrapper.h
+++ b/include/DeviceBlasWrapper.h
@@ -260,6 +260,62 @@ namespace dftfe
                          long long int              strideC,
                          int                        batchCount);
 
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const double *        alpha,
+           const double *        A,
+           int                   lda,
+           const double *        x,
+           int                   incx,
+           const double *        beta,
+           double *              y,
+           int                   incy);
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const float *         alpha,
+           const float *         A,
+           int                   lda,
+           const float *         x,
+           int                   incx,
+           const float *         beta,
+           float *               y,
+           int                   incy);
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t          handle,
+           deviceBlasOperation_t       trans,
+           int                         m,
+           int                         n,
+           const std::complex<double> *alpha,
+           const std::complex<double> *A,
+           int                         lda,
+           const std::complex<double> *x,
+           int                         incx,
+           const std::complex<double> *beta,
+           std::complex<double> *      y,
+           int                         incy);
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t         handle,
+           deviceBlasOperation_t      trans,
+           int                        m,
+           int                        n,
+           const std::complex<float> *alpha,
+           const std::complex<float> *A,
+           int                        lda,
+           const std::complex<float> *x,
+           int                        incx,
+           const std::complex<float> *beta,
+           std::complex<float> *      y,
+           int                        incy);
+
 
     } // namespace deviceBlasWrapper
   }   // namespace utils
diff --git a/include/FEBasisOperations.h b/include/FEBasisOperations.h
new file mode 100644
index 000000000..62021d158
--- /dev/null
+++ b/include/FEBasisOperations.h
@@ -0,0 +1,1019 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#ifndef dftfeFEBasisOperations_h
+#define dftfeFEBasisOperations_h
+
+#include <MultiVector.h>
+#include <headers.h>
+#include <constraintMatrixInfo.h>
+#include <constraintMatrixInfoDevice.h>
+#include <DeviceTypeConfig.h>
+
+namespace dftfe
+{
+  namespace basis
+  {
+    enum UpdateFlags
+    {
+      update_default = 0,
+
+      update_values = 0x0001,
+
+      update_gradients = 0x0002,
+
+      update_transpose = 0x0004
+    };
+
+    inline UpdateFlags
+    operator|(const UpdateFlags f1, const UpdateFlags f2)
+    {
+      return static_cast<UpdateFlags>(static_cast<unsigned int>(f1) |
+                                      static_cast<unsigned int>(f2));
+    }
+
+
+
+    inline UpdateFlags &
+    operator|=(UpdateFlags &f1, const UpdateFlags f2)
+    {
+      f1 = f1 | f2;
+      return f1;
+    }
+
+
+    inline UpdateFlags operator&(const UpdateFlags f1, const UpdateFlags f2)
+    {
+      return static_cast<UpdateFlags>(static_cast<unsigned int>(f1) &
+                                      static_cast<unsigned int>(f2));
+    }
+
+
+    inline UpdateFlags &
+    operator&=(UpdateFlags &f1, const UpdateFlags f2)
+    {
+      f1 = f1 & f2;
+      return f1;
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    class FEBasisOperationsBase
+    {
+    protected:
+      mutable dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>
+        tempCellNodalData, tempQuadratureGradientsData,
+        tempQuadratureGradientsDataNonAffine;
+
+    public:
+      /**
+       * @brief Constructor, fills required data structures using deal.ii's MatrixFree and AffineConstraints objects
+       * @param[in] matrixFreeData MatrixFree object.
+       * @param[in] constraintsVector std::vector of AffineConstraints, should
+       * be the same vector which was passed for the construction of the given
+       * MatrixFree object.
+       */
+      FEBasisOperationsBase(
+        dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData,
+        std::vector<const dealii::AffineConstraints<ValueTypeBasisData> *>
+          &constraintsVector);
+
+      /**
+       * @brief Default Destructor
+       */
+      ~FEBasisOperationsBase() = default;
+
+      /**
+       * @brief fills required data structures for the given dofHandlerID
+       * @param[in] dofHandlerID dofHandler index to be used for getting data
+       * from the MatrixFree object.
+       * @param[in] quadratureID std::vector of quadratureIDs to be used, should
+       * be the same IDs which were used during the construction of the given
+       * MatrixFree object.
+       */
+      void
+      init(const unsigned int &             dofHandlerID,
+           const std::vector<unsigned int> &quadratureID,
+           const UpdateFlags                updateFlags = update_values);
+
+      /**
+       * @brief sets internal variables and optionally resizes internal temp storage for interpolation operations
+       * @param[in] vecBlockSize block size to used for operations on vectors,
+       * this has to be set to the exact value before any such operations are
+       * called.
+       * @param[in] cellBlockSize block size to used for cells, this has to be
+       * set to a value greater than or equal to the required value before any
+       * such operations are called
+       * @param[in] quadratureID Quadrature index to be used.
+       * @param[in] isResizeTempStorage whether to resize internal tempstorage.
+       */
+      void
+      reinit(const unsigned int &vecBlockSize,
+             const unsigned int &cellBlockSize,
+             const unsigned int &quadratureID,
+             const bool          isResizeTempStorage = true);
+
+      // private:
+#if defined(DFTFE_WITH_DEVICE)
+      using constraintInfoClass =
+        typename std::conditional<memorySpace ==
+                                    dftfe::utils::MemorySpace::DEVICE,
+                                  dftUtils::constraintMatrixInfoDevice,
+                                  dftUtils::constraintMatrixInfo>::type;
+#else
+      using constraintInfoClass = dftUtils::constraintMatrixInfo;
+#endif
+
+
+
+      /**
+       * @brief Initializes indexset maps from process level indices to cell level indices for a single vector, also initializes cell index to cellid map.
+       */
+      void
+      initializeIndexMaps();
+
+      /**
+       * @brief Initializes indexset maps from process level indices to cell level indices for multivectors.
+       */
+      void
+      initializeFlattenedIndexMaps();
+
+      /**
+       * @brief Initializes the constraintMatrixInfo object.
+       */
+      void
+      initializeConstraints();
+
+      /**
+       * @brief Constructs the MPIPatternP2P object.
+       */
+      void
+      initializeMPIPattern();
+
+      /**
+       * @brief Fill the shape function data and jacobian data in the ValueTypeBasisCoeff datatype.
+       */
+      void
+      initializeShapeFunctionAndJacobianData();
+
+      /**
+       * @brief Fill the shape function data and jacobian data in the ValueTypeBasisData datatype.
+       */
+      void
+      initializeShapeFunctionAndJacobianBasisData();
+
+      /**
+       * @brief Resizes the internal temp storage to be sufficient for the vector and cell block sizes provided in reinit.
+       */
+      void
+      resizeTempStorage();
+
+      /**
+       * @brief Number of quadrature points per cell for the quadratureID set in reinit.
+       */
+      unsigned int
+      nQuadsPerCell() const;
+
+      /**
+       * @brief Number of DoFs per cell for the dofHandlerID set in init.
+       */
+      unsigned int
+      nDofsPerCell() const;
+
+      /**
+       * @brief Number of locally owned cells on the current processor.
+       */
+      unsigned int
+      nCells() const;
+
+      /**
+       * @brief Number of DoFs on the current processor, locally owned + ghosts.
+       */
+      unsigned int
+      nRelaventDofs() const;
+
+      /**
+       * @brief Number of locally owned DoFs on the current processor.
+       */
+      unsigned int
+      nOwnedDofs() const;
+
+      /**
+       * @brief Shape function values at quadrature points.
+       * @param[in] transpose if false the the data is indexed as [iQuad *
+       * d_nDofsPerCell + iNode] and if true it is indexed as [iNode *
+       * d_nQuadsPerCell + iQuad].
+       */
+      const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+      shapeFunctionData(bool transpose = false) const;
+
+      /**
+       * @brief Shape function gradient values at quadrature points.
+       * @param[in] transpose if false the the data is indexed as [iDim *
+       * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and
+       * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell +
+       * iNode * d_nQuadsPerCell + iQuad].
+       */
+      const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+      shapeFunctionGradientData(bool transpose = false) const;
+
+      /**
+       * @brief Inverse Jacobian matrices, for cartesian cells returns the
+       * diagonal elements of the inverse Jacobian matrices for each cell, for
+       * affine cells returns the 3x3 inverse Jacobians for each cell otherwise
+       * returns the 3x3 inverse Jacobians at each quad point for each cell.
+       */
+      const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+      inverseJacobians() const;
+
+      /**
+       * @brief determinant of Jacobian times the quadrature weight at each
+       * quad point for each cell.
+       */
+      const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+      JxW() const;
+
+      /**
+       * @brief Shape function values at quadrature points in ValueTypeBasisData.
+       * @param[in] transpose if false the the data is indexed as [iQuad *
+       * d_nDofsPerCell + iNode] and if true it is indexed as [iNode *
+       * d_nQuadsPerCell + iQuad].
+       */
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      shapeFunctionBasisData(bool transpose = false) const;
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<!std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      shapeFunctionBasisData(bool transpose = false) const;
+
+      /**
+       * @brief Shape function gradient values at quadrature points in ValueTypeBasisData.
+       * @param[in] transpose if false the the data is indexed as [iDim *
+       * d_nQuadsPerCell * d_nDofsPerCell + iQuad * d_nDofsPerCell + iNode] and
+       * if true it is indexed as [iDim * d_nQuadsPerCell * d_nDofsPerCell +
+       * iNode * d_nQuadsPerCell + iQuad].
+       */
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      shapeFunctionGradientBasisData(bool transpose = false) const;
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<!std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      shapeFunctionGradientBasisData(bool transpose = false) const;
+
+      /**
+       * @brief Inverse Jacobian matrices in ValueTypeBasisData, for cartesian cells returns the
+       * diagonal elements of the inverse Jacobian matrices for each cell, for
+       * affine cells returns the 3x3 inverse Jacobians for each cell otherwise
+       * returns the 3x3 inverse Jacobians at each quad point for each cell.
+       */
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      inverseJacobiansBasisData() const;
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<!std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      inverseJacobiansBasisData() const;
+
+      /**
+       * @brief determinant of Jacobian times the quadrature weight in ValueTypeBasisData at each
+       * quad point for each cell.
+       */
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      JxWBasisData() const;
+      template <typename A = ValueTypeBasisCoeff,
+                typename B = ValueTypeBasisData,
+                typename std::enable_if_t<!std::is_same<A, B>::value, int> = 0>
+      const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+      JxWBasisData() const;
+
+      /**
+       * @brief returns 2 if all cells on current processor are Cartesian,
+       * 1 if all cells on current processor are affine and 0 otherwise.
+       */
+      unsigned int
+      cellsTypeFlag() const;
+
+      /**
+       * @brief returns the deal.ii cellID corresponing to given cell Index.
+       * @param[in] iElem cell Index
+       */
+      dealii::CellId
+      cellID(const unsigned int iElem) const;
+
+      /**
+       * @brief Creates a multivector.
+       * @param[in] blocksize Number of vectors in the multivector.
+       * @param[out] multiVector the created multivector.
+       */
+      void
+      createMultiVector(
+        const unsigned int blocksize,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>
+          &multiVector) const;
+
+      /**
+       * @brief Creates scratch multivectors.
+       * @param[in] vecBlockSize Number of vectors in the multivector.
+       * @param[out] numMultiVecs number of scratch multivectors needed with
+       * this vecBlockSize.
+       */
+      void
+      createScratchMultiVectors(const unsigned int vecBlockSize,
+                                const unsigned int numMultiVecs = 1) const;
+
+      /**
+       * @brief Clears scratch multivectors.
+       */
+      void
+      clearScratchMultiVectors() const;
+
+      /**
+       * @brief Gets scratch multivectors.
+       * @param[in] vecBlockSize Number of vectors in the multivector.
+       * @param[out] numMultiVecs index of the multivector among those with the
+       * same vecBlockSize.
+       */
+      dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace> &
+      getMultiVector(const unsigned int vecBlockSize,
+                     const unsigned int index = 0) const;
+
+      /**
+       * @brief Apply constraints on given multivector.
+       * @param[inout] multiVector the given multivector.
+       */
+      void
+      distribute(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>
+          &multiVector) const;
+
+
+
+      constraintInfoClass d_constraintInfo;
+      std::vector<const dealii::AffineConstraints<ValueTypeBasisData> *>
+        *                                              d_constraintsVector;
+      const dealii::MatrixFree<3, ValueTypeBasisData> *d_matrixFreeDataPtr;
+      dftfe::utils::MemoryStorage<dftfe::global_size_type,
+                                  dftfe::utils::MemorySpace::HOST>
+        d_cellDofIndexToProcessDofIndexMap;
+      dftfe::utils::MemoryStorage<dftfe::global_size_type, memorySpace>
+                                  d_flattenedCellDofIndexToProcessDofIndexMap;
+      std::vector<dealii::CellId> d_cellIndexToCellIdMap;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_inverseJacobianData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_JxWData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_shapeFunctionData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_shapeFunctionGradientDataInternalLayout;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_shapeFunctionGradientData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_shapeFunctionDataTranspose;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace>>
+        d_shapeFunctionGradientDataTranspose;
+
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_inverseJacobianBasisData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_JxWBasisData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_shapeFunctionBasisData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_shapeFunctionGradientBasisData;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_shapeFunctionBasisDataTranspose;
+      std::vector<dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace>>
+        d_shapeFunctionGradientBasisDataTranspose;
+
+
+      mutable std::map<
+        unsigned int,
+        std::vector<
+          dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>>>
+        scratchMultiVectors;
+
+      std::vector<unsigned int> d_quadratureIDsVector;
+      unsigned int              d_quadratureID;
+      std::vector<unsigned int> d_nQuadsPerCell;
+      unsigned int              d_dofHandlerID;
+      unsigned int              d_nVectors;
+      unsigned int              d_nCells;
+      unsigned int              d_cellsBlockSize;
+      unsigned int              d_nDofsPerCell;
+      unsigned int              d_localSize;
+      unsigned int              d_locallyOwnedSize;
+      bool                      areAllCellsAffine;
+      bool                      areAllCellsCartesian;
+      UpdateFlags               d_updateFlags;
+
+      std::shared_ptr<const utils::mpi::MPIPatternP2P<memorySpace>>
+        mpiPatternP2P;
+    };
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    class FEBasisOperations : FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                                    ValueTypeBasisData,
+                                                    memorySpace>
+    {};
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    class FEBasisOperations<ValueTypeBasisCoeff,
+                            ValueTypeBasisData,
+                            dftfe::utils::MemorySpace::HOST>
+      : public FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                     ValueTypeBasisData,
+                                     dftfe::utils::MemorySpace::HOST>
+    {
+    public:
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::FEBasisOperationsBase;
+
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::d_nCells;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::d_localSize;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_locallyOwnedSize;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::tempCellNodalData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::tempQuadratureGradientsDataNonAffine;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::d_nVectors;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_quadratureID;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_nQuadsPerCell;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_nDofsPerCell;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::areAllCellsAffine;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::areAllCellsCartesian;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_updateFlags;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_shapeFunctionData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_shapeFunctionDataTranspose;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_shapeFunctionGradientDataTranspose;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::
+        d_shapeFunctionGradientDataInternalLayout;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::d_JxWData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_inverseJacobianData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_cellIndexToCellIdMap;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_cellDofIndexToProcessDofIndexMap;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::HOST>::
+        d_flattenedCellDofIndexToProcessDofIndexMap;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::HOST>::d_constraintsVector;
+
+
+      /**
+       * @brief Interpolate process level nodal data to cell level quadrature data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       */
+      void
+      interpolate(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                  nodalData,
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients = NULL) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Integrate cell level quadrature data times shape functions to process level nodal data.
+       * @param[in] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       */
+      void
+      integrateWithBasis(
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &nodalData) const;
+
+      /**
+       * @brief Get cell level nodal data from process level nodal data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       */
+      void
+      extractToCellNodalData(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                  nodalData,
+        ValueTypeBasisCoeff *cellNodalDataPtr) const;
+      // FIXME Untested function
+      /**
+       * @brief Accumulate cell level nodal data into process level nodal data.
+       * @param[in] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       */
+      void
+      accumulateFromCellNodalData(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &nodalData) const;
+
+      /**
+       * @brief Interpolate process level nodal data to cell level quadrature data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which interpolation has to
+       * be done.
+       */
+      void
+      interpolateKernel(
+        const dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                                dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      /**
+       * @brief Interpolate cell level nodal data to cell level quadrature data.
+       * @param[in] nodalData cell level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which interpolation has to
+       * be done.
+       */
+      void
+      interpolateKernel(
+        const ValueTypeBasisCoeff *                 nodalData,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Integrate cell level quadrature data times shape functions to process level nodal data.
+       * @param[in] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       * @param[in] cellRange the range of cells for which integration has to be
+       * done.
+       */
+      void
+      integrateWithBasisKernel(
+        const ValueTypeBasisCoeff *quadratureValues,
+        const ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+
+      /**
+       * @brief Get cell level nodal data from process level nodal data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which extraction has to be
+       * done.
+       */
+      void
+      extractToCellNodalDataKernel(
+        const dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                                dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        ValueTypeBasisCoeff *                       cellNodalDataPtr,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Accumulate cell level nodal data into process level nodal data.
+       * @param[in] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       * @param[in] cellRange the range of cells for which extraction has to be
+       * done.
+       */
+      void
+      accumulateFromCellNodalDataKernel(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+    };
+#if defined(DFTFE_WITH_DEVICE)
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    class FEBasisOperations<ValueTypeBasisCoeff,
+                            ValueTypeBasisData,
+                            dftfe::utils::MemorySpace::DEVICE>
+      : public FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                     ValueTypeBasisData,
+                                     dftfe::utils::MemorySpace::DEVICE>
+    {
+    public:
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::FEBasisOperationsBase;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::d_nCells;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_localSize;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_locallyOwnedSize;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::tempCellNodalData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::tempQuadratureGradientsData;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::
+        tempQuadratureGradientsDataNonAffine;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_nVectors;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_cellsBlockSize;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_quadratureID;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_nQuadsPerCell;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_nDofsPerCell;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::areAllCellsAffine;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::areAllCellsCartesian;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_updateFlags;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionDataTranspose;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_shapeFunctionGradientData;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::
+        d_shapeFunctionGradientDataTranspose;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::
+        d_shapeFunctionGradientDataInternalLayout;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::d_JxWData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_inverseJacobianData;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_cellIndexToCellIdMap;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_cellDofIndexToProcessDofIndexMap;
+      using FEBasisOperationsBase<ValueTypeBasisCoeff,
+                                  ValueTypeBasisData,
+                                  dftfe::utils::MemorySpace::DEVICE>::
+        d_flattenedCellDofIndexToProcessDofIndexMap;
+      using FEBasisOperationsBase<
+        ValueTypeBasisCoeff,
+        ValueTypeBasisData,
+        dftfe::utils::MemorySpace::DEVICE>::d_constraintsVector;
+
+      // FIXME has to be removed in a future PR
+      /**
+       * @brief sets device blas handle for internal blas operations.
+       */
+      dftfe::utils::deviceBlasHandle_t *d_deviceBlasHandlePtr;
+      void
+      setDeviceBLASHandle(
+        dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr);
+
+      // FIXME has to be removed in a future PR
+      /**
+       * @brief gets device blas handle for blas operations.
+       */
+      dftfe::utils::deviceBlasHandle_t &
+      getDeviceBLASHandle();
+
+
+
+      /**
+       * @brief Interpolate process level nodal data to cell level quadrature data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       */
+      void
+      interpolate(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                  nodalData,
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients = NULL) const;
+
+
+      // FIXME Untested function
+      /**
+       * @brief Integrate cell level quadrature data times shape functions to process level nodal data.
+       * @param[in] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       */
+      void
+      integrateWithBasis(
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &nodalData) const;
+
+      /**
+       * @brief Get cell level nodal data from process level nodal data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       */
+      void
+      extractToCellNodalData(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                  nodalData,
+        ValueTypeBasisCoeff *cellNodalDataPtr) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Accumulate cell level nodal data into process level nodal data.
+       * @param[in] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       */
+      void
+      accumulateFromCellNodalData(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &nodalData) const;
+
+      /**
+       * @brief Interpolate process level nodal data to cell level quadrature data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which interpolation has to
+       * be done.
+       */
+      void
+      interpolateKernel(
+        const dftfe::linearAlgebra::MultiVector<
+          ValueTypeBasisCoeff,
+          dftfe::utils::MemorySpace::DEVICE> &      nodalData,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      /**
+       * @brief Interpolate cell level nodal data to cell level quadrature data.
+       * @param[in] nodalData cell level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which interpolation has to
+       * be done.
+       */
+      void
+      interpolateKernel(
+        const ValueTypeBasisCoeff *                 nodalData,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Integrate cell level quadrature data times shape functions to process level nodal data.
+       * @param[in] quadratureValues Cell level quadrature values, indexed by
+       * [iCell * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] quadratureGradients Cell level quadrature gradients,
+       * indexed by [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       * @param[in] cellRange the range of cells for which integration has to be
+       * done.
+       */
+      void
+      integrateWithBasisKernel(
+        const ValueTypeBasisCoeff *quadratureValues,
+        const ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+
+      /**
+       * @brief Get cell level nodal data from process level nodal data.
+       * @param[in] nodalData process level nodal data, the multivector should
+       * already have ghost data and constraints should have been applied.
+       * @param[out] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[in] cellRange the range of cells for which extraction has to be
+       * done.
+       */
+      void
+      extractToCellNodalDataKernel(
+        const dftfe::linearAlgebra::MultiVector<
+          ValueTypeBasisCoeff,
+          dftfe::utils::MemorySpace::DEVICE> &      nodalData,
+        ValueTypeBasisCoeff *                       cellNodalDataPtr,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+
+      // FIXME Untested function
+      /**
+       * @brief Accumulate cell level nodal data into process level nodal data.
+       * @param[in] cellNodalDataPtr Cell level nodal values, indexed by
+       * [iCell * d_nDofsPerCell * d_nVectors + iDoF * d_nVectors + iVec].
+       * @param[out] nodalData process level nodal data.
+       * @param[in] cellRange the range of cells for which extraction has to be
+       * done.
+       */
+      void
+      accumulateFromCellNodalDataKernel(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const;
+    };
+#endif
+  } // end of namespace basis
+} // end of namespace dftfe
+#include "../utils/FEBasisOperations.t.cc"
+#include "../utils/FEBasisOperationsHost.t.cc"
+#if defined(DFTFE_WITH_DEVICE)
+#  include "../utils/FEBasisOperationsDevice.t.cc"
+#endif
+
+#endif // dftfeBasisOperations_h
diff --git a/include/FEBasisOperationsKernelsDevice.h b/include/FEBasisOperationsKernelsDevice.h
new file mode 100644
index 000000000..8a38c53a8
--- /dev/null
+++ b/include/FEBasisOperationsKernelsDevice.h
@@ -0,0 +1,52 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+
+#ifndef dftfeFEBasisOperationsKernelsDevice_h
+#define dftfeFEBasisOperationsKernelsDevice_h
+
+#ifdef DFTFE_WITH_DEVICE
+#  include <TypeConfig.h>
+
+namespace dftfe
+{
+  namespace basis
+  {
+    namespace FEBasisOperationsKernelsDevice
+    {
+      /**
+       * @brief rehsape gradient data from [iCell * 3 * d_nQuadsPerCell * d_nVectors + iQuad * 3 * d_nVectors + iDim * d_nVectors + iVec] to [iCell * 3 * d_nQuadsPerCell * d_nVectors + iDim *
+       * d_nQuadsPerCell * d_nVectors + iQuad * d_nVectors + iVec].
+       * @param[in] numVecs number of vectors.
+       * @param[in] numQuads number of quadrature points per cell.
+       * @param[in] numCells number of locally owned cells.
+       * @param[in] copyFromVec source data pointer.
+       * @param[out] copyToVec destination data pointer.
+       */
+      template <typename ValueType1, typename ValueType2>
+      void
+      reshapeNonAffineCase(const dftfe::size_type numVecs,
+                           const dftfe::size_type numQuads,
+                           const dftfe::size_type numCells,
+                           const ValueType1 *     copyFromVec,
+                           ValueType2 *           copyToVec);
+
+
+    }; // namespace FEBasisOperationsKernelsDevice
+  }    // namespace basis
+} // namespace dftfe
+
+#endif // DFTFE_WITH_DEVICE
+#endif // dftfeFEBasisOperationsKernelsDevice_h
diff --git a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h
index c0dadb055..929ef8534 100644
--- a/include/chebyshevOrthogonalizedSubspaceIterationSolver.h
+++ b/include/chebyshevOrthogonalizedSubspaceIterationSolver.h
@@ -64,17 +64,18 @@ namespace dftfe
      * @brief Solve a generalized eigen problem.
      */
     void
-    solve(operatorDFTClass &              operatorMatrix,
-          elpaScalaManager &              elpaScala,
-          std::vector<dataTypes::number> &eigenVectorsFlattened,
-          std::vector<dataTypes::number> &eigenVectorsRotFracDensityFlattened,
-          const unsigned int              totalNumberWaveFunctions,
-          std::vector<double> &           eigenValues,
-          std::vector<double> &           residuals,
-          const MPI_Comm &                interBandGroupComm,
-          const bool                      computeResidual,
-          const bool                      useMixedPrec = false,
-          const bool                      isFirstScf   = false);
+    solve(operatorDFTClass &   operatorMatrix,
+          elpaScalaManager &   elpaScala,
+          dataTypes::number *  eigenVectorsFlattened,
+          dataTypes::number *  eigenVectorsRotFracDensityFlattened,
+          const unsigned int   totalNumberWaveFunctions,
+          const unsigned int   localVectorSize,
+          std::vector<double> &eigenValues,
+          std::vector<double> &residuals,
+          const MPI_Comm &     interBandGroupComm,
+          const bool           computeResidual,
+          const bool           useMixedPrec = false,
+          const bool           isFirstScf   = false);
 
     /**
      * @brief Solve a generalized eigen problem.
diff --git a/include/constraintMatrixInfo.h b/include/constraintMatrixInfo.h
index 7fdb002db..92a71e0f4 100644
--- a/include/constraintMatrixInfo.h
+++ b/include/constraintMatrixInfo.h
@@ -63,33 +63,6 @@ namespace dftfe
           &                                      partitioner,
         const dealii::AffineConstraints<double> &constraintMatrixData);
 
-      /**
-       * @brief precompute map between local processor index of unflattened deallii array to the local processor index of
-       * the first field associated with the multi-field flattened dealii array
-       *
-       * @param partitioner1 associated with unflattened dealii vector
-       * @param partitioner2 associated with flattened dealii vector storing multi-fields
-       */
-      void
-      precomputeMaps(
-        const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-          &partitioner1,
-        const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-          &                partitioner2,
-        const unsigned int blockSize);
-
-      /**
-       * @brief precompute map between local processor index of unflattened deallii array to the local processor index of
-       * the first field associated with the multi-field flattened dealii array
-       *
-       * @param partitioner1 associated with unflattened dealii vector
-       * @param partitioner2 associated with flattened dealii vector storing multi-fields
-       */
-      void
-      precomputeMaps(const std::shared_ptr<const utils::mpi::MPIPatternP2P<
-                       dftfe::utils::MemorySpace::HOST>> &partitioner2,
-                     const unsigned int                   blockSize);
-
       /**
        * @brief overloaded dealii internal function "distribute" which sets the slave node
        * field values from master nodes
diff --git a/include/constraintMatrixInfoDevice.h b/include/constraintMatrixInfoDevice.h
index e682a25b5..dd5338a0e 100644
--- a/include/constraintMatrixInfoDevice.h
+++ b/include/constraintMatrixInfoDevice.h
@@ -61,27 +61,6 @@ namespace dftfe
           &                                      partitioner,
         const dealii::AffineConstraints<double> &constraintMatrixData);
 
-      /**
-       * @brief precompute map between local processor index of unflattened deallii array to the local processor index of
-       * the first field associated with the multi-field flattened dealii array
-       *
-       * @param partitioner1 associated with unflattened dealii vector
-       * @param partitioner2 associated with flattened dealii vector storing multi-fields
-       */
-      void
-      precomputeMaps(const std::shared_ptr<const utils::mpi::MPIPatternP2P<
-                       dftfe::utils::MemorySpace::HOST>> &partitioner2,
-                     const unsigned int                   blockSize);
-
-      void
-      precomputeMaps(
-        const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-          &partitioner1,
-        const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-          &                partitioner2,
-        const unsigned int blockSize);
-
-
       /**
        * @brief overloaded dealii internal function distribute for flattened dealii array  which sets
        * the slave node field values from master nodes
diff --git a/include/densityCalculator.h b/include/densityCalculator.h
new file mode 100644
index 000000000..b6154609e
--- /dev/null
+++ b/include/densityCalculator.h
@@ -0,0 +1,96 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#ifndef densityCalculatorDevice_H_
+#define densityCalculatorDevice_H_
+
+#include <headers.h>
+#include <operatorDevice.h>
+#include "dftParameters.h"
+#include "FEBasisOperations.h"
+
+namespace dftfe
+{
+  template <typename NumberType, dftfe::utils::MemorySpace memorySpace>
+  void
+  computeRhoFromPSI(
+    const dftfe::utils::MemoryStorage<NumberType, memorySpace> *X,
+    const dftfe::utils::MemoryStorage<NumberType, memorySpace> *XFrac,
+    const unsigned int                      totalNumWaveFunctions,
+    const unsigned int                      Nfr,
+    const std::vector<std::vector<double>> &eigenValues,
+    const double                            fermiEnergy,
+    const double                            fermiEnergyUp,
+    const double                            fermiEnergyDown,
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<NumberType, double, memorySpace>>
+      &                                            basisOperationsPtr,
+    const unsigned int                             matrixFreeDofhandlerIndex,
+    const unsigned int                             quadratureIndex,
+    const std::vector<double> &                    kPointWeights,
+    std::map<dealii::CellId, std::vector<double>> *rhoValues,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
+    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
+    const bool                                     isEvaluateGradRho,
+    const MPI_Comm &                               mpiCommParent,
+    const MPI_Comm &                               interpoolcomm,
+    const MPI_Comm &                               interBandGroupComm,
+    const dftParameters &                          dftParams,
+    const bool                                     spectrumSplit);
+
+  template <typename NumberType>
+  void
+  computeRhoGradRhoFromInterpolatedValues(
+    std::shared_ptr<
+      dftfe::basis::
+        FEBasisOperations<NumberType, double, dftfe::utils::MemorySpace::HOST>>
+      &                                         basisOperationsPtr,
+    const std::pair<unsigned int, unsigned int> cellRange,
+    const std::pair<unsigned int, unsigned int> vecRange,
+    double *                                    partialOccupVec,
+    NumberType *                                wfcQuadPointData,
+    NumberType *                                gradWfcQuadPointData,
+    double *                                    rhoCellsWfcContributions,
+    double *                                    gradRhoCellsWfcContributions,
+    double *                                    rho,
+    double *                                    gradRho,
+    const bool                                  isEvaluateGradRho);
+
+#if defined(DFTFE_WITH_DEVICE)
+  template <typename NumberType>
+  void
+  computeRhoGradRhoFromInterpolatedValues(
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<NumberType,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      &                                         basisOperationsPtr,
+    const std::pair<unsigned int, unsigned int> cellRange,
+    const std::pair<unsigned int, unsigned int> vecRange,
+    double *                                    partialOccupVec,
+    NumberType *                                wfcQuadPointData,
+    NumberType *                                gradWfcQuadPointData,
+    double *                                    rhoCellsWfcContributions,
+    double *                                    gradRhoCellsWfcContributions,
+    double *                                    rho,
+    double *                                    gradRho,
+    const bool                                  isEvaluateGradRho);
+#endif
+
+} // namespace dftfe
+#endif
diff --git a/include/densityCalculatorCPU.h b/include/densityCalculatorCPU.h
deleted file mode 100644
index e324cdad6..000000000
--- a/include/densityCalculatorCPU.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// ---------------------------------------------------------------------
-//
-// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
-// authors.
-//
-// This file is part of the DFT-FE code.
-//
-// The DFT-FE code is free software; you can use it, redistribute
-// it, and/or modify it under the terms of the GNU Lesser General
-// Public License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-// The full text of the license can be found in the file LICENSE at
-// the top level of the DFT-FE distribution.
-//
-// ---------------------------------------------------------------------
-//
-
-#ifndef densityCalculatorCPU_H_
-#define densityCalculatorCPU_H_
-
-#include "headers.h"
-#include "operator.h"
-#include "dftParameters.h"
-
-namespace dftfe
-{
-  /**
-   * @brief Density calculator class using gemm recasting
-   *
-   * @author Sambit Das
-   */
-
-  template <typename T>
-  void
-  computeRhoFromPSICPU(
-    const std::vector<std::vector<T>> &            X,
-    const std::vector<std::vector<T>> &            XFrac,
-    const unsigned int                             totalNumWaveFunctions,
-    const unsigned int                             Nfr,
-    const unsigned int                             numLocalDofs,
-    const std::vector<std::vector<double>> &       eigenValues,
-    const double                                   fermiEnergy,
-    const double                                   fermiEnergyUp,
-    const double                                   fermiEnergyDown,
-    operatorDFTClass &                             operatorMatrix,
-    const dealii::DoFHandler<3> &                  dofHandler,
-    const unsigned int                             totalLocallyOwnedCells,
-    const unsigned int                             numberNodesPerElement,
-    const unsigned int                             numQuadPoints,
-    const std::vector<double> &                    kPointWeights,
-    std::map<dealii::CellId, std::vector<double>> *rhoValues,
-    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
-    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
-    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-    const bool                                     isEvaluateGradRho,
-    const MPI_Comm &                               mpiCommParent,
-    const MPI_Comm &                               interpoolcomm,
-    const MPI_Comm &                               interBandGroupComm,
-    const dftParameters &                          dftParams,
-    const bool                                     spectrumSplit,
-    const bool                                     useFEOrderRhoPlusOneGLQuad);
-} // namespace dftfe
-#endif
diff --git a/include/densityCalculatorDevice.h b/include/densityCalculatorDevice.h
deleted file mode 100644
index 847efc733..000000000
--- a/include/densityCalculatorDevice.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// ---------------------------------------------------------------------
-//
-// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
-// authors.
-//
-// This file is part of the DFT-FE code.
-//
-// The DFT-FE code is free software; you can use it, redistribute
-// it, and/or modify it under the terms of the GNU Lesser General
-// Public License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-// The full text of the license can be found in the file LICENSE at
-// the top level of the DFT-FE distribution.
-//
-// ---------------------------------------------------------------------
-//
-
-#if defined(DFTFE_WITH_DEVICE)
-#  ifndef densityCalculatorDevice_H_
-#    define densityCalculatorDevice_H_
-
-#    include <headers.h>
-#    include <operatorDevice.h>
-#    include "dftParameters.h"
-
-namespace dftfe
-{
-  namespace Device
-  {
-    template <typename NumberType>
-    void
-    computeRhoFromPSI(
-      const NumberType *                             X,
-      const NumberType *                             XFrac,
-      const unsigned int                             totalNumWaveFunctions,
-      const unsigned int                             Nfr,
-      const unsigned int                             numLocalDofs,
-      const std::vector<std::vector<double>> &       eigenValues,
-      const double                                   fermiEnergy,
-      const double                                   fermiEnergyUp,
-      const double                                   fermiEnergyDown,
-      operatorDFTDeviceClass &                       operatorMatrix,
-      const unsigned int                             matrixFreeDofhandlerIndex,
-      const dealii::DoFHandler<3> &                  dofHandler,
-      const unsigned int                             totalLocallyOwnedCells,
-      const unsigned int                             numberNodesPerElement,
-      const unsigned int                             numQuadPoints,
-      const std::vector<double> &                    kPointWeights,
-      std::map<dealii::CellId, std::vector<double>> *rhoValues,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
-      std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-      const bool                                     isEvaluateGradRho,
-      const MPI_Comm &                               mpiCommParent,
-      const MPI_Comm &                               interpoolcomm,
-      const MPI_Comm &                               interBandGroupComm,
-      const dftParameters &                          dftParams,
-      const bool                                     spectrumSplit,
-      const bool use2pPlusOneGLQuad = false);
-  }
-} // namespace dftfe
-#  endif
-#endif
diff --git a/include/densityFirstOrderResponseCalculator.h b/include/densityFirstOrderResponseCalculator.h
index c674d82d0..4f5bb94f6 100644
--- a/include/densityFirstOrderResponseCalculator.h
+++ b/include/densityFirstOrderResponseCalculator.h
@@ -33,8 +33,8 @@ namespace dftfe
   template <typename NumberType>
   void
   computeRhoFirstOrderResponseCPU(
-    const std::vector<std::vector<NumberType>> &   X,
-    const std::vector<std::vector<NumberType>> &   XPrime,
+    const NumberType *                             X,
+    const NumberType *                             XPrime,
     const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
     const unsigned int                             totalNumWaveFunctions,
     const unsigned int                             numLocalDofs,
@@ -59,8 +59,8 @@ namespace dftfe
   template <typename NumberType, typename NumberTypeLowPrec>
   void
   computeRhoFirstOrderResponseCPUMixedPrec(
-    const std::vector<std::vector<NumberType>> &   X,
-    const std::vector<std::vector<NumberType>> &   XPrime,
+    const NumberType *                             X,
+    const NumberType *                             XPrime,
     const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
     const unsigned int                             totalNumWaveFunctions,
     const unsigned int                             numLocalDofs,
diff --git a/include/deviceKernelsGeneric.h b/include/deviceKernelsGeneric.h
index 6e6b631af..3703bdf02 100644
--- a/include/deviceKernelsGeneric.h
+++ b/include/deviceKernelsGeneric.h
@@ -86,6 +86,17 @@ namespace dftfe
                                        ValueType2 *           copyToVec);
 
 
+      template <typename ValueType1, typename ValueType2>
+      void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const ValueType1 *     copyFromVec,
+                                ValueType2 *           copyToVec);
+
 
       template <typename ValueType1, typename ValueType2>
       void
diff --git a/include/dft.h b/include/dft.h
index 1f64da879..de63bbf10 100644
--- a/include/dft.h
+++ b/include/dft.h
@@ -23,6 +23,7 @@
 #include <headers.h>
 #include <MemorySpaceType.h>
 #include <MemoryStorage.h>
+#include <FEBasisOperations.h>
 
 #include <complex>
 #include <deque>
@@ -1284,6 +1285,18 @@ namespace dftfe
     unsigned int                  d_densityQuadratureId;
     unsigned int                  d_densityQuadratureIdElectro;
     dealii::MatrixFree<3, double> matrix_free_data, d_matrixFreeDataPRefined;
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::HOST>>
+      basisOperationsPtrHost;
+#if defined(DFTFE_WITH_DEVICE)
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      basisOperationsPtrDevice;
+#endif
     std::map<dealii::types::global_dof_index, dealii::Point<3>> d_supportPoints,
       d_supportPointsPRefined, d_supportPointsEigen;
     std::vector<const dealii::AffineConstraints<double> *> d_constraintsVector;
@@ -1394,14 +1407,24 @@ namespace dftfe
     std::vector<std::vector<double>> d_densityMatDerFermiEnergy;
 
     /// Spectrum split higher eigenvalues computed in Rayleigh-Ritz step
-    std::vector<std::vector<double>>                  eigenValuesRRSplit;
-    std::vector<distributedCPUVec<dataTypes::number>> d_eigenVectorsFlattened;
-    std::vector<std::vector<dataTypes::number>> d_eigenVectorsFlattenedSTL;
-    std::vector<std::vector<dataTypes::number>>
-      d_eigenVectorsRotFracDensityFlattenedSTL;
-
-    std::vector<std::vector<dataTypes::number>>
-      d_eigenVectorsDensityMatrixPrimeSTL;
+    std::vector<std::vector<double>> eigenValuesRRSplit;
+
+    /**
+     * The indexing of d_eigenVectorsFlattenedHost and
+     * d_eigenVectorsFlattenedDevice [kPoint * numSpinComponents *
+     * numLocallyOwnedNodes * numWaveFunctions + iSpin * numLocallyOwnedNodes *
+     * numWaveFunctions + iNode * numWaveFunctions + iWaveFunction]
+     */
+    dftfe::utils::MemoryStorage<dataTypes::number,
+                                dftfe::utils::MemorySpace::HOST>
+      d_eigenVectorsFlattenedHost;
+
+    dftfe::utils::MemoryStorage<dataTypes::number,
+                                dftfe::utils::MemorySpace::HOST>
+      d_eigenVectorsRotFracDensityFlattenedHost;
+    dftfe::utils::MemoryStorage<dataTypes::number,
+                                dftfe::utils::MemorySpace::HOST>
+      d_eigenVectorsDensityMatrixPrimeHost;
 
     /// device eigenvectors
 #ifdef DFTFE_WITH_DEVICE
diff --git a/include/forceWfcContractions.h b/include/forceWfcContractions.h
index af612a197..70511ec7a 100644
--- a/include/forceWfcContractions.h
+++ b/include/forceWfcContractions.h
@@ -28,14 +28,14 @@ namespace dftfe
   {
     void
     wfcContractionsForceKernelsAllH(
-      operatorDFTClass &                                 operatorMatrix,
-      const std::vector<std::vector<dataTypes::number>> &X,
-      const unsigned int                                 spinPolarizedFlag,
-      const unsigned int                                 spinIndex,
-      const std::vector<std::vector<double>> &           eigenValuesH,
-      const std::vector<std::vector<double>> &           partialOccupanciesH,
-      const std::vector<double> &                        kPointCoordinates,
-      const unsigned int *nonTrivialIdToElemIdMapH,
+      operatorDFTClass &                      operatorMatrix,
+      const dataTypes::number *               X,
+      const unsigned int                      spinPolarizedFlag,
+      const unsigned int                      spinIndex,
+      const std::vector<std::vector<double>> &eigenValuesH,
+      const std::vector<std::vector<double>> &partialOccupanciesH,
+      const std::vector<double> &             kPointCoordinates,
+      const unsigned int *                    nonTrivialIdToElemIdMapH,
       const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH,
       const unsigned int  MLoc,
       const unsigned int  N,
diff --git a/include/forceWfcContractionsDevice.h b/include/forceWfcContractionsDevice.h
index e3253a854..4f0771dfe 100644
--- a/include/forceWfcContractionsDevice.h
+++ b/include/forceWfcContractionsDevice.h
@@ -22,6 +22,7 @@
 #    include "headers.h"
 #    include "operatorDevice.h"
 #    include "dftParameters.h"
+#    include "FEBasisOperations.h"
 
 namespace dftfe
 {
@@ -29,6 +30,11 @@ namespace dftfe
   {
     void
     wfcContractionsForceKernelsAllH(
+      std::shared_ptr<
+        dftfe::basis::FEBasisOperations<dataTypes::number,
+                                        double,
+                                        dftfe::utils::MemorySpace::DEVICE>>
+        &                                     basisOperationsPtr,
       operatorDFTDeviceClass &                operatorMatrix,
       const dataTypes::number *               X,
       const unsigned int                      spinPolarizedFlag,
diff --git a/include/kohnShamDFTOperator.h b/include/kohnShamDFTOperator.h
index 942255f3a..39754397f 100644
--- a/include/kohnShamDFTOperator.h
+++ b/include/kohnShamDFTOperator.h
@@ -121,9 +121,10 @@ node is stored
      * @return ProjMatrix projected small matrix
      */
     void
-    XtHX(const std::vector<dataTypes::number> &src,
-         const unsigned int                    numberComponents,
-         std::vector<dataTypes::number> &      ProjHam);
+    XtHX(const dataTypes::number *       src,
+         const unsigned int              numberComponents,
+         const unsigned int              numberLocalDofs,
+         std::vector<dataTypes::number> &ProjHam);
 
     /**
      * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HConj=X^{T}*HConj*XConj
@@ -139,8 +140,9 @@ node is stored
      * also avoids creation of another full X memory.
      */
     void
-    XtHX(const std::vector<dataTypes::number> &           X,
+    XtHX(const dataTypes::number *                        X,
          const unsigned int                               numberComponents,
+         const unsigned int                               numberLocalDofs,
          const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
          dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
          const bool onlyHPrimePartForFirstOrderDensityMatResponse = false);
@@ -161,9 +163,10 @@ node is stored
      */
     void
     XtHXMixedPrec(
-      const std::vector<dataTypes::number> &           X,
+      const dataTypes::number *                        X,
       const unsigned int                               N,
       const unsigned int                               Ncore,
+      const unsigned int                               numberLocalDofs,
       const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
       dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
       const bool onlyHPrimePartForFirstOrderDensityMatResponse = false);
diff --git a/include/kohnShamDFTOperatorDevice.h b/include/kohnShamDFTOperatorDevice.h
index a23c4d7ff..8e4d15b53 100644
--- a/include/kohnShamDFTOperatorDevice.h
+++ b/include/kohnShamDFTOperatorDevice.h
@@ -21,6 +21,7 @@
 #include <constants.h>
 #include <headers.h>
 #include <operatorDevice.h>
+#include <FEBasisOperations.h>
 
 namespace dftfe
 {
@@ -85,14 +86,14 @@ namespace dftfe
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
     getShapeFunctionValuesNLPTransposed();
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesXTransposed();
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+    // getShapeFunctionGradientValuesXTransposed();
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesYTransposed();
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+    // getShapeFunctionGradientValuesYTransposed();
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesZTransposed();
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+    // getShapeFunctionGradientValuesZTransposed();
 
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
     getShapeFunctionGradientValuesNLPTransposed();
@@ -647,6 +648,16 @@ namespace dftfe
 
     /// pointer to dft class
     dftClass<FEOrder, FEOrderElectro> *dftPtr;
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      basisOperationsPtrDevice;
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::HOST>>
+      basisOperationsPtrHost;
 
 
     /// data structures to store diagonal of inverse square root mass matrix and
@@ -686,17 +697,6 @@ namespace dftfe
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
       d_shapeFunctionValueTransposedLpspDevice;
 
-    /// storage for shapefunction gradients
-    std::vector<double> d_shapeFunctionGradientValueX;
-    std::vector<double> d_shapeFunctionGradientValueXTransposed;
-
-    std::vector<double> d_shapeFunctionGradientValueY;
-    std::vector<double> d_shapeFunctionGradientValueYTransposed;
-
-    std::vector<double> d_shapeFunctionGradientValueZ;
-    std::vector<double> d_shapeFunctionGradientValueZTransposed;
-
-
     std::vector<double> d_cellJxWValues;
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
       d_cellJxWValuesDevice;
diff --git a/include/linearAlgebraOperations.h b/include/linearAlgebraOperations.h
index bb361e976..cbb5c9f5a 100644
--- a/include/linearAlgebraOperations.h
+++ b/include/linearAlgebraOperations.h
@@ -584,8 +584,9 @@ namespace dftfe
      */
     template <typename T>
     void
-    gramSchmidtOrthogonalization(std::vector<T> &   X,
+    gramSchmidtOrthogonalization(T *                X,
                                  const unsigned int numberComponents,
+                                 const unsigned int numberDofs,
                                  const MPI_Comm &   mpiComm);
 
 
@@ -621,8 +622,9 @@ namespace dftfe
     template <typename T>
     unsigned int
     pseudoGramSchmidtOrthogonalization(elpaScalaManager &   elpaScala,
-                                       std::vector<T> &     X,
+                                       T *                  X,
                                        const unsigned int   numberComponents,
+                                       const unsigned int   numberDofs,
                                        const MPI_Comm &     mpiCommParent,
                                        const MPI_Comm &     interBandGroupComm,
                                        const MPI_Comm &     mpiCommDomain,
@@ -647,8 +649,9 @@ namespace dftfe
     void
     rayleighRitzGEP(operatorDFTClass &   operatorMatrix,
                     elpaScalaManager &   elpaScala,
-                    std::vector<T> &     X,
+                    T *                  X,
                     const unsigned int   numberComponents,
+                    const unsigned int   numberDofs,
                     const MPI_Comm &     mpiCommParent,
                     const MPI_Comm &     interBandGroupComm,
                     const MPI_Comm &     mpiCommDomain,
@@ -674,8 +677,9 @@ namespace dftfe
     void
     rayleighRitz(operatorDFTClass &   operatorMatrix,
                  elpaScalaManager &   elpaScala,
-                 std::vector<T> &     X,
+                 T *                  X,
                  const unsigned int   numberComponents,
+                 const unsigned int   numberDofs,
                  const MPI_Comm &     mpiCommParent,
                  const MPI_Comm &     interBandGroupComm,
                  const MPI_Comm &     mpiCommDomain,
@@ -702,9 +706,10 @@ namespace dftfe
     void
     rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &   operatorMatrix,
                                        elpaScalaManager &   elpaScala,
-                                       std::vector<T> &     X,
-                                       std::vector<T> &     Y,
+                                       T *                  X,
+                                       T *                  Y,
                                        const unsigned int   numberComponents,
+                                       const unsigned int   numberDofs,
                                        const unsigned int   numberCoreStates,
                                        const MPI_Comm &     mpiCommParent,
                                        const MPI_Comm &     interBandGroupComm,
@@ -731,18 +736,19 @@ namespace dftfe
      */
     template <typename T>
     void
-    rayleighRitzSpectrumSplitDirect(operatorDFTClass &    operatorMatrix,
-                                    elpaScalaManager &    elpaScala,
-                                    const std::vector<T> &X,
-                                    std::vector<T> &      Y,
-                                    const unsigned int    numberComponents,
-                                    const unsigned int    numberCoreStates,
-                                    const MPI_Comm &      mpiCommParent,
-                                    const MPI_Comm &      interBandGroupComm,
-                                    const MPI_Comm &      mpiCommDomain,
-                                    const bool            useMixedPrec,
-                                    std::vector<double> & eigenValues,
-                                    const dftParameters & dftParams);
+    rayleighRitzSpectrumSplitDirect(operatorDFTClass &   operatorMatrix,
+                                    elpaScalaManager &   elpaScala,
+                                    const T *            X,
+                                    T *                  Y,
+                                    const unsigned int   numberComponents,
+                                    const unsigned int   numberDofs,
+                                    const unsigned int   numberCoreStates,
+                                    const MPI_Comm &     mpiCommParent,
+                                    const MPI_Comm &     interBandGroupComm,
+                                    const MPI_Comm &     mpiCommDomain,
+                                    const bool           useMixedPrec,
+                                    std::vector<double> &eigenValues,
+                                    const dftParameters &dftParams);
 
 
     /** @brief Compute residual norm associated with eigenValue problem of the given operator
@@ -757,8 +763,10 @@ namespace dftfe
     template <typename T>
     void
     computeEigenResidualNorm(operatorDFTClass &         operatorMatrix,
-                             std::vector<T> &           X,
+                             T *                        X,
                              const std::vector<double> &eigenValues,
+                             const unsigned int         numberComponents,
+                             const unsigned int         numberDofs,
                              const MPI_Comm &           mpiCommParent,
                              const MPI_Comm &           mpiCommDomain,
                              const MPI_Comm &           interBandGroupComm,
@@ -772,8 +780,9 @@ namespace dftfe
     void
     densityMatrixEigenBasisFirstOrderResponse(
       operatorDFTClass &         operatorMatrix,
-      std::vector<T> &           X,
+      T *                        X,
       const unsigned int         N,
+      const unsigned int         numberLocalDofs,
       const MPI_Comm &           mpiCommParent,
       const MPI_Comm &           mpiCommDomain,
       const MPI_Comm &           interBandGroupComm,
diff --git a/include/operator.h b/include/operator.h
index 7395eb718..3c74807d9 100644
--- a/include/operator.h
+++ b/include/operator.h
@@ -171,9 +171,10 @@ namespace dftfe
      * @param ProjMatrix projected small matrix
      */
     virtual void
-    XtHX(const std::vector<dataTypes::number> &X,
-         const unsigned int                    numberComponents,
-         std::vector<dataTypes::number> &      ProjHam) = 0;
+    XtHX(const dataTypes::number *       X,
+         const unsigned int              numberComponents,
+         const unsigned int              numberLocalDofs,
+         std::vector<dataTypes::number> &ProjHam) = 0;
 
     /**
      * @brief Compute projection of the operator into a subspace spanned by a given orthogonal basis HProjConj=X^{T}*HConj*XConj
@@ -185,8 +186,9 @@ namespace dftfe
      * of the operation into the given subspace
      */
     virtual void
-    XtHX(const std::vector<dataTypes::number> &           X,
+    XtHX(const dataTypes::number *                        X,
          const unsigned int                               numberComponents,
+         const unsigned int                               numberLocalDofs,
          const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
          dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
          const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0;
@@ -207,9 +209,10 @@ namespace dftfe
      */
     virtual void
     XtHXMixedPrec(
-      const std::vector<dataTypes::number> &           X,
+      const dataTypes::number *                        X,
       const unsigned int                               totalNumberComponents,
       const unsigned int                               singlePrecComponents,
+      const unsigned int                               numberLocalDofs,
       const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
       dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
       const bool onlyHPrimePartForFirstOrderDensityMatResponse = false) = 0;
diff --git a/include/operatorDevice.h b/include/operatorDevice.h
index 5e980b89b..be38b5ff5 100644
--- a/include/operatorDevice.h
+++ b/include/operatorDevice.h
@@ -97,18 +97,6 @@ namespace dftfe
                                         dftfe::utils::MemorySpace::DEVICE> &
     getShapeFunctionValuesNLPTransposed() = 0;
 
-    virtual dftfe::utils::MemoryStorage<double,
-                                        dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesXTransposed() = 0;
-
-    virtual dftfe::utils::MemoryStorage<double,
-                                        dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesYTransposed() = 0;
-
-    virtual dftfe::utils::MemoryStorage<double,
-                                        dftfe::utils::MemorySpace::DEVICE> &
-    getShapeFunctionGradientValuesZTransposed() = 0;
-
     virtual dftfe::utils::MemoryStorage<double,
                                         dftfe::utils::MemorySpace::DEVICE> &
     getShapeFunctionGradientValuesNLPTransposed() = 0;
@@ -426,14 +414,14 @@ namespace dftfe
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
       d_shapeFunctionValueNLPTransposedDevice;
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
-      d_shapeFunctionGradientValueXTransposedDevice;
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
+    //   d_shapeFunctionGradientValueXTransposedDevice;
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
-      d_shapeFunctionGradientValueYTransposedDevice;
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
+    //   d_shapeFunctionGradientValueYTransposedDevice;
 
-    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
-      d_shapeFunctionGradientValueZTransposedDevice;
+    // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
+    //   d_shapeFunctionGradientValueZTransposedDevice;
 
     dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
       d_shapeFunctionGradientValueNLPTransposedDevice;
@@ -455,10 +443,6 @@ namespace dftfe
                                 dftfe::utils::MemorySpace::DEVICE>
       d_cellWaveFunctionMatrix;
 
-    distributedDeviceVec<dataTypes::number> d_parallelChebyBlockVectorDevice;
-
-    distributedDeviceVec<dataTypes::number> d_parallelChebyBlockVector2Device;
-
     distributedDeviceVec<dataTypes::number>
       d_parallelProjectorKetTimesBlockVectorDevice;
 
diff --git a/include/vectorUtilities.h b/include/vectorUtilities.h
index 5fbbcf4f7..890ddb1a1 100644
--- a/include/vectorUtilities.h
+++ b/include/vectorUtilities.h
@@ -169,8 +169,9 @@ namespace dftfe
      */
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<std::complex<double>> &   flattenedArray,
+      const std::complex<double> *                flattenedArray,
       const unsigned int                          totalNumberComponents,
+      const unsigned int                          localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
       const std::vector<dealii::types::global_dof_index>
         &localProcDofIndicesReal,
@@ -180,8 +181,9 @@ namespace dftfe
 
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<std::complex<double>> &   flattenedArray,
+      const std::complex<double> *                flattenedArray,
       const unsigned int                          totalNumberComponents,
+      const unsigned int                          localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
 
       std::vector<distributedCPUVec<double>> &componentVectors);
@@ -206,8 +208,9 @@ namespace dftfe
      */
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<double> &                 flattenedArray,
+      const double *                              flattenedArray,
       const unsigned int                          totalNumberComponents,
+      const unsigned int                          localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
       std::vector<distributedCPUVec<double>> &    componentVectors);
 
diff --git a/src/dft/computeOutputDensityDirectionalDerivative.cc b/src/dft/computeOutputDensityDirectionalDerivative.cc
index 599550180..c1deaa4ff 100644
--- a/src/dft/computeOutputDensityDirectionalDerivative.cc
+++ b/src/dft/computeOutputDensityDirectionalDerivative.cc
@@ -53,7 +53,7 @@ namespace dftfe
              d_eigenVectorsFlattenedDevice.begin());
 #endif
     if (!d_dftParamsPtr->useDevice)
-      d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL;
+      d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost;
 
 
     // set up linear solver
@@ -465,7 +465,7 @@ namespace dftfe
             d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(),
             d_densityMatDerFermiEnergy,
             d_numEigenValues,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
+            matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             kohnShamDFTEigenOperatorDevice,
             d_eigenDofHandlerIndex,
             dofHandler,
@@ -488,7 +488,7 @@ namespace dftfe
             d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin(),
             d_densityMatDerFermiEnergy,
             d_numEigenValues,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
+            matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             kohnShamDFTEigenOperatorDevice,
             d_eigenDofHandlerIndex,
             dofHandler,
@@ -511,11 +511,11 @@ namespace dftfe
         if (d_dftParamsPtr->singlePrecLRD)
           computeRhoFirstOrderResponseCPUMixedPrec<dataTypes::number,
                                                    dataTypes::numberFP32>(
-            d_eigenVectorsFlattenedSTL,
-            d_eigenVectorsDensityMatrixPrimeSTL,
+            d_eigenVectorsFlattenedHost.data(),
+            d_eigenVectorsDensityMatrixPrimeHost.data(),
             d_densityMatDerFermiEnergy,
             d_numEigenValues,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
+            matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             kohnShamDFTEigenOperatorCPU,
             d_eigenDofHandlerIndex,
             dofHandler,
@@ -533,11 +533,11 @@ namespace dftfe
             *d_dftParamsPtr);
         else
           computeRhoFirstOrderResponseCPU(
-            d_eigenVectorsFlattenedSTL,
-            d_eigenVectorsDensityMatrixPrimeSTL,
+            d_eigenVectorsFlattenedHost.data(),
+            d_eigenVectorsDensityMatrixPrimeHost.data(),
             d_densityMatDerFermiEnergy,
             d_numEigenValues,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
+            matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             kohnShamDFTEigenOperatorCPU,
             d_eigenDofHandlerIndex,
             dofHandler,
diff --git a/src/dft/density.cc b/src/dft/density.cc
index 3722b3cf9..20596afb8 100644
--- a/src/dft/density.cc
+++ b/src/dft/density.cc
@@ -19,10 +19,7 @@
 
 // source file for electron density related computations
 #include <dft.h>
-#include <densityCalculatorCPU.h>
-#ifdef DFTFE_WITH_DEVICE
-#  include <densityCalculatorDevice.h>
-#endif
+#include <densityCalculator.h>
 
 namespace dftfe
 {
@@ -193,66 +190,56 @@ namespace dftfe
 
 #ifdef DFTFE_WITH_DEVICE
         if (d_dftParamsPtr->useDevice)
-          Device::computeRhoFromPSI(
-            d_eigenVectorsFlattenedDevice.begin(),
-            d_eigenVectorsRotFracFlattenedDevice.begin(),
-            d_numEigenValues,
-            d_numEigenValuesRR,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
-            eigenValues,
-            fermiEnergy,
-            fermiEnergyUp,
-            fermiEnergyDown,
-            kohnShamDFTEigenOperator,
-            d_eigenDofHandlerIndex,
-            dofHandler,
-            matrix_free_data.n_physical_cells(),
-            matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex),
-            matrix_free_data.get_quadrature(d_densityQuadratureId).size(),
-            d_kPointWeights,
-            rhoOutValues.get(),
-            gradRhoOutValues.get(),
-            rhoOutValuesSpinPolarized.get(),
-            gradRhoOutValuesSpinPolarized.get(),
-            d_excManagerPtr->getDensityBasedFamilyType() ==
-              densityFamilyType::GGA,
-            d_mpiCommParent,
-            interpoolcomm,
-            interBandGroupComm,
-            *d_dftParamsPtr,
-            isConsiderSpectrumSplitting &&
-              d_numEigenValues != d_numEigenValuesRR);
+          computeRhoFromPSI(&d_eigenVectorsFlattenedDevice,
+                            &d_eigenVectorsRotFracFlattenedDevice,
+                            d_numEigenValues,
+                            d_numEigenValuesRR,
+                            eigenValues,
+                            fermiEnergy,
+                            fermiEnergyUp,
+                            fermiEnergyDown,
+                            basisOperationsPtrDevice,
+                            d_densityDofHandlerIndex,
+                            d_densityQuadratureId,
+                            d_kPointWeights,
+                            rhoOutValues.get(),
+                            gradRhoOutValues.get(),
+                            rhoOutValuesSpinPolarized.get(),
+                            gradRhoOutValuesSpinPolarized.get(),
+                            d_excManagerPtr->getDensityBasedFamilyType() ==
+                              densityFamilyType::GGA,
+                            d_mpiCommParent,
+                            interpoolcomm,
+                            interBandGroupComm,
+                            *d_dftParamsPtr,
+                            isConsiderSpectrumSplitting &&
+                              d_numEigenValues != d_numEigenValuesRR);
 #endif
         if (!d_dftParamsPtr->useDevice)
-          computeRhoFromPSICPU(
-            d_eigenVectorsFlattenedSTL,
-            d_eigenVectorsRotFracDensityFlattenedSTL,
-            d_numEigenValues,
-            d_numEigenValuesRR,
-            d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
-            eigenValues,
-            fermiEnergy,
-            fermiEnergyUp,
-            fermiEnergyDown,
-            kohnShamDFTEigenOperatorCPU,
-            dofHandler,
-            matrix_free_data.n_physical_cells(),
-            matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex),
-            matrix_free_data.get_quadrature(d_densityQuadratureId).size(),
-            d_kPointWeights,
-            rhoOutValues.get(),
-            gradRhoOutValues.get(),
-            rhoOutValuesSpinPolarized.get(),
-            gradRhoOutValuesSpinPolarized.get(),
-            d_excManagerPtr->getDensityBasedFamilyType() ==
-              densityFamilyType::GGA,
-            d_mpiCommParent,
-            interpoolcomm,
-            interBandGroupComm,
-            *d_dftParamsPtr,
-            isConsiderSpectrumSplitting &&
-              d_numEigenValues != d_numEigenValuesRR,
-            false);
+          computeRhoFromPSI(&d_eigenVectorsFlattenedHost,
+                            &d_eigenVectorsRotFracDensityFlattenedHost,
+                            d_numEigenValues,
+                            d_numEigenValuesRR,
+                            eigenValues,
+                            fermiEnergy,
+                            fermiEnergyUp,
+                            fermiEnergyDown,
+                            basisOperationsPtrHost,
+                            d_densityDofHandlerIndex,
+                            d_densityQuadratureId,
+                            d_kPointWeights,
+                            rhoOutValues.get(),
+                            gradRhoOutValues.get(),
+                            rhoOutValuesSpinPolarized.get(),
+                            gradRhoOutValuesSpinPolarized.get(),
+                            d_excManagerPtr->getDensityBasedFamilyType() ==
+                              densityFamilyType::GGA,
+                            d_mpiCommParent,
+                            interpoolcomm,
+                            interBandGroupComm,
+                            *d_dftParamsPtr,
+                            isConsiderSpectrumSplitting &&
+                              d_numEigenValues != d_numEigenValuesRR);
         // normalizeRhoOutQuadValues();
 
         if (isGroundState)
@@ -611,63 +598,54 @@ namespace dftfe
         // nodes in each cell
 #ifdef DFTFE_WITH_DEVICE
     if (d_dftParamsPtr->useDevice)
-      Device::computeRhoFromPSI(
-        d_eigenVectorsFlattenedDevice.begin(),
-        d_eigenVectorsRotFracFlattenedDevice.begin(),
-        d_numEigenValues,
-        d_numEigenValuesRR,
-        d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
-        eigenValues,
-        fermiEnergy,
-        fermiEnergyUp,
-        fermiEnergyDown,
-        kohnShamDFTEigenOperator,
-        d_eigenDofHandlerIndex,
-        dofHandler,
-        matrix_free_data.n_physical_cells(),
-        matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex),
-        quadrature_formula.size(),
-        d_kPointWeights,
-        &rhoPRefinedNodalData,
-        &_gradRhoValues,
-        &rhoPRefinedSpinPolarizedNodalData,
-        &_gradRhoValuesSpinPolarized,
-        false,
-        d_mpiCommParent,
-        interpoolcomm,
-        interBandGroupComm,
-        *d_dftParamsPtr,
-        isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR,
-        true);
+      computeRhoFromPSI(&d_eigenVectorsFlattenedDevice,
+                        &d_eigenVectorsRotFracFlattenedDevice,
+                        d_numEigenValues,
+                        d_numEigenValuesRR,
+                        eigenValues,
+                        fermiEnergy,
+                        fermiEnergyUp,
+                        fermiEnergyDown,
+                        basisOperationsPtrDevice,
+                        d_densityDofHandlerIndex,
+                        d_gllQuadratureId,
+                        d_kPointWeights,
+                        &rhoPRefinedNodalData,
+                        &_gradRhoValues,
+                        &rhoPRefinedSpinPolarizedNodalData,
+                        &_gradRhoValuesSpinPolarized,
+                        false,
+                        d_mpiCommParent,
+                        interpoolcomm,
+                        interBandGroupComm,
+                        *d_dftParamsPtr,
+                        isConsiderSpectrumSplitting &&
+                          d_numEigenValues != d_numEigenValuesRR);
 #endif
     if (!d_dftParamsPtr->useDevice)
-      computeRhoFromPSICPU(
-        d_eigenVectorsFlattenedSTL,
-        d_eigenVectorsRotFracDensityFlattenedSTL,
-        d_numEigenValues,
-        d_numEigenValuesRR,
-        d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues,
-        eigenValues,
-        fermiEnergy,
-        fermiEnergyUp,
-        fermiEnergyDown,
-        kohnShamDFTEigenOperatorCPU,
-        dofHandler,
-        matrix_free_data.n_physical_cells(),
-        matrix_free_data.get_dofs_per_cell(d_densityDofHandlerIndex),
-        quadrature_formula.size(),
-        d_kPointWeights,
-        &rhoPRefinedNodalData,
-        &_gradRhoValues,
-        &rhoPRefinedSpinPolarizedNodalData,
-        &_gradRhoValuesSpinPolarized,
-        false,
-        d_mpiCommParent,
-        interpoolcomm,
-        interBandGroupComm,
-        *d_dftParamsPtr,
-        isConsiderSpectrumSplitting && d_numEigenValues != d_numEigenValuesRR,
-        true);
+      computeRhoFromPSI(&d_eigenVectorsFlattenedHost,
+                        &d_eigenVectorsRotFracDensityFlattenedHost,
+                        d_numEigenValues,
+                        d_numEigenValuesRR,
+                        eigenValues,
+                        fermiEnergy,
+                        fermiEnergyUp,
+                        fermiEnergyDown,
+                        basisOperationsPtrHost,
+                        d_densityDofHandlerIndex,
+                        d_gllQuadratureId,
+                        d_kPointWeights,
+                        &rhoPRefinedNodalData,
+                        &_gradRhoValues,
+                        &rhoPRefinedSpinPolarizedNodalData,
+                        &_gradRhoValuesSpinPolarized,
+                        false,
+                        d_mpiCommParent,
+                        interpoolcomm,
+                        interBandGroupComm,
+                        *d_dftParamsPtr,
+                        isConsiderSpectrumSplitting &&
+                          d_numEigenValues != d_numEigenValuesRR);
 
     // copy Lobatto quadrature data to fill in 2p DoFHandler nodal data
     dealii::DoFHandler<3>::active_cell_iterator cellP = d_dofHandlerRhoNodal
diff --git a/src/dft/densityCalculator.cc b/src/dft/densityCalculator.cc
new file mode 100644
index 000000000..9e4f1dbeb
--- /dev/null
+++ b/src/dft/densityCalculator.cc
@@ -0,0 +1,772 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+// @author Sambit Das
+//
+
+// source file for electron density related computations
+#include <constants.h>
+#include <densityCalculator.h>
+#include <dftUtils.h>
+#include <vectorUtilities.h>
+#include <MemoryStorage.h>
+#include <DataTypeOverloads.h>
+#include <deviceKernelsGeneric.h>
+#include <linearAlgebraOperationsDevice.h>
+#include <DeviceAPICalls.h>
+#include <DeviceDataTypeOverloads.h>
+#include <DeviceTypeConfig.h>
+#include <DeviceKernelLauncherConstants.h>
+#include <DeviceBlasWrapper.h>
+
+namespace dftfe
+{
+  template <typename NumberType, dftfe::utils::MemorySpace memorySpace>
+  void
+  computeRhoFromPSI(
+    const dftfe::utils::MemoryStorage<NumberType, memorySpace> *X,
+    const dftfe::utils::MemoryStorage<NumberType, memorySpace> *XFrac,
+    const unsigned int                      totalNumWaveFunctions,
+    const unsigned int                      Nfr,
+    const std::vector<std::vector<double>> &eigenValues,
+    const double                            fermiEnergy,
+    const double                            fermiEnergyUp,
+    const double                            fermiEnergyDown,
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<NumberType, double, memorySpace>>
+      &                                            basisOperationsPtr,
+    const unsigned int                             matrixFreeDofhandlerIndex,
+    const unsigned int                             quadratureIndex,
+    const std::vector<double> &                    kPointWeights,
+    std::map<dealii::CellId, std::vector<double>> *rhoValues,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
+    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
+    const bool                                     isEvaluateGradRho,
+    const MPI_Comm &                               mpiCommParent,
+    const MPI_Comm &                               interpoolcomm,
+    const MPI_Comm &                               interBandGroupComm,
+    const dftParameters &                          dftParams,
+    const bool                                     spectrumSplit)
+  {
+    int this_process;
+    MPI_Comm_rank(mpiCommParent, &this_process);
+#if defined(DFTFE_WITH_DEVICE)
+    dftfe::utils::deviceSynchronize();
+#endif
+    MPI_Barrier(mpiCommParent);
+    double             computeRho_time = MPI_Wtime();
+    const unsigned int numKPoints      = kPointWeights.size();
+    const unsigned int numLocalDofs    = basisOperationsPtr->nOwnedDofs();
+    const unsigned int totalLocallyOwnedCells = basisOperationsPtr->nCells();
+    const unsigned int numNodesPerElement = basisOperationsPtr->nDofsPerCell();
+    // band group parallelization data structures
+    const unsigned int numberBandGroups =
+      dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm);
+    const unsigned int bandGroupTaskId =
+      dealii::Utilities::MPI::this_mpi_process(interBandGroupComm);
+    std::vector<unsigned int> bandGroupLowHighPlusOneIndices;
+    dftUtils::createBandParallelizationIndices(interBandGroupComm,
+                                               totalNumWaveFunctions,
+                                               bandGroupLowHighPlusOneIndices);
+
+    const unsigned int BVec =
+      std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]);
+
+    const double spinPolarizedFactor =
+      (dftParams.spinPolarized == 1) ? 1.0 : 2.0;
+    const unsigned int numSpinComponents =
+      (dftParams.spinPolarized == 1) ? 2 : 1;
+
+    const NumberType zero                    = 0;
+    const NumberType scalarCoeffAlphaRho     = 1.0;
+    const NumberType scalarCoeffBetaRho      = 1.0;
+    const NumberType scalarCoeffAlphaGradRho = 1.0;
+    const NumberType scalarCoeffBetaGradRho  = 1.0;
+
+    const unsigned int cellsBlockSize =
+      memorySpace == dftfe::utils::MemorySpace::DEVICE ? 50 : 1;
+    const unsigned int numCellBlocks = totalLocallyOwnedCells / cellsBlockSize;
+    const unsigned int remCellBlockSize =
+      totalLocallyOwnedCells - numCellBlocks * cellsBlockSize;
+    basisOperationsPtr->reinit(BVec, cellsBlockSize, quadratureIndex);
+    const unsigned int numQuadPoints = basisOperationsPtr->nQuadsPerCell();
+
+    std::vector<dftfe::utils::MemoryStorage<NumberType, memorySpace>>
+      wfcQuadPointData(numSpinComponents);
+    std::vector<dftfe::utils::MemoryStorage<NumberType, memorySpace>>
+      gradWfcQuadPointData(numSpinComponents);
+    std::vector<dftfe::utils::MemoryStorage<double, memorySpace>>
+      rhoWfcContributions(numSpinComponents);
+    std::vector<dftfe::utils::MemoryStorage<double, memorySpace>>
+      gradRhoWfcContributions(numSpinComponents);
+    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::HOST>
+      rhoHost;
+
+    dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::HOST>
+      gradRhoHost;
+#if defined(DFTFE_WITH_DEVICE)
+    dftfe::utils::MemoryStorage<double, memorySpace> rho;
+    dftfe::utils::MemoryStorage<double, memorySpace> gradRho;
+#else
+    auto &rho             = rhoHost;
+    auto &gradRho         = gradRhoHost;
+#endif
+
+    rho.resize(totalLocallyOwnedCells * numQuadPoints * numSpinComponents, 0.0);
+    for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex)
+      {
+        wfcQuadPointData[spinIndex].resize(cellsBlockSize * numQuadPoints *
+                                             BVec,
+                                           zero);
+
+        if (memorySpace == dftfe::utils::MemorySpace::DEVICE)
+          rhoWfcContributions[spinIndex].resize(cellsBlockSize * numQuadPoints *
+                                                  BVec,
+                                                0.0);
+      }
+    if (isEvaluateGradRho)
+      {
+        gradRho.resize(totalLocallyOwnedCells * numQuadPoints * 3 *
+                         numSpinComponents,
+                       0.0);
+        for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+             ++spinIndex)
+          {
+            gradWfcQuadPointData[spinIndex].resize(cellsBlockSize *
+                                                     numQuadPoints * BVec * 3,
+                                                   zero);
+            if (memorySpace == dftfe::utils::MemorySpace::DEVICE)
+              gradRhoWfcContributions[spinIndex].resize(
+                cellsBlockSize * numQuadPoints * BVec * 3, 0.0);
+          }
+      }
+
+
+
+    std::vector<
+      dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::HOST>>
+    partialOccupVecHost(
+      numSpinComponents,
+      dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::HOST>(
+        BVec, 0.0));
+#if defined(DFTFE_WITH_DEVICE)
+    std::vector<dftfe::utils::MemoryStorage<double, memorySpace>>
+      partialOccupVec(numSpinComponents);
+    for (unsigned int spinIndex = 0; spinIndex < numSpinComponents; ++spinIndex)
+      partialOccupVec[spinIndex].resize(partialOccupVecHost[spinIndex].size());
+#else
+    auto &partialOccupVec = partialOccupVecHost;
+#endif
+
+    std::vector<dftfe::linearAlgebra::MultiVector<NumberType, memorySpace> *>
+      flattenedArrayBlock(numSpinComponents);
+
+    dftfe::utils::MemoryStorage<NumberType, memorySpace> cellWaveFunctionMatrix(
+      cellsBlockSize * numNodesPerElement * BVec);
+
+    for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint)
+      {
+        for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+             ++spinIndex)
+          {
+            wfcQuadPointData[spinIndex].setValue(zero);
+            gradWfcQuadPointData[spinIndex].setValue(zero);
+            rhoWfcContributions[spinIndex].setValue(0.0);
+            gradRhoWfcContributions[spinIndex].setValue(0.0);
+          }
+        for (unsigned int jvec = 0; jvec < totalNumWaveFunctions; jvec += BVec)
+          {
+            const unsigned int currentBlockSize =
+              std::min(BVec, totalNumWaveFunctions - jvec);
+            for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                 ++spinIndex)
+              flattenedArrayBlock[spinIndex] =
+                &(basisOperationsPtr->getMultiVector(currentBlockSize,
+                                                     spinIndex));
+
+            if ((jvec + currentBlockSize) <=
+                  bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] &&
+                (jvec + currentBlockSize) >
+                  bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
+              {
+                for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                     ++spinIndex)
+                  if (spectrumSplit)
+                    {
+                      partialOccupVecHost[spinIndex].setValue(
+                        kPointWeights[kPoint] * spinPolarizedFactor);
+                    }
+                  else
+                    {
+                      if (dftParams.constraintMagnetization)
+                        {
+                          const double fermiEnergyConstraintMag =
+                            spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown;
+                          for (unsigned int iEigenVec = 0;
+                               iEigenVec < currentBlockSize;
+                               ++iEigenVec)
+                            {
+                              if (eigenValues[kPoint][totalNumWaveFunctions *
+                                                        spinIndex +
+                                                      jvec + iEigenVec] >
+                                  fermiEnergyConstraintMag)
+                                *(partialOccupVecHost[spinIndex].begin() +
+                                  iEigenVec) = 0;
+                              else
+                                *(partialOccupVecHost[spinIndex].begin() +
+                                  iEigenVec) =
+                                  kPointWeights[kPoint] * spinPolarizedFactor;
+                            }
+                        }
+                      else
+                        {
+                          for (unsigned int iEigenVec = 0;
+                               iEigenVec < currentBlockSize;
+                               ++iEigenVec)
+                            {
+                              *(partialOccupVecHost[spinIndex].begin() +
+                                iEigenVec) =
+                                dftUtils::getPartialOccupancy(
+                                  eigenValues[kPoint][totalNumWaveFunctions *
+                                                        spinIndex +
+                                                      jvec + iEigenVec],
+                                  fermiEnergy,
+                                  C_kb,
+                                  dftParams.TVal) *
+                                kPointWeights[kPoint] * spinPolarizedFactor;
+                            }
+                        }
+                    }
+#if defined(DFTFE_WITH_DEVICE)
+                for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                     ++spinIndex)
+                  partialOccupVec[spinIndex].copyFrom(
+                    partialOccupVecHost[spinIndex]);
+#endif
+                for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                     ++spinIndex)
+                  if (memorySpace == dftfe::utils::MemorySpace::HOST)
+                    for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode)
+                      std::memcpy(flattenedArrayBlock[spinIndex]->data() +
+                                    iNode * currentBlockSize,
+                                  X->data() +
+                                    numLocalDofs * totalNumWaveFunctions *
+                                      (numSpinComponents * kPoint + spinIndex) +
+                                    iNode * totalNumWaveFunctions + jvec,
+                                  currentBlockSize * sizeof(NumberType));
+#if defined(DFTFE_WITH_DEVICE)
+                  else if (memorySpace == dftfe::utils::MemorySpace::DEVICE)
+                    dftfe::utils::deviceKernelsGeneric::
+                      stridedCopyToBlockConstantStride(
+                        currentBlockSize,
+                        totalNumWaveFunctions,
+                        numLocalDofs,
+                        jvec,
+                        X->data() + numLocalDofs * totalNumWaveFunctions *
+                                      (numSpinComponents * kPoint + spinIndex),
+                        flattenedArrayBlock[spinIndex]->data());
+#endif
+
+
+                basisOperationsPtr->reinit(currentBlockSize,
+                                           cellsBlockSize,
+                                           quadratureIndex,
+                                           false);
+
+
+                for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                     ++spinIndex)
+                  {
+                    flattenedArrayBlock[spinIndex]->updateGhostValues();
+                    basisOperationsPtr->distribute(
+                      *(flattenedArrayBlock[spinIndex]));
+                  }
+
+                for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++)
+                  {
+                    const unsigned int currentCellsBlockSize =
+                      (iblock == numCellBlocks) ? remCellBlockSize :
+                                                  cellsBlockSize;
+                    if (currentCellsBlockSize > 0)
+                      {
+                        const unsigned int startingCellId =
+                          iblock * cellsBlockSize;
+
+                        for (unsigned int spinIndex = 0;
+                             spinIndex < numSpinComponents;
+                             ++spinIndex)
+                          basisOperationsPtr->interpolateKernel(
+                            *(flattenedArrayBlock[spinIndex]),
+                            wfcQuadPointData[spinIndex].data(),
+                            isEvaluateGradRho ?
+                              gradWfcQuadPointData[spinIndex].data() :
+                              NULL,
+                            std::pair<unsigned int, unsigned int>(
+                              startingCellId,
+                              startingCellId + currentCellsBlockSize));
+
+                        for (unsigned int spinIndex = 0;
+                             spinIndex < numSpinComponents;
+                             ++spinIndex)
+                          computeRhoGradRhoFromInterpolatedValues(
+                            basisOperationsPtr,
+                            std::pair<unsigned int, unsigned int>(
+                              startingCellId,
+                              startingCellId + currentCellsBlockSize),
+                            std::pair<unsigned int, unsigned int>(
+                              jvec, jvec + currentBlockSize),
+                            partialOccupVec[spinIndex].data(),
+                            wfcQuadPointData[spinIndex].data(),
+                            gradWfcQuadPointData[spinIndex].data(),
+                            rhoWfcContributions[spinIndex].data(),
+                            gradRhoWfcContributions[spinIndex].data(),
+                            rho.data() + spinIndex * totalLocallyOwnedCells *
+                                           numQuadPoints,
+                            gradRho.data() + spinIndex *
+                                               totalLocallyOwnedCells *
+                                               numQuadPoints * 3,
+                            isEvaluateGradRho);
+                      } // non-trivial cell block check
+                  }     // cells block loop
+              }
+          }
+
+        if (spectrumSplit)
+          for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec)
+            {
+              const unsigned int currentBlockSize = std::min(BVec, Nfr - jvec);
+              for (unsigned int spinIndex = 0; spinIndex < numSpinComponents;
+                   ++spinIndex)
+                flattenedArrayBlock[spinIndex] =
+                  &(basisOperationsPtr->getMultiVector(currentBlockSize,
+                                                       spinIndex));
+              if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <=
+                    bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] &&
+                  (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) >
+                    bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
+                {
+                  for (unsigned int spinIndex = 0;
+                       spinIndex < numSpinComponents;
+                       ++spinIndex)
+                    if (dftParams.constraintMagnetization)
+                      {
+                        const double fermiEnergyConstraintMag =
+                          spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown;
+                        for (unsigned int iEigenVec = 0;
+                             iEigenVec < currentBlockSize;
+                             ++iEigenVec)
+                          {
+                            if (eigenValues[kPoint]
+                                           [totalNumWaveFunctions * spinIndex +
+                                            (totalNumWaveFunctions - Nfr) +
+                                            jvec + iEigenVec] >
+                                fermiEnergyConstraintMag)
+                              *(partialOccupVecHost[spinIndex].begin() +
+                                iEigenVec) =
+                                -kPointWeights[kPoint] * spinPolarizedFactor;
+                            else
+                              *(partialOccupVecHost[spinIndex].begin() +
+                                iEigenVec) = 0;
+                          }
+                      }
+                    else
+                      {
+                        for (unsigned int iEigenVec = 0;
+                             iEigenVec < currentBlockSize;
+                             ++iEigenVec)
+                          {
+                            *(partialOccupVecHost[spinIndex].begin() +
+                              iEigenVec) =
+                              (dftUtils::getPartialOccupancy(
+                                 eigenValues[kPoint]
+                                            [totalNumWaveFunctions * spinIndex +
+                                             (totalNumWaveFunctions - Nfr) +
+                                             jvec + iEigenVec],
+                                 fermiEnergy,
+                                 C_kb,
+                                 dftParams.TVal) -
+                               1.0) *
+                              kPointWeights[kPoint] * spinPolarizedFactor;
+                          }
+                      }
+
+#if defined(DFTFE_WITH_DEVICE)
+                  for (unsigned int spinIndex = 0;
+                       spinIndex < numSpinComponents;
+                       ++spinIndex)
+                    {
+                      partialOccupVec[spinIndex].resize(
+                        partialOccupVecHost[spinIndex].size());
+                      partialOccupVec[spinIndex].copyFrom(
+                        partialOccupVecHost[spinIndex]);
+                    }
+#endif
+                  for (unsigned int spinIndex = 0;
+                       spinIndex < numSpinComponents;
+                       ++spinIndex)
+                    if (memorySpace == dftfe::utils::MemorySpace::HOST)
+                      for (unsigned int iNode = 0; iNode < numLocalDofs;
+                           ++iNode)
+                        std::memcpy(flattenedArrayBlock[spinIndex]->data() +
+                                      iNode * currentBlockSize,
+                                    XFrac->data() +
+                                      numLocalDofs * Nfr *
+                                        (numSpinComponents * kPoint +
+                                         spinIndex) +
+                                      iNode * Nfr + jvec,
+                                    currentBlockSize * sizeof(NumberType));
+#if defined(DFTFE_WITH_DEVICE)
+                    else if (memorySpace == dftfe::utils::MemorySpace::DEVICE)
+                      dftfe::utils::deviceKernelsGeneric::
+                        stridedCopyToBlockConstantStride(
+                          currentBlockSize,
+                          Nfr,
+                          numLocalDofs,
+                          jvec,
+                          XFrac->data() +
+                            numLocalDofs * Nfr *
+                              (numSpinComponents * kPoint + spinIndex),
+                          flattenedArrayBlock[spinIndex]->data());
+#endif
+                  basisOperationsPtr->reinit(currentBlockSize,
+                                             cellsBlockSize,
+                                             quadratureIndex,
+                                             false);
+
+
+                  for (unsigned int spinIndex = 0;
+                       spinIndex < numSpinComponents;
+                       ++spinIndex)
+                    {
+                      flattenedArrayBlock[spinIndex]->updateGhostValues();
+                      basisOperationsPtr->distribute(
+                        *(flattenedArrayBlock[spinIndex]));
+                    }
+
+                  for (int iblock = 0; iblock < (numCellBlocks + 1); iblock++)
+                    {
+                      const unsigned int currentCellsBlockSize =
+                        (iblock == numCellBlocks) ? remCellBlockSize :
+                                                    cellsBlockSize;
+                      if (currentCellsBlockSize > 0)
+                        {
+                          const unsigned int startingCellId =
+                            iblock * cellsBlockSize;
+                          for (unsigned int spinIndex = 0;
+                               spinIndex < numSpinComponents;
+                               ++spinIndex)
+                            basisOperationsPtr->interpolateKernel(
+                              *(flattenedArrayBlock[spinIndex]),
+                              wfcQuadPointData[spinIndex].data(),
+                              isEvaluateGradRho ?
+                                gradWfcQuadPointData[spinIndex].data() :
+                                NULL,
+                              std::pair<unsigned int, unsigned int>(
+                                startingCellId,
+                                startingCellId + currentCellsBlockSize));
+
+                          for (unsigned int spinIndex = 0;
+                               spinIndex < numSpinComponents;
+                               ++spinIndex)
+                            computeRhoGradRhoFromInterpolatedValues(
+                              basisOperationsPtr,
+                              std::pair<unsigned int, unsigned int>(
+                                startingCellId,
+                                startingCellId + currentCellsBlockSize),
+                              std::pair<unsigned int, unsigned int>(
+                                jvec, jvec + currentBlockSize),
+                              partialOccupVec[spinIndex].data(),
+                              wfcQuadPointData[spinIndex].data(),
+                              gradWfcQuadPointData[spinIndex].data(),
+                              rhoWfcContributions[spinIndex].data(),
+                              gradRhoWfcContributions[spinIndex].data(),
+                              rho.data() + spinIndex * totalLocallyOwnedCells *
+                                             numQuadPoints,
+                              gradRho.data() + spinIndex *
+                                                 totalLocallyOwnedCells *
+                                                 numQuadPoints * 3,
+                              isEvaluateGradRho);
+                        } // non-tivial cells block
+                    }     // cells block loop
+                }
+            } // spectrum split block
+      }
+#if defined(DFTFE_WITH_DEVICE)
+    rhoHost.resize(rho.size());
+    rhoHost.copyFrom(rho);
+    if (isEvaluateGradRho)
+      {
+        gradRhoHost.resize(gradRho.size());
+        gradRhoHost.copyFrom(gradRho);
+      }
+#endif
+
+    int size;
+    MPI_Comm_size(interpoolcomm, &size);
+    if (size > 1)
+      {
+        MPI_Allreduce(MPI_IN_PLACE,
+                      rhoHost.data(),
+                      totalLocallyOwnedCells * numQuadPoints *
+                        numSpinComponents,
+                      dataTypes::mpi_type_id(rhoHost.data()),
+                      MPI_SUM,
+                      interpoolcomm);
+        if (isEvaluateGradRho)
+          MPI_Allreduce(MPI_IN_PLACE,
+                        gradRhoHost.data(),
+                        totalLocallyOwnedCells * numQuadPoints *
+                          numSpinComponents * 3,
+                        dataTypes::mpi_type_id(gradRhoHost.data()),
+                        MPI_SUM,
+                        interpoolcomm);
+      }
+    MPI_Comm_size(interBandGroupComm, &size);
+    if (size > 1)
+      {
+        MPI_Allreduce(MPI_IN_PLACE,
+                      rhoHost.data(),
+                      totalLocallyOwnedCells * numQuadPoints *
+                        numSpinComponents,
+                      dataTypes::mpi_type_id(rhoHost.data()),
+                      MPI_SUM,
+                      interBandGroupComm);
+        if (isEvaluateGradRho)
+          MPI_Allreduce(MPI_IN_PLACE,
+                        gradRhoHost.data(),
+                        totalLocallyOwnedCells * numQuadPoints *
+                          numSpinComponents * 3,
+                        dataTypes::mpi_type_id(gradRhoHost.data()),
+                        MPI_SUM,
+                        interBandGroupComm);
+      }
+
+    for (unsigned int iElem = 0; iElem < totalLocallyOwnedCells; ++iElem)
+      {
+        const dealii::CellId cellid = basisOperationsPtr->cellID(iElem);
+
+        std::vector<double>  dummy(1);
+        std::vector<double> &tempRhoQuads = (*rhoValues)[cellid];
+        std::vector<double> &tempGradRhoQuads =
+          isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy;
+
+        std::vector<double> &tempRhoQuadsSP =
+          (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] :
+                                           dummy;
+        std::vector<double> &tempGradRhoQuadsSP =
+          ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ?
+            (*gradRhoValuesSpinPolarized)[cellid] :
+            dummy;
+
+        if (dftParams.spinPolarized == 1)
+          {
+            for (unsigned int q = 0; q < numQuadPoints; ++q)
+              {
+                const double rho0 = rhoHost[iElem * numQuadPoints + q];
+                const double rho1 =
+                  rhoHost[totalLocallyOwnedCells * numQuadPoints +
+                          iElem * numQuadPoints + q];
+                tempRhoQuadsSP[2 * q + 0] = rho0;
+
+                tempRhoQuadsSP[2 * q + 1] = rho1;
+                tempRhoQuads[q]           = rho0 + rho1;
+              }
+
+            if (isEvaluateGradRho)
+              for (unsigned int q = 0; q < numQuadPoints; ++q)
+                {
+                  const double gradRho0x =
+                    gradRhoHost[iElem * numQuadPoints * 3 + 3 * q];
+                  const double gradRho0y =
+                    gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 1];
+                  const double gradRho0z =
+                    gradRhoHost[iElem * numQuadPoints * 3 + 3 * q + 2];
+                  const double gradRho1x =
+                    gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 +
+                                iElem * numQuadPoints * 3 + 3 * q];
+                  const double gradRho1y =
+                    gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 +
+                                iElem * numQuadPoints * 3 + 3 * q + 1];
+                  const double gradRho1z =
+                    gradRhoHost[totalLocallyOwnedCells * numQuadPoints * 3 +
+                                iElem * numQuadPoints * 3 + 3 * q + 2];
+                  tempGradRhoQuadsSP[6 * q + 0] = gradRho0x;
+                  tempGradRhoQuadsSP[6 * q + 1] = gradRho0y;
+                  tempGradRhoQuadsSP[6 * q + 2] = gradRho0z;
+                  tempGradRhoQuadsSP[6 * q + 3] = gradRho1x;
+                  tempGradRhoQuadsSP[6 * q + 4] = gradRho1y;
+                  tempGradRhoQuadsSP[6 * q + 5] = gradRho1z;
+                  tempGradRhoQuads[3 * q]       = gradRho0x + gradRho1x;
+                  tempGradRhoQuads[3 * q + 1]   = gradRho0y + gradRho1y;
+                  tempGradRhoQuads[3 * q + 2]   = gradRho0z + gradRho1z;
+                }
+          }
+        else
+          {
+            std::memcpy(tempRhoQuads.data(),
+                        rhoHost.data() + iElem * numQuadPoints,
+                        numQuadPoints * sizeof(double));
+
+            if (isEvaluateGradRho)
+              std::memcpy(tempGradRhoQuads.data(),
+                          gradRhoHost.data() + iElem * numQuadPoints * 3,
+                          3 * numQuadPoints * sizeof(double));
+          }
+      }
+#if defined(DFTFE_WITH_DEVICE)
+    dftfe::utils::deviceSynchronize();
+#endif
+    MPI_Barrier(mpiCommParent);
+    computeRho_time = MPI_Wtime() - computeRho_time;
+
+    if (this_process == 0 && dftParams.verbosity >= 2)
+      if (memorySpace == dftfe::utils::MemorySpace::HOST)
+        std::cout << "Time for compute rho on CPU: " << computeRho_time
+                  << std::endl;
+      else if (memorySpace == dftfe::utils::MemorySpace::DEVICE)
+        std::cout << "Time for compute rho on Device: " << computeRho_time
+                  << std::endl;
+  }
+  template <typename NumberType>
+  void
+  computeRhoGradRhoFromInterpolatedValues(
+    std::shared_ptr<
+      dftfe::basis::
+        FEBasisOperations<NumberType, double, dftfe::utils::MemorySpace::HOST>>
+      &                                         basisOperationsPtr,
+    const std::pair<unsigned int, unsigned int> cellRange,
+    const std::pair<unsigned int, unsigned int> vecRange,
+    double *                                    partialOccupVec,
+    NumberType *                                wfcQuadPointData,
+    NumberType *                                gradWfcQuadPointData,
+    double *                                    rhoCellsWfcContributions,
+    double *                                    gradRhoCellsWfcContributions,
+    double *                                    rho,
+    double *                                    gradRho,
+    const bool                                  isEvaluateGradRho)
+  {
+    const unsigned int cellsBlockSize   = cellRange.second - cellRange.first;
+    const unsigned int vectorsBlockSize = vecRange.second - vecRange.first;
+    const unsigned int nQuadsPerCell    = basisOperationsPtr->nQuadsPerCell();
+    const unsigned int nCells           = basisOperationsPtr->nCells();
+    for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+         ++iCell)
+      for (unsigned int iQuad = 0; iQuad < nQuadsPerCell; ++iQuad)
+        for (unsigned int iWave = 0; iWave < vecRange.second - vecRange.first;
+             ++iWave)
+          {
+            const NumberType psi =
+              wfcQuadPointData[(iCell - cellRange.first) * nQuadsPerCell *
+                                 vectorsBlockSize +
+                               iQuad * vectorsBlockSize + iWave];
+            rho[iCell * nQuadsPerCell + iQuad] +=
+              partialOccupVec[iWave] * std::abs(psi) * std::abs(psi);
+            if (isEvaluateGradRho)
+              {
+                gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad] +=
+                  2 * partialOccupVec[iWave] *
+                  dftfe::utils::realPart(
+                    dftfe::utils::complexConj(psi) *
+                    gradWfcQuadPointData[(iCell - cellRange.first) *
+                                           nQuadsPerCell * vectorsBlockSize *
+                                           3 +
+                                         iQuad * vectorsBlockSize + iWave]);
+                gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 1] +=
+                  2 * partialOccupVec[iWave] *
+                  dftfe::utils::realPart(
+                    dftfe::utils::complexConj(psi) *
+                    gradWfcQuadPointData[(iCell - cellRange.first) *
+                                           nQuadsPerCell * vectorsBlockSize *
+                                           3 +
+                                         nQuadsPerCell * vectorsBlockSize +
+                                         iQuad * vectorsBlockSize + iWave]);
+                gradRho[iCell * nQuadsPerCell * 3 + 3 * iQuad + 2] +=
+                  2 * partialOccupVec[iWave] *
+                  dftfe::utils::realPart(
+                    dftfe::utils::complexConj(psi) *
+                    gradWfcQuadPointData[(iCell - cellRange.first) *
+                                           nQuadsPerCell * vectorsBlockSize *
+                                           3 +
+                                         2 * nQuadsPerCell * vectorsBlockSize +
+                                         iQuad * vectorsBlockSize + iWave]);
+              }
+          }
+  }
+#if defined(DFTFE_WITH_DEVICE)
+  template void
+  computeRhoFromPSI(
+    const dftfe::utils::MemoryStorage<dataTypes::number,
+                                      dftfe::utils::MemorySpace::DEVICE> *X,
+    const dftfe::utils::MemoryStorage<dataTypes::number,
+                                      dftfe::utils::MemorySpace::DEVICE> *XFrac,
+    const unsigned int                      totalNumWaveFunctions,
+    const unsigned int                      Nfr,
+    const std::vector<std::vector<double>> &eigenValues,
+    const double                            fermiEnergy,
+    const double                            fermiEnergyUp,
+    const double                            fermiEnergyDown,
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      &                                            basisOperationsPtrDevice,
+    const unsigned int                             matrixFreeDofhandlerIndex,
+    const unsigned int                             quadratureIndex,
+    const std::vector<double> &                    kPointWeights,
+    std::map<dealii::CellId, std::vector<double>> *rhoValues,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
+    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
+    const bool                                     isEvaluateGradRho,
+    const MPI_Comm &                               mpiCommParent,
+    const MPI_Comm &                               interpoolcomm,
+    const MPI_Comm &                               interBandGroupComm,
+    const dftParameters &                          dftParams,
+    const bool                                     spectrumSplit);
+#endif
+
+  template void
+  computeRhoFromPSI(
+    const dftfe::utils::MemoryStorage<dataTypes::number,
+                                      dftfe::utils::MemorySpace::HOST> *X,
+    const dftfe::utils::MemoryStorage<dataTypes::number,
+                                      dftfe::utils::MemorySpace::HOST> *XFrac,
+    const unsigned int                      totalNumWaveFunctions,
+    const unsigned int                      Nfr,
+    const std::vector<std::vector<double>> &eigenValues,
+    const double                            fermiEnergy,
+    const double                            fermiEnergyUp,
+    const double                            fermiEnergyDown,
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::HOST>>
+      &                                            basisOperationsPtr,
+    const unsigned int                             matrixFreeDofhandlerIndex,
+    const unsigned int                             quadratureIndex,
+    const std::vector<double> &                    kPointWeights,
+    std::map<dealii::CellId, std::vector<double>> *rhoValues,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
+    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
+    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
+    const bool                                     isEvaluateGradRho,
+    const MPI_Comm &                               mpiCommParent,
+    const MPI_Comm &                               interpoolcomm,
+    const MPI_Comm &                               interBandGroupComm,
+    const dftParameters &                          dftParams,
+    const bool                                     spectrumSplit);
+} // namespace dftfe
diff --git a/src/dft/densityCalculator.inst.cc b/src/dft/densityCalculator.inst.cc
deleted file mode 100644
index 9fbf7a331..000000000
--- a/src/dft/densityCalculator.inst.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-template class DensityCalculator<1, 1>;
-template class DensityCalculator<1, 2>;
-template class DensityCalculator<2, 2>;
-template class DensityCalculator<2, 3>;
-template class DensityCalculator<2, 4>;
-template class DensityCalculator<3, 3>;
-template class DensityCalculator<3, 4>;
-template class DensityCalculator<3, 5>;
-template class DensityCalculator<3, 6>;
-template class DensityCalculator<4, 4>;
-template class DensityCalculator<4, 5>;
-template class DensityCalculator<4, 6>;
-template class DensityCalculator<4, 7>;
-template class DensityCalculator<4, 8>;
-template class DensityCalculator<5, 5>;
-template class DensityCalculator<5, 6>;
-template class DensityCalculator<5, 7>;
-template class DensityCalculator<5, 8>;
-template class DensityCalculator<5, 9>;
-template class DensityCalculator<5, 10>;
-template class DensityCalculator<6, 6>;
-template class DensityCalculator<6, 7>;
-template class DensityCalculator<6, 8>;
-template class DensityCalculator<6, 9>;
-template class DensityCalculator<6, 10>;
-template class DensityCalculator<6, 11>;
-template class DensityCalculator<6, 12>;
-template class DensityCalculator<7, 7>;
-template class DensityCalculator<7, 8>;
-template class DensityCalculator<7, 9>;
-template class DensityCalculator<7, 10>;
-template class DensityCalculator<7, 11>;
-template class DensityCalculator<7, 12>;
-template class DensityCalculator<7, 13>;
-template class DensityCalculator<7, 14>;
-template class DensityCalculator<8, 8>;
-template class DensityCalculator<8, 9>;
-template class DensityCalculator<8, 10>;
-template class DensityCalculator<8, 11>;
-template class DensityCalculator<8, 12>;
-template class DensityCalculator<8, 13>;
-template class DensityCalculator<8, 14>;
-template class DensityCalculator<8, 15>;
-template class DensityCalculator<8, 16>;
diff --git a/src/dft/densityCalculatorCPU.cc b/src/dft/densityCalculatorCPU.cc
deleted file mode 100644
index c94244e4e..000000000
--- a/src/dft/densityCalculatorCPU.cc
+++ /dev/null
@@ -1,849 +0,0 @@
-// ---------------------------------------------------------------------
-//
-// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
-// authors.
-//
-// This file is part of the DFT-FE code.
-//
-// The DFT-FE code is free software; you can use it, redistribute
-// it, and/or modify it under the terms of the GNU Lesser General
-// Public License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-// The full text of the license can be found in the file LICENSE at
-// the top level of the DFT-FE distribution.
-//
-// ---------------------------------------------------------------------
-//
-// @author Sambit Das
-//
-
-// source file for electron density related computations
-#include <constants.h>
-#include <densityCalculatorCPU.h>
-#include <dftUtils.h>
-#include <vectorUtilities.h>
-#include <linearAlgebraOperations.h>
-#include <DataTypeOverloads.h>
-
-namespace dftfe
-{
-  template <typename T>
-  void
-  computeRhoFromPSICPU(
-    const std::vector<std::vector<T>> &            X,
-    const std::vector<std::vector<T>> &            XFrac,
-    const unsigned int                             totalNumWaveFunctions,
-    const unsigned int                             Nfr,
-    const unsigned int                             numLocalDofs,
-    const std::vector<std::vector<double>> &       eigenValues,
-    const double                                   fermiEnergy,
-    const double                                   fermiEnergyUp,
-    const double                                   fermiEnergyDown,
-    operatorDFTClass &                             operatorMatrix,
-    const dealii::DoFHandler<3> &                  dofHandler,
-    const unsigned int                             totalLocallyOwnedCells,
-    const unsigned int                             numNodesPerElement,
-    const unsigned int                             numQuadPoints,
-    const std::vector<double> &                    kPointWeights,
-    std::map<dealii::CellId, std::vector<double>> *rhoValues,
-    std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
-    std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
-    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-    const bool                                     isEvaluateGradRho,
-    const MPI_Comm &                               mpiCommParent,
-    const MPI_Comm &                               interpoolcomm,
-    const MPI_Comm &                               interBandGroupComm,
-    const dftParameters &                          dftParams,
-    const bool                                     spectrumSplit,
-    const bool                                     useFEOrderRhoPlusOneGLQuad)
-  {
-    int this_process;
-    MPI_Comm_rank(mpiCommParent, &this_process);
-    MPI_Barrier(mpiCommParent);
-    double cpu_time = MPI_Wtime();
-
-    // band group parallelization data structures
-    const unsigned int numberBandGroups =
-      dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm);
-    const unsigned int bandGroupTaskId =
-      dealii::Utilities::MPI::this_mpi_process(interBandGroupComm);
-    std::vector<unsigned int> bandGroupLowHighPlusOneIndices;
-    dftUtils::createBandParallelizationIndices(interBandGroupComm,
-                                               totalNumWaveFunctions,
-                                               bandGroupLowHighPlusOneIndices);
-
-    const unsigned int BVec =
-      std::min(dftParams.chebyWfcBlockSize, bandGroupLowHighPlusOneIndices[1]);
-
-    const double spinPolarizedFactor =
-      (dftParams.spinPolarized == 1) ? 1.0 : 2.0;
-
-
-    std::vector<T> wfcQuads(numQuadPoints * BVec, T(0.0));
-
-    std::vector<T> gradWfcQuads(numQuadPoints * 3 * BVec, T(0.0));
-
-    std::vector<T>     shapeFunctionValues(numQuadPoints * numNodesPerElement,
-                                       T(0.0));
-    std::vector<T>     shapeFunctionGradValues(numQuadPoints * 3 *
-                                             numNodesPerElement,
-                                           T(0.0));
-    const unsigned int numQuadPointsTimes3 = numQuadPoints * 3;
-
-    if (useFEOrderRhoPlusOneGLQuad)
-      {
-        for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-          for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode)
-            shapeFunctionValues[iquad * numNodesPerElement + iNode] =
-              T(operatorMatrix.getShapeFunctionValuesDensityGaussLobattoQuad()
-                  [iquad * numNodesPerElement + iNode]);
-      }
-    else
-      {
-        for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-          for (unsigned int iNode = 0; iNode < numNodesPerElement; ++iNode)
-            shapeFunctionValues[iquad * numNodesPerElement + iNode] =
-              T(operatorMatrix.getShapeFunctionValuesDensityGaussQuad()
-                  [iquad * numNodesPerElement + iNode]);
-      }
-
-    std::vector<double> partialOccupVecTimesKptWeight(BVec, 0.0);
-
-
-    dftfe::distributedCPUMultiVec<T> flattenedArrayBlock;
-
-    std::vector<T> cellWaveFunctionMatrix(numNodesPerElement * BVec, T(0.0));
-
-
-    // set density to zero
-    typename dealii::DoFHandler<3>::active_cell_iterator cell =
-      dofHandler.begin_active();
-    typename dealii::DoFHandler<3>::active_cell_iterator endc =
-      dofHandler.end();
-    for (; cell != endc; ++cell)
-      if (cell->is_locally_owned())
-        {
-          const dealii::CellId cellid = cell->id();
-
-
-          std::fill((*rhoValues)[cellid].begin(),
-                    (*rhoValues)[cellid].end(),
-                    0.0);
-          if (isEvaluateGradRho)
-            std::fill((*gradRhoValues)[cellid].begin(),
-                      (*gradRhoValues)[cellid].end(),
-                      0.0);
-
-          if (dftParams.spinPolarized == 1)
-            {
-              std::fill((*rhoValuesSpinPolarized)[cellid].begin(),
-                        (*rhoValuesSpinPolarized)[cellid].end(),
-                        0.0);
-              if (isEvaluateGradRho)
-                std::fill((*gradRhoValuesSpinPolarized)[cellid].begin(),
-                          (*gradRhoValuesSpinPolarized)[cellid].end(),
-                          0.0);
-            }
-        }
-
-    std::vector<double> rhoValuesFlattened(totalLocallyOwnedCells *
-                                             numQuadPoints,
-                                           0.0);
-    std::vector<double> gradRhoValuesFlattened(totalLocallyOwnedCells *
-                                                 numQuadPoints * 3,
-                                               0.0);
-    std::vector<double> rhoValuesSpinPolarizedFlattened(totalLocallyOwnedCells *
-                                                          numQuadPoints * 2,
-                                                        0.0);
-    std::vector<double> gradRhoValuesSpinPolarizedFlattened(
-      totalLocallyOwnedCells * numQuadPoints * 6, 0.0);
-
-
-    for (unsigned int spinIndex = 0; spinIndex < (1 + dftParams.spinPolarized);
-         ++spinIndex)
-      {
-        for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint)
-          {
-            std::vector<double> rhoContribution(totalLocallyOwnedCells *
-                                                  numQuadPoints,
-                                                0.0);
-
-            std::vector<double> gradRhoXContribution(
-              isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1,
-              0.0);
-            std::vector<double> gradRhoYContribution(
-              isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1,
-              0.0);
-            std::vector<double> gradRhoZContribution(
-              isEvaluateGradRho ? (totalLocallyOwnedCells * numQuadPoints) : 1,
-              0.0);
-
-            const std::vector<T> &XCurrentKPoint =
-              X[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
-            const std::vector<T> &XFracCurrentKPoint =
-              XFrac[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
-
-            for (unsigned int jvec = 0; jvec < totalNumWaveFunctions;
-                 jvec += BVec)
-              {
-                const unsigned int currentBlockSize =
-                  std::min(BVec, totalNumWaveFunctions - jvec);
-
-                if (currentBlockSize != BVec || jvec == 0)
-                  operatorMatrix.reinit(currentBlockSize,
-                                        flattenedArrayBlock,
-                                        true);
-
-                if ((jvec + currentBlockSize) <=
-                      bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId + 1] &&
-                    (jvec + currentBlockSize) >
-                      bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
-                  {
-                    if (spectrumSplit)
-                      {
-                        std::fill(partialOccupVecTimesKptWeight.begin(),
-                                  partialOccupVecTimesKptWeight.end(),
-                                  kPointWeights[kPoint] * spinPolarizedFactor);
-                      }
-                    else
-                      {
-                        if (dftParams.constraintMagnetization)
-                          {
-                            const double fermiEnergyConstraintMag =
-                              spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown;
-                            for (unsigned int iEigenVec = 0;
-                                 iEigenVec < currentBlockSize;
-                                 ++iEigenVec)
-                              {
-                                if (eigenValues[kPoint][totalNumWaveFunctions *
-                                                          spinIndex +
-                                                        jvec + iEigenVec] >
-                                    fermiEnergyConstraintMag)
-                                  partialOccupVecTimesKptWeight[iEigenVec] =
-                                    0.0;
-                                else
-                                  partialOccupVecTimesKptWeight[iEigenVec] =
-                                    kPointWeights[kPoint] * spinPolarizedFactor;
-                              }
-                          }
-                        else
-                          {
-                            for (unsigned int iEigenVec = 0;
-                                 iEigenVec < currentBlockSize;
-                                 ++iEigenVec)
-                              {
-                                partialOccupVecTimesKptWeight[iEigenVec] =
-                                  dftUtils::getPartialOccupancy(
-                                    eigenValues[kPoint][totalNumWaveFunctions *
-                                                          spinIndex +
-                                                        jvec + iEigenVec],
-                                    fermiEnergy,
-                                    C_kb,
-                                    dftParams.TVal) *
-                                  kPointWeights[kPoint] * spinPolarizedFactor;
-                              }
-                          }
-                      }
-
-
-                    for (unsigned int iNode = 0; iNode < numLocalDofs; ++iNode)
-                      for (unsigned int iWave = 0; iWave < currentBlockSize;
-                           ++iWave)
-                        flattenedArrayBlock
-                          .data()[iNode * currentBlockSize + iWave] =
-                          XCurrentKPoint[iNode * totalNumWaveFunctions + jvec +
-                                         iWave];
-
-
-                    (operatorMatrix.getOverloadedConstraintMatrix())
-                      ->distribute(flattenedArrayBlock, currentBlockSize);
-
-                    for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                      {
-                        const unsigned int inc = 1;
-                        for (unsigned int iNode = 0; iNode < numNodesPerElement;
-                             ++iNode)
-                          {
-                            xcopy(
-                              &currentBlockSize,
-                              flattenedArrayBlock.data() +
-                                operatorMatrix
-                                  .getFlattenedArrayCellLocalProcIndexIdMap()
-                                    [icell * numNodesPerElement + iNode],
-                              &inc,
-                              &cellWaveFunctionMatrix[currentBlockSize * iNode],
-                              &inc);
-                          }
-
-
-                        const T scalarCoeffAlpha = T(1.0),
-                                scalarCoeffBeta  = T(0.0);
-                        const char transA = 'N', transB = 'N';
-
-                        xgemm(&transA,
-                              &transB,
-                              &currentBlockSize,
-                              &numQuadPoints,
-                              &numNodesPerElement,
-                              &scalarCoeffAlpha,
-                              &cellWaveFunctionMatrix[0],
-                              &currentBlockSize,
-                              &shapeFunctionValues[0],
-                              &numNodesPerElement,
-                              &scalarCoeffBeta,
-                              &wfcQuads[0],
-                              &currentBlockSize);
-
-                        for (unsigned int iquad = 0; iquad < numQuadPoints;
-                             ++iquad)
-                          for (unsigned int iWave = 0; iWave < currentBlockSize;
-                               ++iWave)
-                            rhoContribution[icell * numQuadPoints + iquad] +=
-                              partialOccupVecTimesKptWeight[iWave] *
-                              std::abs(
-                                wfcQuads[iquad * currentBlockSize + iWave]) *
-                              std::abs(
-                                wfcQuads[iquad * currentBlockSize + iWave]);
-
-                        if (isEvaluateGradRho)
-                          {
-                            for (unsigned int i = 0;
-                                 i < numNodesPerElement * 3 * numQuadPoints;
-                                 ++i)
-                              {
-                                shapeFunctionGradValues[i] = T(
-                                  operatorMatrix
-                                    .getShapeFunctionGradValuesDensityGaussQuad()
-                                      [icell * numNodesPerElement * 3 *
-                                         numQuadPoints +
-                                       i]);
-                              }
-
-                            xgemm(&transA,
-                                  &transB,
-                                  &currentBlockSize,
-                                  &numQuadPointsTimes3,
-                                  &numNodesPerElement,
-                                  &scalarCoeffAlpha,
-                                  &cellWaveFunctionMatrix[0],
-                                  &currentBlockSize,
-                                  &shapeFunctionGradValues[0],
-                                  &numNodesPerElement,
-                                  &scalarCoeffBeta,
-                                  &gradWfcQuads[0],
-                                  &currentBlockSize);
-
-                            for (unsigned int iquad = 0; iquad < numQuadPoints;
-                                 ++iquad)
-                              for (unsigned int iWave = 0;
-                                   iWave < currentBlockSize;
-                                   ++iWave)
-                                {
-                                  const T wfcQuadVal =
-                                    dftfe::utils::complexConj(
-                                      wfcQuads[iquad * currentBlockSize +
-                                               iWave]);
-                                  const T temp1 =
-                                    wfcQuadVal *
-                                    gradWfcQuads[iquad * 3 * currentBlockSize +
-                                                 iWave];
-                                  gradRhoXContribution[icell * numQuadPoints +
-                                                       iquad] +=
-                                    2.0 * partialOccupVecTimesKptWeight[iWave] *
-                                    dftfe::utils::realPart(temp1);
-                                }
-
-                            for (unsigned int iquad = 0; iquad < numQuadPoints;
-                                 ++iquad)
-                              for (unsigned int iWave = 0;
-                                   iWave < currentBlockSize;
-                                   ++iWave)
-                                {
-                                  const T wfcQuadVal =
-                                    dftfe::utils::complexConj(
-                                      wfcQuads[iquad * currentBlockSize +
-                                               iWave]);
-                                  const T temp1 =
-                                    wfcQuadVal *
-                                    gradWfcQuads[iquad * 3 * currentBlockSize +
-                                                 currentBlockSize + iWave];
-                                  gradRhoYContribution[icell * numQuadPoints +
-                                                       iquad] +=
-                                    2.0 * partialOccupVecTimesKptWeight[iWave] *
-                                    dftfe::utils::realPart(temp1);
-                                }
-
-                            for (unsigned int iquad = 0; iquad < numQuadPoints;
-                                 ++iquad)
-                              for (unsigned int iWave = 0;
-                                   iWave < currentBlockSize;
-                                   ++iWave)
-                                {
-                                  const T wfcQuadVal =
-                                    dftfe::utils::complexConj(
-                                      wfcQuads[iquad * currentBlockSize +
-                                               iWave]);
-                                  const T temp1 =
-                                    wfcQuadVal *
-                                    gradWfcQuads[iquad * 3 * currentBlockSize +
-                                                 2 * currentBlockSize + iWave];
-                                  gradRhoZContribution[icell * numQuadPoints +
-                                                       iquad] +=
-                                    2.0 * partialOccupVecTimesKptWeight[iWave] *
-                                    dftfe::utils::realPart(temp1);
-                                }
-                          }
-
-                      } // cells loop
-                  }     // band parallelizatoin check
-              }         // wave function block loop
-
-            if (spectrumSplit)
-              for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec)
-                {
-                  const unsigned int currentBlockSize =
-                    std::min(BVec, Nfr - jvec);
-
-                  if (currentBlockSize != BVec || jvec == 0)
-                    operatorMatrix.reinit(currentBlockSize,
-                                          flattenedArrayBlock,
-                                          true);
-
-                  if ((jvec + totalNumWaveFunctions - Nfr + currentBlockSize) <=
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId +
-                                                       1] &&
-                      (jvec + totalNumWaveFunctions - Nfr + currentBlockSize) >
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
-                    {
-                      if (dftParams.constraintMagnetization)
-                        {
-                          const double fermiEnergyConstraintMag =
-                            spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown;
-                          for (unsigned int iEigenVec = 0;
-                               iEigenVec < currentBlockSize;
-                               ++iEigenVec)
-                            {
-                              if (eigenValues[kPoint]
-                                             [totalNumWaveFunctions *
-                                                spinIndex +
-                                              (totalNumWaveFunctions - Nfr) +
-                                              jvec + iEigenVec] >
-                                  fermiEnergyConstraintMag)
-                                partialOccupVecTimesKptWeight[iEigenVec] =
-                                  -kPointWeights[kPoint] * spinPolarizedFactor;
-                              else
-                                partialOccupVecTimesKptWeight[iEigenVec] = 0.0;
-                            }
-                        }
-                      else
-                        {
-                          for (unsigned int iEigenVec = 0;
-                               iEigenVec < currentBlockSize;
-                               ++iEigenVec)
-                            {
-                              partialOccupVecTimesKptWeight[iEigenVec] =
-                                (dftUtils::getPartialOccupancy(
-                                   eigenValues[kPoint]
-                                              [totalNumWaveFunctions *
-                                                 spinIndex +
-                                               (totalNumWaveFunctions - Nfr) +
-                                               jvec + iEigenVec],
-                                   fermiEnergy,
-                                   C_kb,
-                                   dftParams.TVal) -
-                                 1.0) *
-                                kPointWeights[kPoint] * spinPolarizedFactor;
-                            }
-                        }
-
-
-                      for (unsigned int iNode = 0; iNode < numLocalDofs;
-                           ++iNode)
-                        for (unsigned int iWave = 0; iWave < currentBlockSize;
-                             ++iWave)
-                          flattenedArrayBlock
-                            .data()[iNode * currentBlockSize + iWave] =
-                            XFracCurrentKPoint[iNode * Nfr + jvec + iWave];
-
-                      (operatorMatrix.getOverloadedConstraintMatrix())
-                        ->distribute(flattenedArrayBlock, currentBlockSize);
-
-                      for (int icell = 0; icell < totalLocallyOwnedCells;
-                           icell++)
-                        {
-                          const unsigned int inc = 1;
-                          for (unsigned int iNode = 0;
-                               iNode < numNodesPerElement;
-                               ++iNode)
-                            {
-                              xcopy(
-                                &currentBlockSize,
-                                flattenedArrayBlock.data() +
-                                  operatorMatrix
-                                    .getFlattenedArrayCellLocalProcIndexIdMap()
-                                      [icell * numNodesPerElement + iNode],
-                                &inc,
-                                &cellWaveFunctionMatrix[currentBlockSize *
-                                                        iNode],
-                                &inc);
-                            }
-
-
-                          const T scalarCoeffAlpha = T(1.0),
-                                  scalarCoeffBeta  = T(0.0);
-                          const char transA = 'N', transB = 'N';
-
-                          xgemm(&transA,
-                                &transB,
-                                &currentBlockSize,
-                                &numQuadPoints,
-                                &numNodesPerElement,
-                                &scalarCoeffAlpha,
-                                &cellWaveFunctionMatrix[0],
-                                &currentBlockSize,
-                                &shapeFunctionValues[0],
-                                &numNodesPerElement,
-                                &scalarCoeffBeta,
-                                &wfcQuads[0],
-                                &currentBlockSize);
-
-                          for (unsigned int iquad = 0; iquad < numQuadPoints;
-                               ++iquad)
-                            for (unsigned int iWave = 0;
-                                 iWave < currentBlockSize;
-                                 ++iWave)
-                              rhoContribution[icell * numQuadPoints + iquad] +=
-                                partialOccupVecTimesKptWeight[iWave] *
-                                std::abs(
-                                  wfcQuads[iquad * currentBlockSize + iWave]) *
-                                std::abs(
-                                  wfcQuads[iquad * currentBlockSize + iWave]);
-
-                          if (isEvaluateGradRho)
-                            {
-                              for (unsigned int i = 0;
-                                   i < numNodesPerElement * 3 * numQuadPoints;
-                                   ++i)
-                                {
-                                  shapeFunctionGradValues[i] = T(
-                                    operatorMatrix
-                                      .getShapeFunctionGradValuesDensityGaussQuad()
-                                        [icell * numNodesPerElement * 3 *
-                                           numQuadPoints +
-                                         i]);
-                                }
-
-                              xgemm(&transA,
-                                    &transB,
-                                    &currentBlockSize,
-                                    &numQuadPointsTimes3,
-                                    &numNodesPerElement,
-                                    &scalarCoeffAlpha,
-                                    &cellWaveFunctionMatrix[0],
-                                    &currentBlockSize,
-                                    &shapeFunctionGradValues[0],
-                                    &numNodesPerElement,
-                                    &scalarCoeffBeta,
-                                    &gradWfcQuads[0],
-                                    &currentBlockSize);
-
-                              for (unsigned int iquad = 0;
-                                   iquad < numQuadPoints;
-                                   ++iquad)
-                                for (unsigned int iWave = 0;
-                                     iWave < currentBlockSize;
-                                     ++iWave)
-                                  {
-                                    const T wfcQuadVal =
-                                      dftfe::utils::complexConj(
-                                        wfcQuads[iquad * currentBlockSize +
-                                                 iWave]);
-                                    const T temp1 =
-                                      wfcQuadVal *
-                                      gradWfcQuads[iquad * 3 *
-                                                     currentBlockSize +
-                                                   iWave];
-                                    gradRhoXContribution[icell * numQuadPoints +
-                                                         iquad] +=
-                                      2.0 *
-                                      partialOccupVecTimesKptWeight[iWave] *
-                                      dftfe::utils::realPart(temp1);
-                                  }
-
-                              for (unsigned int iquad = 0;
-                                   iquad < numQuadPoints;
-                                   ++iquad)
-                                for (unsigned int iWave = 0;
-                                     iWave < currentBlockSize;
-                                     ++iWave)
-                                  {
-                                    const T wfcQuadVal =
-                                      dftfe::utils::complexConj(
-                                        wfcQuads[iquad * currentBlockSize +
-                                                 iWave]);
-                                    const T temp1 =
-                                      wfcQuadVal *
-                                      gradWfcQuads[iquad * 3 *
-                                                     currentBlockSize +
-                                                   currentBlockSize + iWave];
-                                    gradRhoYContribution[icell * numQuadPoints +
-                                                         iquad] +=
-                                      2.0 *
-                                      partialOccupVecTimesKptWeight[iWave] *
-                                      dftfe::utils::realPart(temp1);
-                                  }
-
-                              for (unsigned int iquad = 0;
-                                   iquad < numQuadPoints;
-                                   ++iquad)
-                                for (unsigned int iWave = 0;
-                                     iWave < currentBlockSize;
-                                     ++iWave)
-                                  {
-                                    const T wfcQuadVal =
-                                      dftfe::utils::complexConj(
-                                        wfcQuads[iquad * currentBlockSize +
-                                                 iWave]);
-                                    const T temp1 =
-                                      wfcQuadVal *
-                                      gradWfcQuads[iquad * 3 *
-                                                     currentBlockSize +
-                                                   2 * currentBlockSize +
-                                                   iWave];
-                                    gradRhoZContribution[icell * numQuadPoints +
-                                                         iquad] +=
-                                      2.0 *
-                                      partialOccupVecTimesKptWeight[iWave] *
-                                      dftfe::utils::realPart(temp1);
-                                  }
-                            }
-
-                        } // cells loop
-                    }
-                }
-
-            for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-              for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                {
-                  rhoValuesFlattened[icell * numQuadPoints + iquad] +=
-                    rhoContribution[icell * numQuadPoints + iquad];
-                }
-
-            if (isEvaluateGradRho)
-              for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                  {
-                    gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                           3 * iquad + 0] +=
-                      gradRhoXContribution[icell * numQuadPoints + iquad];
-                    gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                           3 * iquad + 1] +=
-                      gradRhoYContribution[icell * numQuadPoints + iquad];
-                    gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                           3 * iquad + 2] +=
-                      gradRhoZContribution[icell * numQuadPoints + iquad];
-                  }
-            if (dftParams.spinPolarized == 1)
-              {
-                for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                  for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                    {
-                      rhoValuesSpinPolarizedFlattened[icell * numQuadPoints *
-                                                        2 +
-                                                      iquad * 2 + spinIndex] +=
-                        rhoContribution[icell * numQuadPoints + iquad];
-                    }
-
-                if (isEvaluateGradRho)
-                  for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                    for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                      {
-                        gradRhoValuesSpinPolarizedFlattened
-                          [icell * numQuadPoints * 6 + iquad * 6 +
-                           spinIndex * 3] +=
-                          gradRhoXContribution[icell * numQuadPoints + iquad];
-                        gradRhoValuesSpinPolarizedFlattened
-                          [icell * numQuadPoints * 6 + iquad * 6 +
-                           spinIndex * 3 + 1] +=
-                          gradRhoYContribution[icell * numQuadPoints + iquad];
-                        gradRhoValuesSpinPolarizedFlattened
-                          [icell * numQuadPoints * 6 + iquad * 6 +
-                           spinIndex * 3 + 2] +=
-                          gradRhoZContribution[icell * numQuadPoints + iquad];
-                      }
-              }
-
-          } // kpoint loop
-      }     // spin index loop
-
-
-    // gather density from all inter communicators
-    if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1)
-      {
-        dealii::Utilities::MPI::sum(rhoValuesFlattened,
-                                    interpoolcomm,
-                                    rhoValuesFlattened);
-
-        if (isEvaluateGradRho)
-          dealii::Utilities::MPI::sum(gradRhoValuesFlattened,
-                                      interpoolcomm,
-                                      gradRhoValuesFlattened);
-
-
-
-        if (dftParams.spinPolarized == 1)
-          {
-            dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened,
-                                        interpoolcomm,
-                                        rhoValuesSpinPolarizedFlattened);
-
-            if (isEvaluateGradRho)
-              dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened,
-                                          interpoolcomm,
-                                          gradRhoValuesSpinPolarizedFlattened);
-          }
-      }
-
-    if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1)
-      {
-        dealii::Utilities::MPI::sum(rhoValuesFlattened,
-                                    interBandGroupComm,
-                                    rhoValuesFlattened);
-
-        if (isEvaluateGradRho)
-          dealii::Utilities::MPI::sum(gradRhoValuesFlattened,
-                                      interBandGroupComm,
-                                      gradRhoValuesFlattened);
-
-
-        if (dftParams.spinPolarized == 1)
-          {
-            dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened,
-                                        interBandGroupComm,
-                                        rhoValuesSpinPolarizedFlattened);
-
-            if (isEvaluateGradRho)
-              dealii::Utilities::MPI::sum(gradRhoValuesSpinPolarizedFlattened,
-                                          interBandGroupComm,
-                                          gradRhoValuesSpinPolarizedFlattened);
-          }
-      }
-
-
-    unsigned int iElem = 0;
-    cell               = dofHandler.begin_active();
-    endc               = dofHandler.end();
-    for (; cell != endc; ++cell)
-      if (cell->is_locally_owned())
-        {
-          const dealii::CellId cellid = cell->id();
-
-          std::vector<double>  dummy(1);
-          std::vector<double> &tempRhoQuads = (*rhoValues)[cellid];
-          std::vector<double> &tempGradRhoQuads =
-            isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy;
-
-          std::vector<double> &tempRhoQuadsSP =
-            (dftParams.spinPolarized == 1) ? (*rhoValuesSpinPolarized)[cellid] :
-                                             dummy;
-          std::vector<double> &tempGradRhoQuadsSP =
-            ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ?
-              (*gradRhoValuesSpinPolarized)[cellid] :
-              dummy;
-
-          if (dftParams.spinPolarized == 1)
-            {
-              for (unsigned int q = 0; q < numQuadPoints; ++q)
-                {
-                  tempRhoQuadsSP[2 * q + 0] =
-                    rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 +
-                                                    q * 2 + 0];
-
-                  tempRhoQuadsSP[2 * q + 1] =
-                    rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints * 2 +
-                                                    q * 2 + 1];
-                }
-
-              if (isEvaluateGradRho)
-                for (unsigned int q = 0; q < numQuadPoints; ++q)
-                  {
-                    tempGradRhoQuadsSP[6 * q + 0] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q];
-                    tempGradRhoQuadsSP[6 * q + 1] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q + 1];
-                    tempGradRhoQuadsSP[6 * q + 2] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q + 2];
-                    tempGradRhoQuadsSP[6 * q + 3] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q + 3];
-                    tempGradRhoQuadsSP[6 * q + 4] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q + 4];
-                    tempGradRhoQuadsSP[6 * q + 5] =
-                      gradRhoValuesSpinPolarizedFlattened[iElem *
-                                                            numQuadPoints * 6 +
-                                                          6 * q + 5];
-                  }
-            }
-
-          for (unsigned int q = 0; q < numQuadPoints; ++q)
-            tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q];
-
-
-          if (isEvaluateGradRho)
-            for (unsigned int q = 0; q < numQuadPoints; ++q)
-              {
-                tempGradRhoQuads[3 * q] =
-                  gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3];
-                tempGradRhoQuads[3 * q + 1] =
-                  gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 1];
-                tempGradRhoQuads[3 * q + 2] =
-                  gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 + 2];
-              }
-          iElem++;
-        }
-
-    MPI_Barrier(mpiCommParent);
-    cpu_time = MPI_Wtime() - cpu_time;
-
-    if (this_process == 0 && dftParams.verbosity >= 2)
-      std::cout << "Time for compute rho on CPU: " << cpu_time << std::endl;
-  }
-
-  template void
-  computeRhoFromPSICPU(
-    const std::vector<std::vector<dataTypes::number>> &X,
-    const std::vector<std::vector<dataTypes::number>> &XFrac,
-    const unsigned int                                 totalNumWaveFunctions,
-    const unsigned int                                 Nfr,
-    const unsigned int                                 numLocalDofs,
-    const std::vector<std::vector<double>> &           eigenValues,
-    const double                                       fermiEnergy,
-    const double                                       fermiEnergyUp,
-    const double                                       fermiEnergyDown,
-    operatorDFTClass &                                 operatorMatrix,
-    const dealii::DoFHandler<3> &                      dofHandler,
-    const unsigned int                                 totalLocallyOwnedCells,
-    const unsigned int                                 numNodesPerElement,
-    const unsigned int                                 numQuadPoints,
-    const std::vector<double> &                        kPointWeights,
-    std::map<dealii::CellId, std::vector<double>> *    rhoValues,
-    std::map<dealii::CellId, std::vector<double>> *    gradRhoValues,
-    std::map<dealii::CellId, std::vector<double>> *    rhoValuesSpinPolarized,
-    std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-    const bool                                     isEvaluateGradRho,
-    const MPI_Comm &                               mpiCommParent,
-    const MPI_Comm &                               interpoolcomm,
-    const MPI_Comm &                               interBandGroupComm,
-    const dftParameters &                          dftParams,
-    const bool                                     spectrumSplit,
-    const bool                                     useFEOrderRhoPlusOneGLQuad);
-} // namespace dftfe
diff --git a/src/dft/densityCalculatorDevice.cc b/src/dft/densityCalculatorDevice.cc
deleted file mode 100644
index ebc6865b2..000000000
--- a/src/dft/densityCalculatorDevice.cc
+++ /dev/null
@@ -1,1283 +0,0 @@
-// ---------------------------------------------------------------------
-//
-// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
-// authors.
-//
-// This file is part of the DFT-FE code.
-//
-// The DFT-FE code is free software; you can use it, redistribute
-// it, and/or modify it under the terms of the GNU Lesser General
-// Public License as published by the Free Software Foundation; either
-// version 2.1 of the License, or (at your option) any later version.
-// The full text of the license can be found in the file LICENSE at
-// the top level of the DFT-FE distribution.
-//
-// ---------------------------------------------------------------------
-//
-// @author Sambit Das
-//
-
-// source file for electron density related computations
-#include <constants.h>
-#include <densityCalculatorDevice.h>
-#include <dftUtils.h>
-#include <vectorUtilities.h>
-#include <deviceKernelsGeneric.h>
-#include <linearAlgebraOperationsDevice.h>
-#include <MemoryStorage.h>
-#include <DataTypeOverloads.h>
-#include <DeviceAPICalls.h>
-#include <DeviceDataTypeOverloads.h>
-#include <DeviceTypeConfig.h>
-#include <DeviceKernelLauncherConstants.h>
-#include <DeviceBlasWrapper.h>
-
-namespace dftfe
-{
-  namespace Device
-  {
-    namespace
-    {
-      __global__ void
-      computeRhoGradRhoFromInterpolatedValues(
-        const unsigned int numberEntries,
-        double *           rhoCellsWfcContributions,
-        double *           gradRhoCellsWfcContributionsX,
-        double *           gradRhoCellsWfcContributionsY,
-        double *           gradRhoCellsWfcContributionsZ,
-        const bool         isEvaluateGradRho)
-      {
-        const unsigned int globalThreadId =
-          blockIdx.x * blockDim.x + threadIdx.x;
-
-        for (unsigned int index = globalThreadId; index < numberEntries;
-             index += blockDim.x * gridDim.x)
-          {
-            const double psi                = rhoCellsWfcContributions[index];
-            rhoCellsWfcContributions[index] = psi * psi;
-
-            if (isEvaluateGradRho)
-              {
-                const double gradPsiX = gradRhoCellsWfcContributionsX[index];
-                gradRhoCellsWfcContributionsX[index] = 2.0 * psi * gradPsiX;
-
-                const double gradPsiY = gradRhoCellsWfcContributionsY[index];
-                gradRhoCellsWfcContributionsY[index] = 2.0 * psi * gradPsiY;
-
-                const double gradPsiZ = gradRhoCellsWfcContributionsZ[index];
-                gradRhoCellsWfcContributionsZ[index] = 2.0 * psi * gradPsiZ;
-              }
-          }
-      }
-
-      __global__ void
-      computeRhoGradRhoFromInterpolatedValues(
-        const unsigned int                 numberEntries,
-        dftfe::utils::deviceDoubleComplex *rhoCellsWfcContributions,
-        dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsX,
-        dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsY,
-        dftfe::utils::deviceDoubleComplex *gradRhoCellsWfcContributionsZ,
-        const bool                         isEvaluateGradRho)
-      {
-        const unsigned int globalThreadId =
-          blockIdx.x * blockDim.x + threadIdx.x;
-
-        for (unsigned int index = globalThreadId; index < numberEntries;
-             index += blockDim.x * gridDim.x)
-          {
-            const dftfe::utils::deviceDoubleComplex psi =
-              rhoCellsWfcContributions[index];
-            rhoCellsWfcContributions[index] =
-              dftfe::utils::makeComplex(psi.x * psi.x + psi.y * psi.y, 0.0);
-
-            if (isEvaluateGradRho)
-              {
-                const dftfe::utils::deviceDoubleComplex gradPsiX =
-                  gradRhoCellsWfcContributionsX[index];
-                gradRhoCellsWfcContributionsX[index] =
-                  dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiX.x +
-                                                   psi.y * gradPsiX.y),
-                                            0.0);
-
-                const dftfe::utils::deviceDoubleComplex gradPsiY =
-                  gradRhoCellsWfcContributionsY[index];
-                gradRhoCellsWfcContributionsY[index] =
-                  dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiY.x +
-                                                   psi.y * gradPsiY.y),
-                                            0.0);
-
-                const dftfe::utils::deviceDoubleComplex gradPsiZ =
-                  gradRhoCellsWfcContributionsZ[index];
-                gradRhoCellsWfcContributionsZ[index] =
-                  dftfe::utils::makeComplex(2.0 * (psi.x * gradPsiZ.x +
-                                                   psi.y * gradPsiZ.y),
-                                            0.0);
-              }
-          }
-      }
-    } // namespace
-
-    template <typename NumberType>
-    void
-    computeRhoFromPSI(
-      const NumberType *                             X,
-      const NumberType *                             XFrac,
-      const unsigned int                             totalNumWaveFunctions,
-      const unsigned int                             Nfr,
-      const unsigned int                             numLocalDofs,
-      const std::vector<std::vector<double>> &       eigenValues,
-      const double                                   fermiEnergy,
-      const double                                   fermiEnergyUp,
-      const double                                   fermiEnergyDown,
-      operatorDFTDeviceClass &                       operatorMatrix,
-      const unsigned int                             matrixFreeDofhandlerIndex,
-      const dealii::DoFHandler<3> &                  dofHandler,
-      const unsigned int                             totalLocallyOwnedCells,
-      const unsigned int                             numNodesPerElement,
-      const unsigned int                             numQuadPoints,
-      const std::vector<double> &                    kPointWeights,
-      std::map<dealii::CellId, std::vector<double>> *rhoValues,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
-      std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-      const bool                                     isEvaluateGradRho,
-      const MPI_Comm &                               mpiCommParent,
-      const MPI_Comm &                               interpoolcomm,
-      const MPI_Comm &                               interBandGroupComm,
-      const dftParameters &                          dftParams,
-      const bool                                     spectrumSplit,
-      const bool                                     use2pPlusOneGLQuad)
-    {
-      if (use2pPlusOneGLQuad)
-        AssertThrow(!isEvaluateGradRho, dftUtils::ExcNotImplementedYet());
-
-      int this_process;
-      MPI_Comm_rank(mpiCommParent, &this_process);
-      dftfe::utils::deviceSynchronize();
-      MPI_Barrier(mpiCommParent);
-      double             device_time = MPI_Wtime();
-      const unsigned int numKPoints  = kPointWeights.size();
-
-      // band group parallelization data structures
-      const unsigned int numberBandGroups =
-        dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm);
-      const unsigned int bandGroupTaskId =
-        dealii::Utilities::MPI::this_mpi_process(interBandGroupComm);
-      std::vector<unsigned int> bandGroupLowHighPlusOneIndices;
-      dftUtils::createBandParallelizationIndices(
-        interBandGroupComm,
-        totalNumWaveFunctions,
-        bandGroupLowHighPlusOneIndices);
-
-      const unsigned int BVec =
-        std::min(dftParams.chebyWfcBlockSize, totalNumWaveFunctions);
-
-      const double spinPolarizedFactor =
-        (dftParams.spinPolarized == 1) ? 1.0 : 2.0;
-
-      const NumberType zero                    = 0;
-      const NumberType scalarCoeffAlphaRho     = 1.0;
-      const NumberType scalarCoeffBetaRho      = 1.0;
-      const NumberType scalarCoeffAlphaGradRho = 1.0;
-      const NumberType scalarCoeffBetaGradRho  = 1.0;
-
-      const unsigned int cellsBlockSize = 50;
-      const unsigned int numCellBlocks =
-        totalLocallyOwnedCells / cellsBlockSize;
-      const unsigned int remCellBlockSize =
-        totalLocallyOwnedCells - numCellBlocks * cellsBlockSize;
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        rhoDevice(totalLocallyOwnedCells * numQuadPoints, zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        rhoWfcContributionsDevice(cellsBlockSize * numQuadPoints * BVec, zero);
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoDeviceX(isEvaluateGradRho ?
-                         (totalLocallyOwnedCells * numQuadPoints) :
-                         1,
-                       zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoDeviceY(isEvaluateGradRho ?
-                         (totalLocallyOwnedCells * numQuadPoints) :
-                         1,
-                       zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoDeviceZ(isEvaluateGradRho ?
-                         (totalLocallyOwnedCells * numQuadPoints) :
-                         1,
-                       zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoWfcContributionsDeviceX(
-          isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1,
-          zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoWfcContributionsDeviceY(
-          isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1,
-          zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        gradRhoWfcContributionsDeviceZ(
-          isEvaluateGradRho ? (cellsBlockSize * numQuadPoints * BVec) : 1,
-          zero);
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::HOST>
-        rhoHost;
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::HOST>
-        gradRhoHostX;
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::HOST>
-        gradRhoHostY;
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::HOST>
-        gradRhoHostZ;
-
-      rhoHost.resize(totalLocallyOwnedCells * numQuadPoints, zero);
-
-      if (isEvaluateGradRho)
-        {
-          gradRhoHostX.resize(totalLocallyOwnedCells * numQuadPoints, zero);
-
-          gradRhoHostY.resize(totalLocallyOwnedCells * numQuadPoints, zero);
-          gradRhoHostZ.resize(totalLocallyOwnedCells * numQuadPoints, zero);
-        }
-
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        shapeFunctionValuesTransposedDevice(numNodesPerElement * numQuadPoints,
-                                            zero);
-
-      shapeFunctionValuesTransposedDevice.setValue(zero);
-
-
-      dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr(
-        numNodesPerElement * numQuadPoints,
-        (operatorMatrix.getShapeFunctionValuesTransposed(use2pPlusOneGLQuad))
-          .begin(),
-        shapeFunctionValuesTransposedDevice.begin());
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        shapeFunctionGradientValuesXTransposedDevice;
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        shapeFunctionGradientValuesYTransposedDevice;
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        shapeFunctionGradientValuesZTransposedDevice;
-
-      if (isEvaluateGradRho)
-        {
-          shapeFunctionGradientValuesXTransposedDevice.resize(
-            cellsBlockSize * numNodesPerElement * numQuadPoints, 0);
-          shapeFunctionGradientValuesXTransposedDevice.setValue(0);
-
-          shapeFunctionGradientValuesYTransposedDevice.resize(
-            cellsBlockSize * numNodesPerElement * numQuadPoints, 0);
-          shapeFunctionGradientValuesYTransposedDevice.setValue(0);
-
-          shapeFunctionGradientValuesZTransposedDevice.resize(
-            cellsBlockSize * numNodesPerElement * numQuadPoints, 0);
-          shapeFunctionGradientValuesZTransposedDevice.setValue(0);
-        }
-
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::HOST>
-        partialOccupVec(BVec, zero);
-      dftfe::utils::MemoryStorage<NumberType, dftfe::utils::MemorySpace::DEVICE>
-        partialOccupVecDevice(BVec, zero);
-
-      distributedDeviceVec<NumberType> &deviceFlattenedArrayBlock =
-        operatorMatrix.getParallelChebyBlockVectorDevice();
-
-      NumberType *cellWaveFunctionMatrix =
-        (operatorMatrix.getCellWaveFunctionMatrix()).begin();
-
-      typename dealii::DoFHandler<3>::active_cell_iterator cell =
-        dofHandler.begin_active();
-      typename dealii::DoFHandler<3>::active_cell_iterator endc =
-        dofHandler.end();
-
-      std::vector<double> rhoValuesFlattened(totalLocallyOwnedCells *
-                                               numQuadPoints,
-                                             0.0);
-      std::vector<double> gradRhoValuesFlattened(totalLocallyOwnedCells *
-                                                   numQuadPoints * 3,
-                                                 0.0);
-      std::vector<double> rhoValuesSpinPolarizedFlattened(
-        totalLocallyOwnedCells * numQuadPoints * 2, 0.0);
-      std::vector<double> gradRhoValuesSpinPolarizedFlattened(
-        totalLocallyOwnedCells * numQuadPoints * 6, 0.0);
-
-      for (unsigned int spinIndex = 0;
-           spinIndex < (1 + dftParams.spinPolarized);
-           ++spinIndex)
-        {
-          for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint)
-            {
-              rhoDevice.setValue(zero);
-              rhoWfcContributionsDevice.setValue(zero);
-              gradRhoDeviceX.setValue(zero);
-              gradRhoDeviceY.setValue(zero);
-              gradRhoDeviceZ.setValue(zero);
-              gradRhoWfcContributionsDeviceX.setValue(zero);
-              gradRhoWfcContributionsDeviceY.setValue(zero);
-              gradRhoWfcContributionsDeviceZ.setValue(zero);
-
-              for (unsigned int jvec = 0; jvec < totalNumWaveFunctions;
-                   jvec += BVec)
-                {
-                  if ((jvec + BVec) <=
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId +
-                                                       1] &&
-                      (jvec + BVec) >
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
-                    {
-                      if (spectrumSplit)
-                        {
-                          partialOccupVecDevice.setValue(kPointWeights[kPoint] *
-                                                         spinPolarizedFactor);
-                        }
-                      else
-                        {
-                          if (dftParams.constraintMagnetization)
-                            {
-                              const double fermiEnergyConstraintMag =
-                                spinIndex == 0 ? fermiEnergyUp :
-                                                 fermiEnergyDown;
-                              for (unsigned int iEigenVec = 0; iEigenVec < BVec;
-                                   ++iEigenVec)
-                                {
-                                  if (eigenValues[kPoint]
-                                                 [totalNumWaveFunctions *
-                                                    spinIndex +
-                                                  jvec + iEigenVec] >
-                                      fermiEnergyConstraintMag)
-                                    *(partialOccupVec.begin() + iEigenVec) = 0;
-                                  else
-                                    *(partialOccupVec.begin() + iEigenVec) =
-                                      kPointWeights[kPoint] *
-                                      spinPolarizedFactor;
-                                }
-                            }
-                          else
-                            {
-                              for (unsigned int iEigenVec = 0; iEigenVec < BVec;
-                                   ++iEigenVec)
-                                {
-                                  *(partialOccupVec.begin() + iEigenVec) =
-                                    dftUtils::getPartialOccupancy(
-                                      eigenValues[kPoint]
-                                                 [totalNumWaveFunctions *
-                                                    spinIndex +
-                                                  jvec + iEigenVec],
-                                      fermiEnergy,
-                                      C_kb,
-                                      dftParams.TVal) *
-                                    kPointWeights[kPoint] * spinPolarizedFactor;
-                                }
-                            }
-
-                          partialOccupVec
-                            .template copyTo<dftfe::utils::MemorySpace::DEVICE>(
-                              partialOccupVecDevice);
-                        }
-
-                      dftfe::utils::deviceKernelsGeneric::
-                        stridedCopyToBlockConstantStride(
-                          BVec,
-                          totalNumWaveFunctions,
-                          numLocalDofs,
-                          jvec,
-                          X + numLocalDofs * totalNumWaveFunctions *
-                                ((dftParams.spinPolarized + 1) * kPoint +
-                                 spinIndex),
-                          deviceFlattenedArrayBlock.begin());
-
-
-                      deviceFlattenedArrayBlock.updateGhostValues();
-
-                      (operatorMatrix.getOverloadedConstraintMatrix())
-                        ->distribute(deviceFlattenedArrayBlock, BVec);
-
-                      for (int iblock = 0; iblock < (numCellBlocks + 1);
-                           iblock++)
-                        {
-                          const unsigned int currentCellsBlockSize =
-                            (iblock == numCellBlocks) ? remCellBlockSize :
-                                                        cellsBlockSize;
-                          if (currentCellsBlockSize > 0)
-                            {
-                              const unsigned int startingCellId =
-                                iblock * cellsBlockSize;
-
-                              dftfe::utils::deviceKernelsGeneric::
-                                stridedCopyToBlock(
-                                  BVec,
-                                  currentCellsBlockSize * numNodesPerElement,
-                                  deviceFlattenedArrayBlock.begin(),
-                                  cellWaveFunctionMatrix,
-                                  (operatorMatrix
-                                     .getFlattenedArrayCellLocalProcIndexIdMap())
-                                      .begin() +
-                                    startingCellId * numNodesPerElement);
-
-                              NumberType scalarCoeffAlpha = 1.0;
-                              NumberType scalarCoeffBeta  = 0;
-                              int        strideA = BVec * numNodesPerElement;
-                              int        strideB = 0;
-                              int        strideC = BVec * numQuadPoints;
-
-                              dftfe::utils::deviceBlasWrapper::
-                                gemmStridedBatched(
-                                  operatorMatrix.getDeviceBlasHandle(),
-                                  dftfe::utils::DEVICEBLAS_OP_N,
-                                  dftfe::utils::DEVICEBLAS_OP_N,
-                                  BVec,
-                                  numQuadPoints,
-                                  numNodesPerElement,
-                                  &scalarCoeffAlpha,
-                                  cellWaveFunctionMatrix,
-                                  BVec,
-                                  strideA,
-                                  shapeFunctionValuesTransposedDevice.begin(),
-                                  numNodesPerElement,
-                                  strideB,
-                                  &scalarCoeffBeta,
-                                  rhoWfcContributionsDevice.begin(),
-                                  BVec,
-                                  strideC,
-                                  currentCellsBlockSize);
-
-
-                              if (isEvaluateGradRho)
-                                {
-                                  strideB = numNodesPerElement * numQuadPoints;
-
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesXTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesXTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesYTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesYTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesZTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesZTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesXTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceX.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesYTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceY.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesZTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceZ.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-                                }
-
-
-#ifdef DFTFE_WITH_DEVICE_LANG_CUDA
-                              computeRhoGradRhoFromInterpolatedValues<<<
-                                (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
-                                  dftfe::utils::DEVICE_BLOCK_SIZE *
-                                  numQuadPoints * currentCellsBlockSize,
-                                dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-                                currentCellsBlockSize * numQuadPoints * BVec,
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  rhoWfcContributionsDevice.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceX.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceY.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceZ.begin()),
-                                isEvaluateGradRho);
-#elif DFTFE_WITH_DEVICE_LANG_HIP
-                              hipLaunchKernelGGL(
-                                computeRhoGradRhoFromInterpolatedValues,
-                                (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
-                                  dftfe::utils::DEVICE_BLOCK_SIZE *
-                                  numQuadPoints * currentCellsBlockSize,
-                                dftfe::utils::DEVICE_BLOCK_SIZE,
-                                0,
-                                0,
-                                currentCellsBlockSize * numQuadPoints * BVec,
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  rhoWfcContributionsDevice.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceX.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceY.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceZ.begin()),
-                                isEvaluateGradRho);
-#endif
-
-                              dftfe::utils::deviceBlasWrapper::gemm(
-                                operatorMatrix.getDeviceBlasHandle(),
-                                dftfe::utils::DEVICEBLAS_OP_N,
-                                dftfe::utils::DEVICEBLAS_OP_N,
-                                1,
-                                currentCellsBlockSize * numQuadPoints,
-                                BVec,
-                                &scalarCoeffAlphaRho,
-                                partialOccupVecDevice.begin(),
-                                1,
-                                rhoWfcContributionsDevice.begin(),
-                                BVec,
-                                &scalarCoeffBetaRho,
-                                rhoDevice.begin() +
-                                  startingCellId * numQuadPoints,
-                                1);
-
-
-                              if (isEvaluateGradRho)
-                                {
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceX.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceX.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-
-
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceY.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceY.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceZ.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceZ.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-                                }
-                            } // non-trivial cell block check
-                        }     // cells block loop
-                    }         // band parallelizatoin check
-                }             // wave function block loop
-
-              if (spectrumSplit)
-                for (unsigned int jvec = 0; jvec < Nfr; jvec += BVec)
-                  if ((jvec + totalNumWaveFunctions - Nfr + BVec) <=
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId +
-                                                       1] &&
-                      (jvec + totalNumWaveFunctions - Nfr + BVec) >
-                        bandGroupLowHighPlusOneIndices[2 * bandGroupTaskId])
-                    {
-                      if (dftParams.constraintMagnetization)
-                        {
-                          const double fermiEnergyConstraintMag =
-                            spinIndex == 0 ? fermiEnergyUp : fermiEnergyDown;
-                          for (unsigned int iEigenVec = 0; iEigenVec < BVec;
-                               ++iEigenVec)
-                            {
-                              if (eigenValues[kPoint]
-                                             [totalNumWaveFunctions *
-                                                spinIndex +
-                                              (totalNumWaveFunctions - Nfr) +
-                                              jvec + iEigenVec] >
-                                  fermiEnergyConstraintMag)
-                                *(partialOccupVec.begin() + iEigenVec) =
-                                  -kPointWeights[kPoint] * spinPolarizedFactor;
-                              else
-                                *(partialOccupVec.begin() + iEigenVec) = 0;
-                            }
-                        }
-                      else
-                        {
-                          for (unsigned int iEigenVec = 0; iEigenVec < BVec;
-                               ++iEigenVec)
-                            {
-                              *(partialOccupVec.begin() + iEigenVec) =
-                                (dftUtils::getPartialOccupancy(
-                                   eigenValues[kPoint]
-                                              [totalNumWaveFunctions *
-                                                 spinIndex +
-                                               (totalNumWaveFunctions - Nfr) +
-                                               jvec + iEigenVec],
-                                   fermiEnergy,
-                                   C_kb,
-                                   dftParams.TVal) -
-                                 1.0) *
-                                kPointWeights[kPoint] * spinPolarizedFactor;
-                            }
-                        }
-
-                      partialOccupVec
-                        .template copyTo<dftfe::utils::MemorySpace::DEVICE>(
-                          partialOccupVecDevice);
-
-
-                      dftfe::utils::deviceKernelsGeneric::
-                        stridedCopyToBlockConstantStride(
-                          BVec,
-                          Nfr,
-                          numLocalDofs,
-                          jvec,
-                          XFrac + numLocalDofs * Nfr *
-                                    ((dftParams.spinPolarized + 1) * kPoint +
-                                     spinIndex),
-                          deviceFlattenedArrayBlock.begin());
-
-                      deviceFlattenedArrayBlock.updateGhostValues();
-
-                      (operatorMatrix.getOverloadedConstraintMatrix())
-                        ->distribute(deviceFlattenedArrayBlock, BVec);
-
-                      for (int iblock = 0; iblock < (numCellBlocks + 1);
-                           iblock++)
-                        {
-                          const unsigned int currentCellsBlockSize =
-                            (iblock == numCellBlocks) ? remCellBlockSize :
-                                                        cellsBlockSize;
-                          if (currentCellsBlockSize > 0)
-                            {
-                              const unsigned int startingCellId =
-                                iblock * cellsBlockSize;
-
-                              dftfe::utils::deviceKernelsGeneric::
-                                stridedCopyToBlock(
-                                  BVec,
-                                  currentCellsBlockSize * numNodesPerElement,
-                                  deviceFlattenedArrayBlock.begin(),
-                                  cellWaveFunctionMatrix,
-                                  (operatorMatrix
-                                     .getFlattenedArrayCellLocalProcIndexIdMap())
-                                      .begin() +
-                                    startingCellId * numNodesPerElement);
-
-                              NumberType scalarCoeffAlpha = 1.0;
-                              NumberType scalarCoeffBeta  = 0;
-                              int        strideA = BVec * numNodesPerElement;
-                              int        strideB = 0;
-                              int        strideC = BVec * numQuadPoints;
-
-
-                              dftfe::utils::deviceBlasWrapper::
-                                gemmStridedBatched(
-                                  operatorMatrix.getDeviceBlasHandle(),
-                                  dftfe::utils::DEVICEBLAS_OP_N,
-                                  dftfe::utils::DEVICEBLAS_OP_N,
-                                  BVec,
-                                  numQuadPoints,
-                                  numNodesPerElement,
-                                  &scalarCoeffAlpha,
-                                  cellWaveFunctionMatrix,
-                                  BVec,
-                                  strideA,
-                                  shapeFunctionValuesTransposedDevice.begin(),
-                                  numNodesPerElement,
-                                  strideB,
-                                  &scalarCoeffBeta,
-                                  rhoWfcContributionsDevice.begin(),
-                                  BVec,
-                                  strideC,
-                                  currentCellsBlockSize);
-
-
-
-                              if (isEvaluateGradRho)
-                                {
-                                  strideB = numNodesPerElement * numQuadPoints;
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesXTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesXTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesYTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesYTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceKernelsGeneric::
-                                    copyValueType1ArrToValueType2Arr(
-                                      currentCellsBlockSize *
-                                        numNodesPerElement * numQuadPoints,
-                                      (operatorMatrix
-                                         .getShapeFunctionGradientValuesZTransposed())
-                                          .begin() +
-                                        startingCellId * numNodesPerElement *
-                                          numQuadPoints,
-                                      shapeFunctionGradientValuesZTransposedDevice
-                                        .begin());
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesXTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceX.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesYTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceY.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-
-                                  dftfe::utils::deviceBlasWrapper::
-                                    gemmStridedBatched(
-                                      operatorMatrix.getDeviceBlasHandle(),
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      dftfe::utils::DEVICEBLAS_OP_N,
-                                      BVec,
-                                      numQuadPoints,
-                                      numNodesPerElement,
-                                      &scalarCoeffAlpha,
-                                      cellWaveFunctionMatrix,
-                                      BVec,
-                                      strideA,
-                                      shapeFunctionGradientValuesZTransposedDevice
-                                        .begin(),
-                                      numNodesPerElement,
-                                      strideB,
-                                      &scalarCoeffBeta,
-                                      gradRhoWfcContributionsDeviceZ.begin(),
-                                      BVec,
-                                      strideC,
-                                      currentCellsBlockSize);
-                                }
-
-
-#ifdef DFTFE_WITH_DEVICE_LANG_CUDA
-                              computeRhoGradRhoFromInterpolatedValues<<<
-                                (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
-                                  dftfe::utils::DEVICE_BLOCK_SIZE *
-                                  numQuadPoints * currentCellsBlockSize,
-                                dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-                                currentCellsBlockSize * numQuadPoints * BVec,
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  rhoWfcContributionsDevice.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceX.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceY.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceZ.begin()),
-                                isEvaluateGradRho);
-#elif DFTFE_WITH_DEVICE_LANG_HIP
-                              hipLaunchKernelGGL(
-                                computeRhoGradRhoFromInterpolatedValues,
-                                (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
-                                  dftfe::utils::DEVICE_BLOCK_SIZE *
-                                  numQuadPoints * currentCellsBlockSize,
-                                dftfe::utils::DEVICE_BLOCK_SIZE,
-                                0,
-                                0,
-                                currentCellsBlockSize * numQuadPoints * BVec,
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  rhoWfcContributionsDevice.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceX.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceY.begin()),
-                                dftfe::utils::makeDataTypeDeviceCompatible(
-                                  gradRhoWfcContributionsDeviceZ.begin()),
-                                isEvaluateGradRho);
-#endif
-
-                              dftfe::utils::deviceBlasWrapper::gemm(
-                                operatorMatrix.getDeviceBlasHandle(),
-                                dftfe::utils::DEVICEBLAS_OP_N,
-                                dftfe::utils::DEVICEBLAS_OP_N,
-                                1,
-                                currentCellsBlockSize * numQuadPoints,
-                                BVec,
-                                &scalarCoeffAlphaRho,
-                                partialOccupVecDevice.begin(),
-                                1,
-                                rhoWfcContributionsDevice.begin(),
-                                BVec,
-                                &scalarCoeffBetaRho,
-                                rhoDevice.begin() +
-                                  startingCellId * numQuadPoints,
-                                1);
-
-
-                              if (isEvaluateGradRho)
-                                {
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceX.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceX.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-
-
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceY.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceY.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-
-                                  dftfe::utils::deviceBlasWrapper::gemm(
-                                    operatorMatrix.getDeviceBlasHandle(),
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    dftfe::utils::DEVICEBLAS_OP_N,
-                                    1,
-                                    currentCellsBlockSize * numQuadPoints,
-                                    BVec,
-                                    &scalarCoeffAlphaGradRho,
-                                    partialOccupVecDevice.begin(),
-                                    1,
-                                    gradRhoWfcContributionsDeviceZ.begin(),
-                                    BVec,
-                                    &scalarCoeffBetaGradRho,
-                                    gradRhoDeviceZ.begin() +
-                                      startingCellId * numQuadPoints,
-                                    1);
-                                }
-                            } // non-tivial cells block
-                        }     // cells block loop
-                    }         // spectrum split block
-
-
-              // do memcopy to host
-              rhoDevice.template copyTo<dftfe::utils::MemorySpace::HOST>(
-                rhoHost.begin(), totalLocallyOwnedCells * numQuadPoints, 0, 0);
-
-              if (isEvaluateGradRho)
-                {
-                  gradRhoDeviceX
-                    .template copyTo<dftfe::utils::MemorySpace::HOST>(
-                      gradRhoHostX.begin(),
-                      totalLocallyOwnedCells * numQuadPoints,
-                      0,
-                      0);
-
-                  gradRhoDeviceY
-                    .template copyTo<dftfe::utils::MemorySpace::HOST>(
-                      gradRhoHostY.begin(),
-                      totalLocallyOwnedCells * numQuadPoints,
-                      0,
-                      0);
-
-                  gradRhoDeviceZ
-                    .template copyTo<dftfe::utils::MemorySpace::HOST>(
-                      gradRhoHostZ.begin(),
-                      totalLocallyOwnedCells * numQuadPoints,
-                      0,
-                      0);
-                }
-
-              for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                  {
-                    rhoValuesFlattened[icell * numQuadPoints + iquad] +=
-                      dftfe::utils::realPart(
-                        *(rhoHost.begin() + icell * numQuadPoints + iquad));
-                  }
-
-              if (isEvaluateGradRho)
-                for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                  for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                    {
-                      gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                             3 * iquad + 0] +=
-                        dftfe::utils::realPart(*(gradRhoHostX.begin() +
-                                                 icell * numQuadPoints +
-                                                 iquad));
-                      gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                             3 * iquad + 1] +=
-                        dftfe::utils::realPart(*(gradRhoHostY.begin() +
-                                                 icell * numQuadPoints +
-                                                 iquad));
-                      gradRhoValuesFlattened[icell * numQuadPoints * 3 +
-                                             3 * iquad + 2] +=
-                        dftfe::utils::realPart(*(gradRhoHostZ.begin() +
-                                                 icell * numQuadPoints +
-                                                 iquad));
-                    }
-              if (dftParams.spinPolarized == 1)
-                {
-                  for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                    for (unsigned int iquad = 0; iquad < numQuadPoints; ++iquad)
-                      {
-                        rhoValuesSpinPolarizedFlattened
-                          [icell * numQuadPoints * 2 + iquad * 2 + spinIndex] +=
-                          dftfe::utils::realPart(
-                            *(rhoHost.begin() + icell * numQuadPoints + iquad));
-                      }
-
-                  if (isEvaluateGradRho)
-                    for (int icell = 0; icell < totalLocallyOwnedCells; icell++)
-                      for (unsigned int iquad = 0; iquad < numQuadPoints;
-                           ++iquad)
-                        {
-                          gradRhoValuesSpinPolarizedFlattened
-                            [icell * numQuadPoints * 6 + iquad * 6 +
-                             spinIndex * 3] +=
-                            dftfe::utils::realPart(*(gradRhoHostX.begin() +
-                                                     icell * numQuadPoints +
-                                                     iquad));
-                          gradRhoValuesSpinPolarizedFlattened
-                            [icell * numQuadPoints * 6 + iquad * 6 +
-                             spinIndex * 3 + 1] +=
-                            dftfe::utils::realPart(*(gradRhoHostY.begin() +
-                                                     icell * numQuadPoints +
-                                                     iquad));
-                          gradRhoValuesSpinPolarizedFlattened
-                            [icell * numQuadPoints * 6 + iquad * 6 +
-                             spinIndex * 3 + 2] +=
-                            dftfe::utils::realPart(*(gradRhoHostZ.begin() +
-                                                     icell * numQuadPoints +
-                                                     iquad));
-                        }
-                }
-            } // kpoint loop
-        }     // spin index
-
-
-      // gather density from all inter communicators
-      if (dealii::Utilities::MPI::n_mpi_processes(interpoolcomm) > 1)
-        {
-          dealii::Utilities::MPI::sum(rhoValuesFlattened,
-                                      interpoolcomm,
-                                      rhoValuesFlattened);
-
-          if (isEvaluateGradRho)
-            dealii::Utilities::MPI::sum(gradRhoValuesFlattened,
-                                        interpoolcomm,
-                                        gradRhoValuesFlattened);
-
-
-
-          if (dftParams.spinPolarized == 1)
-            {
-              dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened,
-                                          interpoolcomm,
-                                          rhoValuesSpinPolarizedFlattened);
-
-              if (isEvaluateGradRho)
-                dealii::Utilities::MPI::sum(
-                  gradRhoValuesSpinPolarizedFlattened,
-                  interpoolcomm,
-                  gradRhoValuesSpinPolarizedFlattened);
-            }
-        }
-
-      if (dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm) > 1)
-        {
-          dealii::Utilities::MPI::sum(rhoValuesFlattened,
-                                      interBandGroupComm,
-                                      rhoValuesFlattened);
-
-          if (isEvaluateGradRho)
-            dealii::Utilities::MPI::sum(gradRhoValuesFlattened,
-                                        interBandGroupComm,
-                                        gradRhoValuesFlattened);
-
-
-          if (dftParams.spinPolarized == 1)
-            {
-              dealii::Utilities::MPI::sum(rhoValuesSpinPolarizedFlattened,
-                                          interBandGroupComm,
-                                          rhoValuesSpinPolarizedFlattened);
-
-              if (isEvaluateGradRho)
-                dealii::Utilities::MPI::sum(
-                  gradRhoValuesSpinPolarizedFlattened,
-                  interBandGroupComm,
-                  gradRhoValuesSpinPolarizedFlattened);
-            }
-        }
-
-
-      unsigned int iElem = 0;
-      cell               = dofHandler.begin_active();
-      endc               = dofHandler.end();
-      for (; cell != endc; ++cell)
-        if (cell->is_locally_owned())
-          {
-            const dealii::CellId cellid = cell->id();
-
-            std::vector<double>  dummy(1);
-            std::vector<double> &tempRhoQuads = (*rhoValues)[cellid];
-            std::vector<double> &tempGradRhoQuads =
-              isEvaluateGradRho ? (*gradRhoValues)[cellid] : dummy;
-
-            std::vector<double> &tempRhoQuadsSP =
-              (dftParams.spinPolarized == 1) ?
-                (*rhoValuesSpinPolarized)[cellid] :
-                dummy;
-            std::vector<double> &tempGradRhoQuadsSP =
-              ((dftParams.spinPolarized == 1) && isEvaluateGradRho) ?
-                (*gradRhoValuesSpinPolarized)[cellid] :
-                dummy;
-
-            if (dftParams.spinPolarized == 1)
-              {
-                for (unsigned int q = 0; q < numQuadPoints; ++q)
-                  {
-                    tempRhoQuadsSP[2 * q + 0] =
-                      rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints *
-                                                        2 +
-                                                      q * 2 + 0];
-
-                    tempRhoQuadsSP[2 * q + 1] =
-                      rhoValuesSpinPolarizedFlattened[iElem * numQuadPoints *
-                                                        2 +
-                                                      q * 2 + 1];
-                  }
-
-                if (isEvaluateGradRho)
-                  for (unsigned int q = 0; q < numQuadPoints; ++q)
-                    {
-                      tempGradRhoQuadsSP[6 * q + 0] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q];
-                      tempGradRhoQuadsSP[6 * q + 1] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q + 1];
-                      tempGradRhoQuadsSP[6 * q + 2] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q + 2];
-                      tempGradRhoQuadsSP[6 * q + 3] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q + 3];
-                      tempGradRhoQuadsSP[6 * q + 4] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q + 4];
-                      tempGradRhoQuadsSP[6 * q + 5] =
-                        gradRhoValuesSpinPolarizedFlattened
-                          [iElem * numQuadPoints * 6 + 6 * q + 5];
-                    }
-              }
-
-            for (unsigned int q = 0; q < numQuadPoints; ++q)
-              tempRhoQuads[q] = rhoValuesFlattened[iElem * numQuadPoints + q];
-
-
-            if (isEvaluateGradRho)
-              for (unsigned int q = 0; q < numQuadPoints; ++q)
-                {
-                  tempGradRhoQuads[3 * q] =
-                    gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3];
-                  tempGradRhoQuads[3 * q + 1] =
-                    gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 +
-                                           1];
-                  tempGradRhoQuads[3 * q + 2] =
-                    gradRhoValuesFlattened[iElem * numQuadPoints * 3 + q * 3 +
-                                           2];
-                }
-            iElem++;
-          }
-
-      dftfe::utils::deviceSynchronize();
-      MPI_Barrier(mpiCommParent);
-      device_time = MPI_Wtime() - device_time;
-
-      if (this_process == 0 && dftParams.verbosity >= 2)
-        std::cout << "Time for compute rho on Device: " << device_time
-                  << std::endl;
-    }
-
-    template void
-    computeRhoFromPSI(
-      const dataTypes::number *                      X,
-      const dataTypes::number *                      XFrac,
-      const unsigned int                             totalNumWaveFunctions,
-      const unsigned int                             Nfr,
-      const unsigned int                             numLocalDofs,
-      const std::vector<std::vector<double>> &       eigenValues,
-      const double                                   fermiEnergy,
-      const double                                   fermiEnergyUp,
-      const double                                   fermiEnergyDown,
-      operatorDFTDeviceClass &                       operatorMatrix,
-      const unsigned int                             matrixFreeDofhandlerIndex,
-      const dealii::DoFHandler<3> &                  dofHandler,
-      const unsigned int                             totalLocallyOwnedCells,
-      const unsigned int                             numNodesPerElement,
-      const unsigned int                             numQuadPoints,
-      const std::vector<double> &                    kPointWeights,
-      std::map<dealii::CellId, std::vector<double>> *rhoValues,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValues,
-      std::map<dealii::CellId, std::vector<double>> *rhoValuesSpinPolarized,
-      std::map<dealii::CellId, std::vector<double>> *gradRhoValuesSpinPolarized,
-      const bool                                     isEvaluateGradRho,
-      const MPI_Comm &                               mpiCommParent,
-      const MPI_Comm &                               interpoolcomm,
-      const MPI_Comm &                               interBandGroupComm,
-      const dftParameters &                          dftParams,
-      const bool                                     spectrumSplit,
-      const bool                                     use2pPlusOneGLQuad);
-  } // namespace Device
-} // namespace dftfe
diff --git a/src/dft/densityCalculatorDeviceKernels.cc b/src/dft/densityCalculatorDeviceKernels.cc
new file mode 100644
index 000000000..8b8ac25c1
--- /dev/null
+++ b/src/dft/densityCalculatorDeviceKernels.cc
@@ -0,0 +1,247 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022 The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+// @author Sambit Das
+//
+
+// source file for electron density related computations
+#include <constants.h>
+#include <densityCalculator.h>
+#include <dftUtils.h>
+#include <DataTypeOverloads.h>
+#include <DeviceAPICalls.h>
+#include <DeviceDataTypeOverloads.h>
+#include <DeviceTypeConfig.h>
+#include <DeviceKernelLauncherConstants.h>
+
+namespace dftfe
+{
+  namespace
+  {
+    __global__ void
+    computeRhoGradRhoFromInterpolatedValues(
+      const unsigned int numVectors,
+      const unsigned int numCells,
+      const unsigned int nQuadsPerCell,
+      double *           wfcContributions,
+      double *           gradwfcContributions,
+      double *           rhoCellsWfcContributions,
+      double *           gradRhoCellsWfcContributions,
+      const bool         isEvaluateGradRho)
+    {
+      const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+      const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell;
+      const unsigned int numberEntries     = numEntriesPerCell * numCells;
+
+      for (unsigned int index = globalThreadId; index < numberEntries;
+           index += blockDim.x * gridDim.x)
+        {
+          const double psi                = wfcContributions[index];
+          rhoCellsWfcContributions[index] = psi * psi;
+
+          if (isEvaluateGradRho)
+            {
+              unsigned int iCell          = index / numEntriesPerCell;
+              unsigned int intraCellIndex = index - iCell * numEntriesPerCell;
+              unsigned int iQuad          = intraCellIndex / numVectors;
+              unsigned int iVec           = intraCellIndex - iQuad * numVectors;
+              const double gradPsiX = //[iVec * numCells * numVectors + + 0]
+                gradwfcContributions[intraCellIndex +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * psi * gradPsiX;
+
+              const double gradPsiY =
+                gradwfcContributions[intraCellIndex + numEntriesPerCell +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + numVectors +
+                                           3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * psi * gradPsiY;
+
+              const double gradPsiZ =
+                gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + 2 * numVectors +
+                                           3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * psi * gradPsiZ;
+            }
+        }
+    }
+
+    __global__ void
+    computeRhoGradRhoFromInterpolatedValues(
+      const unsigned int                 numVectors,
+      const unsigned int                 numCells,
+      const unsigned int                 nQuadsPerCell,
+      dftfe::utils::deviceDoubleComplex *wfcContributions,
+      dftfe::utils::deviceDoubleComplex *gradwfcContributions,
+      double *                           rhoCellsWfcContributions,
+      double *                           gradRhoCellsWfcContributions,
+      const bool                         isEvaluateGradRho)
+    {
+      const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
+      const unsigned int numEntriesPerCell = numVectors * nQuadsPerCell;
+      const unsigned int numberEntries     = numEntriesPerCell * numCells;
+
+      for (unsigned int index = globalThreadId; index < numberEntries;
+           index += blockDim.x * gridDim.x)
+        {
+          const dftfe::utils::deviceDoubleComplex psi = wfcContributions[index];
+          rhoCellsWfcContributions[index] = psi.x * psi.x + psi.y * psi.y;
+
+          if (isEvaluateGradRho)
+            {
+              unsigned int iCell          = index / numEntriesPerCell;
+              unsigned int intraCellIndex = index - iCell * numEntriesPerCell;
+              unsigned int iQuad          = intraCellIndex / numVectors;
+              unsigned int iVec           = intraCellIndex - iQuad * numVectors;
+              const dftfe::utils::deviceDoubleComplex gradPsiX =
+                gradwfcContributions[intraCellIndex +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + 3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * (psi.x * gradPsiX.x + psi.y * gradPsiX.y);
+
+              const dftfe::utils::deviceDoubleComplex gradPsiY =
+                gradwfcContributions[intraCellIndex + numEntriesPerCell +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + numVectors +
+                                           3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * (psi.x * gradPsiY.x + psi.y * gradPsiY.y);
+
+              const dftfe::utils::deviceDoubleComplex gradPsiZ =
+                gradwfcContributions[intraCellIndex + 2 * numEntriesPerCell +
+                                     numEntriesPerCell * 3 * iCell];
+              gradRhoCellsWfcContributions[iVec + 2 * numVectors +
+                                           3 * iQuad * numVectors +
+                                           numEntriesPerCell * 3 * iCell] =
+                2.0 * (psi.x * gradPsiZ.x + psi.y * gradPsiZ.y);
+            }
+        }
+    }
+  } // namespace
+  template <typename NumberType>
+  void
+  computeRhoGradRhoFromInterpolatedValues(
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<NumberType,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      &                                         basisOperationsPtr,
+    const std::pair<unsigned int, unsigned int> cellRange,
+    const std::pair<unsigned int, unsigned int> vecRange,
+    double *                                    partialOccupVec,
+    NumberType *                                wfcQuadPointData,
+    NumberType *                                gradWfcQuadPointData,
+    double *                                    rhoCellsWfcContributions,
+    double *                                    gradRhoCellsWfcContributions,
+    double *                                    rho,
+    double *                                    gradRho,
+    const bool                                  isEvaluateGradRho)
+  {
+    const unsigned int cellsBlockSize   = cellRange.second - cellRange.first;
+    const unsigned int vectorsBlockSize = vecRange.second - vecRange.first;
+    const unsigned int nQuadsPerCell    = basisOperationsPtr->nQuadsPerCell();
+    const unsigned int nCells           = basisOperationsPtr->nCells();
+    const double       scalarCoeffAlphaRho     = 1.0;
+    const double       scalarCoeffBetaRho      = 1.0;
+    const double       scalarCoeffAlphaGradRho = 1.0;
+    const double       scalarCoeffBetaGradRho  = 1.0;
+#ifdef DFTFE_WITH_DEVICE_LANG_CUDA
+    computeRhoGradRhoFromInterpolatedValues<<<
+      (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
+        dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize,
+      dftfe::utils::DEVICE_BLOCK_SIZE>>>(
+      vectorsBlockSize,
+      cellsBlockSize,
+      nQuadsPerCell,
+      dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData),
+      dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData),
+      dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions),
+      dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions),
+      isEvaluateGradRho);
+#elif DFTFE_WITH_DEVICE_LANG_HIP
+    hipLaunchKernelGGL(
+      computeRhoGradRhoFromInterpolatedValues,
+      (vectorsBlockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
+        dftfe::utils::DEVICE_BLOCK_SIZE * nQuadsPerCell * cellsBlockSize,
+      dftfe::utils::DEVICE_BLOCK_SIZE,
+      0,
+      0,
+      vectorsBlockSize,
+      cellsBlockSize,
+      nQuadsPerCell,
+      dftfe::utils::makeDataTypeDeviceCompatible(wfcQuadPointData),
+      dftfe::utils::makeDataTypeDeviceCompatible(gradWfcQuadPointData),
+      dftfe::utils::makeDataTypeDeviceCompatible(rhoCellsWfcContributions),
+      dftfe::utils::makeDataTypeDeviceCompatible(gradRhoCellsWfcContributions),
+      isEvaluateGradRho);
+#endif
+    dftfe::utils::deviceBlasWrapper::gemv(
+      basisOperationsPtr->getDeviceBLASHandle(),
+      dftfe::utils::DEVICEBLAS_OP_T,
+      vectorsBlockSize,
+      cellsBlockSize * nQuadsPerCell,
+      &scalarCoeffAlphaRho,
+      rhoCellsWfcContributions,
+      vectorsBlockSize,
+      partialOccupVec,
+      1,
+      &scalarCoeffBetaRho,
+      rho + cellRange.first * nQuadsPerCell,
+      1);
+
+
+    if (isEvaluateGradRho)
+      {
+        dftfe::utils::deviceBlasWrapper::gemv(
+          basisOperationsPtr->getDeviceBLASHandle(),
+          dftfe::utils::DEVICEBLAS_OP_T,
+          vectorsBlockSize,
+          cellsBlockSize * nQuadsPerCell * 3,
+          &scalarCoeffAlphaGradRho,
+          gradRhoCellsWfcContributions,
+          vectorsBlockSize,
+          partialOccupVec,
+          1,
+          &scalarCoeffBetaGradRho,
+          gradRho + cellRange.first * nQuadsPerCell * 3,
+          1);
+      }
+  }
+  template void
+  computeRhoGradRhoFromInterpolatedValues(
+    std::shared_ptr<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::DEVICE>>
+      &                                         basisOperationsPtr,
+    const std::pair<unsigned int, unsigned int> cellRange,
+    const std::pair<unsigned int, unsigned int> vecRange,
+    double *                                    partialOccupVec,
+    dataTypes::number *                         wfcQuadPointData,
+    dataTypes::number *                         gradWfcQuadPointData,
+    double *                                    rhoCellsWfcContributions,
+    double *                                    gradRhoCellsWfcContributions,
+    double *                                    rho,
+    double *                                    gradRho,
+    const bool                                  isEvaluateGradRho);
+
+} // namespace dftfe
diff --git a/src/dft/densityFirstOrderResponseCalculatorCPU.cc b/src/dft/densityFirstOrderResponseCalculatorCPU.cc
index 0c1eef5b0..b0efcd893 100644
--- a/src/dft/densityFirstOrderResponseCalculatorCPU.cc
+++ b/src/dft/densityFirstOrderResponseCalculatorCPU.cc
@@ -31,8 +31,8 @@ namespace dftfe
   template <typename T>
   void
   computeRhoFirstOrderResponseCPU(
-    const std::vector<std::vector<T>> &            X,
-    const std::vector<std::vector<T>> &            XPrime,
+    const T *                                      X,
+    const T *                                      XPrime,
     const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
     const unsigned int                             totalNumWaveFunctions,
     const unsigned int                             numLocalDofs,
@@ -149,11 +149,13 @@ namespace dftfe
 
         for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint)
           {
-            const std::vector<T> &XCurrentKPoint =
-              X[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
+            const T *XCurrentKPoint =
+              X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) *
+                    numLocalDofs * totalNumWaveFunctions;
 
-            const std::vector<T> &XPrimeCurrentKPoint =
-              XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
+            const T *XPrimeCurrentKPoint =
+              XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) *
+                         numLocalDofs * totalNumWaveFunctions;
 
             const std::vector<double> &densityMatDerFermiEnergyVec =
               densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint +
@@ -420,8 +422,8 @@ namespace dftfe
   template <typename T, typename TLowPrec>
   void
   computeRhoFirstOrderResponseCPUMixedPrec(
-    const std::vector<std::vector<T>> &            X,
-    const std::vector<std::vector<T>> &            XPrime,
+    const T *                                      X,
+    const T *                                      XPrime,
     const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
     const unsigned int                             totalNumWaveFunctions,
     const unsigned int                             numLocalDofs,
@@ -543,11 +545,13 @@ namespace dftfe
 
         for (unsigned int kPoint = 0; kPoint < kPointWeights.size(); ++kPoint)
           {
-            const std::vector<T> &XCurrentKPoint =
-              X[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
+            const T *XCurrentKPoint =
+              X + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) *
+                    numLocalDofs * totalNumWaveFunctions;
 
-            const std::vector<T> &XPrimeCurrentKPoint =
-              XPrime[(dftParams.spinPolarized + 1) * kPoint + spinIndex];
+            const T *XPrimeCurrentKPoint =
+              XPrime + ((dftParams.spinPolarized + 1) * kPoint + spinIndex) *
+                         numLocalDofs * totalNumWaveFunctions;
 
             const std::vector<double> &densityMatDerFermiEnergyVec =
               densityMatDerFermiEnergy[(dftParams.spinPolarized + 1) * kPoint +
@@ -815,12 +819,12 @@ namespace dftfe
 
   template void
   computeRhoFirstOrderResponseCPU(
-    const std::vector<std::vector<dataTypes::number>> &X,
-    const std::vector<std::vector<dataTypes::number>> &XPrime,
-    const std::vector<std::vector<double>> &           densityMatDerFermiEnergy,
-    const unsigned int                                 totalNumWaveFunctions,
-    const unsigned int                                 numLocalDofs,
-    operatorDFTClass &                                 operatorMatrix,
+    const dataTypes::number *                      X,
+    const dataTypes::number *                      XPrime,
+    const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
+    const unsigned int                             totalNumWaveFunctions,
+    const unsigned int                             numLocalDofs,
+    operatorDFTClass &                             operatorMatrix,
     const unsigned int                             matrixFreeDofhandlerIndex,
     const dealii::DoFHandler<3> &                  dofHandler,
     const unsigned int                             totalLocallyOwnedCells,
@@ -841,12 +845,12 @@ namespace dftfe
   template void
   computeRhoFirstOrderResponseCPUMixedPrec<dataTypes::number,
                                            dataTypes::numberFP32>(
-    const std::vector<std::vector<dataTypes::number>> &X,
-    const std::vector<std::vector<dataTypes::number>> &XPrime,
-    const std::vector<std::vector<double>> &           densityMatDerFermiEnergy,
-    const unsigned int                                 totalNumWaveFunctions,
-    const unsigned int                                 numLocalDofs,
-    operatorDFTClass &                                 operatorMatrix,
+    const dataTypes::number *                      X,
+    const dataTypes::number *                      XPrime,
+    const std::vector<std::vector<double>> &       densityMatDerFermiEnergy,
+    const unsigned int                             totalNumWaveFunctions,
+    const unsigned int                             numLocalDofs,
+    operatorDFTClass &                             operatorMatrix,
     const unsigned int                             matrixFreeDofhandlerIndex,
     const dealii::DoFHandler<3> &                  dofHandler,
     const unsigned int                             totalLocallyOwnedCells,
diff --git a/src/dft/dft.cc b/src/dft/dft.cc
index fd371b8b7..decfb89fd 100644
--- a/src/dft/dft.cc
+++ b/src/dft/dft.cc
@@ -20,7 +20,6 @@
 // Include header files
 #include <chebyshevOrthogonalizedSubspaceIterationSolver.h>
 #include <dealiiLinearSolver.h>
-#include <densityCalculatorCPU.h>
 #include <densityFirstOrderResponseCalculator.h>
 #include <dft.h>
 #include <dftParameters.h>
@@ -65,7 +64,6 @@
 #include <ctime>
 
 #ifdef DFTFE_WITH_DEVICE
-#  include <densityCalculatorDevice.h>
 #  include <linearAlgebraOperationsDevice.h>
 #endif
 
@@ -758,10 +756,6 @@ namespace dftfe
     d_upperBoundUnwantedSpectrumValues.resize(
       (d_dftParamsPtr->spinPolarized + 1) * d_kPointWeights.size(), 0.0);
 
-    d_eigenVectorsFlattenedSTL.resize((1 + d_dftParamsPtr->spinPolarized) *
-                                      d_kPointWeights.size());
-    d_eigenVectorsRotFracDensityFlattenedSTL.resize(
-      (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
 
     for (unsigned int kPoint = 0; kPoint < d_kPointWeights.size(); ++kPoint)
       {
@@ -1654,8 +1648,9 @@ namespace dftfe
 
 
             vectorTools::copyFlattenedSTLVecToSingleCompVec(
-              d_eigenVectorsFlattenedSTL[0],
+              d_eigenVectorsFlattenedHost.data(),
               d_numEigenValues,
+              matrix_free_data.get_vector_partitioner()->locally_owned_size(),
               std::make_pair(0, numberWaveFunctionsErrorEstimate),
               eigenVectorsArray);
 
@@ -1842,6 +1837,8 @@ namespace dftfe
         if (initializeCublas)
           {
             kohnShamDFTEigenOperatorDevice.createDeviceBlasHandle();
+            basisOperationsPtrDevice->setDeviceBLASHandle(
+              &(kohnShamDFTEigenOperatorDevice.getDeviceBlasHandle()));
           }
 
         AssertThrow(
@@ -1942,6 +1939,9 @@ namespace dftfe
 
         d_kohnShamDFTOperatorDevicePtr->reinit(
           std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues), true);
+
+        basisOperationsPtrDevice->setDeviceBLASHandle(
+          &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle()));
       }
 #endif
   }
@@ -3552,8 +3552,7 @@ namespace dftfe
       dealii::Utilities::MPI::n_mpi_processes(interBandGroupComm);
 
     const unsigned int localVectorSize =
-      d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues;
-
+      matrix_free_data.get_vector_partitioner()->locally_owned_size();
     if (numberBandGroups > 1 && !d_dftParamsPtr->useDevice)
       {
         MPI_Barrier(interBandGroupComm);
@@ -3569,13 +3568,17 @@ namespace dftfe
             {
               const unsigned int currentBlockSize =
                 std::min(blockSize, d_numEigenValues * localVectorSize - i);
-              MPI_Allreduce(MPI_IN_PLACE,
-                            &d_eigenVectorsFlattenedSTL[kPoint][0] + i,
-                            currentBlockSize,
-                            dataTypes::mpi_type_id(
-                              &d_eigenVectorsFlattenedSTL[kPoint][0]),
-                            MPI_SUM,
-                            interBandGroupComm);
+              MPI_Allreduce(
+                MPI_IN_PLACE,
+                &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues *
+                                             localVectorSize] +
+                  i,
+                currentBlockSize,
+                dataTypes::mpi_type_id(
+                  &d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues *
+                                               localVectorSize]),
+                MPI_SUM,
+                interBandGroupComm);
             }
       }
 
@@ -3775,17 +3778,7 @@ namespace dftfe
     if (d_dftParamsPtr->useDevice &&
         (d_dftParamsPtr->writeWfcSolutionFields ||
          d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile))
-      for (unsigned int kPoint = 0;
-           kPoint <
-           (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
-           ++kPoint)
-        {
-          d_eigenVectorsFlattenedDevice.copyTo<dftfe::utils::MemorySpace::HOST>(
-            &d_eigenVectorsFlattenedSTL[kPoint][0],
-            d_eigenVectorsFlattenedSTL[kPoint].size(),
-            (kPoint * d_eigenVectorsFlattenedSTL[0].size()),
-            0);
-        }
+      d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost);
 #endif
 
 
@@ -3891,6 +3884,11 @@ namespace dftfe
 #endif
         );
       }
+#ifdef DFTFE_WITH_DEVICE
+    if (d_dftParamsPtr->useDevice)
+      basisOperationsPtrDevice->setDeviceBLASHandle(
+        &(d_kohnShamDFTOperatorDevicePtr->getDeviceBlasHandle()));
+#endif
 
     forcePtr->computeStress(matrix_free_data,
 #ifdef DFTFE_WITH_DEVICE
@@ -4171,20 +4169,26 @@ namespace dftfe
           {
 #ifdef USE_COMPLEX
             vectorTools::copyFlattenedSTLVecToSingleCompVec(
-              d_eigenVectorsFlattenedSTL[k *
-                                           (1 + d_dftParamsPtr->spinPolarized) +
-                                         s],
+              d_eigenVectorsFlattenedHost.data() +
+                (k * (1 + d_dftParamsPtr->spinPolarized) + s) *
+                  d_numEigenValues *
+                  matrix_free_data.get_vector_partitioner()
+                    ->locally_owned_size(),
               d_numEigenValues,
+              matrix_free_data.get_vector_partitioner()->locally_owned_size(),
               std::make_pair(i, i + 1),
               localProc_dof_indicesReal,
               localProc_dof_indicesImag,
               tempVec);
 #else
             vectorTools::copyFlattenedSTLVecToSingleCompVec(
-              d_eigenVectorsFlattenedSTL[k *
-                                           (1 + d_dftParamsPtr->spinPolarized) +
-                                         s],
+              d_eigenVectorsFlattenedHost.data() +
+                (k * (1 + d_dftParamsPtr->spinPolarized) + s) *
+                  d_numEigenValues *
+                  matrix_free_data.get_vector_partitioner()
+                    ->locally_owned_size(),
               d_numEigenValues,
+              matrix_free_data.get_vector_partitioner()->locally_owned_size(),
               std::make_pair(i, i + 1),
               tempVec);
 #endif
diff --git a/src/dft/dos.cc b/src/dft/dos.cc
index d5abe5689..78ad4b545 100644
--- a/src/dft/dos.cc
+++ b/src/dft/dos.cc
@@ -445,7 +445,7 @@ namespace dftfe
     std::vector<double> tempQuadPointValues(n_q_points);
 
     const unsigned int localVectorSize =
-      d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues;
+      matrix_free_data.get_vector_partitioner()->locally_owned_size();
     std::vector<std::vector<distributedCPUVec<double>>> eigenVectors(
       (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
     std::vector<distributedCPUVec<dataTypes::number>>
@@ -475,11 +475,6 @@ namespace dftfe
                   eigenVectorsFlattenedBlock[kPoint]);
                 eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0);
               }
-
-            constraintsNoneDataInfo.precomputeMaps(
-              matrix_free_data.get_vector_partitioner(),
-              eigenVectorsFlattenedBlock[0].get_partitioner(),
-              currentBlockSize);
           }
 
 
@@ -507,8 +502,10 @@ namespace dftfe
               for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave)
                 eigenVectorsFlattenedBlock[kPoint].local_element(
                   iNode * currentBlockSize + iWave) =
-                  d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues +
-                                                     ivec + iWave];
+                  d_eigenVectorsFlattenedHost[kPoint * d_numEigenValues *
+                                                localVectorSize +
+                                              iNode * d_numEigenValues + ivec +
+                                              iWave];
 
             constraintsNoneDataInfo.distribute(
               eigenVectorsFlattenedBlock[kPoint], currentBlockSize);
@@ -987,7 +984,7 @@ namespace dftfe
     std::vector<double> tempQuadPointValues(n_q_points);
 
     const unsigned int localVectorSize =
-      d_eigenVectorsFlattenedSTL[0].size() / d_numEigenValues;
+      matrix_free_data.get_vector_partitioner()->locally_owned_size();
     std::vector<std::vector<distributedCPUVec<double>>> eigenVectors(
       (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
     std::vector<distributedCPUVec<dataTypes::number>>
@@ -1022,11 +1019,6 @@ namespace dftfe
                   eigenVectorsFlattenedBlock[kPoint]);
                 eigenVectorsFlattenedBlock[kPoint] = dataTypes::number(0.0);
               }
-
-            constraintsNoneDataInfo.precomputeMaps(
-              matrix_free_data.get_vector_partitioner(),
-              eigenVectorsFlattenedBlock[0].get_partitioner(),
-              currentBlockSize);
           }
 
 
@@ -1056,8 +1048,10 @@ namespace dftfe
               for (unsigned int iWave = 0; iWave < currentBlockSize; ++iWave)
                 eigenVectorsFlattenedBlock[kPoint].local_element(
                   iNode * currentBlockSize + iWave) =
-                  d_eigenVectorsFlattenedSTL[kPoint][iNode * d_numEigenValues +
-                                                     ivec + iWave];
+                  d_eigenVectorsFlattenedHost[kPoint * localVectorSize *
+                                                d_numEigenValues +
+                                              iNode * d_numEigenValues + ivec +
+                                              iWave];
 
             constraintsNoneDataInfo.distribute(
               eigenVectorsFlattenedBlock[kPoint], currentBlockSize);
diff --git a/src/dft/initBoundaryConditions.cc b/src/dft/initBoundaryConditions.cc
index 5e0814c20..d7e207974 100644
--- a/src/dft/initBoundaryConditions.cc
+++ b/src/dft/initBoundaryConditions.cc
@@ -262,6 +262,60 @@ namespace dftfe
                             d_constraintsVector,
                             quadratureVector,
                             additional_data);
+    basisOperationsPtrHost = std::make_shared<
+      dftfe::basis::FEBasisOperations<dataTypes::number,
+                                      double,
+                                      dftfe::utils::MemorySpace::HOST>>(
+      matrix_free_data, d_constraintsVector);
+    dftfe::basis::UpdateFlags updateFlags = dftfe::basis::update_values |
+                                            dftfe::basis::update_gradients |
+                                            dftfe::basis::update_transpose;
+    std::vector<unsigned int> quadratureIndices(4, 0);
+    for (auto i = 0; i < 4; ++i)
+      quadratureIndices[i] = i;
+    basisOperationsPtrHost->init(d_densityDofHandlerIndex,
+                                 quadratureIndices,
+                                 updateFlags);
+    if (!d_dftParamsPtr->useDevice)
+      {
+        std::vector<unsigned int> bandGroupLowHighPlusOneIndices;
+        dftUtils::createBandParallelizationIndices(
+          interBandGroupComm, d_numEigenValues, bandGroupLowHighPlusOneIndices);
+
+        unsigned int BVec = std::min(d_dftParamsPtr->chebyWfcBlockSize,
+                                     bandGroupLowHighPlusOneIndices[1]);
+
+        basisOperationsPtrHost->createScratchMultiVectors(
+          BVec, (d_dftParamsPtr->spinPolarized + 1));
+        if (d_numEigenValues % BVec != 0)
+          basisOperationsPtrHost->createScratchMultiVectors(
+            d_numEigenValues % BVec, (d_dftParamsPtr->spinPolarized + 1));
+        if (d_numEigenValues != d_numEigenValuesRR &&
+            d_numEigenValuesRR % BVec != 0)
+          basisOperationsPtrHost->createScratchMultiVectors(
+            d_numEigenValuesRR % BVec, (d_dftParamsPtr->spinPolarized + 1));
+      }
+#if defined(DFTFE_WITH_DEVICE)
+    if (d_dftParamsPtr->useDevice)
+      {
+        basisOperationsPtrDevice = std::make_shared<
+          dftfe::basis::FEBasisOperations<dataTypes::number,
+                                          double,
+                                          dftfe::utils::MemorySpace::DEVICE>>(
+          matrix_free_data, d_constraintsVector);
+        basisOperationsPtrDevice->init(d_densityDofHandlerIndex,
+                                       quadratureIndices,
+                                       updateFlags);
+        const unsigned int BVec =
+          std::min(d_dftParamsPtr->chebyWfcBlockSize, d_numEigenValues);
+
+        if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND")
+          basisOperationsPtrDevice->createScratchMultiVectors(BVec, 2);
+        else
+          basisOperationsPtrDevice->createScratchMultiVectors(
+            BVec, (d_dftParamsPtr->spinPolarized + 1));
+      }
+#endif
 
     MPI_Barrier(d_mpiCommParent);
     init_mf = MPI_Wtime() - init_mf;
diff --git a/src/dft/initElectronicFields.cc b/src/dft/initElectronicFields.cc
index 90fb3c3d4..74ce870a7 100644
--- a/src/dft/initElectronicFields.cc
+++ b/src/dft/initElectronicFields.cc
@@ -104,16 +104,17 @@ namespace dftfe
          kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
          ++kPoint)
       {
-        d_eigenVectorsFlattenedSTL[kPoint].resize(
-          d_numEigenValues *
-            matrix_free_data.get_vector_partitioner()->local_size(),
+        d_eigenVectorsFlattenedHost.resize(
+          (d_numEigenValues *
+           matrix_free_data.get_vector_partitioner()->local_size()) *
+            (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(),
           dataTypes::number(0.0));
-
         if (d_numEigenValuesRR != d_numEigenValues)
           {
-            d_eigenVectorsRotFracDensityFlattenedSTL[kPoint].resize(
+            d_eigenVectorsRotFracDensityFlattenedHost.resize(
               d_numEigenValuesRR *
-                matrix_free_data.get_vector_partitioner()->local_size(),
+                matrix_free_data.get_vector_partitioner()->local_size() *
+                (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size(),
               dataTypes::number(0.0));
           }
       }
@@ -145,40 +146,26 @@ namespace dftfe
     if (d_dftParamsPtr->useDevice)
       {
         d_eigenVectorsFlattenedDevice.resize(
-          d_eigenVectorsFlattenedSTL[0].size() *
-          (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
+          d_eigenVectorsFlattenedHost.size());
 
         if (d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND")
           d_eigenVectorsDensityMatrixPrimeFlattenedDevice.resize(
-            d_eigenVectorsFlattenedSTL[0].size() *
-            (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
+            d_eigenVectorsFlattenedHost.size());
 
         if (d_numEigenValuesRR != d_numEigenValues)
           d_eigenVectorsRotFracFlattenedDevice.resize(
-            d_eigenVectorsRotFracDensityFlattenedSTL[0].size() *
-            (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size());
+            d_eigenVectorsRotFracDensityFlattenedHost.size());
         else
           d_eigenVectorsRotFracFlattenedDevice.resize(1);
 
-        for (unsigned int kPoint = 0;
-             kPoint <
-             (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
-             ++kPoint)
-          {
-            d_eigenVectorsFlattenedDevice
-              .copyFrom<dftfe::utils::MemorySpace::HOST>(
-                &d_eigenVectorsFlattenedSTL[kPoint][0],
-                d_eigenVectorsFlattenedSTL[0].size(),
-                0,
-                kPoint * d_eigenVectorsFlattenedSTL[0].size());
-          }
+        d_eigenVectorsFlattenedDevice.copyFrom(d_eigenVectorsFlattenedHost);
       }
 #endif
 
     if (!d_dftParamsPtr->useDevice &&
         d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND")
       {
-        d_eigenVectorsDensityMatrixPrimeSTL = d_eigenVectorsFlattenedSTL;
+        d_eigenVectorsDensityMatrixPrimeHost = d_eigenVectorsFlattenedHost;
       }
 
     if (d_dftParamsPtr->verbosity >= 2 && d_dftParamsPtr->spinPolarized == 1)
diff --git a/src/dft/kohnShamEigenSolve.cc b/src/dft/kohnShamEigenSolve.cc
index 75c473ac6..92b0552c3 100644
--- a/src/dft/kohnShamEigenSolve.cc
+++ b/src/dft/kohnShamEigenSolve.cc
@@ -20,22 +20,17 @@
 #include <vector>
 #include <dft.h>
 #include <linearAlgebraOperations.h>
-#include <densityCalculatorCPU.h>
 
 namespace dftfe
 {
   namespace internal
   {
     void
-    pointWiseScaleWithDiagonal(
-      const distributedCPUVec<double> &diagonal,
-      const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-        &                             singleComponentPartitioner,
-      const unsigned int              numberFields,
-      std::vector<dataTypes::number> &fieldsArrayFlattened)
+    pointWiseScaleWithDiagonal(const distributedCPUVec<double> &diagonal,
+                               const unsigned int               numberFields,
+                               const unsigned int               numberDofs,
+                               dataTypes::number *fieldsArrayFlattened)
     {
-      const unsigned int numberDofs =
-        fieldsArrayFlattened.size() / numberFields;
       const unsigned int inc = 1;
 
       for (unsigned int i = 0; i < numberDofs; ++i)
@@ -178,9 +173,9 @@ namespace dftfe
     // by M^{1/2}
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_sqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[0]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data());
 
 
     //
@@ -188,9 +183,11 @@ namespace dftfe
     //
     std::vector<dataTypes::number> ProjHam;
 
-    kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0],
-                                  d_numEigenValues,
-                                  ProjHam);
+    kohnShamDFTEigenOperator.XtHX(
+      d_eigenVectorsFlattenedHost.data(),
+      d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      ProjHam);
 
     //
     // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in
@@ -198,9 +195,9 @@ namespace dftfe
     //
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_invSqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[0]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data());
 
 
     dataTypes::number trXtHX = 0.0;
@@ -243,25 +240,30 @@ namespace dftfe
     // by M^{1/2}
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_sqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[0]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data());
 
 
     //
     // orthogonalize the vectors
     //
     linearAlgebraOperations::gramSchmidtOrthogonalization(
-      d_eigenVectorsFlattenedSTL[0], d_numEigenValues, mpi_communicator);
+      d_eigenVectorsFlattenedHost.data(),
+      d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      mpi_communicator);
 
     //
     // compute projected Hamiltonian
     //
     std::vector<dataTypes::number> ProjHam;
 
-    kohnShamDFTEigenOperator.XtHX(d_eigenVectorsFlattenedSTL[0],
-                                  d_numEigenValues,
-                                  ProjHam);
+    kohnShamDFTEigenOperator.XtHX(
+      d_eigenVectorsFlattenedHost.data(),
+      d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      ProjHam);
 
     //
     // scale the eigenVectors with M^{-1/2} to represent the wavefunctions in
@@ -269,9 +271,9 @@ namespace dftfe
     //
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_invSqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[0]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data());
 
     double trXtKX = 0.0;
 #ifdef USE_COMPLEX
@@ -319,11 +321,12 @@ namespace dftfe
     // by M^{1/2}
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_sqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                   kPointIndex +
-                                 spinType]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size());
 
     std::vector<double> eigenValuesTemp(isSpectrumSplit ? d_numEigenValuesRR :
                                                           d_numEigenValues,
@@ -387,12 +390,16 @@ namespace dftfe
     subspaceIterationSolver.solve(
       kohnShamDFTEigenOperator,
       elpaScala,
-      d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                   kPointIndex +
-                                 spinType],
-      d_eigenVectorsRotFracDensityFlattenedSTL
-        [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType],
+      d_eigenVectorsFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsRotFracDensityFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValuesRR *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       eigenValuesTemp,
       residualNormWaveFunctions,
       interBandGroupComm,
@@ -406,20 +413,23 @@ namespace dftfe
     //
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_invSqrtMassVector,
-      matrix_free_data.get_vector_partitioner(),
       d_numEigenValues,
-      d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                   kPointIndex +
-                                 spinType]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size());
 
     if (isSpectrumSplit && d_numEigenValuesRR != d_numEigenValues)
       {
         internal::pointWiseScaleWithDiagonal(
           kohnShamDFTEigenOperator.d_invSqrtMassVector,
-          matrix_free_data.get_vector_partitioner(),
           d_numEigenValuesRR,
-          d_eigenVectorsRotFracDensityFlattenedSTL
-            [(1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType]);
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+          d_eigenVectorsRotFracDensityFlattenedHost.data() +
+            ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+              d_numEigenValuesRR *
+              matrix_free_data.get_vector_partitioner()->locally_owned_size());
       }
 
     //
@@ -537,8 +547,10 @@ namespace dftfe
           elpaScala,
           d_eigenVectorsFlattenedDevice.begin() +
             ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
-              d_eigenVectorsFlattenedSTL[0].size(),
-          d_eigenVectorsFlattenedSTL[0].size(),
+              d_numEigenValues *
+              matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+          d_numEigenValues *
+            matrix_free_data.get_vector_partitioner()->locally_owned_size(),
           d_numEigenValues,
           eigenValuesDummy,
           *d_devicecclMpiCommDomainPtr,
@@ -556,11 +568,14 @@ namespace dftfe
             elpaScala,
             d_eigenVectorsFlattenedDevice.begin() +
               ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
-                d_eigenVectorsFlattenedSTL[0].size(),
+                d_numEigenValues *
+                matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             d_eigenVectorsRotFracFlattenedDevice.begin() +
               ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
-                d_eigenVectorsRotFracDensityFlattenedSTL[0].size(),
-            d_eigenVectorsFlattenedSTL[0].size(),
+                d_numEigenValuesRR *
+                matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+            d_numEigenValues *
+              matrix_free_data.get_vector_partitioner()->locally_owned_size(),
             d_numEigenValues,
             eigenValuesTemp,
             residualNormWaveFunctions,
@@ -662,11 +677,12 @@ namespace dftfe
     // multiply by M^{1/2}
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_sqrtMassVector,
-      matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex),
       d_numEigenValues,
-      d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                            kPointIndex +
-                                          spinType]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsDensityMatrixPrimeHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size());
 
     std::vector<double> eigenValuesTemp(d_numEigenValues, 0.0);
     for (unsigned int i = 0; i < d_numEigenValues; i++)
@@ -678,10 +694,12 @@ namespace dftfe
 
     linearAlgebraOperations::densityMatrixEigenBasisFirstOrderResponse(
       kohnShamDFTEigenOperator,
-      d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                            kPointIndex +
-                                          spinType],
+      d_eigenVectorsDensityMatrixPrimeHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       d_mpiCommParent,
       kohnShamDFTEigenOperator.getMPICommunicator(),
       interBandGroupComm,
@@ -700,11 +718,12 @@ namespace dftfe
     //
     internal::pointWiseScaleWithDiagonal(
       kohnShamDFTEigenOperator.d_invSqrtMassVector,
-      matrix_free_data.get_vector_partitioner(d_densityDofHandlerIndex),
       d_numEigenValues,
-      d_eigenVectorsDensityMatrixPrimeSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                            kPointIndex +
-                                          spinType]);
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsDensityMatrixPrimeHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size());
   }
 
 #ifdef DFTFE_WITH_DEVICE
@@ -739,8 +758,10 @@ namespace dftfe
       kohnShamDFTEigenOperator,
       d_eigenVectorsDensityMatrixPrimeFlattenedDevice.begin() +
         ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
-          d_eigenVectorsFlattenedSTL[0].size(),
-      d_eigenVectorsFlattenedSTL[0].size(),
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_numEigenValues *
+        matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       d_numEigenValues,
       eigenValuesTemp,
       fermiEnergy,
@@ -779,11 +800,12 @@ namespace dftfe
     if (ipass == 1)
       internal::pointWiseScaleWithDiagonal(
         kohnShamDFTEigenOperator.d_invSqrtMassVector,
-        matrix_free_data.get_vector_partitioner(),
         d_numEigenValues,
-        d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                     kPointIndex +
-                                   spinType]);
+        matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+        d_eigenVectorsFlattenedHost.data() +
+          ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+            d_numEigenValues *
+            matrix_free_data.get_vector_partitioner()->locally_owned_size());
 
 
     std::vector<double> eigenValuesTemp(d_numEigenValues, 0.0);
@@ -834,13 +856,16 @@ namespace dftfe
     subspaceIterationSolver.solve(
       kohnShamDFTEigenOperator,
       *d_elpaScala,
-      d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                   kPointIndex +
-                                 spinType],
-      d_eigenVectorsFlattenedSTL[(1 + d_dftParamsPtr->spinPolarized) *
-                                   kPointIndex +
-                                 spinType],
+      d_eigenVectorsFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
+      d_eigenVectorsFlattenedHost.data() +
+        ((1 + d_dftParamsPtr->spinPolarized) * kPointIndex + spinType) *
+          d_numEigenValues *
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       d_numEigenValues,
+      matrix_free_data.get_vector_partitioner()->locally_owned_size(),
       eigenValuesTemp,
       residualNormWaveFunctions,
       interBandGroupComm,
diff --git a/src/dft/localizationLength.cc b/src/dft/localizationLength.cc
index 64e06d019..90296b7eb 100644
--- a/src/dft/localizationLength.cc
+++ b/src/dft/localizationLength.cc
@@ -57,8 +57,9 @@ namespace dftfe
     for (unsigned int iWave = 0; iWave < d_numEigenValues; ++iWave)
       {
         vectorTools::copyFlattenedSTLVecToSingleCompVec(
-          d_eigenVectorsFlattenedSTL[0],
+          d_eigenVectorsFlattenedHost.data(),
           d_numEigenValues,
+          matrix_free_data.get_vector_partitioner()->locally_owned_size(),
           std::make_pair(iWave, iWave + 1),
           tempVec);
 
diff --git a/src/dft/psiInitialGuess.cc b/src/dft/psiInitialGuess.cc
index f9a8ae3a9..f065dec2f 100644
--- a/src/dft/psiInitialGuess.cc
+++ b/src/dft/psiInitialGuess.cc
@@ -344,14 +344,9 @@ namespace dftfe
     locallyOwnedSet.fill_index_vector(locallyOwnedDOFs);
     unsigned int numberDofs = locallyOwnedDOFs.size();
 
-    for (unsigned int kPoint = 0;
-         kPoint < (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
-         ++kPoint)
-      {
-        std::fill(d_eigenVectorsFlattenedSTL[kPoint].begin(),
-                  d_eigenVectorsFlattenedSTL[kPoint].end(),
-                  0.0);
-      }
+    std::fill(d_eigenVectorsFlattenedHost.begin(),
+              d_eigenVectorsFlattenedHost.end(),
+              0.0);
 
     const unsigned int numberGlobalAtoms = atomLocations.size();
 
@@ -498,8 +493,9 @@ namespace dftfe
                             // spherical part
                             if (it->m > 0)
                               {
-                                d_eigenVectorsFlattenedSTL
-                                  [kPoint][dof * d_numEigenValues + waveId] +=
+                                d_eigenVectorsFlattenedHost
+                                  [kPoint * d_numEigenValues * numberDofs +
+                                   dof * d_numEigenValues + waveId] +=
                                   dataTypes::number(
                                     R * std::sqrt(2) *
                                     boost::math::spherical_harmonic_r(
@@ -507,16 +503,18 @@ namespace dftfe
                               }
                             else if (it->m == 0)
                               {
-                                d_eigenVectorsFlattenedSTL
-                                  [kPoint][dof * d_numEigenValues + waveId] +=
+                                d_eigenVectorsFlattenedHost
+                                  [kPoint * d_numEigenValues * numberDofs +
+                                   dof * d_numEigenValues + waveId] +=
                                   dataTypes::number(
                                     R * boost::math::spherical_harmonic_r(
                                           it->l, it->m, theta, phi));
                               }
                             else
                               {
-                                d_eigenVectorsFlattenedSTL
-                                  [kPoint][dof * d_numEigenValues + waveId] +=
+                                d_eigenVectorsFlattenedHost
+                                  [kPoint * d_numEigenValues * numberDofs +
+                                   dof * d_numEigenValues + waveId] +=
                                   dataTypes::number(
                                     R * std::sqrt(2) *
                                     boost::math::spherical_harmonic_i(
@@ -539,8 +537,9 @@ namespace dftfe
                     //
                     // boost::math::normal normDist;
 
-                    std::vector<dataTypes::number> &temp =
-                      d_eigenVectorsFlattenedSTL[kPoint];
+                    dataTypes::number *temp =
+                      d_eigenVectorsFlattenedHost.data() +
+                      kPoint * d_numEigenValues * numberDofs;
                     for (unsigned int iWave = waveFunctionsVector.size();
                          iWave < d_numEigenValues;
                          ++iWave)
@@ -565,11 +564,10 @@ namespace dftfe
              (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
              ++kPoint)
           {
-            std::vector<dataTypes::number> &temp1 =
-              d_eigenVectorsFlattenedSTL[kPoint];
+            dataTypes::number *temp1 = d_eigenVectorsFlattenedHost.data() +
+                                       kPoint * d_numEigenValues * numberDofs;
 
-            std::vector<dataTypes::number> &temp2 =
-              d_eigenVectorsFlattenedSTL[0];
+            dataTypes::number *temp2 = d_eigenVectorsFlattenedHost.data();
 
             for (unsigned int idof = 0; idof < numberDofs; idof++)
               for (unsigned int iwave = 0; iwave < d_numEigenValues; iwave++)
diff --git a/src/dft/solveNSCF.cc b/src/dft/solveNSCF.cc
index bcdb9edf9..b03a0b22d 100644
--- a/src/dft/solveNSCF.cc
+++ b/src/dft/solveNSCF.cc
@@ -1130,17 +1130,7 @@ namespace dftfe
     if (d_dftParamsPtr->useDevice &&
         (d_dftParamsPtr->writeWfcSolutionFields ||
          d_dftParamsPtr->writeLdosFile || d_dftParamsPtr->writePdosFile))
-      for (unsigned int kPoint = 0;
-           kPoint <
-           (1 + d_dftParamsPtr->spinPolarized) * d_kPointWeights.size();
-           ++kPoint)
-        {
-          d_eigenVectorsFlattenedDevice.copyTo<dftfe::utils::MemorySpace::HOST>(
-            &d_eigenVectorsFlattenedSTL[kPoint][0],
-            d_eigenVectorsFlattenedSTL[kPoint].size(),
-            (kPoint * d_eigenVectorsFlattenedSTL[0].size()),
-            0);
-        }
+      d_eigenVectorsFlattenedDevice.copyTo(d_eigenVectorsFlattenedHost);
 #endif
 
     //#ifdef USE_COMPLEX
diff --git a/src/dft/solveVselfInBinsDevice.cc b/src/dft/solveVselfInBinsDevice.cc
index 22bad5577..3fc681a31 100644
--- a/src/dft/solveVselfInBinsDevice.cc
+++ b/src/dft/solveVselfInBinsDevice.cc
@@ -515,10 +515,6 @@ namespace dftfe
         matrixFreeData.get_vector_partitioner(mfDofHandlerIndex),
         hangingPeriodicConstraintMatrix);
 
-
-      constraintsMatrixDataInfoDevice.precomputeMaps(
-        flattenedArray.getMPIPatternP2P(), blockSize);
-
       constraintsMatrixDataInfoDevice.set_zero(xD, blockSize);
 
       dftfe::utils::deviceSynchronize();
diff --git a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc
index 349ff0004..35ec71e9d 100644
--- a/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc
+++ b/src/dftOperator/computeNonLocalHamiltonianTimesXMemoryOptBatchGEMMDevice.cc
@@ -253,8 +253,7 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
   if (std::is_same<dataTypes::number, std::complex<double>>::value)
     {
       utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice(
-        (d_parallelChebyBlockVectorDevice.localSize() *
-         d_parallelChebyBlockVectorDevice.numVectors()),
+        (d_tempRealVec.size()),
         dst,
         d_tempRealVec.begin(),
         d_tempImagVec.begin());
@@ -269,8 +268,7 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
 
 
       utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice(
-        (d_parallelChebyBlockVectorDevice.localSize() *
-         d_parallelChebyBlockVectorDevice.numVectors()),
+        (d_tempRealVec.size()),
         d_tempRealVec.begin(),
         d_tempImagVec.begin(),
         dst);
diff --git a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc
index 963d2cafd..661c67f8f 100644
--- a/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc
+++ b/src/dftOperator/hamiltonianMatrixCalculatorFlattenedDevice.cc
@@ -68,9 +68,9 @@ namespace
     const unsigned int numkPoints,
     const double *     shapeFunctionValues,
     const double *     shapeFunctionValuesTransposed,
-    const double *     shapeFunctionGradientValuesXTransposed,
-    const double *     shapeFunctionGradientValuesYTransposed,
-    const double *     shapeFunctionGradientValuesZTransposed,
+    const double *     shapeFunctionGradientValues,
+    const double *     inverseJacobianValues,
+    const int          areAllCellsAffineOrCartesianFlag,
     const double *     cellShapeFunctionGradientIntegral,
     const double *     vEffJxW,
     const double *     JxW,
@@ -124,9 +124,9 @@ namespace
     const unsigned int numkPoints,
     const double *     shapeFunctionValues,
     const double *     shapeFunctionValuesTransposed,
-    const double *     shapeFunctionGradientValuesXTransposed,
-    const double *     shapeFunctionGradientValuesYTransposed,
-    const double *     shapeFunctionGradientValuesZTransposed,
+    const double *     shapeFunctionGradientValues,
+    const double *     inverseJacobianValues,
+    const int          areAllCellsAffineOrCartesianFlag,
     const double *     cellShapeFunctionGradientIntegral,
     const double *     vEffJxW,
     const double *     JxW,
@@ -165,21 +165,82 @@ namespace
             const double shapeJ =
               shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ];
 
-            const double gradShapeXI =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeYI =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeZI =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
+            double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ,
+              gradShapeZI, gradShapeZJ;
+            const double gradShapeXIRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeYIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeZIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexI];
+            if (areAllCellsAffineOrCartesianFlag == 0)
+              {
+                const double Jxx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        0];
+                const double Jxy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        1];
+                const double Jxz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        2];
+                const double Jyx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        3];
+                const double Jyy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        4];
+                const double Jyz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        5];
+                const double Jzx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        6];
+                const double Jzy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        7];
+                const double Jzz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 1)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 9 + 0];
+                const double Jxy = inverseJacobianValues[cellIndex * 9 + 1];
+                const double Jxz = inverseJacobianValues[cellIndex * 9 + 2];
+                const double Jyx = inverseJacobianValues[cellIndex * 9 + 3];
+                const double Jyy = inverseJacobianValues[cellIndex * 9 + 4];
+                const double Jyz = inverseJacobianValues[cellIndex * 9 + 5];
+                const double Jzx = inverseJacobianValues[cellIndex * 9 + 6];
+                const double Jzy = inverseJacobianValues[cellIndex * 9 + 7];
+                const double Jzz = inverseJacobianValues[cellIndex * 9 + 8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 2)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 3 + 0];
+                const double Jyy = inverseJacobianValues[cellIndex * 3 + 1];
+                const double Jzz = inverseJacobianValues[cellIndex * 3 + 2];
+
+                gradShapeXI = gradShapeXIRef * Jxx;
+                gradShapeYI = gradShapeYIRef * Jyy;
+                gradShapeZI = gradShapeZIRef * Jzz;
+              }
 
             val += vEffJxW[cellIndex * numQuadPoints + q] * shapeI * shapeJ;
 
@@ -227,9 +288,9 @@ namespace
     const unsigned int numkPoints,
     const double *     shapeFunctionValues,
     const double *     shapeFunctionValuesTransposed,
-    const double *     shapeFunctionGradientValuesXTransposed,
-    const double *     shapeFunctionGradientValuesYTransposed,
-    const double *     shapeFunctionGradientValuesZTransposed,
+    const double *     shapeFunctionGradientValues,
+    const double *     inverseJacobianValues,
+    const int          areAllCellsAffineOrCartesianFlag,
     const double *     cellShapeFunctionGradientIntegral,
     const double *     vEffJxW,
     const double *     JxW,
@@ -264,37 +325,105 @@ namespace
             const double shapeJ =
               shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ];
 
-            const double gradShapeXI =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeYI =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeZI =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-
-            const double gradShapeXJ =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeYJ =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeZJ =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
+            double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ,
+              gradShapeZI, gradShapeZJ;
+            const double gradShapeXIRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeYIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeZIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeXJRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeYJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeZJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            if (areAllCellsAffineOrCartesianFlag == 0)
+              {
+                const double Jxx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        0];
+                const double Jxy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        1];
+                const double Jxz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        2];
+                const double Jyx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        3];
+                const double Jyy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        4];
+                const double Jyz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        5];
+                const double Jzx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        6];
+                const double Jzy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        7];
+                const double Jzz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 1)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 9 + 0];
+                const double Jxy = inverseJacobianValues[cellIndex * 9 + 1];
+                const double Jxz = inverseJacobianValues[cellIndex * 9 + 2];
+                const double Jyx = inverseJacobianValues[cellIndex * 9 + 3];
+                const double Jyy = inverseJacobianValues[cellIndex * 9 + 4];
+                const double Jyz = inverseJacobianValues[cellIndex * 9 + 5];
+                const double Jzx = inverseJacobianValues[cellIndex * 9 + 6];
+                const double Jzy = inverseJacobianValues[cellIndex * 9 + 7];
+                const double Jzz = inverseJacobianValues[cellIndex * 9 + 8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 2)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 3 + 0];
+                const double Jyy = inverseJacobianValues[cellIndex * 3 + 1];
+                const double Jzz = inverseJacobianValues[cellIndex * 3 + 2];
+
+                gradShapeXI = gradShapeXIRef * Jxx;
+                gradShapeYI = gradShapeYIRef * Jyy;
+                gradShapeZI = gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx;
+                gradShapeYJ = gradShapeYJRef * Jyy;
+                gradShapeZJ = gradShapeZJRef * Jzz;
+              }
 
 
             val +=
@@ -334,9 +463,9 @@ namespace
     const unsigned int numkPoints,
     const double *     shapeFunctionValues,
     const double *     shapeFunctionValuesTransposed,
-    const double *     shapeFunctionGradientValuesXTransposed,
-    const double *     shapeFunctionGradientValuesYTransposed,
-    const double *     shapeFunctionGradientValuesZTransposed,
+    const double *     shapeFunctionGradientValues,
+    const double *     inverseJacobianValues,
+    const int          areAllCellsAffineOrCartesianFlag,
     const double *     cellShapeFunctionGradientIntegral,
     const double *     vEffJxW,
     const double *     JxW,
@@ -376,37 +505,105 @@ namespace
             const double shapeJ =
               shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ];
 
-            const double gradShapeXI =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeYI =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeZI =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-
-            const double gradShapeXJ =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeYJ =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeZJ =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
+            double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ,
+              gradShapeZI, gradShapeZJ;
+            const double gradShapeXIRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeYIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeZIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeXJRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeYJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeZJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            if (areAllCellsAffineOrCartesianFlag == 0)
+              {
+                const double Jxx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        0];
+                const double Jxy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        1];
+                const double Jxz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        2];
+                const double Jyx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        3];
+                const double Jyy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        4];
+                const double Jyz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        5];
+                const double Jzx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        6];
+                const double Jzy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        7];
+                const double Jzz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 1)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 9 + 0];
+                const double Jxy = inverseJacobianValues[cellIndex * 9 + 1];
+                const double Jxz = inverseJacobianValues[cellIndex * 9 + 2];
+                const double Jyx = inverseJacobianValues[cellIndex * 9 + 3];
+                const double Jyy = inverseJacobianValues[cellIndex * 9 + 4];
+                const double Jyz = inverseJacobianValues[cellIndex * 9 + 5];
+                const double Jzx = inverseJacobianValues[cellIndex * 9 + 6];
+                const double Jzy = inverseJacobianValues[cellIndex * 9 + 7];
+                const double Jzz = inverseJacobianValues[cellIndex * 9 + 8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 2)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 3 + 0];
+                const double Jyy = inverseJacobianValues[cellIndex * 3 + 1];
+                const double Jzz = inverseJacobianValues[cellIndex * 3 + 2];
+
+                gradShapeXI = gradShapeXIRef * Jxx;
+                gradShapeYI = gradShapeYIRef * Jyy;
+                gradShapeZI = gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx;
+                gradShapeYJ = gradShapeYJRef * Jyy;
+                gradShapeZJ = gradShapeZJRef * Jzz;
+              }
 
 
             val +=
@@ -462,12 +659,12 @@ namespace
                           const unsigned int numQuadPoints,
                           const double *     shapeFunctionValues,
                           const double *     shapeFunctionValuesTransposed,
-                          const double *shapeFunctionGradientValuesXTransposed,
-                          const double *shapeFunctionGradientValuesYTransposed,
-                          const double *shapeFunctionGradientValuesZTransposed,
-                          const double *vEffPrimeJxW,
-                          const double *JxW,
-                          double *      cellHamiltonianPrimeMatrixFlattened)
+                          const double *     shapeFunctionGradientValues,
+                          const double *     inverseJacobianValues,
+                          const int          areAllCellsAffineOrCartesianFlag,
+                          const double *     vEffPrimeJxW,
+                          const double *     JxW,
+                          double *cellHamiltonianPrimeMatrixFlattened)
   {
     const unsigned int globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -506,9 +703,9 @@ namespace
     const unsigned int                 numQuadPoints,
     const double *                     shapeFunctionValues,
     const double *                     shapeFunctionValuesTransposed,
-    const double *                     shapeFunctionGradientValuesXTransposed,
-    const double *                     shapeFunctionGradientValuesYTransposed,
-    const double *                     shapeFunctionGradientValuesZTransposed,
+    const double *                     shapeFunctionGradientValues,
+    const double *                     inverseJacobianValues,
+    const int                          areAllCellsAffineOrCartesianFlag,
     const double *                     vEffPrimeJxW,
     const double *                     JxW,
     dftfe::utils::deviceDoubleComplex *cellHamiltonianPrimeMatrixFlattened)
@@ -553,9 +750,9 @@ namespace
     const unsigned int                 numQuadPoints,
     const double *                     shapeFunctionValues,
     const double *                     shapeFunctionValuesTransposed,
-    const double *                     shapeFunctionGradientValuesXTransposed,
-    const double *                     shapeFunctionGradientValuesYTransposed,
-    const double *                     shapeFunctionGradientValuesZTransposed,
+    const double *                     shapeFunctionGradientValues,
+    const double *                     inverseJacobianValues,
+    const int                          areAllCellsAffineOrCartesianFlag,
     const double *                     vEffPrimeJxW,
     const double *                     JxW,
     const double *                     derExcPrimeWithSigmaTimesGradRhoJxW,
@@ -585,37 +782,105 @@ namespace
             const double shapeJ =
               shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ];
 
-            const double gradShapeXI =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeYI =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeZI =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-
-            const double gradShapeXJ =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeYJ =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeZJ =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
+            double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ,
+              gradShapeZI, gradShapeZJ;
+            const double gradShapeXIRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeYIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeZIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeXJRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeYJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeZJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            if (areAllCellsAffineOrCartesianFlag == 0)
+              {
+                const double Jxx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        0];
+                const double Jxy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        1];
+                const double Jxz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        2];
+                const double Jyx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        3];
+                const double Jyy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        4];
+                const double Jyz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        5];
+                const double Jzx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        6];
+                const double Jzy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        7];
+                const double Jzz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 1)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 9 + 0];
+                const double Jxy = inverseJacobianValues[cellIndex * 9 + 1];
+                const double Jxz = inverseJacobianValues[cellIndex * 9 + 2];
+                const double Jyx = inverseJacobianValues[cellIndex * 9 + 3];
+                const double Jyy = inverseJacobianValues[cellIndex * 9 + 4];
+                const double Jyz = inverseJacobianValues[cellIndex * 9 + 5];
+                const double Jzx = inverseJacobianValues[cellIndex * 9 + 6];
+                const double Jzy = inverseJacobianValues[cellIndex * 9 + 7];
+                const double Jzz = inverseJacobianValues[cellIndex * 9 + 8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 2)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 3 + 0];
+                const double Jyy = inverseJacobianValues[cellIndex * 3 + 1];
+                const double Jzz = inverseJacobianValues[cellIndex * 3 + 2];
+
+                gradShapeXI = gradShapeXIRef * Jxx;
+                gradShapeYI = gradShapeYIRef * Jyy;
+                gradShapeZI = gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx;
+                gradShapeYJ = gradShapeYJRef * Jyy;
+                gradShapeZJ = gradShapeZJRef * Jzz;
+              }
 
 
             val +=
@@ -647,9 +912,9 @@ namespace
     const unsigned int numQuadPoints,
     const double *     shapeFunctionValues,
     const double *     shapeFunctionValuesTransposed,
-    const double *     shapeFunctionGradientValuesXTransposed,
-    const double *     shapeFunctionGradientValuesYTransposed,
-    const double *     shapeFunctionGradientValuesZTransposed,
+    const double *     shapeFunctionGradientValues,
+    const double *     inverseJacobianValues,
+    const int          areAllCellsAffineOrCartesianFlag,
     const double *     vEffPrimeJxW,
     const double *     JxW,
     const double *     derExcPrimeWithSigmaTimesGradRhoJxW,
@@ -679,37 +944,105 @@ namespace
             const double shapeJ =
               shapeFunctionValuesTransposed[q * numDofsPerCell + cellDofIndexJ];
 
-            const double gradShapeXI =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeYI =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-            const double gradShapeZI =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexI];
-
-            const double gradShapeXJ =
-              shapeFunctionGradientValuesXTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeYJ =
-              shapeFunctionGradientValuesYTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
-            const double gradShapeZJ =
-              shapeFunctionGradientValuesZTransposed[cellIndex * numQuadPoints *
-                                                       numDofsPerCell +
-                                                     numDofsPerCell * q +
-                                                     cellDofIndexJ];
+            double gradShapeXI, gradShapeXJ, gradShapeYI, gradShapeYJ,
+              gradShapeZI, gradShapeZJ;
+            const double gradShapeXIRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeYIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeZIRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexI];
+            const double gradShapeXJRef =
+              shapeFunctionGradientValues[numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeYJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            const double gradShapeZJRef =
+              shapeFunctionGradientValues[numDofsPerCell * numQuadPoints * 2 +
+                                          numDofsPerCell * q + cellDofIndexJ];
+            if (areAllCellsAffineOrCartesianFlag == 0)
+              {
+                const double Jxx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        0];
+                const double Jxy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        1];
+                const double Jxz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        2];
+                const double Jyx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        3];
+                const double Jyy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        4];
+                const double Jyz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        5];
+                const double Jzx =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        6];
+                const double Jzy =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        7];
+                const double Jzz =
+                  inverseJacobianValues[cellIndex * numQuadPoints * 9 + q * 9 +
+                                        8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 1)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 9 + 0];
+                const double Jxy = inverseJacobianValues[cellIndex * 9 + 1];
+                const double Jxz = inverseJacobianValues[cellIndex * 9 + 2];
+                const double Jyx = inverseJacobianValues[cellIndex * 9 + 3];
+                const double Jyy = inverseJacobianValues[cellIndex * 9 + 4];
+                const double Jyz = inverseJacobianValues[cellIndex * 9 + 5];
+                const double Jzx = inverseJacobianValues[cellIndex * 9 + 6];
+                const double Jzy = inverseJacobianValues[cellIndex * 9 + 7];
+                const double Jzz = inverseJacobianValues[cellIndex * 9 + 8];
+
+                gradShapeXI = gradShapeXIRef * Jxx + gradShapeYIRef * Jxy +
+                              gradShapeZIRef * Jxz;
+                gradShapeYI = gradShapeXIRef * Jyx + gradShapeYIRef * Jyy +
+                              gradShapeZIRef * Jyz;
+                gradShapeZI = gradShapeXIRef * Jzx + gradShapeYIRef * Jzy +
+                              gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx + gradShapeYJRef * Jxy +
+                              gradShapeZJRef * Jxz;
+                gradShapeYJ = gradShapeXJRef * Jyx + gradShapeYJRef * Jyy +
+                              gradShapeZJRef * Jyz;
+                gradShapeZJ = gradShapeXJRef * Jzx + gradShapeYJRef * Jzy +
+                              gradShapeZJRef * Jzz;
+              }
+            else if (areAllCellsAffineOrCartesianFlag == 2)
+              {
+                const double Jxx = inverseJacobianValues[cellIndex * 3 + 0];
+                const double Jyy = inverseJacobianValues[cellIndex * 3 + 1];
+                const double Jzz = inverseJacobianValues[cellIndex * 3 + 2];
+
+                gradShapeXI = gradShapeXIRef * Jxx;
+                gradShapeYI = gradShapeYIRef * Jyy;
+                gradShapeZI = gradShapeZIRef * Jzz;
+                gradShapeXJ = gradShapeXJRef * Jxx;
+                gradShapeYJ = gradShapeYJRef * Jyy;
+                gradShapeZJ = gradShapeZJRef * Jzz;
+              }
 
 
             val +=
@@ -764,17 +1097,18 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
       !d_isStiffnessMatrixExternalPotCorrComputed &&
       !onlyHPrimePartForFirstOrderDensityMatResponse)
     {
+      basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_lpspQuadratureId);
 #ifdef DFTFE_WITH_DEVICE_LANG_CUDA
       hamMatrixExtPotCorr<<<(d_numLocallyOwnedCells * d_numberNodesPerElement *
                                d_numberNodesPerElement +
                              (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
                               dftfe::utils::DEVICE_BLOCK_SIZE,
                             dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-        d_numLocallyOwnedCells,
-        d_numberNodesPerElement,
-        d_numQuadPointsLpsp,
-        d_shapeFunctionValueLpspDevice.begin(),
-        d_shapeFunctionValueTransposedLpspDevice.begin(),
+        basisOperationsPtrDevice->nCells(),
+        basisOperationsPtrDevice->nDofsPerCell(),
+        basisOperationsPtrDevice->nQuadsPerCell(),
+        basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+        basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
         d_vEffExternalPotCorrJxWDevice.begin(),
         d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin());
 #elif DFTFE_WITH_DEVICE_LANG_HIP
@@ -787,18 +1121,18 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
         dftfe::utils::DEVICE_BLOCK_SIZE,
         0,
         0,
-        d_numLocallyOwnedCells,
-        d_numberNodesPerElement,
-        d_numQuadPointsLpsp,
-        d_shapeFunctionValueLpspDevice.begin(),
-        d_shapeFunctionValueTransposedLpspDevice.begin(),
+        basisOperationsPtrDevice->nCells(),
+        basisOperationsPtrDevice->nDofsPerCell(),
+        basisOperationsPtrDevice->nQuadsPerCell(),
+        basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+        basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
         d_vEffExternalPotCorrJxWDevice.begin(),
         d_cellHamiltonianMatrixExternalPotCorrFlattenedDevice.begin());
 #endif
 
       d_isStiffnessMatrixExternalPotCorrComputed = true;
     }
-
+  basisOperationsPtrDevice->reinit(0, 0, dftPtr->d_densityQuadratureId);
   if (onlyHPrimePartForFirstOrderDensityMatResponse)
     {
       if (dftPtr->d_excManagerPtr->getDensityBasedFamilyType() ==
@@ -810,14 +1144,14 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
            (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
             dftfe::utils::DEVICE_BLOCK_SIZE,
           dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
           d_derExcWithSigmaTimesGradRhoJxWDevice.begin(),
@@ -835,14 +1169,14 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
           dftfe::utils::DEVICE_BLOCK_SIZE,
           0,
           0,
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
           d_derExcWithSigmaTimesGradRhoJxWDevice.begin(),
@@ -860,14 +1194,14 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
                                    (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
                                     dftfe::utils::DEVICE_BLOCK_SIZE,
                                   dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
           dftfe::utils::makeDataTypeDeviceCompatible(
@@ -884,14 +1218,14 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
           dftfe::utils::DEVICE_BLOCK_SIZE,
           0,
           0,
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
           dftfe::utils::makeDataTypeDeviceCompatible(
@@ -911,17 +1245,17 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
                                     (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
                                      dftfe::utils::DEVICE_BLOCK_SIZE,
                                    dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
           spinIndex,
           (1 + dftPtr->d_dftParamsPtr->spinPolarized),
           dftPtr->d_kPointWeights.size(),
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
@@ -943,17 +1277,17 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
           dftfe::utils::DEVICE_BLOCK_SIZE,
           0,
           0,
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
           spinIndex,
           (1 + dftPtr->d_dftParamsPtr->spinPolarized),
           dftPtr->d_kPointWeights.size(),
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
@@ -974,17 +1308,17 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
                               (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
                                dftfe::utils::DEVICE_BLOCK_SIZE,
                              dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
           spinIndex,
           (1 + dftPtr->d_dftParamsPtr->spinPolarized),
           dftPtr->d_kPointWeights.size(),
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
@@ -1005,17 +1339,17 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
           dftfe::utils::DEVICE_BLOCK_SIZE,
           0,
           0,
-          d_numLocallyOwnedCells,
-          d_numberNodesPerElement,
-          d_numQuadPoints,
+          basisOperationsPtrDevice->nCells(),
+          basisOperationsPtrDevice->nDofsPerCell(),
+          basisOperationsPtrDevice->nQuadsPerCell(),
           spinIndex,
           (1 + dftPtr->d_dftParamsPtr->spinPolarized),
           dftPtr->d_kPointWeights.size(),
-          d_shapeFunctionValueDevice.begin(),
-          d_shapeFunctionValueTransposedDevice.begin(),
-          d_shapeFunctionGradientValueXTransposedDevice.begin(),
-          d_shapeFunctionGradientValueYTransposedDevice.begin(),
-          d_shapeFunctionGradientValueZTransposedDevice.begin(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(true).data(),
+          basisOperationsPtrDevice->shapeFunctionBasisData(false).data(),
+          basisOperationsPtrDevice->shapeFunctionGradientBasisData().data(),
+          basisOperationsPtrDevice->inverseJacobiansBasisData().data(),
+          basisOperationsPtrDevice->cellsTypeFlag(),
           d_cellShapeFunctionGradientIntegralFlattenedDevice.begin(),
           d_vEffJxWDevice.begin(),
           d_cellJxWValuesDevice.begin(),
diff --git a/src/dftOperator/kohnShamDFTOperator.cc b/src/dftOperator/kohnShamDFTOperator.cc
index 7d3fb765d..fed3b321f 100644
--- a/src/dftOperator/kohnShamDFTOperator.cc
+++ b/src/dftOperator/kohnShamDFTOperator.cc
@@ -170,11 +170,6 @@ namespace dftfe
       d_normalCellIdToMacroCellIdMap,
       d_macroCellIdToNormalCellIdMap,
       d_FullflattenedArrayCellLocalProcIndexIdMap);
-
-    getOverloadedConstraintMatrix()->precomputeMaps(
-      dftPtr->matrix_free_data.get_vector_partitioner(),
-      flattenedArray.get_partitioner(),
-      numberWaveFunctions);
   }
 
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
@@ -217,9 +212,6 @@ namespace dftfe
       d_normalCellIdToMacroCellIdMap,
       d_macroCellIdToNormalCellIdMap,
       d_FullflattenedArrayCellLocalProcIndexIdMap);
-
-    getOverloadedConstraintMatrix()->precomputeMaps(
-      flattenedArray.getMPIPatternP2P(), numberWaveFunctions);
   }
 
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
@@ -1239,14 +1231,14 @@ namespace dftfe
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
   void
   kohnShamDFTOperatorClass<FEOrder, FEOrderElectro>::XtHX(
-    const std::vector<dataTypes::number> &X,
-    const unsigned int                    numberWaveFunctions,
-    std::vector<dataTypes::number> &      ProjHam)
+    const dataTypes::number *       X,
+    const unsigned int              numberWaveFunctions,
+    const unsigned int              numberDofs,
+    std::vector<dataTypes::number> &ProjHam)
   {
     //
     // Get access to number of locally owned nodes on the current processor
     //
-    const unsigned int numberDofs = X.size() / numberWaveFunctions;
 
     //
     // Resize ProjHam
@@ -1326,8 +1318,9 @@ namespace dftfe
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
   void
   kohnShamDFTOperatorClass<FEOrder, FEOrderElectro>::XtHX(
-    const std::vector<dataTypes::number> &           X,
+    const dataTypes::number *                        X,
     const unsigned int                               numberWaveFunctions,
+    const unsigned int                               numberDofs,
     const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
     dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
     const bool onlyHPrimePartForFirstOrderDensityMatResponse)
@@ -1335,7 +1328,6 @@ namespace dftfe
     //
     // Get access to number of locally owned nodes on the current processor
     //
-    const unsigned int numberDofs = X.size() / numberWaveFunctions;
 
     // create temporary arrays XBlock,Hx
     distributedCPUMultiVec<dataTypes::number> XBlock, HXBlock;
@@ -1500,9 +1492,10 @@ namespace dftfe
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
   void
   kohnShamDFTOperatorClass<FEOrder, FEOrderElectro>::XtHXMixedPrec(
-    const std::vector<dataTypes::number> &           X,
+    const dataTypes::number *                        X,
     const unsigned int                               N,
     const unsigned int                               Ncore,
+    const unsigned int                               numberDofs,
     const std::shared_ptr<const dftfe::ProcessGrid> &processGrid,
     dftfe::ScaLAPACKMatrix<dataTypes::number> &      projHamPar,
     const bool onlyHPrimePartForFirstOrderDensityMatResponse)
@@ -1510,7 +1503,6 @@ namespace dftfe
     //
     // Get access to number of locally owned nodes on the current processor
     //
-    const unsigned int numberDofs = X.size() / N;
 
     // create temporary arrays XBlock,Hx
     distributedCPUMultiVec<dataTypes::number> XBlock, HXBlock;
@@ -1558,7 +1550,7 @@ namespace dftfe
 
     std::vector<dataTypes::numberFP32> HXBlockSinglePrec;
 
-    std::vector<dataTypes::numberFP32> XSinglePrec(&X[0], &X[0] + X.size());
+    std::vector<dataTypes::numberFP32> XSinglePrec(X, X + numberDofs * N);
 
     if (dftPtr->d_dftParamsPtr->verbosity >= 4)
       dftUtils::printCurrentMemoryUsage(
diff --git a/src/dftOperator/kohnShamDFTOperatorDevice.cc b/src/dftOperator/kohnShamDFTOperatorDevice.cc
index 0485516e3..c63a353ae 100644
--- a/src/dftOperator/kohnShamDFTOperatorDevice.cc
+++ b/src/dftOperator/kohnShamDFTOperatorDevice.cc
@@ -386,29 +386,29 @@ namespace dftfe
     return d_shapeFunctionValueNLPTransposedDevice;
   }
 
-  template <unsigned int FEOrder, unsigned int FEOrderElectro>
-  dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-  kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
-    getShapeFunctionGradientValuesXTransposed()
-  {
-    return d_shapeFunctionGradientValueXTransposedDevice;
-  }
-
-  template <unsigned int FEOrder, unsigned int FEOrderElectro>
-  dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-  kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
-    getShapeFunctionGradientValuesYTransposed()
-  {
-    return d_shapeFunctionGradientValueYTransposedDevice;
-  }
-
-  template <unsigned int FEOrder, unsigned int FEOrderElectro>
-  dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
-  kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
-    getShapeFunctionGradientValuesZTransposed()
-  {
-    return d_shapeFunctionGradientValueZTransposedDevice;
-  }
+  // template <unsigned int FEOrder, unsigned int FEOrderElectro>
+  // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+  // kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
+  //   getShapeFunctionGradientValuesXTransposed()
+  // {
+  //   return d_shapeFunctionGradientValueXTransposedDevice;
+  // }
+
+  // template <unsigned int FEOrder, unsigned int FEOrderElectro>
+  // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+  // kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
+  //   getShapeFunctionGradientValuesYTransposed()
+  // {
+  //   return d_shapeFunctionGradientValueYTransposedDevice;
+  // }
+
+  // template <unsigned int FEOrder, unsigned int FEOrderElectro>
+  // dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
+  // kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
+  //   getShapeFunctionGradientValuesZTransposed()
+  // {
+  //   return d_shapeFunctionGradientValueZTransposedDevice;
+  // }
 
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
   dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE> &
@@ -458,7 +458,10 @@ namespace dftfe
   kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
     getParallelChebyBlockVectorDevice()
   {
-    return d_parallelChebyBlockVectorDevice;
+    const unsigned int BVec =
+      std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize,
+               dftPtr->d_numEigenValues);
+    return basisOperationsPtrDevice->getMultiVector(BVec);
   }
 
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
@@ -466,7 +469,10 @@ namespace dftfe
   kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
     getParallelChebyBlockVector2Device()
   {
-    return d_parallelChebyBlockVector2Device;
+    const unsigned int BVec =
+      std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize,
+               dftPtr->d_numEigenValues);
+    return basisOperationsPtrDevice->getMultiVector(BVec, 1);
   }
 
   template <unsigned int FEOrder, unsigned int FEOrderElectro>
@@ -495,6 +501,8 @@ namespace dftfe
   {
     computing_timer.enter_subsection("kohnShamDFTOperatorDeviceClass setup");
 
+    basisOperationsPtrDevice = dftPtr->basisOperationsPtrDevice;
+    basisOperationsPtrHost   = dftPtr->basisOperationsPtrHost;
 
     dftPtr->matrix_free_data.initialize_dof_vector(
       d_invSqrtMassVector, dftPtr->d_densityDofHandlerIndex);
@@ -569,25 +577,6 @@ namespace dftfe
       std::min(dftPtr->d_dftParamsPtr->chebyWfcBlockSize, numberWaveFunctions);
 
 
-    dftfe::linearAlgebra::createMultiVectorFromDealiiPartitioner(
-      dftPtr->matrix_free_data.get_vector_partitioner(
-        dftPtr->d_densityDofHandlerIndex),
-      BVec,
-      d_parallelChebyBlockVectorDevice);
-
-    if (dftPtr->d_dftParamsPtr->mixingMethod == "LOW_RANK_DIELECM_PRECOND")
-      d_parallelChebyBlockVector2Device.reinit(
-        d_parallelChebyBlockVectorDevice);
-
-    if (std::is_same<dataTypes::number, std::complex<double>>::value)
-      {
-        d_tempRealVec.resize((d_parallelChebyBlockVectorDevice.localSize() *
-                              d_parallelChebyBlockVectorDevice.numVectors()),
-                             0.0);
-        d_tempImagVec.resize((d_parallelChebyBlockVectorDevice.localSize() *
-                              d_parallelChebyBlockVectorDevice.numVectors()),
-                             0.0);
-      }
 
     const unsigned int n_ghosts =
       dftPtr->matrix_free_data
@@ -597,6 +586,11 @@ namespace dftfe
       dftPtr->matrix_free_data
         .get_vector_partitioner(dftPtr->d_densityDofHandlerIndex)
         ->local_size();
+    if (std::is_same<dataTypes::number, std::complex<double>>::value)
+      {
+        d_tempRealVec.resize(((localSize + n_ghosts) * BVec), 0.0);
+        d_tempImagVec.resize(((localSize + n_ghosts) * BVec), 0.0);
+      }
 
     dftfe::utils::MemoryStorage<unsigned int, dftfe::utils::MemorySpace::HOST>
       locallyOwnedProcBoundaryNodesVector(localSize, 0);
@@ -642,16 +636,6 @@ namespace dftfe
       d_flattenedArrayCellLocalProcIndexIdMap);
 
 
-
-    getOverloadedConstraintMatrix()->precomputeMaps(
-      flattenedArray.getMPIPatternP2P(), numberWaveFunctions);
-
-    getOverloadedConstraintMatrixHost()->precomputeMaps(
-      dftPtr->matrix_free_data.get_vector_partitioner(),
-      dftPtr->matrix_free_data.get_vector_partitioner(),
-      1);
-
-
     const unsigned int totalLocallyOwnedCells =
       dftPtr->matrix_free_data.n_physical_cells();
 
diff --git a/src/dftOperator/matrixVectorProductImplementationsDevice.cc b/src/dftOperator/matrixVectorProductImplementationsDevice.cc
index 33461fca7..625354d54 100644
--- a/src/dftOperator/matrixVectorProductImplementationsDevice.cc
+++ b/src/dftOperator/matrixVectorProductImplementationsDevice.cc
@@ -86,8 +86,7 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
       if (std::is_same<dataTypes::number, std::complex<double>>::value)
         {
           utils::deviceKernelsGeneric::copyComplexArrToRealArrsDevice(
-            (d_parallelChebyBlockVectorDevice.localSize() *
-             d_parallelChebyBlockVectorDevice.numVectors()),
+            (d_tempRealVec.size()),
             dst,
             d_tempRealVec.begin(),
             d_tempImagVec.begin());
@@ -102,8 +101,7 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
 
 
           utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice(
-            (d_parallelChebyBlockVectorDevice.localSize() *
-             d_parallelChebyBlockVectorDevice.numVectors()),
+            (d_tempRealVec.size()),
             d_tempRealVec.begin(),
             d_tempImagVec.begin(),
             dst);
diff --git a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc
index 24a144115..c21491078 100644
--- a/src/dftOperator/shapeFunctionDataCalculatorDevice.cc
+++ b/src/dftOperator/shapeFunctionDataCalculatorDevice.cc
@@ -368,8 +368,6 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
       //
       // resize data members
       //
-      // d_cellShapeFunctionGradientIntegralFlattened.clear();
-      // d_cellShapeFunctionGradientIntegralFlattened.resize(numberPhysicalCells*numberDofsPerElement*numberDofsPerElement);
 
       d_cellJxWValues.clear();
       d_cellJxWValues.resize(numberPhysicalCells * numberQuadraturePoints);
@@ -380,33 +378,6 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
                                               numberDofsPerElement,
                                             0.0);
 
-      d_shapeFunctionGradientValueX.resize(numberPhysicalCells *
-                                             numberQuadraturePoints *
-                                             numberDofsPerElement,
-                                           0.0);
-      d_shapeFunctionGradientValueXTransposed.resize(numberPhysicalCells *
-                                                       numberQuadraturePoints *
-                                                       numberDofsPerElement,
-                                                     0.0);
-
-      d_shapeFunctionGradientValueY.resize(numberPhysicalCells *
-                                             numberQuadraturePoints *
-                                             numberDofsPerElement,
-                                           0.0);
-      d_shapeFunctionGradientValueYTransposed.resize(numberPhysicalCells *
-                                                       numberQuadraturePoints *
-                                                       numberDofsPerElement,
-                                                     0.0);
-
-      d_shapeFunctionGradientValueZ.resize(numberPhysicalCells *
-                                             numberQuadraturePoints *
-                                             numberDofsPerElement,
-                                           0.0);
-      d_shapeFunctionGradientValueZTransposed.resize(numberPhysicalCells *
-                                                       numberQuadraturePoints *
-                                                       numberDofsPerElement,
-                                                     0.0);
-
       std::vector<double> shapeFunctionValueLpsp(numberQuadraturePointsLpsp *
                                                    numberDofsPerElement,
                                                  0.0);
@@ -435,38 +406,6 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
               d_cellJxWValues[iElem * numberQuadraturePoints + q_point] =
                 fe_values.JxW(q_point);
 
-            for (unsigned int iNode = 0; iNode < numberDofsPerElement; ++iNode)
-              for (unsigned int q_point = 0; q_point < numberQuadraturePoints;
-                   ++q_point)
-                {
-                  const dealii::Tensor<1, 3, double> &shape_grad =
-                    fe_values.shape_grad(iNode, q_point);
-
-                  d_shapeFunctionGradientValueX[iElem * numberDofsPerElement *
-                                                  numberQuadraturePoints +
-                                                iNode * numberQuadraturePoints +
-                                                q_point] = shape_grad[0];
-                  d_shapeFunctionGradientValueXTransposed
-                    [iElem * numberQuadraturePoints * numberDofsPerElement +
-                     q_point * numberDofsPerElement + iNode] = shape_grad[0];
-
-                  d_shapeFunctionGradientValueY[iElem * numberDofsPerElement *
-                                                  numberQuadraturePoints +
-                                                iNode * numberQuadraturePoints +
-                                                q_point] = shape_grad[1];
-                  d_shapeFunctionGradientValueYTransposed
-                    [iElem * numberQuadraturePoints * numberDofsPerElement +
-                     q_point * numberDofsPerElement + iNode] = shape_grad[1];
-
-                  d_shapeFunctionGradientValueZ[iElem * numberDofsPerElement *
-                                                  numberQuadraturePoints +
-                                                iNode * numberQuadraturePoints +
-                                                q_point] = shape_grad[2];
-                  d_shapeFunctionGradientValueZTransposed
-                    [iElem * numberQuadraturePoints * numberDofsPerElement +
-                     q_point * numberDofsPerElement + iNode] = shape_grad[2];
-                }
-
             if (iElem == 0)
               {
                 fe_values_lpsp.reinit(cellPtr);
@@ -512,21 +451,6 @@ kohnShamDFTOperatorDeviceClass<FEOrder, FEOrderElectro>::
       d_shapeFunctionValueTransposedDevice.copyFrom(
         d_shapeFunctionValueTransposed);
 
-      d_shapeFunctionGradientValueXTransposedDevice.resize(
-        d_shapeFunctionGradientValueXTransposed.size());
-      d_shapeFunctionGradientValueXTransposedDevice.copyFrom(
-        d_shapeFunctionGradientValueXTransposed);
-
-      d_shapeFunctionGradientValueYTransposedDevice.resize(
-        d_shapeFunctionGradientValueYTransposed.size());
-      d_shapeFunctionGradientValueYTransposedDevice.copyFrom(
-        d_shapeFunctionGradientValueYTransposed);
-
-      d_shapeFunctionGradientValueZTransposedDevice.resize(
-        d_shapeFunctionGradientValueZTransposed.size());
-      d_shapeFunctionGradientValueZTransposedDevice.copyFrom(
-        d_shapeFunctionGradientValueZTransposed);
-
       d_shapeFunctionValueLpspDevice.resize(shapeFunctionValueLpsp.size());
       d_shapeFunctionValueLpspDevice.copyFrom(shapeFunctionValueLpsp);
 
diff --git a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc
index 165b5026a..3cf5db1f2 100644
--- a/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc
+++ b/src/force/configurationalForceCompute/configurationalForceEEshelbyFPSPFnlLinFE.cc
@@ -173,7 +173,7 @@ namespace dftfe
 
 
     const unsigned int localVectorSize =
-      dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors;
+      matrixFreeData.get_vector_partitioner()->locally_owned_size();
 
     const unsigned int numMacroCells = matrixFreeData.n_cell_batches();
 
@@ -250,6 +250,7 @@ namespace dftfe
             double device_time = MPI_Wtime();
 
             forceDevice::wfcContractionsForceKernelsAllH(
+              dftPtr->basisOperationsPtrDevice,
               kohnShamDFTEigenOperatorDevice,
               dftPtr->d_eigenVectorsFlattenedDevice.begin(),
               d_dftParams.spinPolarized,
@@ -296,7 +297,7 @@ namespace dftfe
 
             force::wfcContractionsForceKernelsAllH(
               kohnShamDFTEigenOperator,
-              dftPtr->d_eigenVectorsFlattenedSTL,
+              dftPtr->d_eigenVectorsFlattenedHost.begin(),
               d_dftParams.spinPolarized,
               spinIndex,
               dftPtr->eigenValues,
diff --git a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc
index 9c6003ec0..10c40a236 100644
--- a/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc
+++ b/src/force/configurationalStressCompute/computeStressEEshelbyEPSPEnlEk.cc
@@ -163,7 +163,7 @@ namespace dftfe
                                             bandGroupLowHighPlusOneIndices[1]);
 
     const unsigned int localVectorSize =
-      dftPtr->d_eigenVectorsFlattenedSTL[0].size() / numEigenVectors;
+      matrixFreeData.get_vector_partitioner()->locally_owned_size();
     std::vector<std::vector<distributedCPUVec<double>>> eigenVectors(
       dftPtr->d_kPointWeights.size());
     std::vector<distributedCPUVec<dataTypes::number>>
@@ -244,6 +244,7 @@ namespace dftfe
             double device_time = MPI_Wtime();
 
             forceDevice::wfcContractionsForceKernelsAllH(
+              dftPtr->basisOperationsPtrDevice,
               kohnShamDFTEigenOperatorDevice,
               dftPtr->d_eigenVectorsFlattenedDevice.begin(),
               d_dftParams.spinPolarized,
@@ -289,7 +290,7 @@ namespace dftfe
 
             force::wfcContractionsForceKernelsAllH(
               kohnShamDFTEigenOperator,
-              dftPtr->d_eigenVectorsFlattenedSTL,
+              dftPtr->d_eigenVectorsFlattenedHost.begin(),
               d_dftParams.spinPolarized,
               spinIndex,
               dftPtr->eigenValues,
diff --git a/src/force/forceWfcContractions.cc b/src/force/forceWfcContractions.cc
index fbc8a8628..e9b2e9771 100644
--- a/src/force/forceWfcContractions.cc
+++ b/src/force/forceWfcContractions.cc
@@ -607,7 +607,7 @@ namespace dftfe
         operatorDFTClass &                         operatorMatrix,
         distributedCPUMultiVec<dataTypes::number> &flattenedArrayBlock,
         distributedCPUMultiVec<dataTypes::number> &projectorKetTimesVector,
-        const std::vector<dataTypes::number> &     X,
+        const dataTypes::number *                  X,
         const std::vector<double> &                eigenValues,
         const std::vector<double> &                partialOccupancies,
         const std::vector<double> &                kcoord,
@@ -725,14 +725,14 @@ namespace dftfe
 
     void
     wfcContractionsForceKernelsAllH(
-      operatorDFTClass &                                 operatorMatrix,
-      const std::vector<std::vector<dataTypes::number>> &X,
-      const unsigned int                                 spinPolarizedFlag,
-      const unsigned int                                 spinIndex,
-      const std::vector<std::vector<double>> &           eigenValuesH,
-      const std::vector<std::vector<double>> &           partialOccupanciesH,
-      const std::vector<double> &                        kPointCoordinates,
-      const unsigned int *nonTrivialIdToElemIdMapH,
+      operatorDFTClass &                      operatorMatrix,
+      const dataTypes::number *               X,
+      const unsigned int                      spinPolarizedFlag,
+      const unsigned int                      spinIndex,
+      const std::vector<std::vector<double>> &eigenValuesH,
+      const std::vector<std::vector<double>> &partialOccupanciesH,
+      const std::vector<double> &             kPointCoordinates,
+      const unsigned int *                    nonTrivialIdToElemIdMapH,
       const unsigned int *projecterKetTimesFlattenedVectorLocalIdsH,
       const unsigned int  MLoc,
       const unsigned int  N,
@@ -894,7 +894,8 @@ namespace dftfe
                     operatorMatrix,
                     flattenedArrayBlock,
                     projectorKetTimesVector,
-                    X[(1 + spinPolarizedFlag) * kPoint + spinIndex],
+                    X +
+                      ((1 + spinPolarizedFlag) * kPoint + spinIndex) * MLoc * N,
                     blockedEigenValues,
                     blockedPartialOccupancies,
                     kcoord,
diff --git a/src/force/forceWfcContractionsDevice.cc b/src/force/forceWfcContractionsDevice.cc
index 985eb75c8..12a942a7c 100644
--- a/src/force/forceWfcContractionsDevice.cc
+++ b/src/force/forceWfcContractionsDevice.cc
@@ -41,9 +41,7 @@ namespace dftfe
         const unsigned int numContiguousBlocks,
         const unsigned int numQuads,
         const double *     psiQuadValues,
-        const double *     gradPsiQuadValuesX,
-        const double *     gradPsiQuadValuesY,
-        const double *     gradPsiQuadValuesZ,
+        const double *     gradPsiQuadValues,
         const double *     eigenValues,
         const double *     partialOccupancies,
         double *           eshelbyTensor)
@@ -66,10 +64,16 @@ namespace dftfe
             const unsigned int tempIndex =
               (cellIndex)*numQuads * contiguousBlockSize +
               quadId * contiguousBlockSize + intraBlockIndex;
-            const double psi        = psiQuadValues[tempIndex];
-            const double gradPsiX   = gradPsiQuadValuesX[tempIndex];
-            const double gradPsiY   = gradPsiQuadValuesY[tempIndex];
-            const double gradPsiZ   = gradPsiQuadValuesZ[tempIndex];
+            const unsigned int tempIndex2 =
+              (cellIndex)*numQuads * contiguousBlockSize * 3 +
+              quadId * contiguousBlockSize + intraBlockIndex;
+            const double psi      = psiQuadValues[tempIndex];
+            const double gradPsiX = gradPsiQuadValues[tempIndex2];
+            const double gradPsiY =
+              gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize];
+            const double gradPsiZ =
+              gradPsiQuadValues[tempIndex2 +
+                                2 * numQuads * contiguousBlockSize];
             const double eigenValue = eigenValues[intraBlockIndex];
             const double partOcc    = partialOccupancies[intraBlockIndex];
 
@@ -109,9 +113,7 @@ namespace dftfe
         const unsigned int                       numContiguousBlocks,
         const unsigned int                       numQuads,
         const dftfe::utils::deviceDoubleComplex *psiQuadValues,
-        const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesX,
-        const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesY,
-        const dftfe::utils::deviceDoubleComplex *gradPsiQuadValuesZ,
+        const dftfe::utils::deviceDoubleComplex *gradPsiQuadValues,
         const double *                           eigenValues,
         const double *                           partialOccupancies,
         const double                             kcoordx,
@@ -138,22 +140,29 @@ namespace dftfe
             const unsigned int tempIndex =
               (cellIndex)*numQuads * contiguousBlockSize +
               quadId * contiguousBlockSize + intraBlockIndex;
+            const unsigned int tempIndex2 =
+              (cellIndex)*numQuads * contiguousBlockSize * 3 +
+              quadId * contiguousBlockSize + intraBlockIndex;
             const dftfe::utils::deviceDoubleComplex psi =
               psiQuadValues[tempIndex];
             const dftfe::utils::deviceDoubleComplex psiConj =
               dftfe::utils::conj(psiQuadValues[tempIndex]);
             const dftfe::utils::deviceDoubleComplex gradPsiX =
-              gradPsiQuadValuesX[tempIndex];
+              gradPsiQuadValues[tempIndex2];
             const dftfe::utils::deviceDoubleComplex gradPsiY =
-              gradPsiQuadValuesY[tempIndex];
+              gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize];
             const dftfe::utils::deviceDoubleComplex gradPsiZ =
-              gradPsiQuadValuesZ[tempIndex];
+              gradPsiQuadValues[tempIndex2 +
+                                2 * numQuads * contiguousBlockSize];
             const dftfe::utils::deviceDoubleComplex gradPsiXConj =
-              dftfe::utils::conj(gradPsiQuadValuesX[tempIndex]);
+              dftfe::utils::conj(gradPsiQuadValues[tempIndex2]);
             const dftfe::utils::deviceDoubleComplex gradPsiYConj =
-              dftfe::utils::conj(gradPsiQuadValuesY[tempIndex]);
+              dftfe::utils::conj(
+                gradPsiQuadValues[tempIndex2 + numQuads * contiguousBlockSize]);
             const dftfe::utils::deviceDoubleComplex gradPsiZConj =
-              dftfe::utils::conj(gradPsiQuadValuesZ[tempIndex]);
+              dftfe::utils::conj(
+                gradPsiQuadValues[tempIndex2 +
+                                  2 * numQuads * contiguousBlockSize]);
             const double eigenValue = eigenValues[intraBlockIndex];
             const double partOcc    = partialOccupancies[intraBlockIndex];
 
@@ -410,6 +419,11 @@ namespace dftfe
 
       void
       interpolatePsiComputeELocWfcEshelbyTensorD(
+        std::shared_ptr<
+          dftfe::basis::FEBasisOperations<dataTypes::number,
+                                          double,
+                                          dftfe::utils::MemorySpace::DEVICE>>
+          &                                      basisOperationsPtr,
         operatorDFTDeviceClass &                 operatorMatrix,
         distributedDeviceVec<dataTypes::number> &Xb,
         const unsigned int                       BVec,
@@ -437,13 +451,7 @@ namespace dftfe
           &psiQuadsFlatD,
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsXFlatD,
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsYFlatD,
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsZFlatD,
+          &gradPsiQuadsFlatD,
 #ifdef USE_COMPLEX
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
@@ -463,56 +471,24 @@ namespace dftfe
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
           &cellWaveFunctionMatrix = operatorMatrix.getCellWaveFunctionMatrix();
-
-        dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock(
-          BVec,
-          numCells * numNodesPerElement,
-          Xb.begin(),
-          cellWaveFunctionMatrix.begin(),
-          (operatorMatrix.getFlattenedArrayCellLocalProcIndexIdMap()).begin());
+        dftfe::basis::UpdateFlags updateFlags =
+          dftfe::basis::update_values | dftfe::basis::update_gradients;
+        basisOperationsPtr->reinit(BVec, cellsBlockSize, 0);
 
         const int blockSize    = cellsBlockSize;
         const int numberBlocks = numCells / blockSize;
         const int remBlockSize = numCells - numberBlocks * blockSize;
 
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          shapeFunctionValuesReferenceD(numQuads * numNodesPerElement,
-                                        dataTypes::number(0.0));
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
           shapeFunctionValuesNLPReferenceD(numQuadsNLP * numNodesPerElement,
                                            dataTypes::number(0.0));
 
-        dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr(
-          numQuads * numNodesPerElement,
-          (operatorMatrix.getShapeFunctionValuesTransposed()).begin(),
-          shapeFunctionValuesReferenceD.begin());
-
-
         dftfe::utils::deviceKernelsGeneric::copyValueType1ArrToValueType2Arr(
           numQuadsNLP * numNodesPerElement,
           (operatorMatrix.getShapeFunctionValuesNLPTransposed()).begin(),
           shapeFunctionValuesNLPReferenceD.begin());
 
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          shapeFunctionGradientValuesXTransposedDevice(blockSize * numQuads *
-                                                         numNodesPerElement,
-                                                       dataTypes::number(0.0));
-
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          shapeFunctionGradientValuesYTransposedDevice(blockSize * numQuads *
-                                                         numNodesPerElement,
-                                                       dataTypes::number(0.0));
-
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          shapeFunctionGradientValuesZTransposedDevice(blockSize * numQuads *
-                                                         numNodesPerElement,
-                                                       dataTypes::number(0.0));
-
         dftfe::utils::MemoryStorage<double, dftfe::utils::MemorySpace::DEVICE>
           shapeFunctionGradientValuesNLPReferenceD(blockSize * numQuadsNLP * 3 *
                                                      numNodesPerElement,
@@ -535,6 +511,10 @@ namespace dftfe
             0,
             i * numQuadsNLP * 3 * numNodesPerElement);
 
+        basisOperationsPtr->extractToCellNodalDataKernel(
+          Xb,
+          cellWaveFunctionMatrix.data(),
+          std::pair<unsigned int, unsigned int>(0, numCells));
 
 
         for (int iblock = 0; iblock < (numberBlocks + 1); iblock++)
@@ -558,120 +538,13 @@ namespace dftfe
 
                 if (!isFloatingChargeForces)
                   {
-                    dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
-                      operatorMatrix.getDeviceBlasHandle(),
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      BVec,
-                      numQuads,
-                      numNodesPerElement,
-                      &scalarCoeffAlpha,
-                      cellWaveFunctionMatrix.begin() +
-                        startingId * numNodesPerElement * BVec,
-                      BVec,
-                      strideA,
-                      shapeFunctionValuesReferenceD.begin(),
-                      numNodesPerElement,
-                      strideB,
-                      &scalarCoeffBeta,
-                      psiQuadsFlatD.begin(),
-                      BVec,
-                      strideC,
-                      currentBlockSize);
-
-                    strideB = numNodesPerElement * numQuads;
-
-                    dftfe::utils::deviceKernelsGeneric::
-                      copyValueType1ArrToValueType2Arr(
-                        currentBlockSize * numQuads * numNodesPerElement,
-                        (operatorMatrix
-                           .getShapeFunctionGradientValuesXTransposed())
-                            .begin() +
-                          startingId * numQuads * numNodesPerElement,
-                        shapeFunctionGradientValuesXTransposedDevice.begin());
-
-                    dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
-                      operatorMatrix.getDeviceBlasHandle(),
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      BVec,
-                      numQuads,
-                      numNodesPerElement,
-                      &scalarCoeffAlpha,
-                      cellWaveFunctionMatrix.begin() +
+                    basisOperationsPtr->interpolateKernel(
+                      cellWaveFunctionMatrix.data() +
                         startingId * numNodesPerElement * BVec,
-                      BVec,
-                      strideA,
-                      shapeFunctionGradientValuesXTransposedDevice.begin(),
-                      numNodesPerElement,
-                      strideB,
-                      &scalarCoeffBeta,
-                      gradPsiQuadsXFlatD.begin(),
-                      BVec,
-                      strideC,
-                      currentBlockSize);
-
-
-                    dftfe::utils::deviceKernelsGeneric::
-                      copyValueType1ArrToValueType2Arr(
-                        currentBlockSize * numQuads * numNodesPerElement,
-                        (operatorMatrix
-                           .getShapeFunctionGradientValuesYTransposed())
-                            .begin() +
-                          startingId * numQuads * numNodesPerElement,
-                        shapeFunctionGradientValuesYTransposedDevice.begin());
-
-                    dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
-                      operatorMatrix.getDeviceBlasHandle(),
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      BVec,
-                      numQuads,
-                      numNodesPerElement,
-                      &scalarCoeffAlpha,
-                      cellWaveFunctionMatrix.begin() +
-                        startingId * numNodesPerElement * BVec,
-                      BVec,
-                      strideA,
-                      shapeFunctionGradientValuesYTransposedDevice.begin(),
-                      numNodesPerElement,
-                      strideB,
-                      &scalarCoeffBeta,
-                      gradPsiQuadsYFlatD.begin(),
-                      BVec,
-                      strideC,
-                      currentBlockSize);
-
-                    dftfe::utils::deviceKernelsGeneric::
-                      copyValueType1ArrToValueType2Arr(
-                        currentBlockSize * numQuads * numNodesPerElement,
-                        (operatorMatrix
-                           .getShapeFunctionGradientValuesZTransposed())
-                            .begin() +
-                          startingId * numQuads * numNodesPerElement,
-                        shapeFunctionGradientValuesZTransposedDevice.begin());
-
-                    dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
-                      operatorMatrix.getDeviceBlasHandle(),
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      dftfe::utils::DEVICEBLAS_OP_N,
-                      BVec,
-                      numQuads,
-                      numNodesPerElement,
-                      &scalarCoeffAlpha,
-                      cellWaveFunctionMatrix.begin() +
-                        startingId * numNodesPerElement * BVec,
-                      BVec,
-                      strideA,
-                      shapeFunctionGradientValuesZTransposedDevice.begin(),
-                      numNodesPerElement,
-                      strideB,
-                      &scalarCoeffBeta,
-                      gradPsiQuadsZFlatD.begin(),
-                      BVec,
-                      strideC,
-                      currentBlockSize);
-
+                      psiQuadsFlatD.data(),
+                      gradPsiQuadsFlatD.begin(),
+                      std::pair<unsigned int, unsigned int>(
+                        startingId, startingId + currentBlockSize));
 #ifdef DFTFE_WITH_DEVICE_LANG_CUDA
                     computeELocWfcEshelbyTensorContributions<<<
                       (BVec + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
@@ -684,11 +557,7 @@ namespace dftfe
                       dftfe::utils::makeDataTypeDeviceCompatible(
                         psiQuadsFlatD.begin()),
                       dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsXFlatD.begin()),
-                      dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsYFlatD.begin()),
-                      dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsZFlatD.begin()),
+                        gradPsiQuadsFlatD.begin()),
                       eigenValuesD.begin(),
                       partialOccupanciesD.begin(),
 #  ifdef USE_COMPLEX
@@ -717,11 +586,7 @@ namespace dftfe
                       dftfe::utils::makeDataTypeDeviceCompatible(
                         psiQuadsFlatD.begin()),
                       dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsXFlatD.begin()),
-                      dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsYFlatD.begin()),
-                      dftfe::utils::makeDataTypeDeviceCompatible(
-                        gradPsiQuadsZFlatD.begin()),
+                        gradPsiQuadsFlatD.begin()),
                       eigenValuesD.begin(),
                       partialOccupanciesD.begin(),
 #  ifdef USE_COMPLEX
@@ -1069,6 +934,11 @@ namespace dftfe
 
       void
       devicePortedForceKernelsAllD(
+        std::shared_ptr<
+          dftfe::basis::FEBasisOperations<dataTypes::number,
+                                          double,
+                                          dftfe::utils::MemorySpace::DEVICE>>
+          &                                      basisOperationsPtr,
         operatorDFTDeviceClass &                 operatorMatrix,
         distributedDeviceVec<dataTypes::number> &deviceFlattenedArrayBlock,
         distributedDeviceVec<dataTypes::number> &projectorKetTimesVectorD,
@@ -1109,13 +979,7 @@ namespace dftfe
           &psiQuadsFlatD,
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsXFlatD,
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsYFlatD,
-        dftfe::utils::MemoryStorage<dataTypes::number,
-                                    dftfe::utils::MemorySpace::DEVICE>
-          &gradPsiQuadsZFlatD,
+          &gradPsiQuadsFlatD,
 #ifdef USE_COMPLEX
         dftfe::utils::MemoryStorage<dataTypes::number,
                                     dftfe::utils::MemorySpace::DEVICE>
@@ -1154,11 +1018,13 @@ namespace dftfe
         // int this_process;
         // MPI_Comm_rank(d_mpiCommParent, &this_process);
 
-        const unsigned int M = operatorMatrix.getMatrixFreeData()
-                                 ->get_vector_partitioner()
-                                 ->local_size();
         dftfe::utils::deviceKernelsGeneric::stridedCopyToBlockConstantStride(
-          numPsi, N, M, startingVecId, X, deviceFlattenedArrayBlock.begin());
+          numPsi,
+          N,
+          basisOperationsPtr->nOwnedDofs(),
+          startingVecId,
+          X,
+          deviceFlattenedArrayBlock.begin());
         deviceFlattenedArrayBlock.updateGhostValues();
 
         (operatorMatrix.getOverloadedConstraintMatrix())
@@ -1169,7 +1035,8 @@ namespace dftfe
         // MPI_Barrier(d_mpiCommParent);
         // double kernel1_time = MPI_Wtime();
 
-        interpolatePsiComputeELocWfcEshelbyTensorD(operatorMatrix,
+        interpolatePsiComputeELocWfcEshelbyTensorD(basisOperationsPtr,
+                                                   operatorMatrix,
                                                    deviceFlattenedArrayBlock,
                                                    numPsi,
                                                    numCells,
@@ -1186,9 +1053,7 @@ namespace dftfe
                                                    onesVecD,
                                                    cellsBlockSize,
                                                    psiQuadsFlatD,
-                                                   gradPsiQuadsXFlatD,
-                                                   gradPsiQuadsYFlatD,
-                                                   gradPsiQuadsZFlatD,
+                                                   gradPsiQuadsFlatD,
 #ifdef USE_COMPLEX
                                                    psiQuadsNLPD,
 #endif
@@ -1273,6 +1138,11 @@ namespace dftfe
 
     void
     wfcContractionsForceKernelsAllH(
+      std::shared_ptr<
+        dftfe::basis::FEBasisOperations<dataTypes::number,
+                                        double,
+                                        dftfe::utils::MemorySpace::DEVICE>>
+        &                                     basisOperationsPtr,
       operatorDFTDeviceClass &                operatorMatrix,
       const dataTypes::number *               X,
       const unsigned int                      spinPolarizedFlag,
@@ -1361,16 +1231,8 @@ namespace dftfe
                       dataTypes::number(0.0));
       dftfe::utils::MemoryStorage<dataTypes::number,
                                   dftfe::utils::MemorySpace::DEVICE>
-        gradPsiQuadsXFlatD(cellsBlockSize * numQuads * blockSize,
-                           dataTypes::number(0.0));
-      dftfe::utils::MemoryStorage<dataTypes::number,
-                                  dftfe::utils::MemorySpace::DEVICE>
-        gradPsiQuadsYFlatD(cellsBlockSize * numQuads * blockSize,
-                           dataTypes::number(0.0));
-      dftfe::utils::MemoryStorage<dataTypes::number,
-                                  dftfe::utils::MemorySpace::DEVICE>
-        gradPsiQuadsZFlatD(cellsBlockSize * numQuads * blockSize,
-                           dataTypes::number(0.0));
+        gradPsiQuadsFlatD(cellsBlockSize * numQuads * blockSize * 3,
+                          dataTypes::number(0.0));
 #ifdef USE_COMPLEX
       dftfe::utils::MemoryStorage<dataTypes::number,
                                   dftfe::utils::MemorySpace::DEVICE>
@@ -1501,6 +1363,7 @@ namespace dftfe
                   // double kernel_time = MPI_Wtime();
 
                   devicePortedForceKernelsAllD(
+                    basisOperationsPtr,
                     operatorMatrix,
                     deviceFlattenedArrayBlock,
                     projectorKetTimesVectorD,
@@ -1526,9 +1389,7 @@ namespace dftfe
                     numNodesPerElement,
                     totalNonTrivialPseudoWfcs,
                     psiQuadsFlatD,
-                    gradPsiQuadsXFlatD,
-                    gradPsiQuadsYFlatD,
-                    gradPsiQuadsZFlatD,
+                    gradPsiQuadsFlatD,
 #ifdef USE_COMPLEX
                     psiQuadsNLPD,
 #endif
diff --git a/src/helmholtz/kerkerSolverProblemDevice.cc b/src/helmholtz/kerkerSolverProblemDevice.cc
index 2a2afc5d4..84712f276 100644
--- a/src/helmholtz/kerkerSolverProblemDevice.cc
+++ b/src/helmholtz/kerkerSolverProblemDevice.cc
@@ -103,11 +103,6 @@ namespace dftfe
       d_matrixFreeDataPRefinedPtr->get_vector_partitioner(
         d_matrixFreeVectorComponent),
       *d_constraintMatrixPRefinedPtr);
-    d_constraintsTotalPotentialInfo.precomputeMaps(
-      d_matrixFreeDataPRefinedPtr->get_vector_partitioner(
-        d_matrixFreeVectorComponent),
-      d_xPtr->get_partitioner(),
-      1);
   }
 
 
diff --git a/src/linAlg/linearAlgebraOperationsOpt.cc b/src/linAlg/linearAlgebraOperationsOpt.cc
index 8eac264b7..4c5766e18 100644
--- a/src/linAlg/linearAlgebraOperationsOpt.cc
+++ b/src/linAlg/linearAlgebraOperationsOpt.cc
@@ -516,12 +516,12 @@ namespace dftfe
 
     template <typename T>
     void
-    gramSchmidtOrthogonalization(std::vector<T> &   X,
+    gramSchmidtOrthogonalization(T *                X,
                                  const unsigned int numberVectors,
+                                 const unsigned int localVectorSize,
                                  const MPI_Comm &   mpiComm)
     {
 #ifdef USE_PETSC
-      const unsigned int localVectorSize = X.size() / numberVectors;
 
       //
       // Create template PETSc vector to create BV object later
@@ -614,8 +614,9 @@ namespace dftfe
     void
     rayleighRitzGEP(operatorDFTClass &   operatorMatrix,
                     elpaScalaManager &   elpaScala,
-                    std::vector<T> &     X,
+                    T *                  X,
                     const unsigned int   numberWaveFunctions,
+                    const unsigned int   localVectorSize,
                     const MPI_Comm &     mpiCommParent,
                     const MPI_Comm &     interBandGroupComm,
                     const MPI_Comm &     mpi_communicator,
@@ -660,8 +661,9 @@ namespace dftfe
       // SConj=X^{T}*XConj.
       if (!(dftParams.useMixedPrecCGS_O && useMixedPrec))
         {
-          internal::fillParallelOverlapMatrix(&X[0],
-                                              X.size(),
+          internal::fillParallelOverlapMatrix(X,
+                                              numberWaveFunctions *
+                                                localVectorSize,
                                               numberWaveFunctions,
                                               processGrid,
                                               interBandGroupComm,
@@ -674,8 +676,8 @@ namespace dftfe
           if (std::is_same<T, std::complex<double>>::value)
             internal::fillParallelOverlapMatrixMixedPrec<T,
                                                          std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -684,8 +686,8 @@ namespace dftfe
               dftParams);
           else
             internal::fillParallelOverlapMatrixMixedPrec<T, float>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -791,7 +793,8 @@ namespace dftfe
                   T(0.0));
 
 
-      operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar);
+      operatorMatrix.XtHX(
+        X, numberWaveFunctions, localVectorSize, processGrid, projHamPar);
       computing_timer.leave_subsection("Compute ProjHam, RR step");
 
       computing_timer.enter_subsection(
@@ -918,8 +921,8 @@ namespace dftfe
       projHamParCopy.mmult(projHamPar, LMatPar);
 
       if (!(dftParams.useMixedPrecSubspaceRotRR && useMixedPrec))
-        internal::subspaceRotation(&X[0],
-                                   X.size(),
+        internal::subspaceRotation(X,
+                                   numberWaveFunctions * localVectorSize,
                                    numberWaveFunctions,
                                    processGrid,
                                    interBandGroupComm,
@@ -933,8 +936,8 @@ namespace dftfe
         {
           if (std::is_same<T, std::complex<double>>::value)
             internal::subspaceRotationMixedPrec<T, std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -944,8 +947,9 @@ namespace dftfe
               false,
               false);
           else
-            internal::subspaceRotationMixedPrec<T, float>(&X[0],
-                                                          X.size(),
+            internal::subspaceRotationMixedPrec<T, float>(X,
+                                                          numberWaveFunctions *
+                                                            localVectorSize,
                                                           numberWaveFunctions,
                                                           processGrid,
                                                           interBandGroupComm,
@@ -968,8 +972,9 @@ namespace dftfe
     void
     rayleighRitz(operatorDFTClass &   operatorMatrix,
                  elpaScalaManager &   elpaScala,
-                 std::vector<T> &     X,
+                 T *                  X,
                  const unsigned int   numberWaveFunctions,
+                 const unsigned int   localVectorSize,
                  const MPI_Comm &     mpiCommParent,
                  const MPI_Comm &     interBandGroupComm,
                  const MPI_Comm &     mpi_communicator,
@@ -1006,7 +1011,8 @@ namespace dftfe
                   T(0.0));
 
       computing_timer.enter_subsection("Blocked XtHX, RR step");
-      operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar);
+      operatorMatrix.XtHX(
+        X, numberWaveFunctions, localVectorSize, processGrid, projHamPar);
       computing_timer.leave_subsection("Blocked XtHX, RR step");
 
       //
@@ -1116,8 +1122,8 @@ namespace dftfe
                                                processGrid,
                                                rowsBlockSize);
       projHamParCopy.copy_conjugate_transposed(projHamPar);
-      internal::subspaceRotation(&X[0],
-                                 X.size(),
+      internal::subspaceRotation(X,
+                                 numberWaveFunctions * localVectorSize,
                                  numberWaveFunctions,
                                  processGrid,
                                  interBandGroupComm,
@@ -1135,9 +1141,10 @@ namespace dftfe
     void
     rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &   operatorMatrix,
                                        elpaScalaManager &   elpaScala,
-                                       std::vector<T> &     X,
-                                       std::vector<T> &     Y,
+                                       T *                  X,
+                                       T *                  Y,
                                        const unsigned int   numberWaveFunctions,
+                                       const unsigned int   localVectorSize,
                                        const unsigned int   numberCoreStates,
                                        const MPI_Comm &     mpiCommParent,
                                        const MPI_Comm &     interBandGroupComm,
@@ -1183,8 +1190,9 @@ namespace dftfe
       // SConj=X^{T}*XConj
       if (!(dftParams.useMixedPrecCGS_O && useMixedPrec))
         {
-          internal::fillParallelOverlapMatrix(&X[0],
-                                              X.size(),
+          internal::fillParallelOverlapMatrix(X,
+                                              numberWaveFunctions *
+                                                localVectorSize,
                                               numberWaveFunctions,
                                               processGrid,
                                               interBandGroupComm,
@@ -1197,8 +1205,8 @@ namespace dftfe
           if (std::is_same<T, std::complex<double>>::value)
             internal::fillParallelOverlapMatrixMixedPrec<T,
                                                          std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -1207,8 +1215,8 @@ namespace dftfe
               dftParams);
           else
             internal::fillParallelOverlapMatrixMixedPrec<T, float>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -1319,12 +1327,17 @@ namespace dftfe
 
       if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit)
         {
-          operatorMatrix.XtHXMixedPrec(
-            X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar);
+          operatorMatrix.XtHXMixedPrec(X,
+                                       numberWaveFunctions,
+                                       numberCoreStates,
+                                       localVectorSize,
+                                       processGrid,
+                                       projHamPar);
         }
       else
         {
-          operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar);
+          operatorMatrix.XtHX(
+            X, numberWaveFunctions, localVectorSize, processGrid, projHamPar);
         }
 
 
@@ -1497,9 +1510,10 @@ namespace dftfe
       computing_timer.enter_subsection(
         "Xfr^{T}={QfrConjPrime}^{C}*LConj^{-1}*X^{T}, RR step");
 
-      internal::subspaceRotationSpectrumSplit(&X[0],
-                                              &Y[0],
-                                              X.size(),
+      internal::subspaceRotationSpectrumSplit(X,
+                                              Y,
+                                              numberWaveFunctions *
+                                                localVectorSize,
                                               numberWaveFunctions,
                                               processGrid,
                                               numberWaveFunctions -
@@ -1517,8 +1531,8 @@ namespace dftfe
       if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec))
         {
           computing_timer.enter_subsection("X^{T}=Lconj^{-1}*X^{T}, RR step");
-          internal::subspaceRotation(&X[0],
-                                     X.size(),
+          internal::subspaceRotation(X,
+                                     numberWaveFunctions * localVectorSize,
                                      numberWaveFunctions,
                                      processGrid,
                                      interBandGroupComm,
@@ -1536,8 +1550,8 @@ namespace dftfe
             "X^{T}=Lconj^{-1}*X^{T} mixed prec, RR step");
           if (std::is_same<T, std::complex<double>>::value)
             internal::subspaceRotationCGSMixedPrec<T, std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -1548,8 +1562,8 @@ namespace dftfe
               false);
           else
             internal::subspaceRotationCGSMixedPrec<T, float>(
-              &X[0],
-              X.size(),
+              X,
+              numberWaveFunctions * localVectorSize,
               numberWaveFunctions,
               processGrid,
               interBandGroupComm,
@@ -1566,18 +1580,19 @@ namespace dftfe
 
     template <typename T>
     void
-    rayleighRitzSpectrumSplitDirect(operatorDFTClass &    operatorMatrix,
-                                    elpaScalaManager &    elpaScala,
-                                    const std::vector<T> &X,
-                                    std::vector<T> &      Y,
-                                    const unsigned int    numberWaveFunctions,
-                                    const unsigned int    numberCoreStates,
-                                    const MPI_Comm &      mpiCommParent,
-                                    const MPI_Comm &      interBandGroupComm,
-                                    const MPI_Comm &      mpi_communicator,
-                                    const bool            useMixedPrec,
-                                    std::vector<double> & eigenValues,
-                                    const dftParameters & dftParams)
+    rayleighRitzSpectrumSplitDirect(operatorDFTClass &   operatorMatrix,
+                                    elpaScalaManager &   elpaScala,
+                                    const T *            X,
+                                    T *                  Y,
+                                    const unsigned int   numberWaveFunctions,
+                                    const unsigned int   localVectorSize,
+                                    const unsigned int   numberCoreStates,
+                                    const MPI_Comm &     mpiCommParent,
+                                    const MPI_Comm &     interBandGroupComm,
+                                    const MPI_Comm &     mpi_communicator,
+                                    const bool           useMixedPrec,
+                                    std::vector<double> &eigenValues,
+                                    const dftParameters &dftParams)
 
     {
       dealii::ConditionalOStream pcout(
@@ -1611,15 +1626,20 @@ namespace dftfe
       if (useMixedPrec && dftParams.useMixedPrecXTHXSpectrumSplit)
         {
           computing_timer.enter_subsection("Blocked XtHX Mixed Prec, RR step");
-          operatorMatrix.XtHXMixedPrec(
-            X, numberWaveFunctions, numberCoreStates, processGrid, projHamPar);
+          operatorMatrix.XtHXMixedPrec(X,
+                                       numberWaveFunctions,
+                                       numberCoreStates,
+                                       localVectorSize,
+                                       processGrid,
+                                       projHamPar);
 
           computing_timer.leave_subsection("Blocked XtHX Mixed Prec, RR step");
         }
       else
         {
           computing_timer.enter_subsection("Blocked XtHX, RR step");
-          operatorMatrix.XtHX(X, numberWaveFunctions, processGrid, projHamPar);
+          operatorMatrix.XtHX(
+            X, numberWaveFunctions, localVectorSize, processGrid, projHamPar);
           computing_timer.leave_subsection("Blocked XtHX, RR step");
         }
 
@@ -1766,9 +1786,10 @@ namespace dftfe
 
       computing_timer.enter_subsection("Blocked subspace rotation, RR step");
 
-      internal::subspaceRotationSpectrumSplit(&X[0],
-                                              &Y[0],
-                                              X.size(),
+      internal::subspaceRotationSpectrumSplit(X,
+                                              Y,
+                                              numberWaveFunctions *
+                                                localVectorSize,
                                               numberWaveFunctions,
                                               processGrid,
                                               numberWaveFunctions -
@@ -2373,8 +2394,10 @@ namespace dftfe
     template <typename T>
     void
     computeEigenResidualNorm(operatorDFTClass &         operatorMatrix,
-                             std::vector<T> &           X,
+                             T *                        X,
                              const std::vector<double> &eigenValues,
+                             const unsigned int         totalNumberVectors,
+                             const unsigned int         localVectorSize,
                              const MPI_Comm &           mpiCommParent,
                              const MPI_Comm &           mpiCommDomain,
                              const MPI_Comm &           interBandGroupComm,
@@ -2385,8 +2408,6 @@ namespace dftfe
       //
       // get the number of eigenVectors
       //
-      const unsigned int  totalNumberVectors = eigenValues.size();
-      const unsigned int  localVectorSize    = X.size() / totalNumberVectors;
       std::vector<double> residualNormSquare(totalNumberVectors, 0.0);
 
       // band group parallelization data structures
@@ -3006,8 +3027,9 @@ namespace dftfe
     void
     densityMatrixEigenBasisFirstOrderResponse(
       operatorDFTClass &         operatorMatrix,
-      std::vector<T> &           X,
+      T *                        X,
       const unsigned int         N,
+      const unsigned int         numberLocalDofs,
       const MPI_Comm &           mpiCommParent,
       const MPI_Comm &           mpiCommDomain,
       const MPI_Comm &           interBandGroupComm,
@@ -3050,10 +3072,11 @@ namespace dftfe
       if (dftParams.singlePrecLRD)
         {
           operatorMatrix.XtHXMixedPrec(
-            X, N, N, processGrid, projHamPrimePar, true);
+            X, N, N, numberLocalDofs, processGrid, projHamPrimePar, true);
         }
       else
-        operatorMatrix.XtHX(X, N, processGrid, projHamPrimePar, true);
+        operatorMatrix.XtHX(
+          X, N, numberLocalDofs, processGrid, projHamPrimePar, true);
       computing_timer.leave_subsection("Compute ProjHamPrime, DMFOR step");
 
 
@@ -3165,8 +3188,8 @@ namespace dftfe
         {
           if (std::is_same<T, std::complex<double>>::value)
             internal::subspaceRotationMixedPrec<T, std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberLocalDofs * N,
               N,
               processGrid,
               interBandGroupComm,
@@ -3177,8 +3200,8 @@ namespace dftfe
               false);
           else
             internal::subspaceRotationMixedPrec<T, float>(
-              &X[0],
-              X.size(),
+              X,
+              numberLocalDofs * N,
               N,
               processGrid,
               interBandGroupComm,
@@ -3190,8 +3213,8 @@ namespace dftfe
         }
       else
         {
-          internal::subspaceRotation(&X[0],
-                                     X.size(),
+          internal::subspaceRotation(X,
+                                     numberLocalDofs * N,
                                      N,
                                      processGrid,
                                      interBandGroupComm,
@@ -3237,14 +3260,16 @@ namespace dftfe
 
 
     template void
-    gramSchmidtOrthogonalization(std::vector<dataTypes::number> &,
+    gramSchmidtOrthogonalization(dataTypes::number *,
                                  const unsigned int,
+                                 const unsigned int localVectorSize,
                                  const MPI_Comm &);
 
     template unsigned int
     pseudoGramSchmidtOrthogonalization(elpaScalaManager &elpaScala,
-                                       std::vector<dataTypes::number> &,
+                                       dataTypes::number *,
                                        const unsigned int,
+                                       const unsigned int localVectorSize,
                                        const MPI_Comm &,
                                        const MPI_Comm &,
                                        const MPI_Comm &     mpiComm,
@@ -3254,8 +3279,9 @@ namespace dftfe
     template void
     rayleighRitz(operatorDFTClass &operatorMatrix,
                  elpaScalaManager &elpaScala,
-                 std::vector<dataTypes::number> &,
+                 dataTypes::number *,
                  const unsigned int numberWaveFunctions,
+                 const unsigned int localVectorSize,
                  const MPI_Comm &,
                  const MPI_Comm &,
                  const MPI_Comm &,
@@ -3266,8 +3292,9 @@ namespace dftfe
     template void
     rayleighRitzGEP(operatorDFTClass &operatorMatrix,
                     elpaScalaManager &elpaScala,
-                    std::vector<dataTypes::number> &,
+                    dataTypes::number *,
                     const unsigned int numberWaveFunctions,
+                    const unsigned int localVectorSize,
                     const MPI_Comm &,
                     const MPI_Comm &,
                     const MPI_Comm &,
@@ -3279,9 +3306,10 @@ namespace dftfe
     template void
     rayleighRitzSpectrumSplitDirect(operatorDFTClass &operatorMatrix,
                                     elpaScalaManager &elpaScala,
-                                    const std::vector<dataTypes::number> &,
-                                    std::vector<dataTypes::number> &,
+                                    const dataTypes::number *,
+                                    dataTypes::number *,
                                     const unsigned int numberWaveFunctions,
+                                    const unsigned int localVectorSize,
                                     const unsigned int numberCoreStates,
                                     const MPI_Comm &,
                                     const MPI_Comm &,
@@ -3291,11 +3319,12 @@ namespace dftfe
                                     const dftParameters &dftParams);
 
     template void
-    rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &operatorMatrix,
-                                       elpaScalaManager &elpaScala,
-                                       std::vector<dataTypes::number> &X,
-                                       std::vector<dataTypes::number> &Y,
+    rayleighRitzGEPSpectrumSplitDirect(operatorDFTClass &   operatorMatrix,
+                                       elpaScalaManager &   elpaScala,
+                                       dataTypes::number *  X,
+                                       dataTypes::number *  Y,
                                        const unsigned int   numberWaveFunctions,
+                                       const unsigned int   localVectorSize,
                                        const unsigned int   numberCoreStates,
                                        const MPI_Comm &     mpiCommParent,
                                        const MPI_Comm &     interBandGroupComm,
@@ -3305,28 +3334,31 @@ namespace dftfe
                                        const dftParameters &dftParams);
 
     template void
-    computeEigenResidualNorm(operatorDFTClass &              operatorMatrix,
-                             std::vector<dataTypes::number> &X,
-                             const std::vector<double> &     eigenValues,
-                             const MPI_Comm &                mpiCommParent,
-                             const MPI_Comm &                mpiCommDomain,
-                             const MPI_Comm &                interBandGroupComm,
-                             std::vector<double> &           residualNorm,
-                             const dftParameters &           dftParams);
+    computeEigenResidualNorm(operatorDFTClass &         operatorMatrix,
+                             dataTypes::number *        X,
+                             const std::vector<double> &eigenValues,
+                             const unsigned int         totalNumberVectors,
+                             const unsigned int         localVectorSize,
+                             const MPI_Comm &           mpiCommParent,
+                             const MPI_Comm &           mpiCommDomain,
+                             const MPI_Comm &           interBandGroupComm,
+                             std::vector<double> &      residualNorm,
+                             const dftParameters &      dftParams);
 
     template void
     densityMatrixEigenBasisFirstOrderResponse(
-      operatorDFTClass &              operatorMatrix,
-      std::vector<dataTypes::number> &X,
-      const unsigned int              N,
-      const MPI_Comm &                mpiCommParent,
-      const MPI_Comm &                mpiCommDomain,
-      const MPI_Comm &                interBandGroupComm,
-      const std::vector<double> &     eigenValues,
-      const double                    fermiEnergy,
-      std::vector<double> &           densityMatDerFermiEnergy,
-      elpaScalaManager &              elpaScala,
-      const dftParameters &           dftParams);
+      operatorDFTClass &         operatorMatrix,
+      dataTypes::number *        X,
+      const unsigned int         N,
+      const unsigned int         numberLocalDofs,
+      const MPI_Comm &           mpiCommParent,
+      const MPI_Comm &           mpiCommDomain,
+      const MPI_Comm &           interBandGroupComm,
+      const std::vector<double> &eigenValues,
+      const double               fermiEnergy,
+      std::vector<double> &      densityMatDerFermiEnergy,
+      elpaScalaManager &         elpaScala,
+      const dftParameters &      dftParams);
 
   } // namespace linearAlgebraOperations
 
diff --git a/src/linAlg/pseudoGS.cc b/src/linAlg/pseudoGS.cc
index 1703e083c..8eb1898a4 100644
--- a/src/linAlg/pseudoGS.cc
+++ b/src/linAlg/pseudoGS.cc
@@ -29,8 +29,9 @@ namespace dftfe
     template <typename T>
     unsigned int
     pseudoGramSchmidtOrthogonalization(elpaScalaManager &   elpaScala,
-                                       std::vector<T> &     X,
+                                       T *                  X,
                                        const unsigned int   numberVectors,
+                                       const unsigned int   numLocalDofs,
                                        const MPI_Comm &     mpiCommParent,
                                        const MPI_Comm &     interBandGroupComm,
                                        const MPI_Comm &     mpiComm,
@@ -38,8 +39,6 @@ namespace dftfe
                                        const dftParameters &dftParams)
 
     {
-      const unsigned int numLocalDofs = X.size() / numberVectors;
-
       dealii::ConditionalOStream pcout(
         std::cout,
         (dealii::Utilities::MPI::this_mpi_process(mpiCommParent) == 0));
@@ -74,8 +73,8 @@ namespace dftfe
       if (!(dftParams.useMixedPrecCGS_O && useMixedPrec))
         {
           computing_timer.enter_subsection("Fill overlap matrix CGS");
-          internal::fillParallelOverlapMatrix(&X[0],
-                                              X.size(),
+          internal::fillParallelOverlapMatrix(X,
+                                              numberVectors * numLocalDofs,
                                               numberVectors,
                                               processGrid,
                                               interBandGroupComm,
@@ -91,8 +90,8 @@ namespace dftfe
           if (std::is_same<T, std::complex<double>>::value)
             internal::fillParallelOverlapMatrixMixedPrec<T,
                                                          std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberVectors * numLocalDofs,
               numberVectors,
               processGrid,
               interBandGroupComm,
@@ -101,8 +100,8 @@ namespace dftfe
               dftParams);
           else
             internal::fillParallelOverlapMatrixMixedPrec<T, float>(
-              &X[0],
-              X.size(),
+              X,
+              numberVectors * numLocalDofs,
               numberVectors,
               processGrid,
               interBandGroupComm,
@@ -218,8 +217,8 @@ namespace dftfe
       if (!(dftParams.useMixedPrecCGS_SR && useMixedPrec))
         {
           computing_timer.enter_subsection("Subspace rotation CGS");
-          internal::subspaceRotation(&X[0],
-                                     X.size(),
+          internal::subspaceRotation(X,
+                                     numberVectors * numLocalDofs,
                                      numberVectors,
                                      processGrid,
                                      interBandGroupComm,
@@ -235,8 +234,8 @@ namespace dftfe
           computing_timer.enter_subsection("Subspace rotation mixed prec CGS");
           if (std::is_same<T, std::complex<double>>::value)
             internal::subspaceRotationCGSMixedPrec<T, std::complex<float>>(
-              &X[0],
-              X.size(),
+              X,
+              numberVectors * numLocalDofs,
               numberVectors,
               processGrid,
               interBandGroupComm,
@@ -245,8 +244,9 @@ namespace dftfe
               dftParams,
               false);
           else
-            internal::subspaceRotationCGSMixedPrec<T, float>(&X[0],
-                                                             X.size(),
+            internal::subspaceRotationCGSMixedPrec<T, float>(X,
+                                                             numberVectors *
+                                                               numLocalDofs,
                                                              numberVectors,
                                                              processGrid,
                                                              interBandGroupComm,
diff --git a/src/poisson/poissonSolverProblemDevice.cc b/src/poisson/poissonSolverProblemDevice.cc
index 4881183dc..27f4ad642 100644
--- a/src/poisson/poissonSolverProblemDevice.cc
+++ b/src/poisson/poissonSolverProblemDevice.cc
@@ -785,10 +785,6 @@ namespace dftfe
     d_constraintsTotalPotentialInfo.initialize(
       d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent),
       *d_constraintMatrixPtr);
-    d_constraintsTotalPotentialInfo.precomputeMaps(
-      d_matrixFreeDataPtr->get_vector_partitioner(d_matrixFreeVectorComponent),
-      d_xPtr->get_partitioner(),
-      1);
   }
 
 
diff --git a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc
index 11ac14c22..36ce39a4f 100644
--- a/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc
+++ b/src/solvers/eigenSolvers/chebyshevOrthogonalizedSubspaceIterationSolver.cc
@@ -114,17 +114,18 @@ namespace dftfe
   //
   void
   chebyshevOrthogonalizedSubspaceIterationSolver::solve(
-    operatorDFTClass &              operatorMatrix,
-    elpaScalaManager &              elpaScala,
-    std::vector<dataTypes::number> &eigenVectorsFlattened,
-    std::vector<dataTypes::number> &eigenVectorsRotFracDensityFlattened,
-    const unsigned int              totalNumberWaveFunctions,
-    std::vector<double> &           eigenValues,
-    std::vector<double> &           residualNorms,
-    const MPI_Comm &                interBandGroupComm,
-    const bool                      computeResidual,
-    const bool                      useMixedPrec,
-    const bool                      isFirstScf)
+    operatorDFTClass &   operatorMatrix,
+    elpaScalaManager &   elpaScala,
+    dataTypes::number *  eigenVectorsFlattened,
+    dataTypes::number *  eigenVectorsRotFracDensityFlattened,
+    const unsigned int   totalNumberWaveFunctions,
+    const unsigned int   localVectorSize,
+    std::vector<double> &eigenValues,
+    std::vector<double> &residualNorms,
+    const MPI_Comm &     interBandGroupComm,
+    const bool           computeResidual,
+    const bool           useMixedPrec,
+    const bool           isFirstScf)
   {
     dealii::TimerOutput computingTimerStandard(
       operatorMatrix.getMPICommunicator(),
@@ -185,8 +186,6 @@ namespace dftfe
       dftUtils::printCurrentMemoryUsage(operatorMatrix.getMPICommunicator(),
                                         "Before starting chebyshev filtering");
 
-    const unsigned int localVectorSize =
-      eigenVectorsFlattened.size() / totalNumberWaveFunctions;
 
 
     // band group parallelization data structures
@@ -244,9 +243,9 @@ namespace dftfe
             computing_timer.enter_subsection(
               "Copy from full to block flattened array");
             for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode)
-              std::copy(eigenVectorsFlattened.data() +
+              std::copy(eigenVectorsFlattened +
                           iNode * totalNumberWaveFunctions + jvec,
-                        eigenVectorsFlattened.data() +
+                        eigenVectorsFlattened +
                           iNode * totalNumberWaveFunctions + jvec + BVec,
                         eigenVectorsFlattenedArrayBlock.data() + iNode * BVec);
             computing_timer.leave_subsection(
@@ -321,7 +320,7 @@ namespace dftfe
               std::copy(eigenVectorsFlattenedArrayBlock.data() + iNode * BVec,
                         eigenVectorsFlattenedArrayBlock.data() +
                           (iNode + 1) * BVec,
-                        eigenVectorsFlattened.data() +
+                        eigenVectorsFlattened +
                           iNode * totalNumberWaveFunctions + jvec);
 
             computing_timer.leave_subsection(
@@ -358,9 +357,9 @@ namespace dftfe
                   std::min(blockSize,
                            totalNumberWaveFunctions * localVectorSize - i);
                 MPI_Allreduce(MPI_IN_PLACE,
-                              &eigenVectorsFlattened[0] + i,
+                              eigenVectorsFlattened + i,
                               currentBlockSize,
-                              dataTypes::mpi_type_id(&eigenVectorsFlattened[0]),
+                              dataTypes::mpi_type_id(eigenVectorsFlattened),
                               MPI_SUM,
                               interBandGroupComm);
               }
@@ -446,6 +445,7 @@ namespace dftfe
               eigenVectorsFlattened,
               eigenVectorsRotFracDensityFlattened,
               totalNumberWaveFunctions,
+              localVectorSize,
               totalNumberWaveFunctions - eigenValues.size(),
               d_mpiCommParent,
               interBandGroupComm,
@@ -461,6 +461,7 @@ namespace dftfe
               elpaScala,
               eigenVectorsFlattened,
               totalNumberWaveFunctions,
+              localVectorSize,
               d_mpiCommParent,
               interBandGroupComm,
               operatorMatrix.getMPICommunicator(),
@@ -477,6 +478,8 @@ namespace dftfe
               operatorMatrix,
               eigenVectorsRotFracDensityFlattened,
               eigenValues,
+              eigenValues.size(),
+              localVectorSize,
               d_mpiCommParent,
               operatorMatrix.getMPICommunicator(),
               interBandGroupComm,
@@ -489,6 +492,8 @@ namespace dftfe
               operatorMatrix,
               eigenVectorsFlattened,
               eigenValues,
+              totalNumberWaveFunctions,
+              localVectorSize,
               d_mpiCommParent,
               operatorMatrix.getMPICommunicator(),
               interBandGroupComm,
@@ -503,6 +508,7 @@ namespace dftfe
         linearAlgebraOperations::gramSchmidtOrthogonalization(
           eigenVectorsFlattened,
           totalNumberWaveFunctions,
+          localVectorSize,
           operatorMatrix.getMPICommunicator());
         computing_timer.leave_subsection("Gram-Schmidt Orthogn Opt");
 
@@ -519,6 +525,7 @@ namespace dftfe
               eigenVectorsFlattened,
               eigenVectorsRotFracDensityFlattened,
               totalNumberWaveFunctions,
+              localVectorSize,
               totalNumberWaveFunctions - eigenValues.size(),
               d_mpiCommParent,
               interBandGroupComm,
@@ -534,6 +541,7 @@ namespace dftfe
               elpaScala,
               eigenVectorsFlattened,
               totalNumberWaveFunctions,
+              localVectorSize,
               d_mpiCommParent,
               interBandGroupComm,
               operatorMatrix.getMPICommunicator(),
@@ -561,6 +569,8 @@ namespace dftfe
                   operatorMatrix,
                   eigenVectorsRotFracDensityFlattened,
                   eigenValues,
+                  eigenValues.size(),
+                  localVectorSize,
                   d_mpiCommParent,
                   operatorMatrix.getMPICommunicator(),
                   interBandGroupComm,
@@ -572,6 +582,8 @@ namespace dftfe
                 operatorMatrix,
                 eigenVectorsFlattened,
                 eigenValues,
+                totalNumberWaveFunctions,
+                localVectorSize,
                 d_mpiCommParent,
                 operatorMatrix.getMPICommunicator(),
                 interBandGroupComm,
diff --git a/src/symmetry/symmetrizeRho.cc b/src/symmetry/symmetrizeRho.cc
index ecc579bee..5ee927dee 100644
--- a/src/symmetry/symmetrizeRho.cc
+++ b/src/symmetry/symmetrizeRho.cc
@@ -223,7 +223,7 @@ namespace dftfe
       dftPtr->d_kPointWeights.size());
 
     const unsigned int localVectorSize =
-      dftPtr->d_eigenVectorsFlattenedSTL[0].size() / dftPtr->d_numEigenValues;
+      dftPtr->matrix_free_data.get_vector_partitioner()->locally_owned_size();
 
     distributedCPUVec<dataTypes::number> eigenVectorsFlattenedArrayFullBlock;
     vectorTools::createDealiiVector<dataTypes::number>(
@@ -231,11 +231,6 @@ namespace dftfe
       dftPtr->d_numEigenValues,
       eigenVectorsFlattenedArrayFullBlock);
 
-    dftPtr->constraintsNoneDataInfo.precomputeMaps(
-      dftPtr->matrix_free_data.get_vector_partitioner(),
-      eigenVectorsFlattenedArrayFullBlock.get_partitioner(),
-      dftPtr->d_numEigenValues);
-
     for (unsigned int kPoint = 0;
          kPoint < (1 + dftPtr->getParametersObject().spinPolarized) *
                     dftPtr->d_kPointWeights.size();
@@ -250,8 +245,9 @@ namespace dftfe
                ++iWave)
             eigenVectorsFlattenedArrayFullBlock.local_element(
               iNode * dftPtr->d_numEigenValues + iWave) =
-              dftPtr->d_eigenVectorsFlattenedSTL
-                [kPoint][iNode * dftPtr->d_numEigenValues + iWave];
+              dftPtr->d_eigenVectorsFlattenedHost
+                [kPoint * localVectorSize * dftPtr->d_numEigenValues +
+                 iNode * dftPtr->d_numEigenValues + iWave];
 
         dftPtr->constraintsNoneDataInfo.distribute(
           eigenVectorsFlattenedArrayFullBlock, dftPtr->d_numEigenValues);
diff --git a/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc
new file mode 100644
index 000000000..aa420fa35
--- /dev/null
+++ b/testsGPU/pseudopotential/complex/jobscripts/frontierJobScript6GCDs6MPITasks.rc
@@ -0,0 +1,32 @@
+#!/ccs/home/dsambit/frontier/bin/rc
+#SBATCH -A mat239
+#SBATCH -J gputests
+#SBATCH -t 1:00:00
+#SBATCH -p batch
+#SBATCH -N 1
+#SBATCH --gpus-per-node 6
+#SBATCH --ntasks-per-gpu 1
+#SBATCH --gpu-bind closest
+
+OMP_NUM_THREADS = 1
+MPICH_VERSION_DISPLAY=1
+MPICH_ENV_DISPLAY=1
+MPICH_OFI_NIC_POLICY = NUMA 
+MPICH_GPU_SUPPORT_ENABLED=1
+MPICH_SMP_SINGLE_COPY_MODE=NONE
+
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64
+LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
+
+
+BASE = $WD/src/dftfeDebug/build/release/complex
+n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc}
+
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6
+srun -n 6 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe
diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc
new file mode 100644
index 000000000..0659588b2
--- /dev/null
+++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript18GCDs18MPITasks.rc
@@ -0,0 +1,41 @@
+#!/ccs/home/dsambit/frontier/bin/rc
+#SBATCH -A mat239
+#SBATCH -J gputests
+#SBATCH -t 1:00:00
+#SBATCH -p batch
+#SBATCH -N 3
+#SBATCH --gpus-per-node 6
+#SBATCH --ntasks-per-gpu 1
+#SBATCH --gpu-bind closest
+
+OMP_NUM_THREADS = 1
+MPICH_VERSION_DISPLAY=1
+MPICH_ENV_DISPLAY=1
+MPICH_OFI_NIC_POLICY = NUMA 
+MPICH_GPU_SUPPORT_ENABLED=1
+MPICH_SMP_SINGLE_COPY_MODE=NONE
+
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64
+LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
+
+
+BASE = $WD/src/dftfeDebug/build/release/real
+n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc}
+
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe
+
+
diff --git a/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc
new file mode 100644
index 000000000..9c051b5e5
--- /dev/null
+++ b/testsGPU/pseudopotential/real/jobscripts/frontierJobScript6GCDs6MPITasks.rc
@@ -0,0 +1,39 @@
+#!/ccs/home/dsambit/frontier/bin/rc
+#SBATCH -A mat239
+#SBATCH -J gputests
+#SBATCH -t 1:00:00
+#SBATCH -p batch
+#SBATCH -N 1
+#SBATCH --gpus-per-node 6
+#SBATCH --ntasks-per-gpu 1
+#SBATCH --gpu-bind closest
+
+OMP_NUM_THREADS = 1
+MPICH_VERSION_DISPLAY=1
+MPICH_ENV_DISPLAY=1
+MPICH_OFI_NIC_POLICY = NUMA 
+MPICH_GPU_SUPPORT_ENABLED=1
+MPICH_SMP_SINGLE_COPY_MODE=NONE
+
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib
+LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$INST/lib/lib64
+LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
+
+
+BASE = $WD/src/dftfeDebug/build/release/real
+n=`{echo $SLURM_JOB_NUM_NODES '*' 8 | bc}
+
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_0.prm > output_MD_0
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_1.prm > output_MD_1
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe Input_MD_2.prm > output_MD_2
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1.prm > outputMg2x_1
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_1_spingpu.prm > outputMg2x_1_spin_gpu
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_2.prm > outputMg2x_2
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_3.prm > outputMg2x_3
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_4.prm > outputMg2x_4
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_5.prm > outputMg2x_5
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_6.prm > outputMg2x_6
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_7.prm > outputMg2x_7
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_12.prm > outputMg2x_12
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileMg2x_13.prm > outputMg2x_13
+srun -n 18 -c 7 --gpu-bind closest $BASE/dftfe parameterFileBe.prm > outputBe
diff --git a/utils/DeviceBlasWrapper.cu.cc b/utils/DeviceBlasWrapper.cu.cc
index 6ed34f28a..dede70848 100644
--- a/utils/DeviceBlasWrapper.cu.cc
+++ b/utils/DeviceBlasWrapper.cu.cc
@@ -474,6 +474,108 @@ namespace dftfe
         return status;
       }
 
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const double *        alpha,
+           const double *        A,
+           int                   lda,
+           const double *        x,
+           int                   incx,
+           const double *        beta,
+           double *              y,
+           int                   incy)
+      {
+        deviceBlasStatus_t status = cublasDgemv(
+          handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const float *         alpha,
+           const float *         A,
+           int                   lda,
+           const float *         x,
+           int                   incx,
+           const float *         beta,
+           float *               y,
+           int                   incy)
+      {
+        deviceBlasStatus_t status = cublasSgemv(
+          handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t          handle,
+           deviceBlasOperation_t       trans,
+           int                         m,
+           int                         n,
+           const std::complex<double> *alpha,
+           const std::complex<double> *A,
+           int                         lda,
+           const std::complex<double> *x,
+           int                         incx,
+           const std::complex<double> *beta,
+           std::complex<double> *      y,
+           int                         incy)
+      {
+        deviceBlasStatus_t status =
+          cublasZgemv(handle,
+                      trans,
+                      m,
+                      n,
+                      dftfe::utils::makeDataTypeDeviceCompatible(alpha),
+                      dftfe::utils::makeDataTypeDeviceCompatible(A),
+                      lda,
+                      dftfe::utils::makeDataTypeDeviceCompatible(x),
+                      incx,
+                      dftfe::utils::makeDataTypeDeviceCompatible(beta),
+                      dftfe::utils::makeDataTypeDeviceCompatible(y),
+                      incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t         handle,
+           deviceBlasOperation_t      trans,
+           int                        m,
+           int                        n,
+           const std::complex<float> *alpha,
+           const std::complex<float> *A,
+           int                        lda,
+           const std::complex<float> *x,
+           int                        incx,
+           const std::complex<float> *beta,
+           std::complex<float> *      y,
+           int                        incy)
+      {
+        deviceBlasStatus_t status =
+          cublasCgemv(handle,
+                      trans,
+                      m,
+                      n,
+                      dftfe::utils::makeDataTypeDeviceCompatible(alpha),
+                      dftfe::utils::makeDataTypeDeviceCompatible(A),
+                      lda,
+                      dftfe::utils::makeDataTypeDeviceCompatible(x),
+                      incx,
+                      dftfe::utils::makeDataTypeDeviceCompatible(beta),
+                      dftfe::utils::makeDataTypeDeviceCompatible(y),
+                      incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
     } // namespace deviceBlasWrapper
   }   // namespace utils
 } // namespace dftfe
diff --git a/utils/DeviceBlasWrapper.hip.cc b/utils/DeviceBlasWrapper.hip.cc
index ec6a5b316..f24a55ccf 100644
--- a/utils/DeviceBlasWrapper.hip.cc
+++ b/utils/DeviceBlasWrapper.hip.cc
@@ -517,6 +517,107 @@ namespace dftfe
         DEVICEBLAS_API_CHECK(status);
         return status;
       }
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const double *        alpha,
+           const double *        A,
+           int                   lda,
+           const double *        x,
+           int                   incx,
+           const double *        beta,
+           double *              y,
+           int                   incy)
+      {
+        deviceBlasStatus_t status = hipblasDgemv(
+          handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t    handle,
+           deviceBlasOperation_t trans,
+           int                   m,
+           int                   n,
+           const float *         alpha,
+           const float *         A,
+           int                   lda,
+           const float *         x,
+           int                   incx,
+           const float *         beta,
+           float *               y,
+           int                   incy)
+      {
+        deviceBlasStatus_t status = hipblasSgemv(
+          handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t          handle,
+           deviceBlasOperation_t       trans,
+           int                         m,
+           int                         n,
+           const std::complex<double> *alpha,
+           const std::complex<double> *A,
+           int                         lda,
+           const std::complex<double> *x,
+           int                         incx,
+           const std::complex<double> *beta,
+           std::complex<double> *      y,
+           int                         incy)
+      {
+        deviceBlasStatus_t status =
+          hipblasZgemv(handle,
+                       trans,
+                       m,
+                       n,
+                       makeDataTypeHipBlasCompatible(alpha),
+                       makeDataTypeHipBlasCompatible(A),
+                       lda,
+                       makeDataTypeHipBlasCompatible(x),
+                       incx,
+                       makeDataTypeHipBlasCompatible(beta),
+                       makeDataTypeHipBlasCompatible(y),
+                       incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
+
+      deviceBlasStatus_t
+      gemv(deviceBlasHandle_t         handle,
+           deviceBlasOperation_t      trans,
+           int                        m,
+           int                        n,
+           const std::complex<float> *alpha,
+           const std::complex<float> *A,
+           int                        lda,
+           const std::complex<float> *x,
+           int                        incx,
+           const std::complex<float> *beta,
+           std::complex<float> *      y,
+           int                        incy)
+      {
+        deviceBlasStatus_t status =
+          hipblasCgemv(handle,
+                       trans,
+                       m,
+                       n,
+                       makeDataTypeHipBlasCompatible(alpha),
+                       makeDataTypeHipBlasCompatible(A),
+                       lda,
+                       makeDataTypeHipBlasCompatible(x),
+                       incx,
+                       makeDataTypeHipBlasCompatible(beta),
+                       makeDataTypeHipBlasCompatible(y),
+                       incy);
+        DEVICEBLAS_API_CHECK(status);
+        return status;
+      }
 
     } // namespace deviceBlasWrapper
   }   // namespace utils
diff --git a/utils/DeviceKernelsGeneric.cc b/utils/DeviceKernelsGeneric.cc
index 54c517e28..56ecd913d 100644
--- a/utils/DeviceKernelsGeneric.cc
+++ b/utils/DeviceKernelsGeneric.cc
@@ -208,6 +208,37 @@ namespace dftfe
       }
     }
 
+    template <typename ValueType1, typename ValueType2>
+    __global__ void
+    stridedCopyConstantStrideDeviceKernel(const dftfe::size_type blockSize,
+                                          const dftfe::size_type strideTo,
+                                          const dftfe::size_type strideFrom,
+                                          const dftfe::size_type numBlocks,
+                                          const dftfe::size_type startingToId,
+                                          const dftfe::size_type startingFromId,
+                                          const ValueType1 *     copyFromVec,
+                                          ValueType2 *           copyToVec)
+    {
+      {
+        const dftfe::size_type globalThreadId =
+          blockIdx.x * blockDim.x + threadIdx.x;
+        const dftfe::size_type numberEntries = numBlocks * blockSize;
+
+        for (dftfe::size_type index = globalThreadId; index < numberEntries;
+             index += blockDim.x * gridDim.x)
+          {
+            dftfe::size_type blockIndex      = index / blockSize;
+            dftfe::size_type intraBlockIndex = index - blockIndex * blockSize;
+            dftfe::utils::copyValue(
+              copyToVec + blockIndex * strideTo + startingToId +
+                intraBlockIndex,
+              copyFromVec[blockIndex * strideFrom + startingFromId +
+                          intraBlockIndex]);
+          }
+      }
+    }
+
+
     // x=a*x, with inc=1
     template <typename ValueType1, typename ValueType2>
     __global__ void
@@ -586,6 +617,47 @@ namespace dftfe
 #endif
       }
 
+      template <typename ValueType1, typename ValueType2>
+      void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const ValueType1 *     copyFromVec,
+                                ValueType2 *           copyToVec)
+      {
+#ifdef DFTFE_WITH_DEVICE_LANG_CUDA
+        stridedCopyConstantStrideDeviceKernel<<<
+          (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1,
+          dftfe::utils::DEVICE_BLOCK_SIZE>>>(
+          blockSize,
+          strideTo,
+          strideFrom,
+          numBlocks,
+          startingToId,
+          startingFromId,
+          dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec),
+          dftfe::utils::makeDataTypeDeviceCompatible(copyToVec));
+#elif DFTFE_WITH_DEVICE_LANG_HIP
+        hipLaunchKernelGGL(
+          stridedCopyConstantStrideDeviceKernel,
+          (blockSize * numBlocks) / dftfe::utils::DEVICE_BLOCK_SIZE + 1,
+          dftfe::utils::DEVICE_BLOCK_SIZE,
+          0,
+          0,
+          blockSize,
+          strideTo,
+          strideFrom,
+          numBlocks,
+          startingToId,
+          startingFromId,
+          dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec),
+          dftfe::utils::makeDataTypeDeviceCompatible(copyToVec));
+#endif
+      }
+
 
       template <typename ValueType1, typename ValueType2>
       void
@@ -1312,6 +1384,87 @@ namespace dftfe
                                          const dftfe::size_type startingId,
                                          const std::complex<float> *copyFromVec,
                                          std::complex<double> *     copyToVec);
+      // strided copy  constant stride
+      template void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const double *         copyFromVec,
+                                double *               copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const float *          copyFromVec,
+                                float *                copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type      blockSize,
+                                const dftfe::size_type      strideTo,
+                                const dftfe::size_type      strideFrom,
+                                const dftfe::size_type      numBlocks,
+                                const dftfe::size_type      startingToId,
+                                const dftfe::size_type      startingFromId,
+                                const std::complex<double> *copyFromVec,
+                                std::complex<double> *      copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type     blockSize,
+                                const dftfe::size_type     strideTo,
+                                const dftfe::size_type     strideFrom,
+                                const dftfe::size_type     numBlocks,
+                                const dftfe::size_type     startingToId,
+                                const dftfe::size_type     startingFromId,
+                                const std::complex<float> *copyFromVec,
+                                std::complex<float> *      copyToVec);
+
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const double *         copyFromVec,
+                                float *                copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type blockSize,
+                                const dftfe::size_type strideTo,
+                                const dftfe::size_type strideFrom,
+                                const dftfe::size_type numBlocks,
+                                const dftfe::size_type startingToId,
+                                const dftfe::size_type startingFromId,
+                                const float *          copyFromVec,
+                                double *               copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type      blockSize,
+                                const dftfe::size_type      strideTo,
+                                const dftfe::size_type      strideFrom,
+                                const dftfe::size_type      numBlocks,
+                                const dftfe::size_type      startingToId,
+                                const dftfe::size_type      startingFromId,
+                                const std::complex<double> *copyFromVec,
+                                std::complex<float> *       copyToVec);
+
+      template void
+      stridedCopyConstantStride(const dftfe::size_type     blockSize,
+                                const dftfe::size_type     strideTo,
+                                const dftfe::size_type     strideFrom,
+                                const dftfe::size_type     numBlocks,
+                                const dftfe::size_type     startingToId,
+                                const dftfe::size_type     startingFromId,
+                                const std::complex<float> *copyFromVec,
+                                std::complex<double> *     copyToVec);
 
       // stridedBlockScale
       template void
diff --git a/utils/FEBasisOperations.t.cc b/utils/FEBasisOperations.t.cc
new file mode 100644
index 000000000..b26ee0c40
--- /dev/null
+++ b/utils/FEBasisOperations.t.cc
@@ -0,0 +1,1059 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#include <FEBasisOperations.h>
+namespace dftfe
+{
+  namespace basis
+  {
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::
+      FEBasisOperationsBase(
+        dealii::MatrixFree<3, ValueTypeBasisData> &matrixFreeData,
+        std::vector<const dealii::AffineConstraints<ValueTypeBasisData> *>
+          &constraintsVector)
+    {
+      d_matrixFreeDataPtr = &matrixFreeData;
+      d_constraintsVector = &constraintsVector;
+      d_dofHandlerID      = 0;
+      d_nVectors          = 0;
+      d_updateFlags       = update_default;
+      areAllCellsAffine   = true;
+      for (unsigned int iMacroCell = 0;
+           iMacroCell < d_matrixFreeDataPtr->n_cell_batches();
+           ++iMacroCell)
+        {
+          areAllCellsAffine =
+            areAllCellsAffine &&
+            (d_matrixFreeDataPtr->get_mapping_info().get_cell_type(
+               iMacroCell) <= dealii::internal::MatrixFreeFunctions::affine);
+        }
+      areAllCellsCartesian = true;
+      for (unsigned int iMacroCell = 0;
+           iMacroCell < d_matrixFreeDataPtr->n_cell_batches();
+           ++iMacroCell)
+        {
+          areAllCellsCartesian =
+            areAllCellsCartesian &&
+            (d_matrixFreeDataPtr->get_mapping_info().get_cell_type(
+               iMacroCell) == dealii::internal::MatrixFreeFunctions::cartesian);
+        }
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::init(const unsigned int &dofHandlerID,
+                                             const std::vector<unsigned int>
+                                               &               quadratureID,
+                                             const UpdateFlags updateFlags)
+    {
+      d_dofHandlerID        = dofHandlerID;
+      d_quadratureIDsVector = quadratureID;
+      d_updateFlags         = updateFlags;
+      initializeIndexMaps();
+      initializeMPIPattern();
+      initializeConstraints();
+      initializeShapeFunctionAndJacobianData();
+      if (!std::is_same<ValueTypeBasisCoeff, ValueTypeBasisData>::value)
+        initializeShapeFunctionAndJacobianBasisData();
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::reinit(const unsigned int &vecBlockSize,
+                                               const unsigned int
+                                                 &cellsBlockSize,
+                                               const unsigned int &quadratureID,
+                                               const bool isResizeTempStorage)
+    {
+      d_quadratureID   = quadratureID;
+      d_cellsBlockSize = cellsBlockSize;
+      if (d_nVectors != vecBlockSize)
+        {
+          d_nVectors = vecBlockSize;
+          initializeFlattenedIndexMaps();
+        }
+      if (isResizeTempStorage)
+        resizeTempStorage();
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::nQuadsPerCell() const
+    {
+      return d_nQuadsPerCell[d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::nDofsPerCell() const
+    {
+      return d_nDofsPerCell;
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::nCells() const
+    {
+      return d_nCells;
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::nRelaventDofs() const
+    {
+      return d_localSize;
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::nOwnedDofs() const
+    {
+      return d_locallyOwnedSize;
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::shapeFunctionData(bool transpose) const
+    {
+      return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] :
+                         d_shapeFunctionData[d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+    FEBasisOperationsBase<
+      ValueTypeBasisCoeff,
+      ValueTypeBasisData,
+      memorySpace>::shapeFunctionGradientData(bool transpose) const
+    {
+      return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] :
+                         d_shapeFunctionGradientData[d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::inverseJacobians() const
+    {
+      return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisCoeff, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::JxW() const
+    {
+      return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::JxWBasisData() const
+    {
+      return d_JxWData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<!std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::JxWBasisData() const
+    {
+      return d_JxWBasisData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::inverseJacobiansBasisData() const
+    {
+      return d_inverseJacobianData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<!std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::inverseJacobiansBasisData() const
+    {
+      return d_inverseJacobianBasisData[areAllCellsAffine ? 0 : d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::shapeFunctionBasisData(bool transpose)
+      const
+    {
+      return transpose ? d_shapeFunctionDataTranspose[d_quadratureID] :
+                         d_shapeFunctionData[d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<!std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::shapeFunctionBasisData(bool transpose)
+      const
+    {
+      return transpose ? d_shapeFunctionBasisDataTranspose[d_quadratureID] :
+                         d_shapeFunctionBasisData[d_quadratureID];
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<
+      ValueTypeBasisCoeff,
+      ValueTypeBasisData,
+      memorySpace>::shapeFunctionGradientBasisData(bool transpose) const
+    {
+      return transpose ? d_shapeFunctionGradientDataTranspose[d_quadratureID] :
+                         d_shapeFunctionGradientData[d_quadratureID];
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    template <typename A,
+              typename B,
+              typename std::enable_if_t<!std::is_same<A, B>::value, int>>
+    const dftfe::utils::MemoryStorage<ValueTypeBasisData, memorySpace> &
+    FEBasisOperationsBase<
+      ValueTypeBasisCoeff,
+      ValueTypeBasisData,
+      memorySpace>::shapeFunctionGradientBasisData(bool transpose) const
+    {
+      return transpose ?
+               d_shapeFunctionGradientBasisDataTranspose[d_quadratureID] :
+               d_shapeFunctionGradientBasisData[d_quadratureID];
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    unsigned int
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::cellsTypeFlag() const
+    {
+      return (unsigned int)areAllCellsAffine +
+             (unsigned int)areAllCellsCartesian;
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    dealii::CellId
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::cellID(const unsigned int iElem) const
+    {
+      return d_cellIndexToCellIdMap[iElem];
+    }
+
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::resizeTempStorage()
+    {
+      tempCellNodalData.resize(d_nVectors * d_nDofsPerCell * d_cellsBlockSize);
+
+      if (d_updateFlags & update_gradients)
+        tempQuadratureGradientsData.resize(
+          areAllCellsCartesian ? 0 :
+                                 (d_nVectors * d_nQuadsPerCell[d_quadratureID] *
+                                  3 * d_cellsBlockSize));
+
+      if (d_updateFlags & update_gradients)
+        tempQuadratureGradientsDataNonAffine.resize(
+          areAllCellsAffine ? 0 :
+                              (d_nVectors * d_nQuadsPerCell[d_quadratureID] *
+                               3 * d_cellsBlockSize));
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::initializeFlattenedIndexMaps()
+    {
+#if defined(DFTFE_WITH_DEVICE)
+      dftfe::utils::MemoryStorage<dftfe::global_size_type,
+                                  dftfe::utils::MemorySpace::HOST>
+        d_flattenedCellDofIndexToProcessDofIndexMapHost;
+#else
+      auto &d_flattenedCellDofIndexToProcessDofIndexMapHost =
+        d_flattenedCellDofIndexToProcessDofIndexMap;
+#endif
+      d_flattenedCellDofIndexToProcessDofIndexMapHost.clear();
+      d_flattenedCellDofIndexToProcessDofIndexMapHost.resize(d_nCells *
+                                                             d_nDofsPerCell);
+
+      std::transform(d_cellDofIndexToProcessDofIndexMap.begin(),
+                     d_cellDofIndexToProcessDofIndexMap.end(),
+                     d_flattenedCellDofIndexToProcessDofIndexMapHost.begin(),
+                     [&a = this->d_nVectors](auto &c) { return c * a; });
+#if defined(DFTFE_WITH_DEVICE)
+      d_flattenedCellDofIndexToProcessDofIndexMap.resize(
+        d_flattenedCellDofIndexToProcessDofIndexMapHost.size());
+      d_flattenedCellDofIndexToProcessDofIndexMap.copyFrom(
+        d_flattenedCellDofIndexToProcessDofIndexMapHost);
+#endif
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::initializeMPIPattern()
+    {
+      const std::pair<global_size_type, global_size_type> &locallyOwnedRange =
+        d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+          ->local_range();
+
+      std::vector<global_size_type> ghostIndices;
+      (d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+         ->ghost_indices())
+        .fill_index_vector(ghostIndices);
+
+      mpiPatternP2P =
+        std::make_shared<dftfe::utils::mpi::MPIPatternP2P<memorySpace>>(
+          locallyOwnedRange,
+          ghostIndices,
+          d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+            ->get_mpi_communicator());
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::initializeIndexMaps()
+    {
+      d_nCells       = d_matrixFreeDataPtr->n_physical_cells();
+      d_nDofsPerCell = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID)
+                         .get_fe()
+                         .dofs_per_cell;
+      d_locallyOwnedSize =
+        d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+          ->locally_owned_size();
+      d_localSize = d_locallyOwnedSize +
+                    d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+                      ->n_ghost_indices();
+      d_cellDofIndexToProcessDofIndexMap.clear();
+      d_cellDofIndexToProcessDofIndexMap.resize(d_nCells * d_nDofsPerCell);
+
+      d_cellIndexToCellIdMap.clear();
+      d_cellIndexToCellIdMap.resize(d_nCells);
+
+      auto cellPtr =
+        d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active();
+      auto endcPtr = d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end();
+
+      std::vector<global_size_type>       cellDofIndicesGlobal(d_nDofsPerCell);
+      std::map<dealii::CellId, size_type> cellIdToCellIndexMap;
+
+      unsigned int iCell = 0;
+      for (; cellPtr != endcPtr; ++cellPtr)
+        if (cellPtr->is_locally_owned())
+          {
+            cellPtr->get_dof_indices(cellDofIndicesGlobal);
+            for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof)
+              d_cellDofIndexToProcessDofIndexMap[iCell * d_nDofsPerCell +
+                                                 iDof] =
+                d_matrixFreeDataPtr->get_vector_partitioner(d_dofHandlerID)
+                  ->global_to_local(cellDofIndicesGlobal[iDof]);
+
+
+            d_cellIndexToCellIdMap[iCell] = cellPtr->id();
+
+            ++iCell;
+          }
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::initializeConstraints()
+    {
+      d_constraintInfo.initialize(d_matrixFreeDataPtr->get_vector_partitioner(
+                                    d_dofHandlerID),
+                                  *((*d_constraintsVector)[d_dofHandlerID]));
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::initializeShapeFunctionAndJacobianData()
+    {
+      d_nQuadsPerCell.resize(d_quadratureIDsVector.size());
+      d_inverseJacobianData.resize(
+        areAllCellsAffine ? 1 : d_quadratureIDsVector.size());
+      d_JxWData.resize(d_quadratureIDsVector.size());
+      if (d_updateFlags & update_values)
+        {
+          d_shapeFunctionData.resize(d_quadratureIDsVector.size());
+          if (d_updateFlags & update_transpose)
+            d_shapeFunctionDataTranspose.resize(d_quadratureIDsVector.size());
+        }
+      if (d_updateFlags & update_gradients)
+        {
+          d_shapeFunctionGradientDataInternalLayout.resize(
+            d_quadratureIDsVector.size());
+          d_shapeFunctionGradientData.resize(d_quadratureIDsVector.size());
+          if (d_updateFlags & update_transpose)
+            d_shapeFunctionGradientDataTranspose.resize(
+              d_quadratureIDsVector.size());
+        }
+      for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size();
+           ++iQuadID)
+        {
+          const dealii::Quadrature<3> &quadrature =
+            d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]);
+          dealii::FEValues<3> fe_values(
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(),
+            quadrature,
+            dealii::update_values | dealii::update_gradients |
+              dealii::update_jacobians | dealii::update_JxW_values |
+              dealii::update_inverse_jacobians);
+
+          d_nQuadsPerCell[iQuadID] = quadrature.size();
+
+#if defined(DFTFE_WITH_DEVICE)
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_inverseJacobianDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_JxWDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionDataTransposeHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionGradientDataInternalLayoutHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionGradientDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionGradientDataTransposeHost;
+#else
+          auto &d_inverseJacobianDataHost =
+            d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID];
+          auto &d_JxWDataHost           = d_JxWData[iQuadID];
+          auto &d_shapeFunctionDataHost = d_shapeFunctionData[iQuadID];
+          auto &d_shapeFunctionGradientDataInternalLayoutHost =
+            d_shapeFunctionGradientDataInternalLayout[iQuadID];
+          auto &d_shapeFunctionDataTransposeHost =
+            d_shapeFunctionDataTranspose[iQuadID];
+          auto &d_shapeFunctionGradientDataHost =
+            d_shapeFunctionGradientData[iQuadID];
+          auto &d_shapeFunctionGradientDataTransposeHost =
+            d_shapeFunctionGradientDataTranspose[iQuadID];
+#endif
+
+
+          d_shapeFunctionDataHost.clear();
+          if (d_updateFlags & update_values)
+            d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] *
+                                             d_nDofsPerCell,
+                                           0.0);
+          d_shapeFunctionDataTransposeHost.clear();
+          if ((d_updateFlags & update_values) &&
+              (d_updateFlags & update_transpose))
+            d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] *
+                                                      d_nDofsPerCell,
+                                                    0.0);
+          d_shapeFunctionGradientDataInternalLayoutHost.clear();
+          d_shapeFunctionGradientDataHost.clear();
+          d_shapeFunctionGradientDataTransposeHost.clear();
+          if (d_updateFlags & update_gradients)
+            {
+              d_shapeFunctionGradientDataInternalLayoutHost.resize(
+                d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0);
+              d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] *
+                                                       d_nDofsPerCell * 3,
+                                                     0.0);
+              if (d_updateFlags & update_transpose)
+                d_shapeFunctionGradientDataTransposeHost.resize(
+                  d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0);
+            }
+
+          d_JxWDataHost.clear();
+          if ((d_updateFlags & update_values) ||
+              (d_updateFlags & update_gradients))
+            d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]);
+
+          d_inverseJacobianDataHost.clear();
+          if (d_updateFlags & update_gradients)
+            d_inverseJacobianDataHost.resize(
+              areAllCellsCartesian ?
+                d_nCells * 3 :
+                (areAllCellsAffine ? d_nCells * 9 :
+                                     d_nCells * 9 * d_nQuadsPerCell[iQuadID]));
+          const unsigned int nJacobiansPerCell =
+            areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID];
+
+          auto cellPtr =
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active();
+          auto endcPtr =
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end();
+
+          unsigned int iCell = 0;
+          for (; cellPtr != endcPtr; ++cellPtr)
+            if (cellPtr->is_locally_owned())
+              {
+                fe_values.reinit(cellPtr);
+                auto &jacobians        = fe_values.get_jacobians();
+                auto &inverseJacobians = fe_values.get_inverse_jacobians();
+                if (iCell == 0)
+                  {
+                    if (d_updateFlags & update_values)
+                      {
+                        for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                             ++iNode)
+                          for (unsigned int iQuad = 0;
+                               iQuad < d_nQuadsPerCell[iQuadID];
+                               ++iQuad)
+                            d_shapeFunctionDataHost[iQuad * d_nDofsPerCell +
+                                                    iNode] =
+                              fe_values.shape_value(iNode, iQuad);
+                        if (d_updateFlags & update_transpose)
+                          for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                               ++iNode)
+                            for (unsigned int iQuad = 0;
+                                 iQuad < d_nQuadsPerCell[iQuadID];
+                                 ++iQuad)
+                              d_shapeFunctionDataTransposeHost
+                                [iNode * d_nQuadsPerCell[iQuadID] + iQuad] =
+                                  fe_values.shape_value(iNode, iQuad);
+                      }
+
+
+                    if (d_updateFlags & update_gradients)
+                      for (unsigned int iQuad = 0;
+                           iQuad < d_nQuadsPerCell[iQuadID];
+                           ++iQuad)
+                        for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                             ++iNode)
+                          {
+                            const auto &shape_grad_real =
+                              fe_values.shape_grad(iNode, iQuad);
+                            const auto &shape_grad_reference =
+                              apply_transformation(jacobians[iQuad].transpose(),
+                                                   shape_grad_real);
+                            for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                              if (areAllCellsAffine)
+                                d_shapeFunctionGradientDataInternalLayoutHost
+                                  [d_nQuadsPerCell[iQuadID] * d_nDofsPerCell *
+                                     iDim +
+                                   d_nDofsPerCell * iQuad + iNode] =
+                                    shape_grad_reference[iDim];
+                              else
+                                d_shapeFunctionGradientDataInternalLayoutHost
+                                  [iQuad * d_nDofsPerCell * 3 +
+                                   d_nDofsPerCell * iDim + iNode] =
+                                    shape_grad_reference[iDim];
+
+
+                            for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                              d_shapeFunctionGradientDataHost
+                                [iDim * d_nQuadsPerCell[iQuadID] *
+                                   d_nDofsPerCell +
+                                 iQuad * d_nDofsPerCell + iNode] =
+                                  shape_grad_reference[iDim];
+                            if (d_updateFlags & update_transpose)
+                              for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                                d_shapeFunctionGradientDataTransposeHost
+                                  [iDim * d_nQuadsPerCell[iQuadID] *
+                                     d_nDofsPerCell +
+                                   iNode * d_nQuadsPerCell[iQuadID] + iQuad] =
+                                    shape_grad_reference[iDim];
+                          }
+                  }
+                for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID];
+                     ++iQuad)
+                  d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] =
+                    fe_values.JxW(iQuad);
+                for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad)
+                  for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                    if (areAllCellsCartesian)
+                      d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 +
+                                                iDim * nJacobiansPerCell +
+                                                iQuad] =
+                        inverseJacobians[iQuad][iDim][iDim];
+                    else
+                      for (unsigned int jDim = 0; jDim < 3; ++jDim)
+                        d_inverseJacobianDataHost[iCell * nJacobiansPerCell *
+                                                    9 +
+                                                  9 * iQuad + jDim * 3 + iDim] =
+                          inverseJacobians[iQuad][iDim][jDim];
+                ++iCell;
+              }
+
+#if defined(DFTFE_WITH_DEVICE)
+          d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].resize(
+            d_inverseJacobianDataHost.size());
+          d_inverseJacobianData[areAllCellsAffine ? 0 : iQuadID].copyFrom(
+            d_inverseJacobianDataHost);
+          d_JxWData[iQuadID].resize(d_JxWDataHost.size());
+          d_JxWData[iQuadID].copyFrom(d_JxWDataHost);
+          d_shapeFunctionData[iQuadID].resize(d_shapeFunctionDataHost.size());
+          d_shapeFunctionData[iQuadID].copyFrom(d_shapeFunctionDataHost);
+          d_shapeFunctionGradientDataInternalLayout[iQuadID].resize(
+            d_shapeFunctionGradientDataInternalLayoutHost.size());
+          d_shapeFunctionGradientDataInternalLayout[iQuadID].copyFrom(
+            d_shapeFunctionGradientDataInternalLayoutHost);
+          d_shapeFunctionDataTranspose[iQuadID].resize(
+            d_shapeFunctionDataTransposeHost.size());
+          d_shapeFunctionDataTranspose[iQuadID].copyFrom(
+            d_shapeFunctionDataTransposeHost);
+          d_shapeFunctionGradientData[iQuadID].resize(
+            d_shapeFunctionGradientDataHost.size());
+          d_shapeFunctionGradientData[iQuadID].copyFrom(
+            d_shapeFunctionGradientDataHost);
+          d_shapeFunctionGradientDataTranspose[iQuadID].resize(
+            d_shapeFunctionGradientDataTransposeHost.size());
+          d_shapeFunctionGradientDataTranspose[iQuadID].copyFrom(
+            d_shapeFunctionGradientDataTransposeHost);
+#endif
+        }
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<
+      ValueTypeBasisCoeff,
+      ValueTypeBasisData,
+      memorySpace>::initializeShapeFunctionAndJacobianBasisData()
+    {
+      d_inverseJacobianBasisData.resize(
+        areAllCellsAffine ? 1 : d_quadratureIDsVector.size());
+      d_JxWBasisData.resize(d_quadratureIDsVector.size());
+      if (d_updateFlags & update_values)
+        {
+          d_shapeFunctionBasisData.resize(d_quadratureIDsVector.size());
+          if (d_updateFlags & update_transpose)
+            d_shapeFunctionBasisDataTranspose.resize(
+              d_quadratureIDsVector.size());
+        }
+      if (d_updateFlags & update_gradients)
+        {
+          d_shapeFunctionGradientBasisData.resize(d_quadratureIDsVector.size());
+          if (d_updateFlags & update_transpose)
+            d_shapeFunctionGradientBasisDataTranspose.resize(
+              d_quadratureIDsVector.size());
+        }
+      for (unsigned int iQuadID = 0; iQuadID < d_quadratureIDsVector.size();
+           ++iQuadID)
+        {
+          const dealii::Quadrature<3> &quadrature =
+            d_matrixFreeDataPtr->get_quadrature(d_quadratureIDsVector[iQuadID]);
+          dealii::FEValues<3> fe_values(
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).get_fe(),
+            quadrature,
+            dealii::update_values | dealii::update_gradients |
+              dealii::update_jacobians | dealii::update_JxW_values |
+              dealii::update_inverse_jacobians);
+
+#if defined(DFTFE_WITH_DEVICE)
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_inverseJacobianDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_JxWDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionDataTransposeHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionGradientDataHost;
+          dftfe::utils::MemoryStorage<ValueTypeBasisData,
+                                      dftfe::utils::MemorySpace::HOST>
+            d_shapeFunctionGradientDataTransposeHost;
+#else
+          auto &d_inverseJacobianDataHost =
+            d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID];
+          auto &d_JxWDataHost           = d_JxWBasisData[iQuadID];
+          auto &d_shapeFunctionDataHost = d_shapeFunctionBasisData[iQuadID];
+          auto &d_shapeFunctionDataTransposeHost =
+            d_shapeFunctionBasisDataTranspose[iQuadID];
+          auto &d_shapeFunctionGradientDataHost =
+            d_shapeFunctionGradientBasisData[iQuadID];
+          auto &d_shapeFunctionGradientDataTransposeHost =
+            d_shapeFunctionGradientBasisDataTranspose[iQuadID];
+#endif
+
+
+          d_shapeFunctionDataHost.clear();
+          if (d_updateFlags & update_values)
+            d_shapeFunctionDataHost.resize(d_nQuadsPerCell[iQuadID] *
+                                             d_nDofsPerCell,
+                                           0.0);
+          d_shapeFunctionDataTransposeHost.clear();
+          if ((d_updateFlags & update_values) &&
+              (d_updateFlags & update_transpose))
+            d_shapeFunctionDataTransposeHost.resize(d_nQuadsPerCell[iQuadID] *
+                                                      d_nDofsPerCell,
+                                                    0.0);
+          d_shapeFunctionGradientDataHost.clear();
+          d_shapeFunctionGradientDataTransposeHost.clear();
+          if (d_updateFlags & update_gradients)
+            {
+              d_shapeFunctionGradientDataHost.resize(d_nQuadsPerCell[iQuadID] *
+                                                       d_nDofsPerCell * 3,
+                                                     0.0);
+              if (d_updateFlags & update_transpose)
+                d_shapeFunctionGradientDataTransposeHost.resize(
+                  d_nQuadsPerCell[iQuadID] * d_nDofsPerCell * 3, 0.0);
+            }
+
+          d_JxWDataHost.clear();
+          if ((d_updateFlags & update_values) ||
+              (d_updateFlags & update_gradients))
+            d_JxWDataHost.resize(d_nCells * d_nQuadsPerCell[iQuadID]);
+
+          d_inverseJacobianDataHost.clear();
+          if (d_updateFlags & update_gradients)
+            d_inverseJacobianDataHost.resize(
+              areAllCellsCartesian ?
+                d_nCells * 3 :
+                (areAllCellsAffine ? d_nCells * 9 :
+                                     d_nCells * 9 * d_nQuadsPerCell[iQuadID]));
+          const unsigned int nJacobiansPerCell =
+            areAllCellsAffine ? 1 : d_nQuadsPerCell[iQuadID];
+
+          auto cellPtr =
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).begin_active();
+          auto endcPtr =
+            d_matrixFreeDataPtr->get_dof_handler(d_dofHandlerID).end();
+
+          unsigned int iCell = 0;
+          for (; cellPtr != endcPtr; ++cellPtr)
+            if (cellPtr->is_locally_owned())
+              {
+                fe_values.reinit(cellPtr);
+                auto &jacobians        = fe_values.get_jacobians();
+                auto &inverseJacobians = fe_values.get_inverse_jacobians();
+                if (iCell == 0)
+                  {
+                    if (d_updateFlags & update_values)
+                      {
+                        for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                             ++iNode)
+                          for (unsigned int iQuad = 0;
+                               iQuad < d_nQuadsPerCell[iQuadID];
+                               ++iQuad)
+                            d_shapeFunctionDataHost[iQuad * d_nDofsPerCell +
+                                                    iNode] =
+                              fe_values.shape_value(iNode, iQuad);
+                        if (d_updateFlags & update_transpose)
+                          for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                               ++iNode)
+                            for (unsigned int iQuad = 0;
+                                 iQuad < d_nQuadsPerCell[iQuadID];
+                                 ++iQuad)
+                              d_shapeFunctionDataTransposeHost
+                                [iNode * d_nQuadsPerCell[iQuadID] + iQuad] =
+                                  fe_values.shape_value(iNode, iQuad);
+                      }
+
+
+                    if (d_updateFlags & update_gradients)
+                      for (unsigned int iQuad = 0;
+                           iQuad < d_nQuadsPerCell[iQuadID];
+                           ++iQuad)
+                        for (unsigned int iNode = 0; iNode < d_nDofsPerCell;
+                             ++iNode)
+                          {
+                            const auto &shape_grad_real =
+                              fe_values.shape_grad(iNode, iQuad);
+                            const auto &shape_grad_reference =
+                              apply_transformation(jacobians[iQuad].transpose(),
+                                                   shape_grad_real);
+
+                            for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                              d_shapeFunctionGradientDataHost
+                                [iDim * d_nQuadsPerCell[iQuadID] *
+                                   d_nDofsPerCell +
+                                 iQuad * d_nDofsPerCell + iNode] =
+                                  shape_grad_reference[iDim];
+                            if (d_updateFlags & update_transpose)
+                              for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                                d_shapeFunctionGradientDataTransposeHost
+                                  [iDim * d_nQuadsPerCell[iQuadID] *
+                                     d_nDofsPerCell +
+                                   iNode * d_nQuadsPerCell[iQuadID] + iQuad] =
+                                    shape_grad_reference[iDim];
+                          }
+                  }
+                for (unsigned int iQuad = 0; iQuad < d_nQuadsPerCell[iQuadID];
+                     ++iQuad)
+                  d_JxWDataHost[iCell * d_nQuadsPerCell[iQuadID] + iQuad] =
+                    fe_values.JxW(iQuad);
+                for (unsigned int iQuad = 0; iQuad < nJacobiansPerCell; ++iQuad)
+                  for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                    if (areAllCellsCartesian)
+                      d_inverseJacobianDataHost[iCell * nJacobiansPerCell * 3 +
+                                                iDim * nJacobiansPerCell +
+                                                iQuad] =
+                        inverseJacobians[iQuad][iDim][iDim];
+                    else
+                      for (unsigned int jDim = 0; jDim < 3; ++jDim)
+                        d_inverseJacobianDataHost[iCell * nJacobiansPerCell *
+                                                    9 +
+                                                  9 * iQuad + jDim * 3 + iDim] =
+                          inverseJacobians[iQuad][iDim][jDim];
+                ++iCell;
+              }
+
+#if defined(DFTFE_WITH_DEVICE)
+          d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].resize(
+            d_inverseJacobianDataHost.size());
+          d_inverseJacobianBasisData[areAllCellsAffine ? 0 : iQuadID].copyFrom(
+            d_inverseJacobianDataHost);
+          d_JxWBasisData[iQuadID].resize(d_JxWDataHost.size());
+          d_JxWBasisData[iQuadID].copyFrom(d_JxWDataHost);
+          d_shapeFunctionBasisData[iQuadID].resize(
+            d_shapeFunctionDataHost.size());
+          d_shapeFunctionBasisData[iQuadID].copyFrom(d_shapeFunctionDataHost);
+          d_shapeFunctionBasisDataTranspose[iQuadID].resize(
+            d_shapeFunctionDataTransposeHost.size());
+          d_shapeFunctionBasisDataTranspose[iQuadID].copyFrom(
+            d_shapeFunctionDataTransposeHost);
+          d_shapeFunctionGradientBasisData[iQuadID].resize(
+            d_shapeFunctionGradientDataHost.size());
+          d_shapeFunctionGradientBasisData[iQuadID].copyFrom(
+            d_shapeFunctionGradientDataHost);
+          d_shapeFunctionGradientBasisDataTranspose[iQuadID].resize(
+            d_shapeFunctionGradientDataTransposeHost.size());
+          d_shapeFunctionGradientBasisDataTranspose[iQuadID].copyFrom(
+            d_shapeFunctionGradientDataTransposeHost);
+#endif
+        }
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::
+      createMultiVector(
+        const unsigned int blocksize,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>
+          &multiVector) const
+    {
+      multiVector.reinit(mpiPatternP2P, blocksize);
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::
+      createScratchMultiVectors(const unsigned int vecBlockSize,
+                                const unsigned int numMultiVecs) const
+    {
+      auto iter = scratchMultiVectors.find(vecBlockSize);
+      if (iter == scratchMultiVectors.end())
+        {
+          scratchMultiVectors[vecBlockSize] =
+            std::vector<dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                                          memorySpace>>(
+              numMultiVecs);
+          for (unsigned int iVec = 0; iVec < numMultiVecs; ++iVec)
+            scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P,
+                                                           vecBlockSize);
+        }
+      else
+        {
+          scratchMultiVectors[vecBlockSize].resize(
+            scratchMultiVectors[vecBlockSize].size() + numMultiVecs);
+          for (unsigned int iVec = 0;
+               iVec < scratchMultiVectors[vecBlockSize].size();
+               ++iVec)
+            scratchMultiVectors[vecBlockSize][iVec].reinit(mpiPatternP2P,
+                                                           vecBlockSize);
+        }
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::clearScratchMultiVectors() const
+    {
+      scratchMultiVectors.clear();
+    }
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace> &
+    FEBasisOperationsBase<
+      ValueTypeBasisCoeff,
+      ValueTypeBasisData,
+      memorySpace>::getMultiVector(const unsigned int vecBlockSize,
+                                   const unsigned int index) const
+    {
+      AssertThrow(scratchMultiVectors.find(vecBlockSize) !=
+                    scratchMultiVectors.end(),
+                  dealii::ExcMessage(
+                    "DFT-FE Error: MultiVector not found in scratch storage."));
+      return scratchMultiVectors[vecBlockSize][index];
+    }
+
+
+    template <typename ValueTypeBasisCoeff,
+              typename ValueTypeBasisData,
+              dftfe::utils::MemorySpace memorySpace>
+    void
+    FEBasisOperationsBase<ValueTypeBasisCoeff,
+                          ValueTypeBasisData,
+                          memorySpace>::
+      distribute(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff, memorySpace>
+          &multiVector) const
+    {
+      d_constraintInfo.distribute(multiVector, d_nVectors);
+    }
+
+
+  } // namespace basis
+} // namespace dftfe
diff --git a/utils/FEBasisOperationsDevice.t.cc b/utils/FEBasisOperationsDevice.t.cc
new file mode 100644
index 000000000..ef79fe1d5
--- /dev/null
+++ b/utils/FEBasisOperationsDevice.t.cc
@@ -0,0 +1,392 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#include <FEBasisOperations.h>
+#include <linearAlgebraOperations.h>
+#include <deviceKernelsGeneric.h>
+#include <DeviceBlasWrapper.h>
+#include <DeviceTypeConfig.h>
+#include <DeviceKernelLauncherConstants.h>
+#include <DeviceAPICalls.h>
+#include <DeviceDataTypeOverloads.h>
+#include <FEBasisOperationsKernelsDevice.h>
+namespace dftfe
+{
+  // namespace
+  // {
+  //   template <typename ValueType1, typename ValueType2>
+  //   __global__ void
+  //   reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs,
+  //                                    const dftfe::size_type numQuads,
+  //                                    const dftfe::size_type numCells,
+  //                                    const ValueType1 *     copyFromVec,
+  //                                    ValueType2 *           copyToVec)
+  //   {
+  //     const dftfe::size_type globalThreadId =
+  //       blockIdx.x * blockDim.x + threadIdx.x;
+  //     const dftfe::size_type numberEntries = numQuads * numCells * numVecs *
+  //     3;
+
+  //     for (dftfe::size_type index = globalThreadId; index < numberEntries;
+  //          index += blockDim.x * gridDim.x)
+  //       {
+  //         dftfe::size_type blockIndex  = index / numVecs;
+  //         dftfe::size_type iVec        = index - blockIndex * numVecs;
+  //         dftfe::size_type blockIndex2 = blockIndex / numQuads;
+  //         dftfe::size_type iQuad       = blockIndex - blockIndex2 * numQuads;
+  //         dftfe::size_type iCell       = blockIndex2 / 3;
+  //         dftfe::size_type iDim        = blockIndex2 - iCell * 3;
+  //         dftfe::utils::copyValue(
+  //           copyToVec + index,
+  //           copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs +
+  //                       iCell * 3 * numQuads * numVecs]);
+  //       }
+  //   }
+  // } // namespace
+
+  namespace basis
+  {
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      interpolate(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                  nodalData,
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients) const
+    {
+      interpolateKernel(nodalData,
+                        quadratureValues,
+                        quadratureGradients,
+                        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      integrateWithBasis(
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &nodalData) const
+    {
+      integrateWithBasisKernel(quadratureValues,
+                               quadratureGradients,
+                               nodalData,
+                               std::pair<unsigned int, unsigned int>(0,
+                                                                     d_nCells));
+    }
+
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      extractToCellNodalData(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                  nodalData,
+        ValueTypeBasisCoeff *cellNodalDataPtr) const
+    {
+      extractToCellNodalDataKernel(
+        nodalData,
+        cellNodalDataPtr,
+        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      accumulateFromCellNodalData(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &nodalData) const
+    {
+      accumulateFromCellNodalDataKernel(
+        cellNodalDataPtr,
+        nodalData,
+        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+
+
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      interpolateKernel(
+        const dftfe::linearAlgebra::MultiVector<
+          ValueTypeBasisCoeff,
+          dftfe::utils::MemorySpace::DEVICE> &      nodalValues,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      extractToCellNodalDataKernel(nodalValues,
+                                   tempCellNodalData.data(),
+                                   cellRange);
+      interpolateKernel(tempCellNodalData.data(),
+                        quadratureValues,
+                        quadratureGradients,
+                        cellRange);
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      interpolateKernel(
+        const ValueTypeBasisCoeff *                 cellNodalValues,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0),
+                                scalarCoeffBeta  = ValueTypeBasisCoeff(0.0);
+
+      if (quadratureValues != NULL)
+        dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
+          *d_deviceBlasHandlePtr,
+          dftfe::utils::DEVICEBLAS_OP_N,
+          dftfe::utils::DEVICEBLAS_OP_N,
+          d_nVectors,
+          d_nQuadsPerCell[d_quadratureID],
+          d_nDofsPerCell,
+          &scalarCoeffAlpha,
+          cellNodalValues,
+          d_nVectors,
+          d_nVectors * d_nDofsPerCell,
+          d_shapeFunctionData[d_quadratureID].data(),
+          d_nDofsPerCell,
+          0,
+          &scalarCoeffBeta,
+          quadratureValues,
+          d_nVectors,
+          d_nVectors * d_nQuadsPerCell[d_quadratureID],
+          cellRange.second - cellRange.first);
+      if (quadratureGradients != NULL)
+        {
+          dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
+            *d_deviceBlasHandlePtr,
+            dftfe::utils::DEVICEBLAS_OP_N,
+            dftfe::utils::DEVICEBLAS_OP_N,
+            d_nVectors,
+            d_nQuadsPerCell[d_quadratureID] * 3,
+            d_nDofsPerCell,
+            &scalarCoeffAlpha,
+            cellNodalValues,
+            d_nVectors,
+            d_nVectors * d_nDofsPerCell,
+            d_shapeFunctionGradientDataInternalLayout[d_quadratureID].data(),
+            d_nDofsPerCell,
+            0,
+            &scalarCoeffBeta,
+            areAllCellsCartesian ? quadratureGradients :
+                                   tempQuadratureGradientsData.data(),
+            d_nVectors,
+            d_nVectors * d_nQuadsPerCell[d_quadratureID] * 3,
+            cellRange.second - cellRange.first);
+          if (areAllCellsCartesian)
+            {
+              dftfe::utils::deviceKernelsGeneric::stridedBlockScale(
+                d_nQuadsPerCell[d_quadratureID] * d_nVectors,
+                3 * (cellRange.second - cellRange.first),
+                ValueTypeBasisCoeff(1.0),
+                d_inverseJacobianData[0].data() + cellRange.first * 3,
+                quadratureGradients);
+            }
+          else if (areAllCellsAffine)
+            {
+              dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
+                *d_deviceBlasHandlePtr,
+                dftfe::utils::DEVICEBLAS_OP_N,
+                dftfe::utils::DEVICEBLAS_OP_N,
+                d_nQuadsPerCell[d_quadratureID] * d_nVectors,
+                3,
+                3,
+                &scalarCoeffAlpha,
+                tempQuadratureGradientsData.data(),
+                d_nQuadsPerCell[d_quadratureID] * d_nVectors,
+                d_nQuadsPerCell[d_quadratureID] * d_nVectors * 3,
+                d_inverseJacobianData[0].data() + 9 * cellRange.first,
+                3,
+                9,
+                &scalarCoeffBeta,
+                quadratureGradients,
+                d_nQuadsPerCell[d_quadratureID] * d_nVectors,
+                d_nVectors * d_nQuadsPerCell[d_quadratureID] * 3,
+                cellRange.second - cellRange.first);
+            }
+          else
+            {
+              dftfe::utils::deviceBlasWrapper::gemmStridedBatched(
+                *d_deviceBlasHandlePtr,
+                dftfe::utils::DEVICEBLAS_OP_N,
+                dftfe::utils::DEVICEBLAS_OP_N,
+                d_nVectors,
+                3,
+                3,
+                &scalarCoeffAlpha,
+                tempQuadratureGradientsData.data(),
+                d_nVectors,
+                d_nVectors * 3,
+                d_inverseJacobianData[d_quadratureID].data() +
+                  9 * cellRange.first * d_nQuadsPerCell[d_quadratureID],
+                3,
+                9,
+                &scalarCoeffBeta,
+                tempQuadratureGradientsDataNonAffine.data(),
+                d_nVectors,
+                d_nVectors * 3,
+                (cellRange.second - cellRange.first) *
+                  d_nQuadsPerCell[d_quadratureID]);
+              dftfe::basis::FEBasisOperationsKernelsDevice::
+                reshapeNonAffineCase(
+                  d_nVectors,
+                  d_nQuadsPerCell[d_quadratureID],
+                  (cellRange.second - cellRange.first),
+                  tempQuadratureGradientsDataNonAffine.data(),
+                  quadratureGradients);
+              // #ifdef DFTFE_WITH_DEVICE_LANG_CUDA
+              //               reshapeNonAffineCaseDeviceKernel<<<
+              //                 (d_nVectors * (cellRange.second -
+              //                 cellRange.first) *
+              //                  d_nQuadsPerCell[d_quadratureID] * 3) /
+              //                     dftfe::utils::DEVICE_BLOCK_SIZE +
+              //                   1,
+              //                 dftfe::utils::DEVICE_BLOCK_SIZE>>>(
+              //                 d_nVectors,
+              //                 d_nQuadsPerCell[d_quadratureID],
+              //                 (cellRange.second - cellRange.first),
+              //                 dftfe::utils::makeDataTypeDeviceCompatible(
+              //                   tempQuadratureGradientsDataNonAffine.data()),
+              //                 dftfe::utils::makeDataTypeDeviceCompatible(
+              //                   quadratureGradients));
+              // #elif DFTFE_WITH_DEVICE_LANG_HIP
+              //               hipLaunchKernelGGL(reshapeNonAffineCaseDeviceKernel,
+              //                                  (d_nVectors *
+              //                                   (cellRange.second -
+              //                                   cellRange.first) *
+              //                                   d_nQuadsPerCell[d_quadratureID]
+              //                                   * 3) /
+              //                                      dftfe::utils::DEVICE_BLOCK_SIZE
+              //                                      +
+              //                                    1,
+              //                                  dftfe::utils::DEVICE_BLOCK_SIZE,
+              //                                  0,
+              //                                  0,
+              //                                  d_nVectors,
+              //                                  d_nQuadsPerCell[d_quadratureID],
+              //                                  (cellRange.second -
+              //                                  cellRange.first),
+              //                                  dftfe::utils::makeDataTypeDeviceCompatible(
+              //                                    tempQuadratureGradientsDataNonAffine.data()),
+              //                                  dftfe::utils::makeDataTypeDeviceCompatible(
+              //                                    quadratureGradients));
+              // #endif
+            }
+        }
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      integrateWithBasisKernel(
+        const ValueTypeBasisCoeff *quadratureValues,
+        const ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {}
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      extractToCellNodalDataKernel(
+        const dftfe::linearAlgebra::MultiVector<
+          ValueTypeBasisCoeff,
+          dftfe::utils::MemorySpace::DEVICE> &      nodalData,
+        ValueTypeBasisCoeff *                       cellNodalDataPtr,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      dftfe::utils::deviceKernelsGeneric::stridedCopyToBlock(
+        d_nVectors,
+        (cellRange.second - cellRange.first) * d_nDofsPerCell,
+        nodalData.data(),
+        cellNodalDataPtr,
+        d_flattenedCellDofIndexToProcessDofIndexMap.data() +
+          cellRange.first * d_nDofsPerCell);
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      accumulateFromCellNodalDataKernel(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::DEVICE>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      dftfe::utils::deviceKernelsGeneric::axpyStridedBlockAtomicAdd(
+        d_nVectors,
+        (cellRange.second - cellRange.first) * d_nDofsPerCell,
+        cellNodalDataPtr,
+        nodalData.begin(),
+        d_flattenedCellDofIndexToProcessDofIndexMap.begin() +
+          cellRange.first * d_nDofsPerCell);
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::
+      setDeviceBLASHandle(dftfe::utils::deviceBlasHandle_t *deviceBlasHandlePtr)
+    {
+      d_deviceBlasHandlePtr = deviceBlasHandlePtr;
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    dftfe::utils::deviceBlasHandle_t &
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::DEVICE>::getDeviceBLASHandle()
+    {
+      return *d_deviceBlasHandlePtr;
+    }
+  } // namespace basis
+} // namespace dftfe
diff --git a/utils/FEBasisOperationsHost.t.cc b/utils/FEBasisOperationsHost.t.cc
new file mode 100644
index 000000000..8b17a7265
--- /dev/null
+++ b/utils/FEBasisOperationsHost.t.cc
@@ -0,0 +1,461 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#include <FEBasisOperations.h>
+#include <linearAlgebraOperations.h>
+namespace dftfe
+{
+  namespace basis
+  {
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      interpolate(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                  nodalData,
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients) const
+    {
+      interpolateKernel(nodalData,
+                        quadratureValues,
+                        quadratureGradients,
+                        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      integrateWithBasis(
+        ValueTypeBasisCoeff *quadratureValues,
+        ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &nodalData) const
+    {
+      integrateWithBasisKernel(quadratureValues,
+                               quadratureGradients,
+                               nodalData,
+                               std::pair<unsigned int, unsigned int>(0,
+                                                                     d_nCells));
+    }
+
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      extractToCellNodalData(
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                  nodalData,
+        ValueTypeBasisCoeff *cellNodalDataPtr) const
+    {
+      extractToCellNodalDataKernel(
+        nodalData,
+        cellNodalDataPtr,
+        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      accumulateFromCellNodalData(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &nodalData) const
+    {
+      accumulateFromCellNodalDataKernel(
+        cellNodalDataPtr,
+        nodalData,
+        std::pair<unsigned int, unsigned int>(0, d_nCells));
+    }
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      interpolateKernel(
+        const dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                                dftfe::utils::MemorySpace::HOST>
+          &                                         nodalValues,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+           ++iCell)
+        {
+          extractToCellNodalDataKernel(
+            nodalValues,
+            tempCellNodalData.data(),
+            std::pair<unsigned int, unsigned int>(iCell, iCell + 1));
+          interpolateKernel(tempCellNodalData.data(),
+                            quadratureValues,
+                            quadratureGradients,
+                            std::pair<unsigned int, unsigned int>(iCell,
+                                                                  iCell + 1));
+        }
+    }
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      interpolateKernel(
+        const ValueTypeBasisCoeff *                 cellNodalValues,
+        ValueTypeBasisCoeff *                       quadratureValues,
+        ValueTypeBasisCoeff *                       quadratureGradients,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+           ++iCell)
+        {
+          const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0),
+                                    scalarCoeffBeta  = ValueTypeBasisCoeff(0.0);
+          const char transA = 'N', transB = 'N';
+
+          if (quadratureValues != NULL)
+            xgemm(&transA,
+                  &transB,
+                  &d_nVectors,
+                  &d_nQuadsPerCell[d_quadratureID],
+                  &d_nDofsPerCell,
+                  &scalarCoeffAlpha,
+                  cellNodalValues +
+                    d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors,
+                  &d_nVectors,
+                  d_shapeFunctionData[d_quadratureID].data(),
+                  &d_nDofsPerCell,
+                  &scalarCoeffBeta,
+                  quadratureValues + d_nQuadsPerCell[d_quadratureID] *
+                                       (iCell - cellRange.first) * d_nVectors,
+                  &d_nVectors);
+          if (quadratureGradients != NULL)
+            {
+              const unsigned int d_nQuadsPerCellTimesThree =
+                d_nQuadsPerCell[d_quadratureID] * 3;
+              xgemm(&transA,
+                    &transB,
+                    &d_nVectors,
+                    &d_nQuadsPerCellTimesThree,
+                    &d_nDofsPerCell,
+                    &scalarCoeffAlpha,
+                    cellNodalValues +
+                      d_nDofsPerCell * (iCell - cellRange.first) * d_nVectors,
+                    &d_nVectors,
+                    d_shapeFunctionGradientDataInternalLayout[d_quadratureID]
+                      .data(),
+                    &d_nDofsPerCell,
+                    &scalarCoeffBeta,
+                    areAllCellsCartesian ?
+                      (quadratureGradients + d_nQuadsPerCell[d_quadratureID] *
+                                               d_nVectors * 3 *
+                                               (iCell - cellRange.first)) :
+                      (tempQuadratureGradientsData.data()),
+                    &d_nVectors);
+              if (areAllCellsCartesian)
+                {
+                  const unsigned int d_nQuadsPerCellTimesnVectors =
+                    d_nQuadsPerCell[d_quadratureID] * d_nVectors;
+                  const unsigned int one = 1;
+                  for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                    xscal(&d_nQuadsPerCellTimesnVectors,
+                          d_inverseJacobianData[0].data() + 3 * iCell + iDim,
+                          quadratureGradients +
+                            d_nQuadsPerCell[d_quadratureID] * d_nVectors * 3 *
+                              (iCell - cellRange.first) +
+                            d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim,
+                          &one);
+                }
+              else if (areAllCellsAffine)
+                {
+                  const unsigned int d_nQuadsPerCellTimesnVectors =
+                    d_nQuadsPerCell[d_quadratureID] * d_nVectors;
+                  const unsigned int three = 3;
+                  xgemm(&transA,
+                        &transB,
+                        &d_nQuadsPerCellTimesnVectors,
+                        &three,
+                        &three,
+                        &scalarCoeffAlpha,
+                        tempQuadratureGradientsData.data(),
+                        &d_nQuadsPerCellTimesnVectors,
+                        d_inverseJacobianData[0].data() + 9 * iCell,
+                        &three,
+                        &scalarCoeffBeta,
+                        quadratureGradients + d_nQuadsPerCell[d_quadratureID] *
+                                                d_nVectors * 3 *
+                                                (iCell - cellRange.first),
+                        &d_nQuadsPerCellTimesnVectors);
+                }
+              else
+                {
+                  const unsigned int three = 3;
+                  for (unsigned int iQuad = 0;
+                       iQuad < d_nQuadsPerCell[d_quadratureID];
+                       ++iQuad)
+                    xgemm(&transA,
+                          &transB,
+                          &d_nVectors,
+                          &three,
+                          &three,
+                          &scalarCoeffAlpha,
+                          tempQuadratureGradientsData.data() +
+                            iQuad * d_nVectors * 3,
+                          &d_nVectors,
+                          d_inverseJacobianData[d_quadratureID].data() +
+                            9 * d_nQuadsPerCell[d_quadratureID] * iCell +
+                            9 * iQuad,
+                          &three,
+                          &scalarCoeffBeta,
+                          tempQuadratureGradientsDataNonAffine.data() +
+                            iQuad * d_nVectors * 3,
+                          &d_nVectors);
+                  for (unsigned int iQuad = 0;
+                       iQuad < d_nQuadsPerCell[d_quadratureID];
+                       ++iQuad)
+                    for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                      std::memcpy(quadratureGradients +
+                                    d_nVectors * 3 *
+                                      d_nQuadsPerCell[d_quadratureID] *
+                                      (iCell - cellRange.first) +
+                                    d_nVectors *
+                                      d_nQuadsPerCell[d_quadratureID] * iDim +
+                                    d_nVectors * iQuad,
+                                  tempQuadratureGradientsDataNonAffine.data() +
+                                    d_nVectors * 3 * iQuad + d_nVectors * iDim,
+                                  d_nVectors * sizeof(ValueTypeBasisCoeff));
+                }
+            }
+        }
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      integrateWithBasisKernel(
+        const ValueTypeBasisCoeff *quadratureValues,
+        const ValueTypeBasisCoeff *quadratureGradients,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      dftfe::utils::MemoryStorage<ValueTypeBasisCoeff,
+                                  dftfe::utils::MemorySpace::HOST>
+        cellNodalData, tempQuadratureGradientsData,
+        tempQuadratureGradientsDataNonAffine;
+      cellNodalData.resize(d_nVectors * d_nDofsPerCell * d_nCells);
+      if (quadratureGradients != NULL)
+        tempQuadratureGradientsData.resize(3 * d_nVectors *
+                                           d_nQuadsPerCell[d_quadratureID]);
+
+      if (quadratureGradients != NULL)
+        tempQuadratureGradientsDataNonAffine.resize(
+          areAllCellsAffine ?
+            0 :
+            (3 * d_nVectors * d_nQuadsPerCell[d_quadratureID]));
+
+
+
+      for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+           ++iCell)
+        {
+          const ValueTypeBasisCoeff scalarCoeffAlpha = ValueTypeBasisCoeff(1.0),
+                                    scalarCoeffBeta  = ValueTypeBasisCoeff(0.0);
+          const char transA = 'N', transB = 'T';
+
+          xgemm(&transA,
+                &transB,
+                &d_nVectors,
+                &d_nDofsPerCell,
+                &d_nQuadsPerCell[d_quadratureID],
+                &scalarCoeffAlpha,
+                quadratureValues + d_nQuadsPerCell[d_quadratureID] * iCell,
+                &d_nVectors,
+                d_shapeFunctionData[d_quadratureID].data(),
+                &d_nQuadsPerCell[d_quadratureID],
+                &scalarCoeffBeta,
+                cellNodalData.data() + d_nDofsPerCell * iCell,
+                &d_nVectors);
+          if (quadratureGradients != NULL)
+            {
+              if (areAllCellsCartesian)
+                {
+                  const unsigned int d_nQuadsPerCellTimesnVectors =
+                    d_nQuadsPerCell[d_quadratureID] * d_nVectors;
+                  const unsigned int one = 1;
+                  std::memcpy(tempQuadratureGradientsData.data(),
+                              quadratureGradients +
+                                d_nQuadsPerCell[d_quadratureID] * d_nVectors *
+                                  3 * iCell,
+                              3 * d_nQuadsPerCellTimesnVectors *
+                                sizeof(ValueTypeBasisCoeff));
+                  for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                    xscal(&d_nQuadsPerCellTimesnVectors,
+                          d_inverseJacobianData[0].data() + 3 * iCell + iDim,
+                          tempQuadratureGradientsData.data() +
+                            d_nQuadsPerCell[d_quadratureID] * d_nVectors * iDim,
+                          &one);
+                }
+              else if (areAllCellsAffine)
+                {
+                  const unsigned int d_nQuadsPerCellTimesnVectors =
+                    d_nQuadsPerCell[d_quadratureID] * d_nVectors;
+                  const unsigned int three = 3;
+                  xgemm(&transA,
+                        &transB,
+                        &d_nQuadsPerCellTimesnVectors,
+                        &three,
+                        &three,
+                        &scalarCoeffAlpha,
+                        quadratureGradients + d_nQuadsPerCell[d_quadratureID] *
+                                                d_nVectors * 3 * iCell,
+                        &d_nQuadsPerCellTimesnVectors,
+                        d_inverseJacobianData[0].data() + 9 * iCell,
+                        &three,
+                        &scalarCoeffBeta,
+                        tempQuadratureGradientsData.data(),
+                        &d_nQuadsPerCellTimesnVectors);
+                }
+              else
+                {
+                  for (unsigned int iQuad = 0;
+                       iQuad < d_nQuadsPerCell[d_quadratureID];
+                       ++iQuad)
+                    for (unsigned int iDim = 0; iDim < 3; ++iDim)
+                      std::memcpy(tempQuadratureGradientsDataNonAffine.data() +
+                                    d_nVectors * 3 * iQuad + d_nVectors * iDim,
+                                  quadratureGradients +
+                                    d_nVectors * 3 *
+                                      d_nQuadsPerCell[d_quadratureID] * iCell +
+                                    d_nVectors *
+                                      d_nQuadsPerCell[d_quadratureID] * iDim +
+                                    d_nVectors * iQuad,
+                                  d_nVectors * sizeof(ValueTypeBasisCoeff));
+                  const unsigned int three = 3;
+                  for (unsigned int iQuad = 0;
+                       iQuad < d_nQuadsPerCell[d_quadratureID];
+                       ++iQuad)
+                    xgemm(&transA,
+                          &transB,
+                          &d_nVectors,
+                          &three,
+                          &three,
+                          &scalarCoeffAlpha,
+                          tempQuadratureGradientsDataNonAffine.data() +
+                            d_nVectors * 3 * iQuad,
+                          &d_nVectors,
+                          d_inverseJacobianData[d_quadratureID].data() +
+                            9 * d_nQuadsPerCell[d_quadratureID] * iCell +
+                            9 * iQuad,
+                          &three,
+                          &scalarCoeffBeta,
+                          tempQuadratureGradientsData.data() +
+                            d_nVectors * 3 * iQuad,
+                          &d_nVectors);
+                }
+              const unsigned int d_nQuadsPerCellTimesThree =
+                d_nQuadsPerCell[d_quadratureID] * 3;
+              xgemm(&transA,
+                    &transB,
+                    &d_nVectors,
+                    &d_nQuadsPerCellTimesThree,
+                    &d_nDofsPerCell,
+                    &scalarCoeffAlpha,
+                    tempQuadratureGradientsData.data(),
+                    &d_nVectors,
+                    d_shapeFunctionGradientDataInternalLayout[d_quadratureID]
+                      .data(),
+                    &d_nDofsPerCell,
+                    &scalarCoeffBeta,
+                    cellNodalData.data() + d_nDofsPerCell * iCell,
+                    &d_nVectors);
+            }
+          accumulateFromCellNodalDataKernel(
+            cellNodalData.data(),
+            nodalData,
+            std::pair<unsigned int, unsigned int>(iCell, iCell + 1));
+        }
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      extractToCellNodalDataKernel(
+        const dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                                dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        ValueTypeBasisCoeff *                       cellNodalDataPtr,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+           ++iCell)
+        for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof)
+          {
+            std::memcpy(cellNodalDataPtr +
+                          (iCell - cellRange.first) * d_nVectors *
+                            d_nDofsPerCell +
+                          iDof * d_nVectors,
+                        nodalData.data() +
+                          d_flattenedCellDofIndexToProcessDofIndexMap
+                            [iCell * d_nDofsPerCell + iDof],
+                        d_nVectors * sizeof(ValueTypeBasisCoeff));
+          }
+    }
+
+    template <typename ValueTypeBasisCoeff, typename ValueTypeBasisData>
+    void
+    FEBasisOperations<ValueTypeBasisCoeff,
+                      ValueTypeBasisData,
+                      dftfe::utils::MemorySpace::HOST>::
+      accumulateFromCellNodalDataKernel(
+        const ValueTypeBasisCoeff *cellNodalDataPtr,
+        dftfe::linearAlgebra::MultiVector<ValueTypeBasisCoeff,
+                                          dftfe::utils::MemorySpace::HOST>
+          &                                         nodalData,
+        const std::pair<unsigned int, unsigned int> cellRange) const
+    {
+      for (unsigned int iCell = cellRange.first; iCell < cellRange.second;
+           ++iCell)
+        for (unsigned int iDof = 0; iDof < d_nDofsPerCell; ++iDof)
+          std::transform(
+            cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell +
+              iDof * d_nVectors,
+            cellNodalDataPtr + iCell * d_nVectors * d_nDofsPerCell +
+              iDof * d_nVectors + d_nVectors,
+            nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap
+                                 [iCell * d_nDofsPerCell + iDof],
+            nodalData.data() + d_flattenedCellDofIndexToProcessDofIndexMap
+                                 [iCell * d_nDofsPerCell + iDof],
+            std::plus<ValueTypeBasisCoeff>());
+    }
+  } // namespace basis
+} // namespace dftfe
diff --git a/utils/FEBasisOperationsKernelsDevice.cc b/utils/FEBasisOperationsKernelsDevice.cc
new file mode 100644
index 000000000..2ad2a4706
--- /dev/null
+++ b/utils/FEBasisOperationsKernelsDevice.cc
@@ -0,0 +1,110 @@
+// ---------------------------------------------------------------------
+//
+// Copyright (c) 2017-2022  The Regents of the University of Michigan and DFT-FE
+// authors.
+//
+// This file is part of the DFT-FE code.
+//
+// The DFT-FE code is free software; you can use it, redistribute
+// it, and/or modify it under the terms of the GNU Lesser General
+// Public License as published by the Free Software Foundation; either
+// version 2.1 of the License, or (at your option) any later version.
+// The full text of the license can be found in the file LICENSE at
+// the top level of the DFT-FE distribution.
+//
+// ---------------------------------------------------------------------
+//
+
+#include <FEBasisOperationsKernelsDevice.h>
+#include <DeviceAPICalls.h>
+#include <DeviceTypeConfig.h>
+#include <DeviceKernelLauncherConstants.h>
+#include <DeviceDataTypeOverloads.h>
+
+
+namespace dftfe
+{
+  namespace
+  {
+    template <typename ValueType1, typename ValueType2>
+    __global__ void
+    reshapeNonAffineCaseDeviceKernel(const dftfe::size_type numVecs,
+                                     const dftfe::size_type numQuads,
+                                     const dftfe::size_type numCells,
+                                     const ValueType1 *     copyFromVec,
+                                     ValueType2 *           copyToVec)
+    {
+      const dftfe::size_type globalThreadId =
+        blockIdx.x * blockDim.x + threadIdx.x;
+      const dftfe::size_type numberEntries = numQuads * numCells * numVecs * 3;
+
+      for (dftfe::size_type index = globalThreadId; index < numberEntries;
+           index += blockDim.x * gridDim.x)
+        {
+          dftfe::size_type blockIndex  = index / numVecs;
+          dftfe::size_type iVec        = index - blockIndex * numVecs;
+          dftfe::size_type blockIndex2 = blockIndex / numQuads;
+          dftfe::size_type iQuad       = blockIndex - blockIndex2 * numQuads;
+          dftfe::size_type iCell       = blockIndex2 / 3;
+          dftfe::size_type iDim        = blockIndex2 - iCell * 3;
+          dftfe::utils::copyValue(
+            copyToVec + index,
+            copyFromVec[iVec + iDim * numVecs + iQuad * 3 * numVecs +
+                        iCell * 3 * numQuads * numVecs]);
+        }
+    }
+  } // namespace
+  namespace basis
+  {
+    namespace FEBasisOperationsKernelsDevice
+    {
+      template <typename ValueType1, typename ValueType2>
+      void
+      reshapeNonAffineCase(const dftfe::size_type numVecs,
+                           const dftfe::size_type numQuads,
+                           const dftfe::size_type numCells,
+                           const ValueType1 *     copyFromVec,
+                           ValueType2 *           copyToVec)
+      {
+#ifdef DFTFE_WITH_DEVICE_LANG_CUDA
+        reshapeNonAffineCaseDeviceKernel<<<(numVecs * numCells * numQuads * 3) /
+                                               dftfe::utils::DEVICE_BLOCK_SIZE +
+                                             1,
+                                           dftfe::utils::DEVICE_BLOCK_SIZE>>>(
+          numVecs,
+          numQuads,
+          numCells,
+          dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec),
+          dftfe::utils::makeDataTypeDeviceCompatible(copyToVec));
+#elif DFTFE_WITH_DEVICE_LANG_HIP
+        hipLaunchKernelGGL(
+          reshapeNonAffineCaseDeviceKernel,
+          (numVecs * numCells * numQuads * 3) /
+              dftfe::utils::DEVICE_BLOCK_SIZE +
+            1,
+          dftfe::utils::DEVICE_BLOCK_SIZE,
+          0,
+          0,
+          numVecs,
+          numQuads,
+          numCells,
+          dftfe::utils::makeDataTypeDeviceCompatible(copyFromVec),
+          dftfe::utils::makeDataTypeDeviceCompatible(copyToVec));
+#endif
+      }
+      template void
+      reshapeNonAffineCase(const dftfe::size_type numVecs,
+                           const dftfe::size_type numQuads,
+                           const dftfe::size_type numCells,
+                           const double *         copyFromVec,
+                           double *               copyToVec);
+      template void
+      reshapeNonAffineCase(const dftfe::size_type      numVecs,
+                           const dftfe::size_type      numQuads,
+                           const dftfe::size_type      numCells,
+                           const std::complex<double> *copyFromVec,
+                           std::complex<double> *      copyToVec);
+
+    } // namespace FEBasisOperationsKernelsDevice
+  }   // namespace basis
+} // namespace dftfe
diff --git a/utils/constraintMatrixInfo.cc b/utils/constraintMatrixInfo.cc
index 99098a218..7b3eb77a2 100644
--- a/utils/constraintMatrixInfo.cc
+++ b/utils/constraintMatrixInfo.cc
@@ -171,66 +171,6 @@ namespace dftfe
         }
     }
 
-
-    void
-    constraintMatrixInfo::precomputeMaps(
-      const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-        &unFlattenedPartitioner,
-      const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-        &                flattenedPartitioner,
-      const unsigned int blockSize)
-    {
-      //
-      // Get required sizes
-      //
-      const unsigned int n_ghosts  = unFlattenedPartitioner->n_ghost_indices();
-      const unsigned int localSize = unFlattenedPartitioner->local_size();
-      const unsigned int totalSize = n_ghosts + localSize;
-
-      d_localIndexMapUnflattenedToFlattened.clear();
-      d_localIndexMapUnflattenedToFlattened.resize(totalSize);
-
-      //
-      // fill the data array
-      //
-      for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof)
-        {
-          const dealii::types::global_dof_index globalIndex =
-            unFlattenedPartitioner->local_to_global(ilocalDof);
-          d_localIndexMapUnflattenedToFlattened[ilocalDof] =
-            flattenedPartitioner->global_to_local(globalIndex * blockSize);
-        }
-    }
-
-    void
-    constraintMatrixInfo::precomputeMaps(
-      const std::shared_ptr<
-        const utils::mpi::MPIPatternP2P<dftfe::utils::MemorySpace::HOST>>
-        &                mpiPattern,
-      const unsigned int blockSize)
-    {
-      //
-      // Get required sizes
-      //
-      const unsigned int totalSize =
-        mpiPattern->localOwnedSize() + mpiPattern->localGhostSize();
-
-      d_localIndexMapUnflattenedToFlattened.clear();
-      d_localIndexMapUnflattenedToFlattened.resize(totalSize);
-
-      //
-      // fill the data array
-      //
-      for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof)
-        {
-          d_localIndexMapUnflattenedToFlattened[ilocalDof] =
-            (dealii::types::global_dof_index)ilocalDof *
-            (dealii::types::global_dof_index)blockSize;
-        }
-    }
-
-
-
     //
     // set the constrained degrees of freedom to values so that constraints
     // are satisfied
@@ -273,7 +213,7 @@ namespace dftfe
                     d_inhomogenities[i]);
 
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
 
           for (unsigned int j = 0; j < d_rowSizes[i]; ++j)
             {
@@ -284,8 +224,7 @@ namespace dftfe
 
               const dealii::types::global_dof_index
                 startingLocalDofIndexColumn =
-                  d_localIndexMapUnflattenedToFlattened
-                    [d_columnIdsLocal[count]];
+                  d_columnIdsLocal[count] * blockSize;
 
               T alpha = d_columnValues[count];
 
@@ -323,7 +262,7 @@ namespace dftfe
                     d_inhomogenities[i]);
 
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
 
           for (unsigned int j = 0; j < d_rowSizes[i]; ++j)
             {
@@ -334,8 +273,7 @@ namespace dftfe
 
               const dealii::types::global_dof_index
                 startingLocalDofIndexColumn =
-                  d_localIndexMapUnflattenedToFlattened
-                    [d_columnIdsLocal[count]];
+                  d_columnIdsLocal[count] * blockSize;
 
               T alpha = d_columnValues[count];
 
@@ -371,13 +309,12 @@ namespace dftfe
       for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i)
         {
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
           for (unsigned int j = 0; j < d_rowSizes[i]; ++j)
             {
               const dealii::types::global_dof_index
                 startingLocalDofIndexColumn =
-                  d_localIndexMapUnflattenedToFlattened
-                    [d_columnIdsLocal[count]];
+                  d_columnIdsLocal[count] * blockSize;
 
               T alpha = d_columnValues[count];
               callaxpy(&blockSize,
@@ -411,13 +348,12 @@ namespace dftfe
       for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i)
         {
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
           for (unsigned int j = 0; j < d_rowSizes[i]; ++j)
             {
               const dealii::types::global_dof_index
                 startingLocalDofIndexColumn =
-                  d_localIndexMapUnflattenedToFlattened
-                    [d_columnIdsLocal[count]];
+                  d_columnIdsLocal[count] * blockSize;
 
               T alpha = d_columnValues[count];
               callaxpy(&blockSize,
@@ -448,7 +384,7 @@ namespace dftfe
       for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i)
         {
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
 
           // set constrained nodes to zero
           std::fill(fieldVector.begin() + startingLocalDofIndexRow,
@@ -465,7 +401,7 @@ namespace dftfe
       for (unsigned int i = 0; i < d_rowIdsLocal.size(); ++i)
         {
           const dealii::types::global_dof_index startingLocalDofIndexRow =
-            d_localIndexMapUnflattenedToFlattened[d_rowIdsLocal[i]];
+            d_rowIdsLocal[i] * blockSize;
 
           // set constrained nodes to zero
           std::fill(fieldVector.data() + startingLocalDofIndexRow,
@@ -488,7 +424,6 @@ namespace dftfe
       d_columnValues.clear();
       d_inhomogenities.clear();
       d_rowSizes.clear();
-      d_localIndexMapUnflattenedToFlattened.clear();
     }
 
 
@@ -509,8 +444,13 @@ namespace dftfe
 
     template void
     constraintMatrixInfo::distribute(
-      distributedCPUMultiVec<dataTypes::number> &fieldVector,
-      const unsigned int                         blockSize) const;
+      distributedCPUMultiVec<double> &fieldVector,
+      const unsigned int              blockSize) const;
+
+    template void
+    constraintMatrixInfo::distribute(
+      distributedCPUMultiVec<std::complex<double>> &fieldVector,
+      const unsigned int                            blockSize) const;
 
     template void
     constraintMatrixInfo::distribute_slave_to_master(
diff --git a/utils/constraintMatrixInfoDevice.cc b/utils/constraintMatrixInfoDevice.cc
index e3434678c..119ef6fef 100644
--- a/utils/constraintMatrixInfoDevice.cc
+++ b/utils/constraintMatrixInfoDevice.cc
@@ -39,9 +39,7 @@ namespace dftfe
         const unsigned int *constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
         const double *      constraintColumnValuesAllRowsUnflattened,
-        const double *      inhomogenities,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      inhomogenities)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -60,7 +58,7 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             xVec[xVecStartingIdRow + intraBlockIndex] =
               inhomogenities[blockIndex];
             for (unsigned int i = 0; i < numberColumns; ++i)
@@ -69,7 +67,7 @@ namespace dftfe
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 xVec[xVecStartingIdRow + intraBlockIndex] +=
                   constraintColumnValuesAllRowsUnflattened
                     [startingColumnNumber + i] *
@@ -89,9 +87,7 @@ namespace dftfe
         const unsigned int *constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
         const double *      constraintColumnValuesAllRowsUnflattened,
-        const double *      inhomogenities,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      inhomogenities)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -110,7 +106,7 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             xVec[xVecStartingIdRow + intraBlockIndex] =
               inhomogenities[blockIndex];
             for (unsigned int i = 0; i < numberColumns; ++i)
@@ -119,7 +115,7 @@ namespace dftfe
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 xVec[xVecStartingIdRow + intraBlockIndex] +=
                   constraintColumnValuesAllRowsUnflattened
                     [startingColumnNumber + i] *
@@ -139,9 +135,7 @@ namespace dftfe
         const unsigned int *               constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
         const double *      constraintColumnValuesAllRowsUnflattened,
-        const double *      inhomogenities,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      inhomogenities)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -160,7 +154,7 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex,
                                     inhomogenities[blockIndex]);
             for (unsigned int i = 0; i < numberColumns; ++i)
@@ -169,7 +163,7 @@ namespace dftfe
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 dftfe::utils::copyValue(
                   xVec + xVecStartingIdRow + intraBlockIndex,
                   dftfe::utils::add(
@@ -196,9 +190,7 @@ namespace dftfe
         const unsigned int *              constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
         const double *      constraintColumnValuesAllRowsUnflattened,
-        const double *      inhomogenities,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      inhomogenities)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -217,7 +209,7 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             dftfe::utils::copyValue(xVec + xVecStartingIdRow + intraBlockIndex,
                                     inhomogenities[blockIndex]);
             for (unsigned int i = 0; i < numberColumns; ++i)
@@ -226,7 +218,7 @@ namespace dftfe
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 dftfe::utils::copyValue(
                   xVec + xVecStartingIdRow + intraBlockIndex,
                   dftfe::utils::add(
@@ -251,9 +243,7 @@ namespace dftfe
         const unsigned int *constraintRowSizes,
         const unsigned int *constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
-        const double *      constraintColumnValuesAllRowsUnflattened,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      constraintColumnValuesAllRowsUnflattened)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -272,14 +262,14 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             for (unsigned int i = 0; i < numberColumns; ++i)
               {
                 const unsigned int constrainedColumnId =
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 atomicAdd(&(xVec[xVecStartingIdColumn + intraBlockIndex]),
                           constraintColumnValuesAllRowsUnflattened
                               [startingColumnNumber + i] *
@@ -299,9 +289,7 @@ namespace dftfe
         const unsigned int *constraintRowSizes,
         const unsigned int *constraintRowSizesAccumulated,
         const unsigned int *constraintLocalColumnIdsAllRowsUnflattened,
-        const double *      constraintColumnValuesAllRowsUnflattened,
-        const dealii::types::global_dof_index
-          *localIndexMapUnflattenedToFlattened)
+        const double *      constraintColumnValuesAllRowsUnflattened)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -320,14 +308,14 @@ namespace dftfe
             const unsigned int startingColumnNumber =
               constraintRowSizesAccumulated[blockIndex];
             const dealii::types::global_dof_index xVecStartingIdRow =
-              localIndexMapUnflattenedToFlattened[constrainedRowId];
+              constrainedRowId * contiguousBlockSize;
             for (unsigned int i = 0; i < numberColumns; ++i)
               {
                 const unsigned int constrainedColumnId =
                   constraintLocalColumnIdsAllRowsUnflattened
                     [startingColumnNumber + i];
                 const dealii::types::global_dof_index xVecStartingIdColumn =
-                  localIndexMapUnflattenedToFlattened[constrainedColumnId];
+                  constrainedColumnId * contiguousBlockSize;
                 const float tempfloatval =
                   constraintColumnValuesAllRowsUnflattened
                     [startingColumnNumber + i] *
@@ -344,9 +332,7 @@ namespace dftfe
       setzeroKernel(const unsigned int  contiguousBlockSize,
                     double *            xVec,
                     const unsigned int *constraintLocalRowIdsUnflattened,
-                    const unsigned int  numConstraints,
-                    const dealii::types::global_dof_index
-                      *localIndexMapUnflattenedToFlattened)
+                    const unsigned int  numConstraints)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -359,8 +345,8 @@ namespace dftfe
           {
             const unsigned int blockIndex      = index / contiguousBlockSize;
             const unsigned int intraBlockIndex = index % contiguousBlockSize;
-            xVec[localIndexMapUnflattenedToFlattened
-                   [constraintLocalRowIdsUnflattened[blockIndex]] +
+            xVec[constraintLocalRowIdsUnflattened[blockIndex] *
+                   contiguousBlockSize +
                  intraBlockIndex]              = 0;
           }
       }
@@ -369,9 +355,7 @@ namespace dftfe
       setzeroKernel(const unsigned int  contiguousBlockSize,
                     float *             xVec,
                     const unsigned int *constraintLocalRowIdsUnflattened,
-                    const unsigned int  numConstraints,
-                    const dealii::types::global_dof_index
-                      *localIndexMapUnflattenedToFlattened)
+                    const unsigned int  numConstraints)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -384,8 +368,8 @@ namespace dftfe
           {
             const unsigned int blockIndex      = index / contiguousBlockSize;
             const unsigned int intraBlockIndex = index % contiguousBlockSize;
-            xVec[localIndexMapUnflattenedToFlattened
-                   [constraintLocalRowIdsUnflattened[blockIndex]] +
+            xVec[constraintLocalRowIdsUnflattened[blockIndex] *
+                   contiguousBlockSize +
                  intraBlockIndex]              = 0;
           }
       }
@@ -394,9 +378,7 @@ namespace dftfe
       setzeroKernel(const unsigned int                 contiguousBlockSize,
                     dftfe::utils::deviceDoubleComplex *xVec,
                     const unsigned int *constraintLocalRowIdsUnflattened,
-                    const unsigned int  numConstraints,
-                    const dealii::types::global_dof_index
-                      *localIndexMapUnflattenedToFlattened)
+                    const unsigned int  numConstraints)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -411,8 +393,8 @@ namespace dftfe
             const unsigned int intraBlockIndex = index % contiguousBlockSize;
             dftfe::utils::copyValue(
               xVec +
-                localIndexMapUnflattenedToFlattened
-                  [constraintLocalRowIdsUnflattened[blockIndex]] +
+                constraintLocalRowIdsUnflattened[blockIndex] *
+                  contiguousBlockSize +
                 intraBlockIndex,
               0.0);
           }
@@ -423,9 +405,7 @@ namespace dftfe
       setzeroKernel(const unsigned int                contiguousBlockSize,
                     dftfe::utils::deviceFloatComplex *xVec,
                     const unsigned int *constraintLocalRowIdsUnflattened,
-                    const unsigned int  numConstraints,
-                    const dealii::types::global_dof_index
-                      *localIndexMapUnflattenedToFlattened)
+                    const unsigned int  numConstraints)
       {
         const dealii::types::global_dof_index globalThreadId =
           blockIdx.x * blockDim.x + threadIdx.x;
@@ -440,8 +420,8 @@ namespace dftfe
             const unsigned int intraBlockIndex = index % contiguousBlockSize;
             dftfe::utils::copyValue(
               xVec +
-                localIndexMapUnflattenedToFlattened
-                  [constraintLocalRowIdsUnflattened[blockIndex]] +
+                constraintLocalRowIdsUnflattened[blockIndex] *
+                  contiguousBlockSize +
                 intraBlockIndex,
               0.0);
           }
@@ -561,77 +541,6 @@ namespace dftfe
       d_numConstrainedDofs = d_rowIdsLocal.size();
     }
 
-
-    void
-    constraintMatrixInfoDevice::precomputeMaps(
-      const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-        &unFlattenedPartitioner,
-      const std::shared_ptr<const dealii::Utilities::MPI::Partitioner>
-        &                flattenedPartitioner,
-      const unsigned int blockSize)
-    {
-      //
-      // Get required sizes
-      //
-      const unsigned int n_ghosts  = unFlattenedPartitioner->n_ghost_indices();
-      const unsigned int localSize = unFlattenedPartitioner->local_size();
-      const unsigned int totalSize = n_ghosts + localSize;
-
-      d_localIndexMapUnflattenedToFlattened.clear();
-      d_localIndexMapUnflattenedToFlattened.resize(totalSize);
-
-      //
-      // fill the data array
-      //
-      for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof)
-        {
-          const dealii::types::global_dof_index globalIndex =
-            unFlattenedPartitioner->local_to_global(ilocalDof);
-          d_localIndexMapUnflattenedToFlattened[ilocalDof] =
-            flattenedPartitioner->global_to_local(globalIndex * blockSize);
-        }
-
-      d_localIndexMapUnflattenedToFlattenedDevice.resize(
-        d_localIndexMapUnflattenedToFlattened.size());
-      d_localIndexMapUnflattenedToFlattenedDevice.copyFrom(
-        d_localIndexMapUnflattenedToFlattened);
-    }
-
-    void
-    constraintMatrixInfoDevice::precomputeMaps(
-      const std::shared_ptr<
-        const utils::mpi::MPIPatternP2P<dftfe::utils::MemorySpace::HOST>>
-        &                mpiPattern,
-      const unsigned int blockSize)
-    {
-      //
-      // Get required sizes
-      //
-      const unsigned int totalSize =
-        mpiPattern->localOwnedSize() + mpiPattern->localGhostSize();
-
-      d_localIndexMapUnflattenedToFlattened.clear();
-      d_localIndexMapUnflattenedToFlattened.resize(totalSize);
-
-      //
-      // fill the data array
-      //
-      for (unsigned int ilocalDof = 0; ilocalDof < totalSize; ++ilocalDof)
-        {
-          // const dealii::types::global_dof_index globalIndex =
-          //   unFlattenedPartitioner->local_to_global(ilocalDof);
-          d_localIndexMapUnflattenedToFlattened[ilocalDof] =
-            ilocalDof * blockSize;
-          // flattenedPartitioner->globalToLocal(globalIndex * blockSize);
-        }
-
-      d_localIndexMapUnflattenedToFlattenedDevice.resize(
-        d_localIndexMapUnflattenedToFlattened.size());
-      d_localIndexMapUnflattenedToFlattenedDevice.copyFrom(
-        d_localIndexMapUnflattenedToFlattened);
-    }
-
-
     template <typename NumberType>
     void
     constraintMatrixInfoDevice::distribute(
@@ -656,8 +565,7 @@ namespace dftfe
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
         d_columnValuesDevice.begin(),
-        d_inhomogenitiesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_inhomogenitiesDevice.begin());
 #elif DFTFE_WITH_DEVICE_LANG_HIP
       hipLaunchKernelGGL(
         distributeKernel,
@@ -675,8 +583,7 @@ namespace dftfe
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
         d_columnValuesDevice.begin(),
-        d_inhomogenitiesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_inhomogenitiesDevice.begin());
 #endif
     }
 
@@ -707,8 +614,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 #elif DFTFE_WITH_DEVICE_LANG_HIP
       hipLaunchKernelGGL(
         distributeSlaveToMasterKernelAtomicAdd,
@@ -725,8 +631,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 #endif
     }
 
@@ -755,31 +660,27 @@ namespace dftfe
         min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
               dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs,
             30000),
-        dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-        blockSize,
-        tempReal,
-        d_rowIdsLocalDevice.begin(),
-        d_numConstrainedDofs,
-        d_rowSizesDevice.begin(),
-        d_rowSizesAccumulatedDevice.begin(),
-        d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize,
+                                           tempReal,
+                                           d_rowIdsLocalDevice.begin(),
+                                           d_numConstrainedDofs,
+                                           d_rowSizesDevice.begin(),
+                                           d_rowSizesAccumulatedDevice.begin(),
+                                           d_columnIdsLocalDevice.begin(),
+                                           d_columnValuesDevice.begin());
 
       distributeSlaveToMasterKernelAtomicAdd<<<
         min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
               dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs,
             30000),
-        dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-        blockSize,
-        tempImag,
-        d_rowIdsLocalDevice.begin(),
-        d_numConstrainedDofs,
-        d_rowSizesDevice.begin(),
-        d_rowSizesAccumulatedDevice.begin(),
-        d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize,
+                                           tempImag,
+                                           d_rowIdsLocalDevice.begin(),
+                                           d_numConstrainedDofs,
+                                           d_rowSizesDevice.begin(),
+                                           d_rowSizesAccumulatedDevice.begin(),
+                                           d_columnIdsLocalDevice.begin(),
+                                           d_columnValuesDevice.begin());
 #elif DFTFE_WITH_DEVICE_LANG_HIP
       hipLaunchKernelGGL(
         distributeSlaveToMasterKernelAtomicAdd,
@@ -796,8 +697,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 
       hipLaunchKernelGGL(
         distributeSlaveToMasterKernelAtomicAdd,
@@ -814,8 +714,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 #endif
 
       dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice(
@@ -850,31 +749,27 @@ namespace dftfe
         min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
               dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs,
             30000),
-        dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-        blockSize,
-        tempReal,
-        d_rowIdsLocalDevice.begin(),
-        d_numConstrainedDofs,
-        d_rowSizesDevice.begin(),
-        d_rowSizesAccumulatedDevice.begin(),
-        d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize,
+                                           tempReal,
+                                           d_rowIdsLocalDevice.begin(),
+                                           d_numConstrainedDofs,
+                                           d_rowSizesDevice.begin(),
+                                           d_rowSizesAccumulatedDevice.begin(),
+                                           d_columnIdsLocalDevice.begin(),
+                                           d_columnValuesDevice.begin());
 
       distributeSlaveToMasterKernelAtomicAdd<<<
         min((blockSize + (dftfe::utils::DEVICE_BLOCK_SIZE - 1)) /
               dftfe::utils::DEVICE_BLOCK_SIZE * d_numConstrainedDofs,
             30000),
-        dftfe::utils::DEVICE_BLOCK_SIZE>>>(
-        blockSize,
-        tempImag,
-        d_rowIdsLocalDevice.begin(),
-        d_numConstrainedDofs,
-        d_rowSizesDevice.begin(),
-        d_rowSizesAccumulatedDevice.begin(),
-        d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        dftfe::utils::DEVICE_BLOCK_SIZE>>>(blockSize,
+                                           tempImag,
+                                           d_rowIdsLocalDevice.begin(),
+                                           d_numConstrainedDofs,
+                                           d_rowSizesDevice.begin(),
+                                           d_rowSizesAccumulatedDevice.begin(),
+                                           d_columnIdsLocalDevice.begin(),
+                                           d_columnValuesDevice.begin());
 #elif DFTFE_WITH_DEVICE_LANG_HIP
       hipLaunchKernelGGL(
         distributeSlaveToMasterKernelAtomicAdd,
@@ -891,8 +786,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 
       hipLaunchKernelGGL(
         distributeSlaveToMasterKernelAtomicAdd,
@@ -909,8 +803,7 @@ namespace dftfe
         d_rowSizesDevice.begin(),
         d_rowSizesAccumulatedDevice.begin(),
         d_columnIdsLocalDevice.begin(),
-        d_columnValuesDevice.begin(),
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        d_columnValuesDevice.begin());
 #endif
 
       dftfe::utils::deviceKernelsGeneric::copyRealArrsToComplexArrDevice(
@@ -940,8 +833,7 @@ namespace dftfe
         blockSize,
         dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()),
         d_rowIdsLocalDevice.begin(),
-        numConstrainedDofs,
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        numConstrainedDofs);
 #elif DFTFE_WITH_DEVICE_LANG_HIP
       hipLaunchKernelGGL(
         setzeroKernel,
@@ -954,8 +846,7 @@ namespace dftfe
         blockSize,
         dftfe::utils::makeDataTypeDeviceCompatible(fieldVector.begin()),
         d_rowIdsLocalDevice.begin(),
-        numConstrainedDofs,
-        d_localIndexMapUnflattenedToFlattenedDevice.begin());
+        numConstrainedDofs);
 #endif
     }
 
diff --git a/utils/vectorTools/vectorUtilities.cc b/utils/vectorTools/vectorUtilities.cc
index 508d87ee8..68a4a448d 100644
--- a/utils/vectorTools/vectorUtilities.cc
+++ b/utils/vectorTools/vectorUtilities.cc
@@ -682,8 +682,9 @@ namespace dftfe
 #ifdef USE_COMPLEX
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<std::complex<double>> &   flattenedArray,
+      const std::complex<double> *                flattenedArray,
       const unsigned int                          totalNumberComponents,
+      const unsigned int                          localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
       const std::vector<dealii::types::global_dof_index>
         &localProcDofIndicesReal,
@@ -699,8 +700,6 @@ namespace dftfe
              dealii::ExcMessage(
                "componentIndexRange doesn't lie within totalNumberComponents"));
 
-      const unsigned int localVectorSize =
-        flattenedArray.size() / totalNumberComponents;
       for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode)
         for (unsigned int icomp = componentIndexRange.first;
              icomp < componentIndexRange.second;
@@ -720,8 +719,9 @@ namespace dftfe
 
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<std::complex<double>> &   flattenedArray,
+      const std::complex<double> *                flattenedArray,
       const unsigned int                          totalNumberComponents,
+      const unsigned int                          localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
       std::vector<distributedCPUVec<double>> &    componentVectors)
     {
@@ -733,8 +733,6 @@ namespace dftfe
              dealii::ExcMessage(
                "componentIndexRange doesn't lie within totalNumberComponents"));
 
-      const unsigned int localVectorSize =
-        flattenedArray.size() / totalNumberComponents;
       for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode)
         for (unsigned int icomp = componentIndexRange.first;
              icomp < componentIndexRange.second;
@@ -750,8 +748,9 @@ namespace dftfe
 #else
     void
     copyFlattenedSTLVecToSingleCompVec(
-      const std::vector<double> &flattenedArray,
+      const double *flattenedArray,
       const unsigned int totalNumberComponents,
+      const unsigned int localVectorSize,
       const std::pair<unsigned int, unsigned int> componentIndexRange,
       std::vector<distributedCPUVec<double>> &componentVectors)
     {
@@ -762,8 +761,6 @@ namespace dftfe
                componentIndexRange.second <= totalNumberComponents,
              dealii::ExcMessage(
                "componentIndexRange doesn't lie within totalNumberComponents"));
-      const unsigned int localVectorSize =
-        flattenedArray.size() / totalNumberComponents;
       for (unsigned int iNode = 0; iNode < localVectorSize; ++iNode)
         for (unsigned int icomp = componentIndexRange.first;
              icomp < componentIndexRange.second;