Skip to content

Commit

Permalink
Further GPU code deduplication. Use faster math functions for simple …
Browse files Browse the repository at this point in the history
…power functions (inlined device functions calling pow to preprocessor definitions using multiplication operations). Other code clean-up.
  • Loading branch information
ohearnk committed Jun 26, 2024
1 parent 0bf89e1 commit f937da6
Show file tree
Hide file tree
Showing 18 changed files with 19,601 additions and 20,286 deletions.
4,114 changes: 2,020 additions & 2,094 deletions src/gpu/cuda/gpu.cu

Large diffs are not rendered by default.

3 changes: 0 additions & 3 deletions src/gpu/cuda/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,6 @@ __device__ __forceinline__ void hrrwholegrad2_2(QUICKDouble* Yaax, QUICKDouble*
QUICKDouble RCx,QUICKDouble RCy,QUICKDouble RCz, \
QUICKDouble RDx,QUICKDouble RDy,QUICKDouble RDz);


__device__ __forceinline__ QUICKDouble quick_dsqr(QUICKDouble a);

__device__ void vertical(int I, int J, int K, int L, QUICKDouble* YVerticalTemp, QUICKDouble* store, \
QUICKDouble Ptempx, QUICKDouble Ptempy, QUICKDouble Ptempz, \
QUICKDouble WPtempx,QUICKDouble WPtempy,QUICKDouble WPtempz, \
Expand Down
10,923 changes: 5,453 additions & 5,470 deletions src/gpu/cuda/gpu_MP2.cu

Large diffs are not rendered by default.

13 changes: 2 additions & 11 deletions src/gpu/cuda/gpu_get2e_grad_ffff.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,6 @@


#ifndef OSHELL
/*
* sqr for double precision. there no internal function to do that in fast-math-lib of GPU
* */
__device__ __forceinline__ QUICKDouble quick_dsqr(const QUICKDouble a)
{
return a*a;
}


#define FMT_NAME FmT
#include "../gpu_fmt.h"

Expand Down Expand Up @@ -1160,10 +1151,10 @@ const smem_dbl_ptr, unsigned char** const smem_char_ptr, unsigned char* const sm
QUICKDouble Qy = LOC2(DEV_SIM_DBL_PTR_WEIGHTEDCENTERY, kk_start+KKK, ll_start+LLL, DEV_SIM_INT_PRIM_TOTAL, DEV_SIM_INT_PRIM_TOTAL);
QUICKDouble Qz = LOC2(DEV_SIM_DBL_PTR_WEIGHTEDCENTERZ, kk_start+KKK, ll_start+LLL, DEV_SIM_INT_PRIM_TOTAL, DEV_SIM_INT_PRIM_TOTAL);

//QUICKDouble T = AB * CD * ABCD * ( quick_dsqr(Px-Qx) + quick_dsqr(Py-Qy) + quick_dsqr(Pz-Qz));
//QUICKDouble T = AB * CD * ABCD * (SQR(Px - Qx) + SQR(Py - Qy) + SQR(Pz - Qz));

//QUICKDouble YVerticalTemp[VDIM1*VDIM2*VDIM3];
FmT(I+J+K+L+2, AB * CD * ABCD * ( quick_dsqr(Px-Qx) + quick_dsqr(Py-Qy) + quick_dsqr(Pz-Qz)), YVerticalTemp);
FmT(I + J + K + L + 2, AB * CD * ABCD * (SQR(Px - Qx) + SQR(Py - Qy) + SQR(Pz - Qz)), YVerticalTemp);

for (int i = 0; i<=I+J+K+L+2; i++) {
VY(0, 0, i) = VY(0, 0, i) * X2;
Expand Down
Loading

0 comments on commit f937da6

Please sign in to comment.