Add support for Nvidia Grace (RRZE-HPC#585)

* Add support for Nvidia Grace * Add support for SCF unit * Add remaining units (cnvlink, 2x nvlink, pcie) * Add comment about GCCARM compiler setting to config.mk * Update MEM.txt to use CMEM_WR_TOTAL_BYTES NVIDIA recommends to use CMEM_WR_TOTAL_BYTES instead of CMEM_WR_DATA to measure memory write data. * fixed typos in group description * added combined MEM/FLOPS groups * Fix for uncommon Uncore device mapping in perf_event --------- Co-authored-by: Jan <[email protected]> Co-authored-by: JanLJL <[email protected]>
breiters · Sep 10, 2024 · 77b7a4a · 77b7a4a
1 parent cec1a2f
commit 77b7a4a
Show file tree

Hide file tree

Showing 22 changed files with 1,964 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -100,6 +100,18 @@ OBJ := $(filter-out $(BUILD_DIR)/access_x86_translate.o,$(OBJ))
 else
 OBJ := $(filter-out $(BUILD_DIR)/loadDataARM.o,$(OBJ))
 endif
+ifeq ($(COMPILER), GCCARM)
+OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/loadData.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_msr.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_pci.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_rdpmc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_clientmem.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/access_x86_translate.o,$(OBJ))
+else
+OBJ := $(filter-out $(BUILD_DIR)/loadDataARM.o,$(OBJ))
+endif
 ifeq ($(COMPILER), FCC)
 OBJ := $(filter-out $(BUILD_DIR)/topology_cpuid.o,$(OBJ))
 OBJ := $(filter-out $(BUILD_DIR)/loadData.o,$(OBJ))

diff --git a/bench/Makefile b/bench/Makefile
@@ -63,6 +63,9 @@ endif
 ifeq ($(COMPILER),GCCARMv8)
 BENCH_DIR   = ./armv8
 endif
+ifeq ($(COMPILER),GCCARM)
+BENCH_DIR   = ./armv8
+endif
 ifeq ($(COMPILER),ARMCLANG)
 BENCH_DIR   = ./armv8
 endif

diff --git a/config.mk b/config.mk
@@ -8,6 +8,7 @@
 # configuration options setup steps.
 # Supported: GCC, CLANG, ICC, MIC (ICC), GCCX86 (for 32bit systems)
 # GCCARMv8, GCCARMv7 and GCCPOWER
+# Since 5.3, there is a generic GCCARM target
 COMPILER = GCC#NO SPACE
 
 # Absolute path were to install likwid

diff --git a/ext/hwloc/Makefile b/ext/hwloc/Makefile
@@ -39,6 +39,9 @@ endif
 ifeq ($(strip $(COMPILER)), GCCARMv8)
 OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o, $(OBJ))
 endif
+ifeq ($(strip $(COMPILER)), GCCARM)
+OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o, $(OBJ))
+endif
 ifeq ($(strip $(COMPILER)), ARMCLANG)
 OBJ := $(filter-out $(BUILD_DIR)/topology-x86.o, $(OBJ))
 endif

diff --git a/groups/nvidia_grace/BRANCH.txt b/groups/nvidia_grace/BRANCH.txt
@@ -0,0 +1,30 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  BR_RETIRED
+PMC3  BR_MIS_PRED_RETIRED
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Branch rate   PMC2/PMC0
+Branch misprediction rate  PMC3/PMC0
+Branch misprediction ratio  PMC3/PMC2
+Instructions per branch  PMC0/PMC2
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+Branch rate = BR_RETIRED/INST_RETIRED
+Branch misprediction rate =  BR_MIS_PRED_RETIRED/INST_RETIRED
+Branch misprediction ratio = BR_MIS_PRED_RETIRED/BR_RETIRED
+Instructions per branch = INSTR_RETIRED_ANY/BR_RETIRED
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/nvidia_grace/DATA.txt b/groups/nvidia_grace/DATA.txt
@@ -0,0 +1,24 @@
+SHORT Load to store ratio
+
+EVENTSET
+PMC0  INST_SPEC
+PMC1  CPU_CYCLES
+PMC2  LD_SPEC
+PMC3  ST_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+Load to store ratio PMC2/PMC3
+Load ratio PMC2/PMC0
+Store ratio PMC3/PMC0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_SPEC
+Load to store ratio = LD_SPEC / ST_SPEC
+Load ratio = LD_SPEC / INST_SPEC
+Store ratio = ST_SPEC / INST_SPEC
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/nvidia_grace/FLOPS.txt b/groups/nvidia_grace/FLOPS.txt
@@ -0,0 +1,19 @@
+SHORT MFLOP/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC3  FP_FIXED_OPS_SPEC
+PMC4  FP_SCALE_OPS_SPEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+FP rate [MFLOP/s] 1E-06*(PMC3+PMC4)/time
+
+LONG
+Formulas:
+FP rate [MFLOP/s] = 1E-06*(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)/time
+-
+Double-precision FP rate for scalar and SVE vector operations.
diff --git a/groups/nvidia_grace/L2.txt b/groups/nvidia_grace/L2.txt
@@ -0,0 +1,40 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L1D_CACHE_REFILL
+PMC3  L1D_CACHE_WB
+PMC4  L1I_CACHE_REFILL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L1D<-L2 load bandwidth [MBytes/s]  1.0E-06*(PMC2)*256.0/time
+L1D<-L2 load data volume [GBytes]  1.0E-09*(PMC2)*256.0
+L1D->L2 evict bandwidth [MBytes/s]  1.0E-06*PMC3*256.0/time
+L1D->L2 evict data volume [GBytes]  1.0E-09*PMC3*256.0
+L1I<-L2 load bandwidth [MBytes/s]  1.0E-06*PMC4*256.0/time
+L1I<-L2 load data volume [GBytes]  1.0E-09*PMC4*256.0
+L1<->L2 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3+PMC4)*256.0/time
+L1<->L2 data volume [GBytes] 1.0E-09*(PMC2+PMC3+PMC4)*256.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L1D<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_REFILL*256.0/time
+L1D<-L2 load data volume [GBytes] = 1.0E-09*L1D_CACHE_REFILL*256.0
+L1D->L2 evict bandwidth [MBytes/s] = 1.0E-06*L1D_CACHE_WB*256.0/time
+L1D->L2 evict data volume [GBytes] = 1.0E-09*L1D_CACHE_WB*256.0
+L1I<-L2 load bandwidth [MBytes/s] = 1.0E-06*L1I_CACHE_REFILL*256.0/time
+L1I<-L2 load data volume [GBytes] = 1.0E-09*L1I_CACHE_REFILL*256.0
+L1<->L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0/time
+L1<->L2 data volume [GBytes] = 1.0E-09*(L1D_CACHE_REFILL+L1D_CACHE_WB+L1I_CACHE_REFILL)*256.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered in the L1 instruction
+cache.
diff --git a/groups/nvidia_grace/L3.txt b/groups/nvidia_grace/L3.txt
@@ -0,0 +1,34 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  L2D_CACHE_REFILL
+PMC3  L2D_CACHE_WB
+
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI  PMC1/PMC0
+L2D<-L3 load bandwidth [MBytes/s]  1.0E-06*(PMC2)*256.0/time
+L2D<-L3 load data volume [GBytes]  1.0E-09*(PMC2)*256.0
+L2D->L3 evict bandwidth [MBytes/s]  1.0E-06*PMC3*256.0/time
+L2D->L3 evict data volume [GBytes]  1.0E-09*PMC3*256.0
+L2<->L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*256.0/time
+L2<->L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*256.0
+
+LONG
+Formulas:
+CPI = CPU_CYCLES/INST_RETIRED
+L2D<-L3 load bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_REFILL*256.0/time
+L2D<-L3 load data volume [GBytes] = 1.0E-09*L2D_CACHE_REFILL*256.0
+L2D->L3 evict bandwidth [MBytes/s] = 1.0E-06*L2D_CACHE_WB*256.0/time
+L2D->L3 evict data volume [GBytes] = 1.0E-09*L2D_CACHE_WB*256.0
+L2<->L3 bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_REFILL+L2D_CACHE_WB)*256.0/time
+L2<->L3 data volume [GBytes] = 1.0E-09*(L2D_CACHE_REFILL+L2D_CACHE_WB)*256.0
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L3 to the L2 data cache and the writebacks from
+the L2 data cache to the L3 cache. The group also outputs total data volume transfered between
+L3 and L2. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L2.
diff --git a/groups/nvidia_grace/MEM.txt b/groups/nvidia_grace/MEM.txt
@@ -0,0 +1,30 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+SCF0  CMEM_RD_DATA
+SCF1  CMEM_WR_TOTAL_BYTES
+
+METRICS
+Runtime (RDTSC) [s] time
+Clock [MHz] 1.E-06*PMC1/time
+CPI  PMC1/PMC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(SCF0)*32.0/time
+Memory read data volume [GBytes] 1.0E-09*(SCF0)*32.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(SCF1)/time
+Memory write data volume [GBytes] 1.0E-09*(SCF1)
+Memory bandwidth [MBytes/s] 1.0E-06*((SCF0*32.0)+SCF1)/time
+Memory data volume [GBytes] 1.0E-09*((SCF0*32.0)+SCF1)
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*CMEM_RD_DATA*32.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*CMEM_RD_DATA*32.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*CMEM_WR_DATA*32.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*CMEM_WR_TOTAL_BYTES
+Memory bandwidth [MBytes/s] = 1.0E-06*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)/runtime
+Memory data volume [GBytes] = 1.0E-09*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)
+-
+Profiling group to measure memory bandwidth for CPU memory. The transfer unit
+'beats' is 32 Bytes.
diff --git a/groups/nvidia_grace/MEM_DP.txt b/groups/nvidia_grace/MEM_DP.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth and FLOPs
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  FP_FIXED_OPS_SPEC
+PMC3  FP_SCALE_OPS_SPEC
+PMC4  FP_DP_SPEC
+PMC5  SVE_INST_SPEC
+SCF0  CMEM_RD_DATA
+SCF1  CMEM_WR_TOTAL_BYTES
+
+METRICS
+Runtime (RDTSC) [s]    time
+Clock [MHz]    1.E-06*PMC1/time
+CPI    PMC1/PMC0
+FP rate [MFLOP/s]    1.0E-06*(PMC2+PMC3)/time
+SVE FP rate [MFLOP/s]    1.0E-06*(PMC3)/time
+Scalar/NEON FP rate [MFLOP/s]    1.0E-06*(PMC2)/time
+SVE ratio    100*(PMC3)/(PMC2+PMC3)
+Flops per DP instr    PMC3/PMC4
+Arithmetic ratio SVE    100*(PMC4)/(PMC5)
+Memory read bandwidth [MBytes/s]    1.0E-06*(SCF0)*32.0/time
+Memory read data volume [GBytes]    1.0E-09*(SCF0)*32.0
+Memory write bandwidth [MBytes/s]    1.0E-06*(SCF1)/time
+Memory write data volume [GBytes]    1.0E-09*SCF1
+Memory bandwidth [MBytes/s]    1.0E-06*((SCF0*32.0)+SCF1)/time
+Memory data volume [GBytes]    1.0E-09*((SCF0*32.0)+SCF1)
+
+
+LONG
+Formulas:
+FP rate [MFLOP/s] = 1E-06*(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)/time
+SVE FP rate [MFLOP/s] = 1E-06*FP_SCALE_OPS_SPEC/time
+Scalar/NEON FP rate [MFLOPS/s] = 1E-06*FP_FIXED_OPS_SPEC/time
+SVE ratio = 100*FP_SCALE_OPS_SPEC/(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)
+Flops per DP instr = FP_SCALE_OPS_SPEC/FP_DP_SPEC
+Arithmetic ratio SVE = 100*FP_SCALE_OPS_SPEC/SVE_INST_SPEC
+Memory read bandwidth [MBytes/s] = 1.0E-06*CMEM_RD_DATA*32.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*CMEM_RD_DATA*32.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*CMEM_WR_DATA*32.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*CMEM_WR_TOTAL_BYTES
+Memory bandwidth [MBytes/s] = 1.0E-06*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)/runtime
+Memory data volume [GBytes] = 1.0E-09*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)
+-
+Profiling group to measure memory bandwidth for CPU memory and
+FP rate in any precision for scalar and SVE vector operations with additional
+insight into DP instructions.
+The transfer unit 'beats' is 32 Bytes.
diff --git a/groups/nvidia_grace/MEM_HP.txt b/groups/nvidia_grace/MEM_HP.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth and FLOPs
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  FP_FIXED_OPS_SPEC
+PMC3  FP_SCALE_OPS_SPEC
+PMC4  FP_HP_SPEC
+PMC5  SVE_INST_SPEC
+SCF0  CMEM_RD_DATA
+SCF1  CMEM_WR_TOTAL_BYTES
+
+METRICS
+Runtime (RDTSC) [s]    time
+Clock [MHz]    1.E-06*PMC1/time
+CPI    PMC1/PMC0
+FP rate [MFLOP/s]    1.0E-06*(PMC2+PMC3)/time
+SVE FP rate [MFLOP/s]    1.0E-06*(PMC3)/time
+Scalar/NEON FP rate [MFLOP/s]    1.0E-06*(PMC2)/time
+SVE ratio    100*(PMC3)/(PMC2+PMC3)
+Flops per HP instr    PMC3/PMC4
+Arithmetic ratio SVE    100*(PMC4)/(PMC5)
+Memory read bandwidth [MBytes/s]    1.0E-06*(SCF0)*32.0/time
+Memory read data volume [GBytes]    1.0E-09*(SCF0)*32.0
+Memory write bandwidth [MBytes/s]    1.0E-06*(SCF1)/time
+Memory write data volume [GBytes]    1.0E-09*SCF1
+Memory bandwidth [MBytes/s]    1.0E-06*((SCF0*32.0)+SCF1)/time
+Memory data volume [GBytes]    1.0E-09*((SCF0*32.0)+SCF1)
+
+
+LONG
+Formulas:
+FP rate [MFLOP/s] = 1E-06*(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)/time
+SVE FP rate [MFLOP/s] = 1E-06*FP_SCALE_OPS_SPEC/time
+Scalar/NEON FP rate [MFLOPS/s] = 1E-06*FP_FIXED_OPS_SPEC/time
+SVE ratio = 100*FP_SCALE_OPS_SPEC/(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)
+Flops per HP instr = FP_SCALE_OPS_SPEC/FP_HP_SPEC
+Arithmetic ratio SVE = 100*FP_SCALE_OPS_SPEC/SVE_INST_SPEC
+Memory read bandwidth [MBytes/s] = 1.0E-06*CMEM_RD_DATA*32.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*CMEM_RD_DATA*32.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*CMEM_WR_DATA*32.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*CMEM_WR_TOTAL_BYTES
+Memory bandwidth [MBytes/s] = 1.0E-06*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)/runtime
+Memory data volume [GBytes] = 1.0E-09*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)
+-
+Profiling group to measure memory bandwidth for CPU memory and
+FP rate in any precision for scalar and SVE vector operations with additional
+insight into HP instructions.
+The transfer unit 'beats' is 32 Bytes.
diff --git a/groups/nvidia_grace/MEM_SP.txt b/groups/nvidia_grace/MEM_SP.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth and FLOPs
+
+EVENTSET
+PMC0  INST_RETIRED
+PMC1  CPU_CYCLES
+PMC2  FP_FIXED_OPS_SPEC
+PMC3  FP_SCALE_OPS_SPEC
+PMC4  FP_SP_SPEC
+PMC5  SVE_INST_SPEC
+SCF0  CMEM_RD_DATA
+SCF1  CMEM_WR_TOTAL_BYTES
+
+METRICS
+Runtime (RDTSC) [s]    time
+Clock [MHz]    1.E-06*PMC1/time
+CPI    PMC1/PMC0
+FP rate [MFLOP/s]    1.0E-06*(PMC2+PMC3)/time
+SVE FP rate [MFLOP/s]    1.0E-06*(PMC3)/time
+Scalar/NEON FP rate [MFLOP/s]    1.0E-06*(PMC2)/time
+SVE ratio    100*(PMC3)/(PMC2+PMC3)
+Flops per SP instr    PMC3/PMC4
+Arithmetic ratio SVE    100*(PMC4)/(PMC5)
+Memory read bandwidth [MBytes/s]    1.0E-06*(SCF0)*32.0/time
+Memory read data volume [GBytes]    1.0E-09*(SCF0)*32.0
+Memory write bandwidth [MBytes/s]    1.0E-06*(SCF1)/time
+Memory write data volume [GBytes]    1.0E-09*SCF1
+Memory bandwidth [MBytes/s]    1.0E-06*((SCF0*32.0)+SCF1)/time
+Memory data volume [GBytes]    1.0E-09*((SCF0*32.0)+SCF1)
+
+
+LONG
+Formulas:
+FP rate [MFLOP/s] = 1E-06*(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)/time
+SVE FP rate [MFLOP/s] = 1E-06*FP_SCALE_OPS_SPEC/time
+Scalar/NEON FP rate [MFLOPS/s] = 1E-06*FP_FIXED_OPS_SPEC/time
+SVE ratio = 100*FP_SCALE_OPS_SPEC/(FP_FIXED_OPS_SPEC+FP_SCALE_OPS_SPEC)
+Flops per SP instr = FP_SCALE_OPS_SPEC/FP_SP_SPEC
+Arithmetic ratio SVE = 100*FP_SCALE_OPS_SPEC/SVE_INST_SPEC
+Memory read bandwidth [MBytes/s] = 1.0E-06*CMEM_RD_DATA*32.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*CMEM_RD_DATA*32.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*CMEM_WR_DATA*32.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*CMEM_WR_TOTAL_BYTES
+Memory bandwidth [MBytes/s] = 1.0E-06*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)/runtime
+Memory data volume [GBytes] = 1.0E-09*((CMEM_RD_DATA*32.0)+CMEM_WR_TOTAL_BYTES)
+-
+Profiling group to measure memory bandwidth for CPU memory and
+FP rate in any precision for scalar and SVE vector operations with additional
+insight into SP instructions.
+The transfer unit 'beats' is 32 Bytes.