From ab18b9f112a6eddaf78cca7e9c78f47f7c8242b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6hl?= Date: Fri, 15 Nov 2024 12:02:11 +0100 Subject: [PATCH] GNR: Add performance groups for half-precision FP --- groups/GNR/FLOPS_HP.txt | 37 ++++++++++++++++++ groups/GNR/MEM_HP.txt | 87 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 groups/GNR/FLOPS_HP.txt create mode 100644 groups/GNR/MEM_HP.txt diff --git a/groups/GNR/FLOPS_HP.txt b/groups/GNR/FLOPS_HP.txt new file mode 100644 index 000000000..ad4eceb96 --- /dev/null +++ b/groups/GNR/FLOPS_HP.txt @@ -0,0 +1,37 @@ +SHORT Half Precision MFLOP/s + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PMC0 FP_ARITH_INST_RETIRED2_SCALAR +PMC1 FP_ARITH_INST_RETIRED2_128B_PACKED_HALF +PMC2 FP_ARITH_INST_RETIRED2_256B_PACKED_HALF +PMC3 FP_ARITH_INST_RETIRED2_512B_PACKED_HALF + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +HP [MFLOP/s] 1.0E-06*(PMC0+PMC1*8.0+PMC2*16.0+PMC3*32.0)/time +128B HP [MFLOP/s] 1.0E-06*(PMC1*8.0)/time +256B HP [MFLOP/s] 1.0E-06*(PMC2*16.0)/time +512B HP [MFLOP/s] 1.0E-06*(PMC3*32.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC1+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC0/time +Vectorization ratio 100*(PMC1+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + +LONG +Formulas: +HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_SCALAR+FP_ARITH_INST_RETIRED2_128B_PACKED_HALF*8+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF*16+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF*32)/runtime +128B HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_128B_PACKED_HALF*8)/runtime +256B HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_256B_PACKED_HALF*16)/runtime +512B HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_512B_PACKED_HALF*32)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_128B_PACKED_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED2_SCALAR/runtime +Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED2_128B_PACKED_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF)/(FP_ARITH_INST_RETIRED2_SCALAR+FP_ARITH_INST_RETIRED2_128B_PACKED_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF) +- +Scalar and packed half precision FLOP rates new in Sapphire Rapids. + diff --git a/groups/GNR/MEM_HP.txt b/groups/GNR/MEM_HP.txt new file mode 100644 index 000000000..6d2c5547c --- /dev/null +++ b/groups/GNR/MEM_HP.txt @@ -0,0 +1,87 @@ +SHORT Overview of HP arithmetic and main memory performance + +EVENTSET +FIXC0 INSTR_RETIRED_ANY +FIXC1 CPU_CLK_UNHALTED_CORE +FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS +PWR0 PWR_PKG_ENERGY +PWR3 PWR_DRAM_ENERGY +PMC0 FP_ARITH_INST_RETIRED2_128B_PACKED_HALF +PMC1 FP_ARITH_INST_RETIRED2_SCALAR_HALF +PMC2 FP_ARITH_INST_RETIRED2_256B_PACKED_HALF +PMC3 FP_ARITH_INST_RETIRED2_512B_PACKED_HALF +MBOX0C0 CAS_COUNT_RD +MBOX0C1 CAS_COUNT_WR +MBOX1C0 CAS_COUNT_RD +MBOX1C1 CAS_COUNT_WR +MBOX2C0 CAS_COUNT_RD +MBOX2C1 CAS_COUNT_WR +MBOX3C0 CAS_COUNT_RD +MBOX3C1 CAS_COUNT_WR +MBOX4C0 CAS_COUNT_RD +MBOX4C1 CAS_COUNT_WR +MBOX5C0 CAS_COUNT_RD +MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR +MBOX8C0 CAS_COUNT_RD +MBOX8C1 CAS_COUNT_WR +MBOX9C0 CAS_COUNT_RD +MBOX9C1 CAS_COUNT_WR +MBOX10C0 CAS_COUNT_RD +MBOX10C1 CAS_COUNT_WR +MBOX11C0 CAS_COUNT_RD +MBOX11C1 CAS_COUNT_WR +MBOX12C0 CAS_COUNT_RD +MBOX12C1 CAS_COUNT_WR + + + +METRICS +Runtime (RDTSC) [s] time +Runtime unhalted [s] FIXC1*inverseClock +Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +CPI FIXC1/FIXC0 +Energy [J] PWR0 +Power [W] PWR0/time +Energy DRAM [J] PWR3 +Power DRAM [W] PWR3/time +HP [MFLOP/s] 1.0E-06*(PMC0*8.0+PMC1+PMC2*16.0+PMC3*32.0)/time +AVX HP [MFLOP/s] 1.0E-06*(PMC2*16.0+PMC3*32.0)/time +Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time +Scalar [MUOPS/s] 1.0E-06*PMC1/time +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1)*64.0 +Operational intensity [FLOP/Byte] (PMC0*8.0+PMC1+PMC2*16.0+PMC3*32.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX8C0+MBOX9C0+MBOX10C0+MBOX11C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1+MBOX8C1+MBOX9C1+MBOX10C1+MBOX11C1)*64.0) + +LONG +Formulas: +Power [W] = PWR_PKG_ENERGY/runtime +Power DRAM [W] = PWR_DRAM_ENERGY/runtime +HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_128B_PACKED_HALF*8+FP_ARITH_INST_RETIRED2_SCALAR_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF*16+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF*32)/runtime +AVX HP [MFLOP/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_256B_PACKED_HALF*16+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF*32)/runtime +Packed [MUOPS/s] = 1.0E-06*(FP_ARITH_INST_RETIRED2_128B_PACKED_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF)/runtime +Scalar [MUOPS/s] = 1.0E-06*FP_ARITH_INST_RETIRED2_SCALAR_HALF/runtime +Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD))*64.0/runtime +Memory read data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD))*64.0 +Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime +Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime +Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED2_128B_PACKED_HALF*8+FP_ARITH_INST_RETIRED2_SCALAR_HALF+FP_ARITH_INST_RETIRED2_256B_PACKED_HALF*16+FP_ARITH_INST_RETIRED2_512B_PACKED_HALF*32)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +-- +Profiling group to measure memory bandwidth drawn by all cores of a socket. +Since this group is based on Uncore events it is only possible to measure on +a per socket base. Also outputs total data volume transferred from main memory. +SSE scalar and packed half precision FLOP rates. Also reports on packed AVX +32b instructions. +The operational intensity is calculated using the FP values of the cores and the +memory data volume of the whole socket. The actual operational intensity for +multiple CPUs can be found in the statistics table in the Sum column.