diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7c18f2ec6..3c069c131 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -94,7 +94,6 @@ build-arm8-perf: tags: - testcluster - check-event-files: stage: .pre tags: @@ -104,6 +103,16 @@ check-event-files: - test/check_data_files.py events # - test/check_data_files.py groups +notify-github-pending: + stage: .pre + tags: + - testcluster + variables: + NO_SLURM_SUBMIT: 1 + when: always + script: + - test/gitlab-ci/notify_github.sh pending + arch-gen: stage: build tags: @@ -147,3 +156,23 @@ cuda-pipeline: strategy: depend variables: PARENT_PIPELINE_ID: $CI_PIPELINE_ID + +notify-github-success: + stage: .post + tags: + - testcluster + variables: + NO_SLURM_SUBMIT: 1 + when: on_success + script: + - test/gitlab-ci/notify_github.sh success + +notify-github-failure: + stage: .post + tags: + - testcluster + variables: + NO_SLURM_SUBMIT: 1 + when: on_failure + script: + - test/gitlab-ci/notify_github.sh failure diff --git a/CHANGELOG b/CHANGELOG index b3ab5c193..bc52f7b57 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,26 @@ +# Changelog 5.2.2 +- Add mutex to pinning library +- Fix pin string parsing in pinning library +- Make SBIN path configurable in build system +- Add PKGBUILD for ArchLinux package builds +- Remove accessDaemon double-fork in systemd environements +- Group updates for L2/L3 (mainly AMD Zen) +- Fix multi-initialization in MarkerAPI +- Add energy event scaling for Fujitsu A64FX +- Nvmon: Use Cupti error string to get better warning/error messages +- Nvmon: Store events internally to re-use event strings in stopCounters +- AccessLayer: Catch SIGCHLD to stop sending requests to accessDaemon if it was killed +- likwid-genTopoCfg: Update writing and reading of topology file +- Add INST_RETIRED_NOP event for Intel Icelake (desktop & server) +- Removed some memory leaks +- Improved checks for RDPMC availability +- Add TOPDOWN_SLOTS for perf_event +- Fix for systems with CPU sockets without hwthreads (A64FX FX1000) +- Fix if HOME environment variable is not set (systemd) +- Reader function for perf_event_paranoid in Lua to get state early +- likwid-mpirun: Sanitize np and ppn values to avoid crashes + + # Changelog 5.2.1 - Add support for Intel Rocketlake and AMD Zen3 variant (Family 19, Model 0x50) - Fix for perf_event multiplexing (important!) diff --git a/README.md b/README.md index 05cd4efbc..ba8339405 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ our hands to test them properly. [LIKWID Playlist (YouTube)](https://www.youtube.com/playlist?list=PLxVedhmuwLq2CqJpAABDMbZG8Whi7pKsk) -[![Build Status](https://gitlab.rrze.fau.de/ub55yzis/likwid/badges/master/pipeline.svg)](https://gitlab.rrze.fau.de/ub55yzis/likwid/-/commits/master) [![General LIKWID DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4275676.svg)](https://doi.org/10.5281/zenodo.4275676) +[![Build Status](https://gitos.rrze.fau.de/ub55yzis/likwid/badges/master/pipeline.svg)](https://gitos.rrze.fau.de/ub55yzis/likwid/-/commits/master) [![General LIKWID DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4275676.svg)](https://doi.org/10.5281/zenodo.4275676) It consists of: diff --git a/bench/armv8/peakflops_neon.ptt b/bench/armv8/peakflops_neon.ptt new file mode 100644 index 000000000..76a394c58 --- /dev/null +++ b/bench/armv8/peakflops_neon.ptt @@ -0,0 +1,67 @@ +STREAMS 1 +TYPE DOUBLE +FLOPS 28 +BYTES 8 +DESC Double-precision multiplications and additions with a single load, optimized for NEON FMAs +LOADS 1 +STORES 0 +INSTR_LOOP 29 +UOPS 29 +ldr q1, [STR0] +ldr q2, [STR0] +ldr q3, [STR0] +ldr q4, [STR0] +ldr q5, [STR0] +ldr q6, [STR0] +ldr q7, [STR0] +ldr q8, [STR0] +ldr q9, [STR0] +ldr q10, [STR0] +ldr q11, [STR0] +ldr q12, [STR0] +ldr q13, [STR0] +ldr q14, [STR0] +ldr q15, [STR0] +ldr q16, [STR0] +ldr q17, [STR0] +ldr q18, [STR0] +ldr q19, [STR0] +ldr q20, [STR0] +ldr q21, [STR0] +ldr q22, [STR0] +ldr q23, [STR0] +ldr q24, [STR0] +ldr q25, [STR0] +ldr q26, [STR0] +ldr q27, [STR0] +ldr q28, [STR0] +LOOP 2 +ldr q16, [STR0], #8 +fadd v1.2d, v1.2d, v1.2d +fadd v2.2d, v2.2d, v2.2d +fmul v3.2d, v3.2d, v3.2d +fmul v4.2d, v4.2d, v4.2d +fadd v5.2d, v5.2d, v5.2d +fadd v6.2d, v6.2d, v6.2d +fmul v7.2d, v7.2d, v7.2d +fmul v8.2d, v8.2d, v8.2d +fadd v9.2d, v9.2d, v9.2d +fadd v10.2d, v10.2d, v10.2d +fmul v11.2d, v11.2d, v11.2d +fmul v12.2d, v12.2d, v12.2d +fadd v13.2d, v13.2d, v13.2d +fadd v14.2d, v14.2d, v14.2d +fmul v15.2d, v15.2d, v15.2d +fmul v16.2d, v16.2d, v16.2d +fadd v17.2d, v17.2d, v17.2d +fadd v18.2d, v18.2d, v18.2d +fmul v19.2d, v19.2d, v19.2d +fmul v20.2d, v20.2d, v20.2d +fadd v21.2d, v21.2d, v21.2d +fadd v22.2d, v22.2d, v22.2d +fmul v23.2d, v23.2d, v23.2d +fmul v24.2d, v24.2d, v24.2d +fadd v25.2d, v25.2d, v25.2d +fadd v26.2d, v26.2d, v26.2d +fmul v27.2d, v27.2d, v27.2d +fmul v28.2d, v28.2d, v28.2d diff --git a/bench/armv8/peakflops_neon_fma.ptt b/bench/armv8/peakflops_neon_fma.ptt new file mode 100644 index 000000000..75a0442c3 --- /dev/null +++ b/bench/armv8/peakflops_neon_fma.ptt @@ -0,0 +1,67 @@ +STREAMS 1 +TYPE DOUBLE +FLOPS 56 +BYTES 8 +DESC Double-precision multiplications and additions with a single load, optimized for NEON FMAs +LOADS 1 +STORES 0 +INSTR_LOOP 29 +UOPS 29 +ldr q1, [STR0] +ldr q2, [STR0] +ldr q3, [STR0] +ldr q4, [STR0] +ldr q5, [STR0] +ldr q6, [STR0] +ldr q7, [STR0] +ldr q8, [STR0] +ldr q9, [STR0] +ldr q10, [STR0] +ldr q11, [STR0] +ldr q12, [STR0] +ldr q13, [STR0] +ldr q14, [STR0] +ldr q15, [STR0] +ldr q16, [STR0] +ldr q17, [STR0] +ldr q18, [STR0] +ldr q19, [STR0] +ldr q20, [STR0] +ldr q21, [STR0] +ldr q22, [STR0] +ldr q23, [STR0] +ldr q24, [STR0] +ldr q25, [STR0] +ldr q26, [STR0] +ldr q27, [STR0] +ldr q28, [STR0] +LOOP 2 +ldr q16, [STR0], #8 +fmla v1.2d, v1.2d, v1.2d +fmla v2.2d, v2.2d, v2.2d +fmla v3.2d, v3.2d, v3.2d +fmla v4.2d, v4.2d, v4.2d +fmla v5.2d, v5.2d, v5.2d +fmla v6.2d, v6.2d, v6.2d +fmla v7.2d, v7.2d, v7.2d +fmla v8.2d, v8.2d, v8.2d +fmla v9.2d, v9.2d, v9.2d +fmla v10.2d, v10.2d, v10.2d +fmla v11.2d, v11.2d, v11.2d +fmla v12.2d, v12.2d, v12.2d +fmla v13.2d, v13.2d, v13.2d +fmla v14.2d, v14.2d, v14.2d +fmla v15.2d, v15.2d, v15.2d +fmla v16.2d, v16.2d, v16.2d +fmla v17.2d, v17.2d, v17.2d +fmla v18.2d, v18.2d, v18.2d +fmla v19.2d, v19.2d, v19.2d +fmla v20.2d, v20.2d, v20.2d +fmla v21.2d, v21.2d, v21.2d +fmla v22.2d, v22.2d, v22.2d +fmla v23.2d, v23.2d, v23.2d +fmla v24.2d, v24.2d, v24.2d +fmla v25.2d, v25.2d, v25.2d +fmla v26.2d, v26.2d, v26.2d +fmla v27.2d, v27.2d, v27.2d +fmla v28.2d, v28.2d, v28.2d diff --git a/bench/armv8/peakflops_sp_neon.ptt b/bench/armv8/peakflops_sp_neon.ptt new file mode 100644 index 000000000..2b584d7b0 --- /dev/null +++ b/bench/armv8/peakflops_sp_neon.ptt @@ -0,0 +1,67 @@ +STREAMS 1 +TYPE SINGLE +FLOPS 28 +BYTES 4 +DESC Single-precision multiplications and additions with a single load, optimized for NEON FMAs +LOADS 1 +STORES 0 +INSTR_LOOP 29 +UOPS 29 +ldr q1, [STR0] +ldr q2, [STR0] +ldr q3, [STR0] +ldr q4, [STR0] +ldr q5, [STR0] +ldr q6, [STR0] +ldr q7, [STR0] +ldr q8, [STR0] +ldr q9, [STR0] +ldr q10, [STR0] +ldr q11, [STR0] +ldr q12, [STR0] +ldr q13, [STR0] +ldr q14, [STR0] +ldr q15, [STR0] +ldr q16, [STR0] +ldr q17, [STR0] +ldr q18, [STR0] +ldr q19, [STR0] +ldr q20, [STR0] +ldr q21, [STR0] +ldr q22, [STR0] +ldr q23, [STR0] +ldr q24, [STR0] +ldr q25, [STR0] +ldr q26, [STR0] +ldr q27, [STR0] +ldr q28, [STR0] +LOOP 4 +ldr q16, [STR0], #8 +fadd v1.4s, v1.4s, v1.4s +fadd v2.4s, v2.4s, v2.4s +fmul v3.4s, v3.4s, v3.4s +fmul v4.4s, v4.4s, v4.4s +fadd v5.4s, v5.4s, v5.4s +fadd v6.4s, v6.4s, v6.4s +fmul v7.4s, v7.4s, v7.4s +fmul v8.4s, v8.4s, v8.4s +fadd v9.4s, v9.4s, v9.4s +fadd v10.4s, v10.4s, v10.4s +fmul v11.4s, v11.4s, v11.4s +fmul v12.4s, v12.4s, v12.4s +fadd v13.4s, v13.4s, v13.4s +fadd v14.4s, v14.4s, v14.4s +fmul v15.4s, v15.4s, v15.4s +fmul v16.4s, v16.4s, v16.4s +fadd v17.4s, v17.4s, v17.4s +fadd v18.4s, v18.4s, v18.4s +fmul v19.4s, v19.4s, v19.4s +fmul v20.4s, v20.4s, v20.4s +fadd v21.4s, v21.4s, v21.4s +fadd v22.4s, v22.4s, v22.4s +fmul v23.4s, v23.4s, v23.4s +fmul v24.4s, v24.4s, v24.4s +fadd v25.4s, v25.4s, v25.4s +fadd v26.4s, v26.4s, v26.4s +fmul v27.4s, v27.4s, v27.4s +fmul v28.4s, v28.4s, v28.4s diff --git a/bench/armv8/peakflops_sp_neon_fma.ptt b/bench/armv8/peakflops_sp_neon_fma.ptt new file mode 100644 index 000000000..4076cbab4 --- /dev/null +++ b/bench/armv8/peakflops_sp_neon_fma.ptt @@ -0,0 +1,67 @@ +STREAMS 1 +TYPE SINGLE +FLOPS 56 +BYTES 4 +DESC Single-precision multiplications and additions with a single load, optimized for NEON FMAs +LOADS 1 +STORES 0 +INSTR_LOOP 29 +UOPS 29 +ldr q1, [STR0] +ldr q2, [STR0] +ldr q3, [STR0] +ldr q4, [STR0] +ldr q5, [STR0] +ldr q6, [STR0] +ldr q7, [STR0] +ldr q8, [STR0] +ldr q9, [STR0] +ldr q10, [STR0] +ldr q11, [STR0] +ldr q12, [STR0] +ldr q13, [STR0] +ldr q14, [STR0] +ldr q15, [STR0] +ldr q16, [STR0] +ldr q17, [STR0] +ldr q18, [STR0] +ldr q19, [STR0] +ldr q20, [STR0] +ldr q21, [STR0] +ldr q22, [STR0] +ldr q23, [STR0] +ldr q24, [STR0] +ldr q25, [STR0] +ldr q26, [STR0] +ldr q27, [STR0] +ldr q28, [STR0] +LOOP 4 +ldr q16, [STR0], #8 +fmla v1.4s, v1.4s, v1.4s +fmla v2.4s, v2.4s, v2.4s +fmla v3.4s, v3.4s, v3.4s +fmla v4.4s, v4.4s, v4.4s +fmla v5.4s, v5.4s, v5.4s +fmla v6.4s, v6.4s, v6.4s +fmla v7.4s, v7.4s, v7.4s +fmla v8.4s, v8.4s, v8.4s +fmla v9.4s, v9.4s, v9.4s +fmla v10.4s, v10.4s, v10.4s +fmla v11.4s, v11.4s, v11.4s +fmla v12.4s, v12.4s, v12.4s +fmla v13.4s, v13.4s, v13.4s +fmla v14.4s, v14.4s, v14.4s +fmla v15.4s, v15.4s, v15.4s +fmla v16.4s, v16.4s, v16.4s +fmla v17.4s, v17.4s, v17.4s +fmla v18.4s, v18.4s, v18.4s +fmla v19.4s, v19.4s, v19.4s +fmla v20.4s, v20.4s, v20.4s +fmla v21.4s, v21.4s, v21.4s +fmla v22.4s, v22.4s, v22.4s +fmla v23.4s, v23.4s, v23.4s +fmla v24.4s, v24.4s, v24.4s +fmla v25.4s, v25.4s, v25.4s +fmla v26.4s, v26.4s, v26.4s +fmla v27.4s, v27.4s, v27.4s +fmla v28.4s, v28.4s, v28.4s diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c index d3b00124d..0105a9c3b 100644 --- a/bench/likwid-bench.c +++ b/bench/likwid-bench.c @@ -491,10 +491,10 @@ int main(int argc, char** argv) if ((int)(floor(orig_size/currentWorkgroup->numberOfThreads)) % test->stride) { int typesize = allocator_dataTypeLength(test->type); - newsize = (((int)(floor(orig_size/nrThreads))/stride)*(stride))*nrThreads; + newsize = (((size_t)(floor(orig_size/nrThreads))/stride)*(stride))*nrThreads; if (newsize > 0 && warn_once) { - fprintf (stdout, "Warning: Sanitizing vector length to a multiple of the loop stride %d and thread count %d from %d elements (%d bytes) to %d elements (%d bytes)\n",stride, nrThreads, orig_size, orig_size*typesize, newsize, newsize*typesize); + fprintf (stdout, "Warning: Sanitizing vector length to a multiple of the loop stride %d and thread count %d from %ld elements (%ld bytes) to %ld elements (%ld bytes)\n",stride, nrThreads, orig_size, orig_size*typesize, newsize, newsize*typesize); warn_once = 0; } else if (newsize == 0) diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c index a7c12c86a..0acb2b9e2 100644 --- a/bench/src/strUtil.c +++ b/bench/src/strUtil.c @@ -154,7 +154,7 @@ parse_workgroup(Workgroup* group, const_bstring str, DataType type) } else { - fprintf(stderr, "Unknown affinity domain %s\n", bdata(tokens->entry[2])); + fprintf(stderr, "Unknown affinity domain %s\n", bdata(tokens->entry[0])); bstrListDestroy(tokens); return NULL; } diff --git a/groups/CLX/ENERGY.txt b/groups/CLX/ENERGY.txt index fe7829fbe..2e9caaab8 100644 --- a/groups/CLX/ENERGY.txt +++ b/groups/CLX/ENERGY.txt @@ -8,13 +8,14 @@ TMP0 TEMP_CORE PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR3 PWR_DRAM_ENERGY - +UBOXFIX UNCORE_CLOCK METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/groups/ICL/BRANCH.txt b/groups/ICL/BRANCH.txt index b8d41b23b..3eea82801 100644 --- a/groups/ICL/BRANCH.txt +++ b/groups/ICL/BRANCH.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 BR_INST_RETIRED_ALL_BRANCHES PMC1 BR_MISP_RETIRED_ALL_BRANCHES diff --git a/groups/ICL/DATA.txt b/groups/ICL/DATA.txt index 4e6e938e1..ee154279a 100644 --- a/groups/ICL/DATA.txt +++ b/groups/ICL/DATA.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 MEM_INST_RETIRED_ALL_LOADS PMC1 MEM_INST_RETIRED_ALL_STORES diff --git a/groups/ICL/DIVIDE.txt b/groups/ICL/DIVIDE.txt index 40b4ab6f3..5e3be1670 100644 --- a/groups/ICL/DIVIDE.txt +++ b/groups/ICL/DIVIDE.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 ARITH_DIVIDER_COUNT PMC1 ARITH_DIVIDER_ACTIVE diff --git a/groups/ICL/FLOPS_AVX.txt b/groups/ICL/FLOPS_AVX.txt index e44a91389..0f41891fc 100644 --- a/groups/ICL/FLOPS_AVX.txt +++ b/groups/ICL/FLOPS_AVX.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE diff --git a/groups/ICL/FLOPS_DP.txt b/groups/ICL/FLOPS_DP.txt index 177cff2d0..64e7d3d3e 100644 --- a/groups/ICL/FLOPS_DP.txt +++ b/groups/ICL/FLOPS_DP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE diff --git a/groups/ICL/FLOPS_SP.txt b/groups/ICL/FLOPS_SP.txt index 01d98c2f2..3e6780b86 100644 --- a/groups/ICL/FLOPS_SP.txt +++ b/groups/ICL/FLOPS_SP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE diff --git a/groups/ICL/MEM.txt b/groups/ICL/MEM.txt index b4b6b0d76..45fd290be 100644 --- a/groups/ICL/MEM.txt +++ b/groups/ICL/MEM.txt @@ -1,4 +1,4 @@ -SHORT L3 cache bandwidth in MBytes/s +SHORT Memory bandwidth in MBytes/s EVENTSET FIXC0 INSTR_RETIRED_ANY @@ -22,16 +22,14 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 LONG Formulas: -L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time -L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 -L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time -L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 -L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time -L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +Memory load bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/time +Memory load data volume [GBytes] = 1.0E-09*DRAM_READS*64.0 +Memory evict bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/time +Memory evict data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64/time +Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64 - -Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the -number of cache line allocated in the L2 and the number of modified cache lines -evicted from the L2. This group also output data volume transferred between the -L3 and measured cores L2 caches. Note that this bandwidth also includes data -transfers due to a write allocate load on a store miss in L2. +Profiling group to measure memory cache bandwidth. The desktop-class Intel +chips provide free-running memory counters in the MMIO space. Since they are +free-running, it might overflow without notice if the read intervals are too long. diff --git a/groups/ICL/TMA.txt b/groups/ICL/TMA.txt index a8f156e15..fb0bdedfc 100644 --- a/groups/ICL/TMA.txt +++ b/groups/ICL/TMA.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS TMA0 RETIRING TMA1 BAD_SPECULATION TMA2 FRONTEND_BOUND @@ -15,6 +16,7 @@ Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 IPC FIXC0/FIXC1 +Total slots FIXC3 Front End [%] TMA2*100 Speculation [%] TMA1*100 Retiring [%] TMA0*100 @@ -22,6 +24,7 @@ Back End [%] TMA3*100 LONG Formulas: +Total slots = TOPDOWN_SLOTS Front End [%] = FRONTEND_BOUND*100 Speculation [%] = BAD_SPECULATION*100 Retiring [%] = RETIRING*100 diff --git a/groups/ICX/ENERGY.txt b/groups/ICX/ENERGY.txt index 3d4fd4ce5..52680770c 100644 --- a/groups/ICX/ENERGY.txt +++ b/groups/ICX/ENERGY.txt @@ -10,6 +10,7 @@ PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR3 PWR_DRAM_ENERGY PWR4 PWR_PLATFORM_ENERGY +UBOXFIX UNCORE_CLOCKTICKS @@ -17,6 +18,7 @@ METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/groups/ICX/L3.txt b/groups/ICX/L3.txt index 59185f4e0..d24f9e9d9 100644 --- a/groups/ICX/L3.txt +++ b/groups/ICX/L3.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 L2_LINES_IN_ALL PMC1 L2_TRANS_L2_WB PMC2 L2_LINES_OUT_SILENT diff --git a/groups/ICX/MEM_DP.txt b/groups/ICX/MEM_DP.txt index d6e481a86..acb6a1963 100644 --- a/groups/ICX/MEM_DP.txt +++ b/groups/ICX/MEM_DP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PWR0 PWR_PKG_ENERGY PWR3 PWR_DRAM_ENERGY PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE @@ -22,6 +23,10 @@ MBOX4C0 CAS_COUNT_RD MBOX4C1 CAS_COUNT_WR MBOX5C0 CAS_COUNT_RD MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR METRICS Runtime (RDTSC) [s] time @@ -36,13 +41,13 @@ DP [MFLOP/s] 1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/time AVX DP [MFLOP/s] 1.0E-06*(PMC2*4.0+PMC3*8.0)/time Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time Scalar [MUOPS/s] 1.0E-06*PMC1/time -Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time -Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 -Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time -Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time -Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) LONG Formulas: diff --git a/groups/ICX/MEM_SP.txt b/groups/ICX/MEM_SP.txt index 5720938a0..fb7ad9c71 100644 --- a/groups/ICX/MEM_SP.txt +++ b/groups/ICX/MEM_SP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PWR0 PWR_PKG_ENERGY PWR3 PWR_DRAM_ENERGY PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE @@ -22,6 +23,10 @@ MBOX4C0 CAS_COUNT_RD MBOX4C1 CAS_COUNT_WR MBOX5C0 CAS_COUNT_RD MBOX5C1 CAS_COUNT_WR +MBOX6C0 CAS_COUNT_RD +MBOX6C1 CAS_COUNT_WR +MBOX7C0 CAS_COUNT_RD +MBOX7C1 CAS_COUNT_WR METRICS Runtime (RDTSC) [s] time @@ -36,13 +41,13 @@ SP [MFLOP/s] 1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/time AVX SP [MFLOP/s] 1.0E-06*(PMC2*8.0+PMC3*16.0)/time Packed [MUOPS/s] 1.0E-06*(PMC0+PMC2+PMC3)/time Scalar [MUOPS/s] 1.0E-06*PMC1/time -Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0/time -Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0)*64.0 -Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time -Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time -Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time +Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0 +Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time +Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 +Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) LONG Formulas: diff --git a/groups/ICX/TMA.txt b/groups/ICX/TMA.txt index a8f156e15..fb0bdedfc 100644 --- a/groups/ICX/TMA.txt +++ b/groups/ICX/TMA.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS TMA0 RETIRING TMA1 BAD_SPECULATION TMA2 FRONTEND_BOUND @@ -15,6 +16,7 @@ Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock CPI FIXC1/FIXC0 IPC FIXC0/FIXC1 +Total slots FIXC3 Front End [%] TMA2*100 Speculation [%] TMA1*100 Retiring [%] TMA0*100 @@ -22,6 +24,7 @@ Back End [%] TMA3*100 LONG Formulas: +Total slots = TOPDOWN_SLOTS Front End [%] = FRONTEND_BOUND*100 Speculation [%] = BAD_SPECULATION*100 Retiring [%] = RETIRING*100 diff --git a/groups/RKL/BRANCH.txt b/groups/RKL/BRANCH.txt index b8d41b23b..3eea82801 100644 --- a/groups/RKL/BRANCH.txt +++ b/groups/RKL/BRANCH.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 BR_INST_RETIRED_ALL_BRANCHES PMC1 BR_MISP_RETIRED_ALL_BRANCHES diff --git a/groups/RKL/DATA.txt b/groups/RKL/DATA.txt index 4e6e938e1..ee154279a 100644 --- a/groups/RKL/DATA.txt +++ b/groups/RKL/DATA.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 MEM_INST_RETIRED_ALL_LOADS PMC1 MEM_INST_RETIRED_ALL_STORES diff --git a/groups/RKL/DIVIDE.txt b/groups/RKL/DIVIDE.txt index 40b4ab6f3..5e3be1670 100644 --- a/groups/RKL/DIVIDE.txt +++ b/groups/RKL/DIVIDE.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 ARITH_DIVIDER_COUNT PMC1 ARITH_DIVIDER_ACTIVE diff --git a/groups/RKL/ENERGY.txt b/groups/RKL/ENERGY.txt index fe7829fbe..09045cc24 100644 --- a/groups/RKL/ENERGY.txt +++ b/groups/RKL/ENERGY.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS TMP0 TEMP_CORE PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY diff --git a/groups/RKL/FLOPS_AVX.txt b/groups/RKL/FLOPS_AVX.txt index e44a91389..0f41891fc 100644 --- a/groups/RKL/FLOPS_AVX.txt +++ b/groups/RKL/FLOPS_AVX.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE PMC1 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE PMC2 FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE diff --git a/groups/RKL/FLOPS_DP.txt b/groups/RKL/FLOPS_DP.txt index 177cff2d0..64e7d3d3e 100644 --- a/groups/RKL/FLOPS_DP.txt +++ b/groups/RKL/FLOPS_DP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE PMC1 FP_ARITH_INST_RETIRED_SCALAR_DOUBLE PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE diff --git a/groups/RKL/FLOPS_SP.txt b/groups/RKL/FLOPS_SP.txt index 01d98c2f2..3e6780b86 100644 --- a/groups/RKL/FLOPS_SP.txt +++ b/groups/RKL/FLOPS_SP.txt @@ -4,6 +4,7 @@ EVENTSET FIXC0 INSTR_RETIRED_ANY FIXC1 CPU_CLK_UNHALTED_CORE FIXC2 CPU_CLK_UNHALTED_REF +FIXC3 TOPDOWN_SLOTS PMC0 FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE PMC1 FP_ARITH_INST_RETIRED_SCALAR_SINGLE PMC2 FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE diff --git a/groups/broadwellD/ENERGY.txt b/groups/broadwellD/ENERGY.txt index 09eaeb140..55ec315ba 100644 --- a/groups/broadwellD/ENERGY.txt +++ b/groups/broadwellD/ENERGY.txt @@ -9,6 +9,7 @@ PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR2 PWR_PP1_ENERGY PWR3 PWR_DRAM_ENERGY +UBOXFIX UNCORE_CLOCK @@ -16,6 +17,7 @@ METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/groups/broadwellEP/ENERGY.txt b/groups/broadwellEP/ENERGY.txt index fe7829fbe..ba1e1e309 100644 --- a/groups/broadwellEP/ENERGY.txt +++ b/groups/broadwellEP/ENERGY.txt @@ -8,6 +8,7 @@ TMP0 TEMP_CORE PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR3 PWR_DRAM_ENERGY +UBOXFIX UNCORE_CLOCK @@ -15,6 +16,7 @@ METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/groups/haswellEP/ENERGY.txt b/groups/haswellEP/ENERGY.txt index ee0af1b5c..41cda04fd 100644 --- a/groups/haswellEP/ENERGY.txt +++ b/groups/haswellEP/ENERGY.txt @@ -8,13 +8,14 @@ TMP0 TEMP_CORE PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR3 PWR_DRAM_ENERGY - +UBOXFIX UNCORE_CLOCK METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/groups/skylake/MEM.txt b/groups/skylake/MEM.txt index 3a12df7be..976370e59 100644 --- a/groups/skylake/MEM.txt +++ b/groups/skylake/MEM.txt @@ -1,4 +1,4 @@ -SHORT L3 cache bandwidth in MBytes/s +SHORT Memory bandwidth in MBytes/s EVENTSET FIXC0 INSTR_RETIRED_ANY @@ -21,16 +21,13 @@ Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 LONG Formulas: -L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time -L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0 -L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L2_WB*64.0/time -L3 evict data volume [GBytes] = 1.0E-09*L2_TRANS_L2_WB*64.0 -L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64/time -L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_TRANS_L2_WB)*64 +Memory load bandwidth [MBytes/s] = 1.0E-06*DRAM_READS*64.0/time +Memory load data volume [GBytes] = 1.0E-09*DRAM_READS*64.0 +Memory evict bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/time +Memory evict data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 +Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64/time +Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64 - -Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the -number of cache line allocated in the L2 and the number of modified cache lines -evicted from the L2. This group also output data volume transferred between the -L3 and measured cores L2 caches. Note that this bandwidth also includes data -transfers due to a write allocate load on a store miss in L2. - +Profiling group to measure memory cache bandwidth. The desktop-class Intel +chips provide free-running memory counters in the MMIO space. Since they are +free-running, it might overflow without notice if the read intervals are too long. diff --git a/groups/skylakeX/ENERGY.txt b/groups/skylakeX/ENERGY.txt index fe7829fbe..2e9caaab8 100644 --- a/groups/skylakeX/ENERGY.txt +++ b/groups/skylakeX/ENERGY.txt @@ -8,13 +8,14 @@ TMP0 TEMP_CORE PWR0 PWR_PKG_ENERGY PWR1 PWR_PP0_ENERGY PWR3 PWR_DRAM_ENERGY - +UBOXFIX UNCORE_CLOCK METRICS Runtime (RDTSC) [s] time Runtime unhalted [s] FIXC1*inverseClock Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock +Uncore Clock [MHz] 1.E-06*UBOXFIX/time CPI FIXC1/FIXC0 Temperature [C] TMP0 Energy [J] PWR0 diff --git a/make/config_checks.mk b/make/config_checks.mk index 390a7cc8d..4d23b3607 100644 --- a/make/config_checks.mk +++ b/make/config_checks.mk @@ -11,12 +11,12 @@ HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) HAS_PERFEVENT = $(shell if [ $(KERNEL_VERSION) -lt 6 -a $(KERNEL_VERSION_MAJOR) -lt 2 -a $(KERNEL_VERSION_MINOR) -lt 31 ]; then echo 0; else echo 1; fi; ) # determine glibc Version -GLIBC_VERSION_MAJOR := $(shell echo '\#include ' | cc -dM -E - | grep -E '\#define __GLIBC__' | cut -d ' ' -f 3) -GLIBC_VERSION_MINOR := $(shell echo '\#include ' | cc -dM -E - | grep -E '\#define __GLIBC_MINOR__' | cut -d ' ' -f 3) +#GLIBC_VERSION_MAJOR := $(shell echo '\#include ' | cc -dM -E - | grep -E '\#define __GLIBC__' | cut -d ' ' -f 3) +GLIBC_VERSION_MAJOR := $(shell ldd --version | grep ldd | awk '{ print $$NF }' | awk -F. '{ print $$1 }') +#GLIBC_VERSION_MINOR := $(shell echo '\#include ' | cc -dM -E - | grep -E '\#define __GLIBC_MINOR__' | cut -d ' ' -f 3) +GLIBC_VERSION_MINOR := $(shell ldd --version | grep ldd | awk '{ print $$NF }' | awk -F. '{ print $$2 }') -HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION_MINOR) -lt 4 ]; then \ - echo 0; else echo 1; \ - fi; ) +HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION_MINOR) -lt 4 ]; then echo 0; else echo 1; fi; ) ENOUGH_CPUS = $(shell [ $(shell grep processor /proc/cpuinfo | wc -l) -le $(MAX_NUM_THREADS) ] && echo True ) ifneq ($(strip $(ENOUGH_CPUS)), True) diff --git a/make/include_GCCARMv7.mk b/make/include_GCCARMv7.mk index d8068cb97..13aec5c3f 100644 --- a/make/include_GCCARMv7.mk +++ b/make/include_GCCARMv7.mk @@ -26,6 +26,7 @@ SHARED_LFLAGS = -shared -fvisibility=hidden DEFINES = -DPAGE_ALIGNMENT=4096 DEFINES += -DLIKWID_MONITOR_LOCK DEFINES += -DDEBUGLEV=0 +DEFINES += -D__ARM_ARCH_7A__ INCLUDES = LIBS = -lm -lrt diff --git a/make/include_GCCARMv8.mk b/make/include_GCCARMv8.mk index b31da1ac0..b203b5f7d 100644 --- a/make/include_GCCARMv8.mk +++ b/make/include_GCCARMv8.mk @@ -26,6 +26,7 @@ SHARED_LFLAGS = -shared -fvisibility=hidden DEFINES = -DPAGE_ALIGNMENT=4096 DEFINES += -DLIKWID_MONITOR_LOCK DEFINES += -DDEBUGLEV=0 +DEFINES += -D__ARM_ARCH_8A INCLUDES = LIBS = -lm -lrt diff --git a/src/access_client.c b/src/access_client.c index 835ff75e2..9db2c3932 100644 --- a/src/access_client.c +++ b/src/access_client.c @@ -72,6 +72,7 @@ static pthread_mutex_t globalLock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t *cpuLocks = NULL; /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ +void __attribute__((destructor (104))) close_access_client(void); static char* access_client_strerror(AccessErrorType det) @@ -106,6 +107,21 @@ access_client_errno(AccessErrorType det) } } +static void access_client_signal_catcher(int sig) { + close_access_client(); +} + +static int +access_client_catch_signal() +{ + struct sigaction sia; + sia.sa_handler = access_client_signal_catcher; + sigemptyset(&sia.sa_mask); + sia.sa_flags = SA_ONSTACK; + sigaction(SIGCHLD, &sia, NULL); + return 0; +} + static int access_client_startDaemon(int cpu_id) { diff --git a/src/access_x86_rdpmc.c b/src/access_x86_rdpmc.c index d4f7f66f0..9b9dc3eb6 100644 --- a/src/access_x86_rdpmc.c +++ b/src/access_x86_rdpmc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -56,6 +57,7 @@ static int rdpmc_works_fixed_ref = -1; static int rdpmc_works_fixed_slots = -1; static int rdpmc_works_llc = -1; static int rdpmc_works_mem = -1; +static pthread_mutex_t rdpmc_setup_lock = PTHREAD_MUTEX_INITIALIZER; static inline int @@ -93,7 +95,7 @@ segfault_sigaction_rdpmc(int signal, siginfo_t *si, void *arg) static int test_rdpmc(int cpu_id, uint64_t value, int flag) { - int ret; + int ret = -1; int pid; pid = fork(); @@ -146,6 +148,8 @@ access_x86_rdpmc_init(const int cpu_id) eax = 0xA; CPUID(eax, ebx, ecx, edx); } + unsigned eventSupportedCount = (eax >> 24) & 0xff; + pthread_mutex_lock(&rdpmc_setup_lock); if (rdpmc_works_pmc < 0) { rdpmc_works_pmc = test_rdpmc(cpu_id, 0, 0); @@ -153,26 +157,35 @@ access_x86_rdpmc_init(const int cpu_id) } if (rdpmc_works_fixed_inst < 0 && cpuid_info.isIntel) { - if (ebx & (1<<1)) - rdpmc_works_fixed_inst = test_rdpmc(cpu_id, (1<<30), 0); - DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED instruction counter returned %d, rdpmc_works_fixed_inst); + if (eventSupportedCount > 1 && (!(ebx & (1<<1)))) + { + rdpmc_works_fixed_inst = test_rdpmc(cpu_id, (1<<30), 0); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED instruction counter returned %d, rdpmc_works_fixed_inst); + } } if (rdpmc_works_fixed_cyc < 0 && cpuid_info.isIntel) { - if (ebx & (1<<0)) - rdpmc_works_fixed_cyc = test_rdpmc(cpu_id, (1<<30) + 1, 0); - DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED core cycles counter returned %d, rdpmc_works_fixed_cyc); + if (eventSupportedCount > 0 && (!(ebx & (1<<0)))) + { + rdpmc_works_fixed_cyc = test_rdpmc(cpu_id, (1<<30) + 1, 0); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED core cycles counter returned %d, rdpmc_works_fixed_cyc); + } } if (rdpmc_works_fixed_ref < 0 && cpuid_info.isIntel) { - if (ebx & (1<<2)) - rdpmc_works_fixed_ref = test_rdpmc(cpu_id, (1<<30) + 2, 0); - DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED reference cycle counter returned %d, rdpmc_works_fixed_ref); + if (eventSupportedCount > 2 && (!(ebx & (1<<2)))) + { + rdpmc_works_fixed_ref = test_rdpmc(cpu_id, (1<<30) + 2, 0); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED reference cycle counter returned %d, rdpmc_works_fixed_ref); + } } - if (rdpmc_works_fixed_slots < 0) + if (rdpmc_works_fixed_slots < 0 && cpuid_info.isIntel) { - rdpmc_works_fixed_slots = test_rdpmc(cpu_id, (1<<30) + 3, 0); - DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED slots counter returned %d, rdpmc_works_fixed_ref); + if (eventSupportedCount > 7 && (!(ebx & (1<<7)))) + { + rdpmc_works_fixed_slots = test_rdpmc(cpu_id, (1<<30) + 3, 0); + DEBUG_PRINT(DEBUGLEV_DEVELOP, Test for RDPMC for FIXED slots counter returned %d, rdpmc_works_fixed_slots); + } } if (rdpmc_works_llc < 0 && (!cpuid_info.isIntel)) { @@ -206,12 +219,14 @@ access_x86_rdpmc_init(const int cpu_id) break; } } + pthread_mutex_unlock(&rdpmc_setup_lock); return 0; } void access_x86_rdpmc_finalize(const int cpu_id) { + pthread_mutex_lock(&rdpmc_setup_lock); rdpmc_works_pmc = -1; rdpmc_works_fixed_inst = -1; rdpmc_works_fixed_cyc = -1; @@ -219,6 +234,7 @@ access_x86_rdpmc_finalize(const int cpu_id) rdpmc_works_fixed_slots = -1; rdpmc_works_llc = -1; rdpmc_works_mem = -1; + pthread_mutex_unlock(&rdpmc_setup_lock); } int diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua index cded49fc8..92d5e53eb 100644 --- a/src/applications/likwid-mpirun.lua +++ b/src/applications/likwid-mpirun.lua @@ -110,9 +110,10 @@ local hosts = {} local perf = {} local mpitype = nil local slurm_involved = false +local slurm_no_tasks_per_node = false local omptype = nil local skipStr = "" -local executable = {} +local execList = {} local envsettings = {} local mpiopts = nil local debug = false @@ -269,9 +270,7 @@ local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes) elseif ver2 == "6" then bindstr = "--bind-to-none" end - elseif ver1 == 2 then - bindstr = "--bind-to none" - elseif ver1 == 3 then + elseif ver1 >= 2 then bindstr = "--bind-to none" end @@ -401,7 +400,7 @@ local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes) print_stdout(string.format("EXEC: %s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript)) print_stdout(string.format("EXEC: %s/mpdallexit", path)) else - print_stdout(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript)) + print_stdout(string.format("%s %s -machinefile %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript)) end end @@ -412,7 +411,7 @@ local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes) ret = os.execute(string.format("%s/mpiexec -perhost %d %s -np %d %s", path, ppn, envstr, np, wrapperscript)) ret = os.execute(string.format("%s/mpdallexit", path)) else - ret = os.execute(string.format("%s %s -f %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript)) + ret = os.execute(string.format("%s %s -machinefile %s -np %d -perhost %d %s",mpiexecutable, envstr, hostfile, np, ppn, wrapperscript)) end return ret end @@ -652,7 +651,9 @@ local function executeSlurm(wrapperscript, hostfile, env, nrNodes) end opts["nodes"] = string.format("%d", nrNodes) opts["ntasks"] = string.format("%d", np) - opts["ntasks-per-node"] = string.format("%d", ppn) + if not slurm_no_tasks_per_node then + opts["ntasks-per-node"] = string.format("%d", ppn) + end opts["cpu_bind"] = "none" opts["cpus-per-task"] = string.format("%d", threads) supported_types = _srun_get_mpi_types() @@ -886,10 +887,10 @@ local function getMpiExec(mpitype) end local function getOmpType() - local cmd = string.format("ldd %s 2>/dev/null", executable[1]) + local cmd = string.format("ldd %s 2>/dev/null", execList[1]) local f = io.popen(cmd, 'r') if f == nil then - cmd = string.format("ldd $(basename %s) 2>/dev/null", executable[1]) + cmd = string.format("ldd $(basename %s) 2>/dev/null", execList[1]) f = io.popen(cmd, 'r') end omptype = nil @@ -1027,6 +1028,12 @@ local function assignHosts(hosts, np, ppn, tpp) break elseif tmp < ppn*tpp then ppn = tmp + if tpp > 1 then + ppn = math.tointeger(tmp/tpp) + end + if slurm_involved then + slurm_no_tasks_per_node = true + end end end if break_while then @@ -1435,22 +1442,30 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu end end + if np == nil then np = 1 end + local i_np = math.tointeger(np) + if i_np == nil then i_np = 1 end + + if ppn == nil then ppn = 1 end + local i_ppn = math.tointeger(ppn) + if i_ppn == nil then i_ppn = 1 end + if mpitype == "openmpi" then glsize_var = "$OMPI_COMM_WORLD_SIZE" glrank_var = "${OMPI_COMM_WORLD_RANK:-$(($GLOBALSIZE * 2))}" losize_var = "$OMPI_COMM_WORLD_LOCAL_SIZE" elseif mpitype == "intelmpi" then glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}" - glsize_var = tostring(math.tointeger(np)) - losize_var = tostring(math.tointeger(ppn)) + glsize_var = tostring(i_np) + losize_var = tostring(i_ppn) elseif mpitype == "mvapich2" then glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}" - glsize_var = tostring(math.tointeger(np)) - losize_var = tostring(math.tointeger(ppn)) + glsize_var = tostring(i_np) + losize_var = tostring(i_ppn) elseif mpitype == "slurm" then glrank_var = "${SLURM_PROCID:-$(($GLOBALSIZE * 2))}" - glsize_var = tostring(math.tointeger(np)) - losize_var = string.format("${SLURM_NTASKS_PER_NODE:-%d}", math.tointeger(ppn)) + glsize_var = tostring(i_np) + losize_var = string.format("${SLURM_NTASKS_PER_NODE:-%d}", i_ppn) else print_stderr("Invalid MPI vendor "..mpitype) return @@ -1523,6 +1538,7 @@ local function writeWrapperScript(scriptname, execStr, hosts, envsettings, outpu f:write("GLOBALSIZE="..glsize_var.."\n") f:write("GLOBALRANK="..glrank_var.."\n") if os.getenv("OMP_NUM_THREADS") == nil then + if tpp == nil then tpp = 1 end f:write(string.format("export OMP_NUM_THREADS=%d\n", tpp)) else f:write(string.format("export OMP_NUM_THREADS=%s\n", os.getenv("OMP_NUM_THREADS"))) @@ -2206,22 +2222,28 @@ if use_marker and #perf == 0 then mpirun_exit(1) end -for i,x in pairs(arg) do +for i = 1, #arg do if i > 0 then - table.insert(executable, abspath(x) or x) + local x = arg[i] + local t = abspath(x) or x + if string.find(x, " ") then + table.insert(execList, "\""..t.."\"") + else + table.insert(execList, t) + end end end -if #executable == 0 then +if #execList == 0 then print_stderr("ERROR: No executable given on commandline") mpirun_exit(1) end if debug then - print_stdout("DEBUG: Executable given on commandline: "..table.concat(executable, " ")) + print_stdout("DEBUG: Executable given on commandline: "..table.concat(execList, " ")) end local gotExecutable = false -for i,x in pairs(executable) do +for i,x in pairs(execList) do if likwid.access(x, "x") == 0 then gotExecutable = true break @@ -2229,7 +2251,7 @@ for i,x in pairs(executable) do end if not gotExecutable then print_stderr("ERROR: Cannot find an executable on commandline") - print_stderr(table.concat(executable, " ")) + print_stderr(table.concat(execList, " ")) mpirun_exit(1) end @@ -2465,24 +2487,20 @@ if skipStr == "" then skipStr = '-s 0x7' end elseif mpitype == "openmpi" then - if omptype == "intel" and nrNodes > 1 then + if omptype == "intel" then skipStr = '-s 0x7' - elseif omptype == "intel" and nrNodes == 1 then - skipStr = '-s 0x1' - if tpp > 1 then - skipStr = '-s 0x3' - end - elseif omptype == "gnu" and nrNodes > 1 then + elseif omptype == "gnu" then skipStr = '-s 0x7' - elseif omptype == "gnu" and nrNodes == 1 then - skipStr = '-s 0x0' - if tpp > 0 then - skipStr = '-s 0x1' - end end elseif mpitype == "slurm" then - if omptype == "intel" and nrNodes > 1 then - if nrNodes == 1 then + if omptype == "intel" then + if nrNodes == 1 and tpp == 1 then + skipStr = '-s 0x1' + else + skipStr = '-s 0x3' + end + elseif omptype == "gnu" then + if nrNodes == 1 and tpp == 1 then skipStr = '-s 0x1' else skipStr = '-s 0x3' @@ -2502,7 +2520,7 @@ if writeHostfile == nil or getEnvironment == nil or executeCommand == nil then end writeHostfile(newhosts, hostfilename) -local skipped_ranks = writeWrapperScript(scriptfilename, table.concat(executable, " "), newhosts, envsettings, outfilename) +local skipped_ranks = writeWrapperScript(scriptfilename, table.concat(execList, " "), newhosts, envsettings, outfilename) local env = getEnvironment() local exitvalue = executeCommand(scriptfilename, hostfilename, env, nrNodes) diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua index 8d9d87e3d..5b1e33b86 100644 --- a/src/applications/likwid-perfctr.lua +++ b/src/applications/likwid-perfctr.lua @@ -59,7 +59,7 @@ local function examples() end end -local function usage() +local function usage(config) version() io.stdout:write("A tool to read out performance counter registers on x86, ARM and POWER processors\n\n") io.stdout:write("Options:\n") @@ -97,6 +97,11 @@ local function usage() io.stdout:write("-o, --output \t Store output to file. (Optional: Apply text filter according to filename suffix)\n") io.stdout:write("-O\t\t\t Output easily parseable CSV instead of fancy tables\n") io.stdout:write("--stats\t\t\t Always print statistics table\n") + if config and config["daemonMode"] == -1 then + io.stdout:write("perf_event specific options:\n") + io.stdout:write("--perfpid \t\t Measure given PID\n") + io.stdout:write("--execpid\t\t Use the PID of wrapped application for measurements\n") + end io.stdout:write("\n") examples() end @@ -152,8 +157,14 @@ markerFolder = "/tmp" markerFile = string.format("%s/likwid_%d.txt", markerFolder, likwid.getpid()) cpuClock = 1 execpid = false +local perf_paranoid = likwid.perf_event_paranoid() if config["daemonMode"] == -1 then - execpid = true + if perf_paranoid > 2 then + print_stderr(string.format("Cannot use performance monitoring with perf_event_paranoid = %d", perf_paranoid)) + os.exit(1) + elseif perf_paranoid > 0 then + execpid = true + end end perfflags = nil perfpid = nil @@ -199,7 +210,7 @@ local function perfctr_exit(exitcode) end if #arg == 0 then - usage() + usage(config) perfctr_exit(0) end @@ -213,7 +224,7 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", " end end if opt == "h" or opt == "help" then - usage() + usage(config) perfctr_exit(0) elseif opt == "v" or opt == "version" then version() @@ -371,7 +382,11 @@ for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", " end local execList = {} for i=1, likwid.tablelength(arg)-2 do - table.insert(execList, arg[i]) + if string.find(arg[i], " ") then + table.insert(execList, "\""..arg[i].."\"") + else + table.insert(execList, arg[i]) + end end if perfpid and (not execpid) and (not cpulist) then @@ -654,7 +669,7 @@ end if #event_string_list == 0 and #gpu_event_string_list == 0 and not print_info then print_stderr("Option(s) -g or -W must be given on commandline") - usage() + usage(config) perfctr_exit(1) end @@ -718,7 +733,7 @@ end if use_wrapper and likwid.tablelength(arg)-2 == 0 and print_info == false then print_stderr("No Executable can be found on commandline") - usage() + usage(config) perfctr_exit(0) end diff --git a/src/applications/likwid-perfscope.lua b/src/applications/likwid-perfscope.lua index 6c8e7abcb..211c44952 100644 --- a/src/applications/likwid-perfscope.lua +++ b/src/applications/likwid-perfscope.lua @@ -228,7 +228,11 @@ for opt,arg in likwid.getopt(arg, {"h","v","V:","g:","C:","c:","t:","r:","a","d" end local execList = {} for i=1, likwid.tablelength(arg)-2 do - table.insert(execList, arg[i]) + if string.find(arg[i], " ") then + table.insert(execList, "\""..arg[i].."\"") + else + table.insert(execList, arg[i]) + end end if print_configs then diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua index caedeb960..7604cbd49 100644 --- a/src/applications/likwid-pin.lua +++ b/src/applications/likwid-pin.lua @@ -159,7 +159,11 @@ for opt,arg in likwid.getopt(arg, {"c:", "C:", "d:", "h", "i", "m", "p", "q", "s end local execList = {} for i=1, likwid.tablelength(arg)-2 do - table.insert(execList, arg[i]) + if string.find(arg[i], " ") then + table.insert(execList, "\""..arg[i].."\"") + else + table.insert(execList, arg[i]) + end end likwid.setenv("LIKWID_NO_ACCESS", "1") diff --git a/src/applications/likwid-powermeter.lua b/src/applications/likwid-powermeter.lua index 3d00679e3..3f476bf74 100644 --- a/src/applications/likwid-powermeter.lua +++ b/src/applications/likwid-powermeter.lua @@ -279,7 +279,11 @@ if #arg == 0 then end else for i=1, likwid.tablelength(arg)-2 do - table.insert(execList, arg[i]) + if string.find(arg[i], " ") then + table.insert(execList, "\""..arg[i].."\"") + else + table.insert(execList, arg[i]) + end end execString = execString .. table.concat(execList," ") end diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua index 8cd0f9cd7..edce61c8f 100644 --- a/src/applications/likwid.lua +++ b/src/applications/likwid.lua @@ -1222,6 +1222,18 @@ end likwid.gethostname = gethostname +local function get_perf_event_paranoid() + local f = io.open("/proc/sys/kernel/perf_event_paranoid") + if f then + value = f:read("*l") + f:close() + return tonumber(value) + end + return 4 +end + +likwid.perf_event_paranoid = get_perf_event_paranoid + local function getjid() jid = "X" for _, v in pairs({"PBS_JOBID", "SLURM_JOB_ID", "SLURM_JOBID", "LOADL_STEP_ID", "LSB_JOBID" }) do diff --git a/src/includes/likwid.h b/src/includes/likwid.h index 181aea39a..b491b9b68 100644 --- a/src/includes/likwid.h +++ b/src/includes/likwid.h @@ -2178,7 +2178,7 @@ Get the metric result of all measurement cycles. It reads all raw results for th @param [in] gpuId ID of the GPU that should be read @return The metric result */ -double nvmon_getMetric(int groupId, int metricId, int gpuId); +double nvmon_getMetric(int groupId, int metricId, int gpuId) __attribute__ ((visibility ("default") )); /*! \brief Get the last metric result of the specified group, counter and GPU (Nvmon) Get the metric result of the last measurement cycle. It reads all raw results for the given groupId and gpuId. @@ -2187,7 +2187,7 @@ Get the metric result of the last measurement cycle. It reads all raw results fo @param [in] gpuId ID of the GPU that should be read @return The metric result */ -double nvmon_getLastMetric(int groupId, int metricId, int gpuId); +double nvmon_getLastMetric(int groupId, int metricId, int gpuId) __attribute__ ((visibility ("default") )); /*! \brief Get the number of configured event groups (Nvmon) @return Number of groups diff --git a/src/includes/nvmon_perfworks.h b/src/includes/nvmon_perfworks.h index cbef81dd1..d09205395 100644 --- a/src/includes/nvmon_perfworks.h +++ b/src/includes/nvmon_perfworks.h @@ -736,6 +736,58 @@ void nvmon_perfworks_freeDevice(NvmonDevice_t dev) free(dev->chip); dev->chip = NULL; } + if (dev->nvEventSets) + { + for (int i = 0; i < dev->numNvEventSets; i++) + { + NvmonEventSet* evset = &dev->nvEventSets[i]; + bstrListDestroy(evset->events); + if (evset->nvEvents) + { + free(evset->nvEvents); + evset->nvEvents = NULL; + } + if (evset->results) + { + free(evset->results); + evset->results = NULL; + } + if (evset->configImage) + { + free(evset->configImage); + evset->configImage = NULL; + evset->configImageSize = 0; + } + if (evset->counterDataImage) + { + free(evset->counterDataImage); + evset->counterDataImage = NULL; + evset->counterDataImageSize = 0; + } + if (evset->counterDataScratchBuffer) + { + free(evset->counterDataScratchBuffer); + evset->counterDataScratchBuffer = NULL; + evset->counterDataScratchBufferSize = 0; + } + if (evset->counterDataImagePrefix) + { + free(evset->counterDataImagePrefix); + evset->counterDataImagePrefix = NULL; + evset->counterDataImagePrefixSize = 0; + } + if (evset->counterAvailabilityImage) + { + free(evset->counterAvailabilityImage); + evset->counterAvailabilityImage = NULL; + evset->counterAvailabilityImageSize = 0; + } + } + free(dev->nvEventSets); + dev->nvEventSets = NULL; + dev->numNvEventSets = 0; + dev->activeEventSet = -1; + } if (dev->allevents) { int i = 0; @@ -1181,6 +1233,68 @@ static int nvmon_perfworks_getMetricRequests3(NVPA_MetricsContext* context, } +static int nvmon_perfworks_getMetricRequests(NVPA_MetricsContext* context, struct bstrList* events, NVPA_RawMetricRequest** requests) +{ + int i = 0; + int isolated = 1; + int keepInstances = 1; + struct bstrList* temp = bstrListCreate(); + const char ** raw_events = NULL; + int num_raw = 0; + for (i = 0; i < events->qty; i++) + { + //nvmon_perfworks_parse_metric(events->entry[i], &isolated, &keepInstances); + keepInstances = 1; /* Bug in Nvidia API */ + NVPW_MetricsContext_GetMetricProperties_Begin_Params getMetricPropertiesBeginParams = { NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE }; + NVPW_MetricsContext_GetMetricProperties_End_Params getMetricPropertiesEndParams = { NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE }; + getMetricPropertiesBeginParams.pMetricsContext = context; + getMetricPropertiesBeginParams.pMetricName = bdata(events->entry[i]); + getMetricPropertiesEndParams.pMetricsContext = context; + GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Metric %s, bdata(events->entry[i])); + LIKWID_NVPW_API_CALL((*NVPW_MetricsContext_GetMetricProperties_BeginPtr)(&getMetricPropertiesBeginParams), bstrListDestroy(temp); return -EFAULT); + + int count = 0; + for (const char** dep = getMetricPropertiesBeginParams.ppRawMetricDependencies; *dep ; ++dep) + { + GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Metric depend %s, *dep); + bstrListAddChar(temp, (char*)*dep); + } + + LIKWID_NVPW_API_CALL((*NVPW_MetricsContext_GetMetricProperties_EndPtr)(&getMetricPropertiesEndParams), bstrListDestroy(temp); return -EFAULT); + + } + int num_reqs = 0; + NVPA_RawMetricRequest* reqs = malloc((temp->qty+1) * NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE); + if (!reqs) + { + bstrListDestroy(temp); + return -ENOMEM; + } + for (i = 0; i < temp->qty; i++) + { + NVPA_RawMetricRequest* req = &reqs[num_reqs]; + char* s = malloc((blength(temp->entry[i])+2) * sizeof(char)); + if (s) + { + int ret = snprintf(s, blength(temp->entry[i])+1, "%s", bdata(temp->entry[i])); + if (ret > 0) + { + s[ret] = '\0'; + } + req->structSize = NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE; + req->pMetricName = s; + GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Metric Request %s, s); + req->isolated = isolated; + req->keepInstances = keepInstances; + num_reqs++; + } + + } + bstrListDestroy(temp); + *requests = reqs; + return num_reqs; +} + static int nvmon_perfworks_createConfigImage(char* chip, struct bstrList* events, uint8_t **configImage, uint8_t* availImage) { int i = 0; @@ -1432,6 +1546,12 @@ nvmon_perfworks_addEventSet(NvmonDevice_t device, const char* eventString) tmp = bsplit(eventBString, ','); bdestroy(eventBString); + NvmonEvent_t* nvEvents = malloc(tmp->qty * sizeof(NvmonEvent_t)); + if (!nvEvents) + { + bstrListDestroy(tmp); + return -ENOMEM; + } eventtokens = bstrListCreate(); for (i = 0; i < tmp->qty; i++) @@ -1444,6 +1564,7 @@ nvmon_perfworks_addEventSet(NvmonDevice_t device, const char* eventString) if (bstrcmp(parts->entry[0], bname) == BSTR_OK) { bstrListAddChar(eventtokens, device->allevents[j]->real); + nvEvents[i] = device->allevents[j]; GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Adding real event %s, device->allevents[j]->real); } } @@ -1544,6 +1665,7 @@ nvmon_perfworks_addEventSet(NvmonDevice_t device, const char* eventString) return -ENOMEM; } memset(newEventSet->results, 0, eventtokens->qty * sizeof(NvmonEventResult)); + newEventSet->nvEvents = nvEvents; device->numNvEventSets++; GPUDEBUG_PRINT(DEBUGLEV_DEVELOP, Adding eventset %d, gid); } @@ -1902,7 +2024,6 @@ int nvmon_perfworks_stopCounters(NvmonDevice_t device) nvmon_perfworks_getMetricValue(device->chip, eventSet->counterDataImage, eventSet->events, &values); int i = 0, j = 0; - //for (j = 0; j < eventSet->events->qty; j++) for (j = 0; j < eventSet->numberOfEvents; j++) { double res = values[j]; diff --git a/src/includes/perfmon_icelakeX_events.txt b/src/includes/perfmon_icelakeX_events.txt index 60c6031bd..890cea13a 100644 --- a/src/includes/perfmon_icelakeX_events.txt +++ b/src/includes/perfmon_icelakeX_events.txt @@ -368,6 +368,7 @@ EVENT_INST_RETIRED 0xC0 PMC UMASK_INST_RETIRED_ANY 0x00 UMASK_INST_RETIRED_ANY_P 0x00 UMASK_INST_RETIRED_STALL_CYCLES 0x01 +UMASK_INST_RETIRED_NOP 0x02 EVENT_ASSISTS 0xC1 PMC UMASK_ASSISTS_FP 0x02 diff --git a/src/includes/perfmon_icelake_events.txt b/src/includes/perfmon_icelake_events.txt index 17fad4da2..35761acfa 100644 --- a/src/includes/perfmon_icelake_events.txt +++ b/src/includes/perfmon_icelake_events.txt @@ -357,6 +357,7 @@ EVENT_INST_RETIRED 0xC0 PMC UMASK_INST_RETIRED_ANY 0x00 UMASK_INST_RETIRED_ANY_P 0x00 UMASK_INST_RETIRED_STALL_CYCLES 0x01 +UMASK_INST_RETIRED_NOP 0x02 EVENT_ASSISTS 0xC1 PMC UMASK_ASSISTS_FP 0x02 diff --git a/src/includes/perfmon_perfevent.h b/src/includes/perfmon_perfevent.h index 93d5ec9b7..93802030a 100644 --- a/src/includes/perfmon_perfevent.h +++ b/src/includes/perfmon_perfevent.h @@ -250,25 +250,47 @@ int perf_fixed_setup(struct perf_event_attr *attr, RegisterIndex index, PerfmonE { attr->exclude_kernel = 1; attr->exclude_hv = 1; - if (strcmp(event->name, "INSTR_RETIRED_ANY") == 0) + if (strncmp(event->name, "INSTR_RETIRED_ANY", 18) == 0) { attr->config = PERF_COUNT_HW_INSTRUCTIONS; ret = 0; } - if (strcmp(event->name, "CPU_CLK_UNHALTED_CORE") == 0 || - strcmp(event->name, "ACTUAL_CPU_CLOCK") == 0 || - strcmp(event->name, "APERF") == 0) + if (strncmp(event->name, "CPU_CLK_UNHALTED_CORE", 22) == 0 || + strncmp(event->name, "ACTUAL_CPU_CLOCK", 17) == 0 || + strncmp(event->name, "APERF", 5) == 0) { attr->config = PERF_COUNT_HW_CPU_CYCLES; ret = 0; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0) - if (strcmp(event->name, "CPU_CLK_UNHALTED_REF") == 0) + if (strncmp(event->name, "CPU_CLK_UNHALTED_REF", 21) == 0) { attr->config = PERF_COUNT_HW_REF_CPU_CYCLES; ret = 0; } #endif + if (cpuid_info.isIntel) + { + switch(cpuid_info.model) + { + case ICELAKE1: + case ICELAKE2: + case ICELAKEX1: + case ICELAKEX2: + case ROCKETLAKE: + case COMETLAKE1: + case COMETLAKE2: + case TIGERLAKE1: + case TIGERLAKE2: + case SNOWRIDGEX: + if (strncmp(event->name, "TOPDOWN_SLOTS", 13) == 0) + { + attr->config = 0x0400; + attr->type = PERF_TYPE_RAW; + ret = 0; + } + } + } } else { diff --git a/src/libperfctr.c b/src/libperfctr.c index 287fd65a0..4a1b3b0cb 100644 --- a/src/libperfctr.c +++ b/src/libperfctr.c @@ -113,6 +113,10 @@ calculateMarkerResult(RegisterIndex index, uint64_t start, uint64_t stop, int ov { double result = 0.0; uint64_t maxValue = 0ULL; + if (start > stop) + { + overflows++; + } if (overflows == 0) { result = (double) (stop - start); @@ -627,12 +631,12 @@ likwid_markerStopRegion(const char* regionTag) { if (groupSet->groups[groupSet->activeGroup].events[i].type != NOTYPE) { - DEBUG_PRINT(DEBUGLEV_DEVELOP, STOP [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu, regionTag, thread_id, cpu_id, i, - LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData); result = calculateMarkerResult(groupSet->groups[groupSet->activeGroup].events[i].index, results->StartPMcounters[i], groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData, groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].overflows - results->StartOverflows[i]); + DEBUG_PRINT(DEBUGLEV_DEVELOP, STOP [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu DIFF %f, regionTag, thread_id, cpu_id, i, + LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData, result); if ((counter_map[groupSet->groups[groupSet->activeGroup].events[i].index].type != THERMAL) && (counter_map[groupSet->groups[groupSet->activeGroup].events[i].index].type != VOLTAGE) && (counter_map[groupSet->groups[groupSet->activeGroup].events[i].index].type != MBOX0TMP)) diff --git a/src/luawid.c b/src/luawid.c index b4c3e8e51..7eca2fd84 100644 --- a/src/luawid.c +++ b/src/luawid.c @@ -1973,20 +1973,41 @@ lua_likwid_send_signal(lua_State* L) int parse(char *line, char **argv, int maxlen) { - int len = 0; - while (*line != '\0' && len < maxlen) { /* if not the end of line ....... */ - if (*line == ' ' || *line == '\t' || *line == '\n') - *line++ = '\0'; /* replace white spaces with 0 */ - *argv++ = line; /* save the argument position */ - len++; - while (*line != '\0' && *line != ' ' && - *line != '\t' && *line != '\n') - line++; /* skip the argument until ... */ - } - *argv = (char *)'\0'; /* mark the end of argument list */ - if (len < maxlen || *line == '\0') - return len; - return -1; + int pos = 0; + int len = 0; + int in_string = 0; + while (*line != '\0' && len < maxlen) + { + if (*line == '"' || *line == '\'') + { + in_string = (!in_string); + line++; + pos++; + continue; + } + if (!in_string) + { + if ((*line == ' ' || *line == '\t' || *line == '\n')) + { + *line++ = '\0'; /* replace white spaces with 0 */ + pos++; + } + *argv++ = line; /* save the argument position */ + len++; + } + else if ((*line == ' ' || *line == '\t' || *line == '\n')) + { + line++; + pos++; + } + while (*line != '\0' && *line != ' ' && *line != '\t' && *line != '\n' && *line != '"' && *line != '\'') + { + line++; + pos++; + } + } + *argv = (char *)'\0'; + return (len < maxlen || *line == '\0' ? len : -1); } /* ##### FUNCTION DEFINITIONS - LOCAL TO THIS SOURCE FILE ########### */ diff --git a/src/perfmon.c b/src/perfmon.c index b367071a1..4d71ae3ad 100644 --- a/src/perfmon.c +++ b/src/perfmon.c @@ -348,8 +348,6 @@ static int getEvent(bstring event_str, bstring counter_str, PerfmonEvent* event) { int ret = FALSE; - int (*ownstrncmp)(const char *, const char *, size_t); - ownstrncmp = &strncmp; for (int i=0; i< perfmon_numArchEvents; i++) { if (biseqcstr(event_str, eventHash[i].name)) @@ -863,7 +861,6 @@ perfmon_check_counter_map(int cpu_id) void perfmon_init_maps(void) { - uint32_t eax, ebx, ecx, edx; if (eventHash != NULL && counter_map != NULL && box_map != NULL && perfmon_numCounters > 0 && perfmon_numArchEvents > 0) return; switch ( cpuid_info.family ) @@ -1287,7 +1284,7 @@ perfmon_init_maps(void) #endif case ARMV7_FAMILY: - switch ( cpuid_info.model ) + switch ( cpuid_info.part ) { case ARMV7L: case ARM7L: @@ -1438,7 +1435,7 @@ perfmon_init_maps(void) memset(tmp + perfmon_numArchEvents, '\0', 10*sizeof(PerfmonEvent)); eventHash = tmp; eventHash[perfmon_numArchEvents].name = "GENERIC_EVENT"; - bstring bsep = bfromcstr("|"); + struct tagbstring bsep = bsStatic ("|"); struct bstrList* outlist = bstrListCreate(); for (int i = 0; i < perfmon_numArchEvents; i++) { @@ -1459,7 +1456,8 @@ perfmon_init_maps(void) { for (int k = 0; k < perfmon_numCounters; k++) { - if (strncmp(bdata(xlist->entry[j]), counter_map[k].key, blength(xlist->entry[j])) == 0) + bstring bkey = bfromcstr(counter_map[k].key); + if (bstrcmp(xlist->entry[j], bkey) == BSTR_OK) { #ifndef LIKWID_USE_PERFEVENT if (HPMcheck(counter_map[k].device, cpu_id)) @@ -1468,9 +1466,11 @@ perfmon_init_maps(void) #endif { bstrListAdd(outlist, xlist->entry[j]); + bdestroy(bkey); break; } } + bdestroy(bkey); } } } @@ -1478,7 +1478,7 @@ perfmon_init_maps(void) bstrListDestroy(xlist); } - bstring blim = bjoin(outlist, bsep); + bstring blim = bjoin(outlist, &bsep); eventHash[perfmon_numArchEvents].limit = malloc((blength(blim)+2)*sizeof(char)); int ret = snprintf(eventHash[perfmon_numArchEvents].limit, blength(blim)+1, "%s", bdata(blim)); @@ -2063,12 +2063,21 @@ perfmon_finalize(void) groupSet->groups[group].state = STATE_NONE; } if (groupSet->groups != NULL) + { free(groupSet->groups); + groupSet->groups = NULL; + } if (groupSet->threads != NULL) + { free(groupSet->threads); + groupSet->threads = NULL; + } groupSet->activeGroup = -1; if (groupSet) + { free(groupSet); + groupSet = NULL; + } if (currentConfig) { for (group=0; group < cpuid_topology.numHWThreads; group++) @@ -2102,7 +2111,6 @@ perfmon_finalize(void) added_generic_event = 0; } perfmon_initialized = 0; - groupSet = NULL; return; } @@ -3205,10 +3213,7 @@ perfmon_getMaxCounterValue(RegisterType type) { width = box_map[type].regWidth; } - for(int i=0;i 8) @@ -273,7 +272,6 @@ power_init(int cpuId) if (i % 8 == 0) { flag_idx++; - reg_idx = 0; } power_info.turbo.steps[i] = busSpeed * (double) field64(flag_vals[flag_idx],(i%8)*8, 8); if (power_info.turbo.steps[i] > 0) diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c index f90939eba..d5d4a1120 100644 --- a/src/pthread-overload/pthread-overload.c +++ b/src/pthread-overload/pthread-overload.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef COLOR #include @@ -81,12 +82,14 @@ static int *pin_ids = NULL; static int ncpus = 0; static uint64_t skipMask = 0x0; static int silent = 0; +static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + void __attribute__((constructor (103))) init_pthread_overload(void) { - char *str = NULL; + char *str = NULL, *pinstr = NULL; char *token = NULL, *saveptr = NULL; char *delimiter = ","; - int i = 0; + int i = 0, ret = 0; static long avail_cpus = 0; avail_cpus = sysconf(_SC_NPROCESSORS_CONF); pin_ids = malloc(avail_cpus * sizeof(int)); @@ -94,17 +97,33 @@ void __attribute__((constructor (103))) init_pthread_overload(void) str = getenv("LIKWID_PIN"); if (str != NULL) { - token = str; + pinstr = malloc((strlen(str)+2) * sizeof(char)); + if (!pinstr) + { + free(pin_ids); + pin_ids = NULL; + return; + } + ret = snprintf(pinstr, (strlen(str)+1), "%s", str); + if (ret <= 0) + { + free(pin_ids); + pin_ids = NULL; + return; + } + pinstr[ret] = '\0'; + saveptr = pinstr; + token = pinstr; while (token) { - token = strtok_r(str,delimiter,&saveptr); - str = NULL; + token = strtok_r(saveptr, delimiter ,&saveptr); if (token) { ncpus++; pin_ids[i++] = strtoul(token, &token, 10); } } + free(pinstr); } str = getenv("LIKWID_SKIP"); if (str != NULL) @@ -136,6 +155,7 @@ pthread_create(pthread_t* thread, static long online_cpus = 0; static int shepard = 0; online_cpus = sysconf(_SC_NPROCESSORS_ONLN); + pthread_mutex_lock(&mutex); /* On first entry: Get Evironment Variable and initialize pin_ids */ if (ncalled == 0 && pin_ids != NULL) @@ -238,6 +258,7 @@ pthread_create(pthread_t* thread, if (!handle) { color_print("%s\n", dlerror()); + pthread_mutex_unlock(&mutex); return -1; } @@ -247,6 +268,7 @@ pthread_create(pthread_t* thread, if ((error = dlerror()) != NULL) { color_print("%s\n", error); + pthread_mutex_unlock(&mutex); return -2; } @@ -309,6 +331,7 @@ pthread_create(pthread_t* thread, fflush(stdout); ncalled++; dlclose(handle); + pthread_mutex_unlock(&mutex); return ret; } diff --git a/src/topology.c b/src/topology.c index 6e4c88197..37beb4169 100644 --- a/src/topology.c +++ b/src/topology.c @@ -1134,7 +1134,7 @@ topology_setName(void) break; case ARMV7_FAMILY: - switch (cpuid_info.model) + switch (cpuid_info.part) { case ARM7L: case ARMV7L: diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c index c6e71ffd1..981606dff 100644 --- a/src/topology_hwloc.c +++ b/src/topology_hwloc.c @@ -63,54 +63,54 @@ int parse_cpuinfo(uint32_t* count, uint32_t* family, uint32_t* variant, uint32_t uint32_t c = 0; int (*ownatoi)(const char*); ownatoi = &atoi; + struct tagbstring familyString = bsStatic("CPU architecture"); + struct tagbstring variantString = bsStatic("CPU variant"); + struct tagbstring steppingString = bsStatic("CPU revision"); + struct tagbstring partString = bsStatic("CPU part"); + struct tagbstring vendString = bsStatic("CPU implementer"); + struct tagbstring procString = bsStatic("processor"); if (NULL != (fp = fopen ("/proc/cpuinfo", "r"))) { - const_bstring familyString = bformat("CPU architecture"); - const_bstring variantString = bformat("CPU variant"); - const_bstring steppingString = bformat("CPU revision"); - const_bstring partString = bformat("CPU part"); - const_bstring vendString = bformat("CPU implementer"); - const_bstring procString = bformat("processor"); bstring src = bread ((bNread) fread, fp); struct bstrList* tokens = bsplit(src,(char) '\n'); bdestroy(src); fclose(fp); for (i=0;iqty;i++) { - if ((f == 0) && (binstr(tokens->entry[i],0,procString) != BSTR_ERR)) + if ((binstr(tokens->entry[i],0,&procString) != BSTR_ERR)) { c++; } - else if ((f == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR)) + else if ((f == 0) && (binstr(tokens->entry[i],0,&familyString) != BSTR_ERR)) { struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':'); bltrimws(subtokens->entry[1]); f = ownatoi(bdata(subtokens->entry[1])); bstrListDestroy(subtokens); } - else if ((s == 0) && (binstr(tokens->entry[i],0,steppingString) != BSTR_ERR)) + else if ((s == 0) && (binstr(tokens->entry[i],0,&steppingString) != BSTR_ERR)) { struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':'); bltrimws(subtokens->entry[1]); s = ownatoi(bdata(subtokens->entry[1])); bstrListDestroy(subtokens); } - else if ((v == 0) && (binstr(tokens->entry[i],0,variantString) != BSTR_ERR)) + else if ((v == 0) && (binstr(tokens->entry[i],0,&variantString) != BSTR_ERR)) { struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':'); bltrimws(subtokens->entry[1]); v = strtol(bdata(subtokens->entry[1]), NULL, 0); bstrListDestroy(subtokens); } - else if ((p == 0) && (binstr(tokens->entry[i],0,partString) != BSTR_ERR)) + else if ((p == 0) && (binstr(tokens->entry[i],0,&partString) != BSTR_ERR)) { struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':'); bltrimws(subtokens->entry[1]); p = strtol(bdata(subtokens->entry[1]), NULL, 0); bstrListDestroy(subtokens); } - else if ((p == 0) && (binstr(tokens->entry[i],0,vendString) != BSTR_ERR)) + else if ((vend == 0) && (binstr(tokens->entry[i],0,&vendString) != BSTR_ERR)) { struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':'); bltrimws(subtokens->entry[1]); @@ -119,9 +119,6 @@ int parse_cpuinfo(uint32_t* count, uint32_t* family, uint32_t* variant, uint32_t } } bstrListDestroy(tokens); - /*bdestroy(familyString); - bdestroy(variantString); - bdestroy(steppingString);*/ } else { @@ -284,25 +281,16 @@ hwloc_init_cpuInfo(cpu_set_t cpuSet) cpuid_info.stepping = atoi(info); snprintf(cpuid_info.architecture, 19, "x86_64"); #endif -#ifdef __ARM_ARCH_7A__ - if ((info = LIKWID_HWLOC_NAME(obj_get_info_by_name)(obj, "CPUArchitecture"))) - cpuid_info.family = atoi(info); - if ((info = LIKWID_HWLOC_NAME(obj_get_info_by_name)(obj, "CPURevision"))) - cpuid_info.model = atoi(info); - if (cpuid_info.family == 0 || cpuid_info.model == 0) - { - uint32_t part = 0; - parse_cpuinfo(&count, &cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor); - parse_cpuname(cpuid_info.osname); - } - snprintf(cpuid_info.architecture, 19, "armv7"); -#endif #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A) - uint32_t part = 0; parse_cpuinfo(&count, &cpuid_info.family, &cpuid_info.model, &cpuid_info.stepping, &cpuid_info.part, &cpuid_info.vendor); parse_cpuname(cpuid_info.osname); +#ifdef __ARM_ARCH_7A__ + snprintf(cpuid_info.architecture, 19, "armv7"); +#endif +#ifdef __ARM_ARCH_8A snprintf(cpuid_info.architecture, 19, "armv8"); #endif +#endif #ifndef _ARCH_PPC if ((info = LIKWID_HWLOC_NAME(obj_get_info_by_name)(obj, "CPUModel"))) @@ -344,8 +332,26 @@ hwloc_init_cpuInfo(cpu_set_t cpuSet) cpuid_topology.numHWThreads = LIKWID_HWLOC_NAME(get_nbobjs_by_type)(hwloc_topology, HWLOC_OBJ_PU); #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A) + if (cpuid_info.vendor == FUJITSU_ARM && cpuid_info.part == FUJITSU_A64FX) + { + int max_id = 0; + for (int i = 0; i < LIKWID_HWLOC_NAME(get_nbobjs_by_type)(hwloc_topology, HWLOC_OBJ_PU); i++) + { + obj = LIKWID_HWLOC_NAME(get_obj_by_type)(hwloc_topology, HWLOC_OBJ_PU, i); + if (obj->os_index > max_id) + { + max_id = obj->os_index; + } + } + if (max_id + 1 > count) + { + count = max_id + 1; + } + } if (count > cpuid_topology.numHWThreads) + { cpuid_topology.numHWThreads = count; + } #endif count = likwid_sysfs_list_len("/sys/devices/system/cpu/online"); if (count > cpuid_topology.numHWThreads) diff --git a/src/topology_proc.c b/src/topology_proc.c index c4c0a9df4..0bad5d797 100644 --- a/src/topology_proc.c +++ b/src/topology_proc.c @@ -351,8 +351,8 @@ proc_init_cpuFeatures(void) #ifdef _ARCH_PPC return; #endif - const_bstring flagString = bformat("flags"); - const_bstring featString = bformat("Features"); + struct tagbstring flagString = bsStatic ("flags"); + struct tagbstring featString = bsStatic ("Features"); bstring flagline = bfromcstr(""); bstring cpuinfo = read_file("/proc/cpuinfo"); @@ -361,11 +361,11 @@ proc_init_cpuFeatures(void) for (int i = 0; i < cpulines->qty; i++) { #if defined(__x86_64__) || defined(__i386__) - if (bstrncmp(cpulines->entry[i], flagString, 5) == BSTR_OK || - bstrncmp(cpulines->entry[i], featString, 8) == BSTR_OK) + if (bstrncmp(cpulines->entry[i], &flagString, 5) == BSTR_OK || + bstrncmp(cpulines->entry[i], &featString, 8) == BSTR_OK) #endif #if defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) - if (bstrncmp(cpulines->entry[i], featString, 8) == BSTR_OK) + if (bstrncmp(cpulines->entry[i], &featString, 8) == BSTR_OK) #endif #ifdef _ARCH_PPC if (ret != 1) @@ -379,6 +379,7 @@ proc_init_cpuFeatures(void) bstrListDestroy(cpulines); struct bstrList* flaglist = bsplit(flagline, ' '); + bdestroy(flagline); bstring bfeatures = bfromcstr(""); cpuid_info.featureFlags = 0; @@ -721,42 +722,62 @@ proc_init_nodeTopology(cpu_set_t cpuSet) int hidx = 0; for (int i = 0; i < cpuid_topology.numHWThreads; i++) { - int pid = hwThreadPool[i].packageId; - int found = 0; - for (int j = 0; j < hidx; j++) - { - if (pid == helper[j]) - { - found = 1; - break; - } - } - if (!found) - { - helper[hidx++] = pid; + int pid = hwThreadPool[i].packageId; + int found = 0; + for (int j = 0; j < hidx; j++) + { + if (pid == helper[j]) + { + found = 1; + break; } + } + if (!found) + { + helper[hidx++] = pid; + } } cpuid_topology.numSockets = hidx; + /* Traverse all sockets to get maximal thread count per socket. + * This should fix the code for architectures with "empty" sockets. + */ + int num_threads_per_socket = 0; + for (int i = 0; i < cpuid_topology.numSockets; i++) + { + int threadCount = 0; + for (int j = 0; j < cpuid_topology.numHWThreads; j++) + { + if (helper[i] == hwThreadPool[j].packageId) + { + threadCount++; + } + } + if (threadCount > num_threads_per_socket) + { + num_threads_per_socket = threadCount; + } + } + int first_socket_id = helper[0]; hidx = 0; for (int i = 0; i < cpuid_topology.numHWThreads; i++) { - int did = hwThreadPool[i].dieId; - int pid = hwThreadPool[i].packageId; - if (pid != first_socket_id) continue; - int found = 0; - for (int j = 0; j < hidx; j++) - { - if (did == helper[j]) - { - found = 1; - break; - } - } - if (!found) - { - helper[hidx++] = did; + int did = hwThreadPool[i].dieId; + int pid = hwThreadPool[i].packageId; + if (pid != first_socket_id) continue; + int found = 0; + for (int j = 0; j < hidx; j++) + { + if (did == helper[j]) + { + found = 1; + break; } + } + if (!found) + { + helper[hidx++] = did; + } } cpuid_topology.numDies = hidx * cpuid_topology.numSockets; @@ -764,19 +785,15 @@ proc_init_nodeTopology(cpu_set_t cpuSet) { cpuid_topology.numDies = 0; } - num_threads_per_core = 0; - int test_core_id = hwThreadPool[0].coreId; - int test_socket_id = hwThreadPool[0].packageId; - int num_threads_per_socket = 0; + int max_thread_sibling_id = 0; for (int i = 0; i < cpuid_topology.numHWThreads; i++) { - if (hwThreadPool[i].packageId == test_socket_id) - { - num_threads_per_socket++; - if (hwThreadPool[i].coreId == test_core_id) - num_threads_per_core++; - } + if (hwThreadPool[i].threadId > max_thread_sibling_id) + { + max_thread_sibling_id = hwThreadPool[i].threadId; + } } + num_threads_per_core = max_thread_sibling_id + 1; cpuid_topology.numCoresPerSocket = num_threads_per_socket/num_threads_per_core; cpuid_topology.numThreadsPerCore = num_threads_per_core; free(helper); diff --git a/test/gitlab-ci/generate_arch_jobs.sh b/test/gitlab-ci/generate_arch_jobs.sh index ce933ef73..b7ebcda58 100755 --- a/test/gitlab-ci/generate_arch_jobs.sh +++ b/test/gitlab-ci/generate_arch_jobs.sh @@ -1,11 +1,15 @@ #!/bin/bash -l -for L in $(sinfo -t idle -h --partition=work -o "%n"); do +for L in $(sinfo -t idle -h --partition=work -o "%n %t" | grep "idle" | cut -d ' ' -f 1); do arch="x86" depend="build-x86-perf" if [ "$L" = "aurora1" ]; then continue fi + if [ "$L" = "applem1studio" ]; then + arch="arm8" + depend="build-arm8-perf" + fi if [ "$L" = "warmup" ]; then arch="arm8" depend="build-arm8-perf" diff --git a/test/gitlab-ci/notify_github.sh b/test/gitlab-ci/notify_github.sh new file mode 100755 index 000000000..fc69b3256 --- /dev/null +++ b/test/gitlab-ci/notify_github.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +GITHUB_ORG="RRZE-HPC" +GITHUB_REPO="likwid" +GITHUB_SHA="${CI_COMMIT_SHA}" + +cat << EOF > headers.curl +Accept: application/vnd.github+json +Authorization: token ${GITHUB_API_TOKEN} +EOF +#cat << EOF > success.json +#{ +# "state" : "success", +# "target_url" : "${CI_PIPELINE_URL}", +# "description" : "CI runs at NHR@FAU systems successful", +# "context" : "continuous-integration/gitlab" +#} +#EOF +cat << EOF > success.json +{ + "state" : "success", + "target_url" : "${CI_PIPELINE_URL}", + "context" : "ci/${CI_SERVER_HOST}", + "description" : "CI runs at NHR@FAU systems successful" +} +EOF +cat << EOF > failure.json +{ + "state" : "failure", + "target_url" : "${CI_PIPELINE_URL}", + "context" : "ci/${CI_SERVER_HOST}", + "description" : "CI runs at NHR@FAU systems failed" +} +EOF +cat << EOF > pending.json +{ + "state" : "pending", + "target_url" : "${CI_PIPELINE_URL}", + "context" : "ci/${CI_SERVER_HOST}", + "description" : "CI runs at NHR@FAU systems pending" +} +EOF +GITHUB_API_URL="https://api.github.com/repos/${GITHUB_ORG}/${GITHUB_REPO}/statuses/${GITHUB_SHA}" +if [ "$1" == "success" ]; then + cat success.json + curl -s -X POST -H @headers.curl "${GITHUB_API_URL}" -d @success.json +elif [ "$1" == "failure" ]; then + cat failure.json + curl -s -X POST -H @headers.curl "${GITHUB_API_URL}" -d @failure.json +elif [ "$1" == "pending" ]; then + cat pending.json + curl -s -X POST -H @headers.curl "${GITHUB_API_URL}" -d @pending.json +fi