-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCargo.toml
279 lines (247 loc) · 11.1 KB
/
Cargo.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# For optimal build performance, the implementation is split in multiple crates
#
# - A "common" crate featuring the logic shared by all floating-point operations
# - A crate per floating-point operation that we may want to benchmark
# - A "subwoofer" root crate that receives configuration (via cargo features)
# and orchestrates benchmark runs accordingly
# * ...which defines one benchmark per supported floating-point type
#
# These crates are grouped in a workspace where common properties are defined...
[workspace]
members = [
"addsub",
"common",
"div_denominator_min",
"div_numerator_max",
"fma_addend_max",
"fma_full_max_mul",
"fma_multiplier_bidi",
"max",
"mul_max",
"sqrt_positive_max"
]
[workspace.package]
authors = ["Hadrien G. <[email protected]>"]
categories = ["command-line-utilities", "hardware-support", "science"]
edition = "2021"
keywords = ["benchmark", "cpu", "denormals", "floating-point", "subnormals"]
license = "MPL-2.0"
repository = "https://github.com/HadrienG2/subwoofer.git"
version = "1.0.0"
[workspace.dependencies]
common = { path = "./common" }
criterion = { version = "0.5", default-features = false }
pessimize = { version = "2.0", features = ["nightly"] }
rand = "0.8.5"
target-features = "0.1"
hwlocality = "1.0.0-alpha"
[profile.bench]
# Sadly needed for pessimize::hide() to be compiled efficiently. Without it, no
# amount of inlining and LTO can save rustc nightly 2024-12-06 from spilling
# accumulators to memory in benchmarks with memory data sources. It is not clear
# to me why that is the case, most likely multiple codegen units get in the way
# of some important whole-program optimization.
codegen-units = 1
# Good for perf profiling
debug = "line-tables-only"
[profile.bench.package."*"]
# Only workspace members need top optimizations, dependencies can use less
# aggressive optimization settings in order to improve build performance
codegen-units = 256
incremental = true
opt-level = 2
[profile.release]
# May slightly improve build and runtime perf
panic = "abort"
# ...and everything below this point is about the "subwoofer" root crate
[package]
authors.workspace = true
autobenches = false
categories.workspace = true
description = "Assessing the impact of subnormals on your CPU's performance"
edition.workspace = true
keywords.workspace = true
license.workspace = true
name = "subwoofer"
repository.workspace = true
version.workspace = true
[features]
# If you are running this benchmark on hardware which you only have temporary
# access to, consider --all-features to check everything that the benchmark can
# possibly measure, at the expense of extremely long compilation and execution
# time (we're talking about multiple days of execution).
#
# More fine-grained options are described below.
default = ["cargo_bench_support", "check"]
### High-level profiles ###
# Run enough benchmarks to tell if subnormals are a problem on your hardware
#
# This configuration exercises all supported hardware operations for scalar
# floating-point data in the L1 cache, at a coarse subnormal freq granularity.
#
# This configuration runs relatively quickly (<1h) and should be enough to
# qualitatively tell whether subnormals affect floating-point performance on
# your hardware, and if so which operations are affected. It is not enough to
# precisely tell how much the hardware operations are affected, whether the
# impact is type-dependent, how it depends on subnormal freq, etc.
check = [
"bench_addsub", # ADD/SUB (both are assumed to behave the same)
"bench_max", # MAX is assumed to be same as MIN
"bench_mul_max", # Includes subnormal MUL -> subnormal MAX
"bench_sqrt_positive_max", # Includes subnormal SQRT -> maybe-subnormal MAX
"bench_div_numerator_max", # Includes DIV with subnormal numerator -> subnormal MAX
"bench_div_denominator_min", # Includes DIV with subnormal denominator -> infinite MIN
"bench_fma_full_max_mul", # Includes subnormal FMA -> subnormal MAX -> normal MUL
]
# Run enough benchmarks to check how good/bad the generated code is
#
# This configuration makes sure that all qualitatively different code paths are
# generated and exercised by cargo bench, so that they appear in the output of
# profilers like perf or can be analyzed by static analysis tools like Cutter or
# MAQAO. It does not increase the amount of execution configurations, so it is
# not suitable for quantitative analysis.
measure_codegen = ["bench_all", "register_data_sources", "simd"]
# Run enough benchmarks to quantitatively assess the perf impact of subnormals
#
# This configuration contains everything needed to quantitatively assess the
# worst-case impact of subnormals on typical hardware.
measure = [
"measure_codegen",
"subnormal_freq_resolution_1in64",
]
### Fine-grained options ###
# Enable support for cargo bench
#
# Without this, "cargo bench" runs do not generate criterion reports, and one
# needs to use "cargo criterion" instead. This speeds up the criterion build
# because now benchmarks do not need to bundle a full report generator.
cargo_bench_support = ["criterion/cargo_bench_support", "criterion/plotters", "criterion/rayon"]
# Full set of supported microbenchmarks
bench_all = [
"bench_addsub", # ADD/SUB with x%subnormal input, normal result
"bench_max", # MAX with x%subnormal input, normal result, assumed to behave just like MIN
"bench_mul_max", # MUL with x%subnormal input & result -> MAX with x%subnormal input
"bench_sqrt_positive_max", # SQRT with x%subnormal input, x²%subnormal result -> MAX with x²%subnormal input
"bench_div_numerator_max", # DIV with x%subnormal numerator, normal denominator -> x%subnormal MAX
"bench_div_denominator_min", # DIV with x%subnormal denominator, normal numerator -> x%infinite MIN
"bench_fma_addend_max", # FMA with x%subnormal addend, normal multiplier & result -> normal MIN
"bench_fma_multiplier_bidi", # FMA with x%subnormal multiplier, normal addend & result
"bench_fma_full_max_mul", # FMA with x%subnormal multiplier and addend, x²%subnormal result -> MAX with x²%subnormal input -> normal MUL
]
# Fine-grained control over benchmark selection. Lets you speed up compilation
# when debugging a specific benchmark. Use with --no-default-features.
bench_addsub = ["dep:addsub"]
bench_div_denominator_min = ["dep:div_denominator_min"]
bench_div_numerator_max = ["dep:div_numerator_max"]
bench_fma_addend_max = ["dep:fma_addend_max"]
bench_fma_full_max_mul = ["dep:fma_full_max_mul"]
bench_fma_multiplier_bidi = ["dep:fma_multiplier_bidi"]
bench_max = ["dep:max"]
bench_mul_max = ["dep:mul_max"]
bench_sqrt_positive_max = ["dep:sqrt_positive_max"]
# By default, we only run benchmarks from data in the L1 cache, because that's
# where we expect the maximal subnormal impact outside of perhaps the
# in-registers configuration, and it's less artificial and less likely to be
# messed up by CPU µarch details than the in-registers configuration.
#
# Use this feature to test at all levels of the memory hierarchy instead, at the
# expense of a large increase in compilation and execution time...
more_data_sources = ["register_data_sources", "more_memory_data_sources"]
# ...or use these finer-grained features if you want to be more specific
register_data_sources = []
more_memory_data_sources = []
# By default, we only run benchmarks with minimal instruction-level parallelism
# (a configuration which is latency-bound for all operations other than SQRT),
# with ~maximal ILP (hopefully throughput-bound), and with half the maximal ILP
# (as a sanity check, see below).
#
# This feature lets you cover all power-of-two degrees of ILP instead. Use it
# when you observe that the intermediate ILP configuration runs faster than the
# maximal ILP configuration, which suggests that the maximal ILP configuration
# is running into codegen or microarchitectural bottleneck e.g. it trashes the
# CPU's instruction cache because there is too much code. In this situation, it
# is better to try all ILP configurations to make sure that you do cover the ILP
# configuration of highest runtime performance.
#
# The price to pay is a moderate increase in execution time.
more_ilp_configurations = []
# By default, we only check subnormal behavior in scalars.
#
# Enable this feature to also check vectors of all-normals or all-subnormals.
# This lets you check if the subnormal fallback logic is vectorized or scalar.
#
# The price to pay is a large increase in compilation and execution time.
simd = ["common/simd"]
# By default, we check with [0, 25, 50, 75, 100]% of subnormal inputs.
#
# If you notice that the subnormals-induced slowdown does not follow a simple
# monotonic pattern (e.g. linearly grows to a maximum at 100%, or linearly grows
# from 0% to 50% then linearly decays from 50% to 100%), then you should try
# enabling one of the following features to more precisely probe the overhead vs
# subnormal occurence frequency curve.
#
# The suggested way to tune this for your hardware, if you have enough time, is:
#
# 1. Start with the maximal supported resolution configuration.
# 2. Check out the criterion report, find out how many data point you truly need
# to faithfully probe the overhead vs subnormals frequency curve (e.g. to
# sample the maximal overhead point with good precision).
# 3. If you need to re-run the benchmark on the same hardware later on, use only
# this number of data points to reduce execution time.
#
# The price to pay for increasing subnormal frequency resolution is a
# multiplicative increase in execution time on benchmarks with memory inputs.
subnormal_freq_resolution_1in8 = []
subnormal_freq_resolution_1in16 = []
subnormal_freq_resolution_1in32 = []
subnormal_freq_resolution_1in64 = []
subnormal_freq_resolution_1in128 = []
# TODO: Add configurations and adjust "measure" configuration if we find
# hardware where <1% resolution is still not enough
[dependencies]
addsub = { path = "./addsub", optional = true }
common.workspace = true
criterion.workspace = true
div_denominator_min = { path = "./div_denominator_min", optional = true }
div_numerator_max = { path = "./div_numerator_max", optional = true }
fma_addend_max = { path = "./fma_addend_max", optional = true }
fma_full_max_mul = { path = "./fma_full_max_mul", optional = true }
fma_multiplier_bidi = { path = "./fma_multiplier_bidi", optional = true }
hwlocality.workspace = true
max = { path = "./max", optional = true }
mul_max = { path = "./mul_max", optional = true }
pessimize.workspace = true
rand.workspace = true
sqrt_positive_max = { path = "./sqrt_positive_max", optional = true }
[lib]
bench = false
[[bench]]
name = "f32"
harness = false
[[bench]]
name = "f32x04"
harness = false
required-features = ["simd"]
[[bench]]
name = "f32x08"
harness = false
required-features = ["simd"]
[[bench]]
name = "f32x16"
harness = false
required-features = ["simd"]
[[bench]]
name = "f64"
harness = false
[[bench]]
name = "f64x02"
harness = false
required-features = ["simd"]
[[bench]]
name = "f64x04"
harness = false
required-features = ["simd"]
[[bench]]
name = "f64x08"
harness = false
required-features = ["simd"]