Provide FEE CUDA implementation.

Rust, C and Python APIs are available to utilise CUDA. At least one of the two Cargo features "cuda" and "cuda-single" must be enabled to provide the CUDA functionality. If "cuda-single" is enabled, then the GPU does calculations at single precision, otherwise double precision is used. This choice in precision is supplied because desktop NVIDIA GPUs have substantially less double-precision compute capability.
MWATelescope · Jan 10, 2022 · e78ff09 · e78ff09
1 parent 1f88511
commit e78ff09
Show file tree

Hide file tree

Showing 24 changed files with 3,282 additions and 460 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,14 +21,21 @@ crate-type = ["rlib", "staticlib", "cdylib"]
 default = []
 hdf5-static = ["hdf5-sys"]
 erfa-static = ["marlu/erfa-static"]
-all-static = ["hdf5-static", "erfa-static"]
+cuda-static = ["marlu/cuda-static"]
+all-static = ["hdf5-static", "erfa-static", "cuda-static"]
 python = ["pyo3", "numpy"]
 
+# Provide beam functionality with CUDA, double precision.
+cuda = ["marlu/cuda", "cc"]
+# Opt-out of double precision, use only single precision.
+cuda-single = ["cuda"]
+
 [profile.release]
 lto = true
 codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations
 
 [dependencies]
+cfg-if = "1.0.*"
 dashmap = "4.0.*"
 hdf5 = "0.7.*"
 marlu = { version = "0.3.*", default-features = false }
@@ -51,6 +58,8 @@ ndarray = { version = ">=0.15.4,<0.16", features = ["approx-0_5"] }
 [build-dependencies]
 cbindgen = "0.*"
 
+cc = { version = "1.0.*", features = ["parallel"], optional = true }
+
 [package.metadata.docs.rs]
 features = ["hdf5-static"]
 targets = ["x86_64-unknown-linux-gnu"]
@@ -63,3 +72,10 @@ project-url = { homepage = "https://github.com/MWATelescope/mwa_hyperbeam" }
 [[bench]]
 name = "bench"
 harness = false
+
+[[example]]
+name = "beam_calcs"
+
+[[example]]
+name = "beam_calcs_cuda"
+required-features = ["cuda"]
diff --git a/README.md b/README.md
@@ -46,6 +46,12 @@ other words, most languages. See Rust, C and Python examples of usage in the
  [-1.51506097e-01-4.35034884e-02j -9.76099405e-06-1.21699926e-05j
   1.73003520e-05-1.53580286e-05j -2.23184781e-01-4.51051073e-02j]
 
+### CUDA
+`hyperbeam` also can also be run on NVIDIA GPUs. To see an example of usage, see
+any of the examples with "_cuda" in the name. CUDA functionality is only
+provided with one of two Cargo features; see installing from source instructions
+below.
+
 ## Installation
 ### Python PyPI
 If you're using Python version >=3.6:

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -6,7 +6,8 @@
 //! the project's root directory.
 
 use criterion::*;
-use marlu::rayon;
+use marlu::{ndarray, rayon};
+use ndarray::prelude::*;
 use rayon::prelude::*;
 
 use mwa_hyperbeam::fee::*;
@@ -115,6 +116,100 @@ fn fee(c: &mut Criterion) {
  })
  });
 
+ #[cfg(feature = "cuda")]
+ c.bench_function("cuda_calc_jones", |b| {
+ let freqs = [51200000];
+ let delays = Array2::zeros((1, 16));
+ let amps = Array2::ones((1, 32));
+ let norm_to_zenith = false;
+ let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
+ let cuda_beam = unsafe {
+ beam.cuda_prepare(&freqs, delays.view(), amps.view(), norm_to_zenith)
+ .unwrap()
+ };
+
+ let mut az = vec![];
+ let mut za = vec![];
+ for d in 5..85 {
+ #[cfg(feature = "cuda-single")]
+ let rad = (d as f32).to_radians();
+ #[cfg(not(feature = "cuda-single"))]
+ let rad = (d as f64).to_radians();
+ az.push(rad);
+ za.push(rad);
+ }
+ let parallactic_correction = false;
+
+ b.iter(|| {
+ cuda_beam
+ .calc_jones(&az, &za, parallactic_correction)
+ .unwrap();
+ })
+ });
+
+ // Benchmarks with a fair few pointings!
+ let num_directions = 100000;
+ let mut az_double = vec![];
+ let mut za_double = vec![];
+ for i in 1..=num_directions {
+ az_double.push(0.9 * std::f64::consts::TAU / i as f64);
+ za_double.push(std::f64::consts::PI / i as f64);
+ }
+ let freqs = [51200000];
+ let delays = Array2::zeros((1, 16));
+ let amps = Array2::ones((1, 16));
+ let norm_to_zenith = true;
+
+ c.bench_function("calc_jones_array 100000 dirs", |b| {
+ let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
+ // Prime the cache.
+ beam.calc_jones(
+ az_double[0],
+ za_double[0],
+ freqs[0],
+ delays.as_slice().unwrap(),
+ amps.as_slice().unwrap(),
+ norm_to_zenith,
+ )
+ .unwrap();
+ b.iter(|| {
+ beam.calc_jones_array(
+ &az_double,
+ &za_double,
+ freqs[0],
+ delays.as_slice().unwrap(),
+ amps.as_slice().unwrap(),
+ norm_to_zenith,
+ )
+ .unwrap();
+ })
+ });
+
+ #[cfg(feature = "cuda")]
+ c.bench_function("cuda_calc_jones 100000 dirs", |b| {
+ let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
+ let cuda_beam = unsafe {
+ beam.cuda_prepare(&freqs, delays.view(), amps.view(), norm_to_zenith)
+ .unwrap()
+ };
+ let parallactic_correction = true;
+
+ #[cfg(feature = "cuda-single")]
+ let (az, za): (Vec<_>, Vec<_>) = az_double
+ .iter()
+ .zip(za_double.iter())
+ .map(|(&az, &za)| (az as f32, za as f32))
+ .unzip();
+ #[cfg(not(feature = "cuda-single"))]
+ let (az, za) = (az_double.clone(), za_double.clone());
+
+ b.iter(|| {
+ cuda_beam
+ .calc_jones(&az, &za, parallactic_correction)
+ .unwrap();
+ })
+ });
+
  // The following benchmarks require a few structs and methods to be public.
  // These benchmarks remain commented because those structs and/or methods
  // should not be made public in releases.

diff --git a/build.rs b/build.rs
@@ -4,18 +4,109 @@
 
 use std::env;
 
+#[cfg(feature = "cuda")]
+fn parse_and_validate_compute(c: &str, var: &str) -> u16 {
+ // Check that there's only two numeric characters.
+ if c.len() != 2 {
+ panic!("{} is not a two-digit number!", var)
+ }
+
+ match c.parse::<u16>() {
+ Ok(p) => p,
+ Err(_) => panic!("{} couldn't be parsed into a number!", var),
+ }
+}
+
 fn main() {
- let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
+ println!("cargo:rerun-if-changed=build.rs");
+
+ #[cfg(feature = "cuda")]
+ {
+ const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80];
+ const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 86];
+
+ // Attempt to read HYPERBEAM_CUDA_COMPUTE. HYPERDRIVE_CUDA_COMPUTE can
+ // be used instead, too.
+ println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE");
+ println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE");
+ let (arches, sms): (Vec<u16>, Vec<u16>) = match (
+ env::var("HYPERBEAM_CUDA_COMPUTE"),
+ env::var("HYPERDRIVE_CUDA_COMPUTE"),
+ ) {
+ // When a user-supplied variable exists, use it as the CUDA arch and
+ // compute level.
+ (Ok(c), _) => {
+ let compute = parse_and_validate_compute(&c, "HYPERBEAM_CUDA_COMPUTE");
+ (vec![compute], vec![compute])
+ }
+ (Err(_), Ok(c)) => {
+ let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE");
+ (vec![compute], vec![compute])
+ }
+ (Err(_), Err(_)) => {
+ // Print out all of the default arches and computes as a
+ // warning.
+ let mut warn_str = String::new();
+ warn_str.push_str("cargo:warning=No HYPERBEAM_CUDA_COMPUTE; Passing ");
+ warn_str.push_str(&format!("arch=compute_{:?}", DEFAULT_CUDA_ARCHES));
+ warn_str.push_str(" and ");
+ warn_str.push_str(&format!("code=sm_{:?}", DEFAULT_CUDA_SMS));
+ warn_str.push_str(" to nvcc");
+ println!("{}", warn_str);
+ (DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec())
+ }
+ };
+
+ // TODO: Search for any C/C++/CUDA files and have rerun-if-changed on
+ // all of them.
+ println!("cargo:rerun-if-changed=src/fee/cuda/fee.h");
+ println!("cargo:rerun-if-changed=src/fee/cuda/fee.cu");
+
+ let mut cuda_target = cc::Build::new();
+ cuda_target
+ .cuda(true)
+ .flag("-cudart=static")
+ .include("src/fee/cuda/")
+ .file("src/fee/cuda/fee.cu");
+ // Loop over each arch and sm
+ for arch in arches {
+ for &sm in &sms {
+ if sm < arch {
+ continue;
+ }
+
+ let mut flag = String::new();
+ cuda_target.flag("-gencode");
+ flag.push_str(&format!("arch=compute_{},", arch));
+ flag.push_str(&format!("code=sm_{}", sm));
+ cuda_target.flag(&flag);
+ }
+ }
+
+ // If we're told to, use single-precision floats. The default in the
+ // CUDA code is to use double-precision.
+ #[cfg(feature = "cuda-single")]
+ cuda_target.define("SINGLE", None);
+
+ cuda_target.compile("hyperbeam_cu");
+ }
 
  // Generate a C header for hyperbeam and write it to the include
  // directory. This routine only need to be done if the ffi module has
  // changed.
+ let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
  println!("cargo:rerun-if-changed=src/fee/ffi/mod.rs");
  // Only do this if we're not on docs.rs (doesn't like writing files outside
  // of OUT_DIR).
  match env::var("DOCS_RS").as_deref() {
  Ok("1") => (),
  _ => {
+ // Rename an internal-only name depending on the CUDA precision.
+ #[cfg(feature = "cuda-single")]
+ let c_type = "float";
+ #[cfg(not(feature = "cuda-single"))]
+ let c_type = "double";
+
  cbindgen::Builder::new()
  .with_config(cbindgen::Config {
  cpp_compat: true,
@@ -24,6 +115,7 @@ fn main() {
  })
  .with_crate(crate_dir)
  .with_language(cbindgen::Language::C)
+ .rename_item("CudaFloat", c_type)
  .generate()
  .expect("Unable to generate bindings")
  .write_to_file("include/mwa_hyperbeam.h");