Skip to content

Commit

Permalink
Provide FEE CUDA implementation.
Browse files Browse the repository at this point in the history
Rust, C and Python APIs are available to utilise CUDA.

At least one of the two Cargo features "cuda" and "cuda-single" must be
enabled to provide the CUDA functionality. If "cuda-single" is enabled,
then the GPU does calculations at single precision, otherwise double
precision is used. This choice in precision is supplied because desktop
NVIDIA GPUs have substantially less double-precision compute capability.
  • Loading branch information
cjordan committed Jan 10, 2022
1 parent 1f88511 commit e78ff09
Show file tree
Hide file tree
Showing 24 changed files with 3,282 additions and 460 deletions.
18 changes: 17 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,21 @@ crate-type = ["rlib", "staticlib", "cdylib"]
default = []
hdf5-static = ["hdf5-sys"]
erfa-static = ["marlu/erfa-static"]
all-static = ["hdf5-static", "erfa-static"]
cuda-static = ["marlu/cuda-static"]
all-static = ["hdf5-static", "erfa-static", "cuda-static"]
python = ["pyo3", "numpy"]

# Provide beam functionality with CUDA, double precision.
cuda = ["marlu/cuda", "cc"]
# Opt-out of double precision, use only single precision.
cuda-single = ["cuda"]

[profile.release]
lto = true
codegen-units = 1 # Set this to 1 in Cargo.toml to allow for maximum size reduction optimizations

[dependencies]
cfg-if = "1.0.*"
dashmap = "4.0.*"
hdf5 = "0.7.*"
marlu = { version = "0.3.*", default-features = false }
Expand All @@ -51,6 +58,8 @@ ndarray = { version = ">=0.15.4,<0.16", features = ["approx-0_5"] }
[build-dependencies]
cbindgen = "0.*"

cc = { version = "1.0.*", features = ["parallel"], optional = true }

[package.metadata.docs.rs]
features = ["hdf5-static"]
targets = ["x86_64-unknown-linux-gnu"]
Expand All @@ -63,3 +72,10 @@ project-url = { homepage = "https://github.com/MWATelescope/mwa_hyperbeam" }
[[bench]]
name = "bench"
harness = false

[[example]]
name = "beam_calcs"

[[example]]
name = "beam_calcs_cuda"
required-features = ["cuda"]
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ other words, most languages. See Rust, C and Python examples of usage in the
[-1.51506097e-01-4.35034884e-02j -9.76099405e-06-1.21699926e-05j
1.73003520e-05-1.53580286e-05j -2.23184781e-01-4.51051073e-02j]

### CUDA
`hyperbeam` also can also be run on NVIDIA GPUs. To see an example of usage, see
any of the examples with "_cuda" in the name. CUDA functionality is only
provided with one of two Cargo features; see installing from source instructions
below.

## Installation
### Python PyPI
If you're using Python version >=3.6:
Expand Down
97 changes: 96 additions & 1 deletion benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
//! the project's root directory.

use criterion::*;
use marlu::rayon;
use marlu::{ndarray, rayon};
use ndarray::prelude::*;
use rayon::prelude::*;

use mwa_hyperbeam::fee::*;
Expand Down Expand Up @@ -115,6 +116,100 @@ fn fee(c: &mut Criterion) {
})
});

#[cfg(feature = "cuda")]
c.bench_function("cuda_calc_jones", |b| {
let freqs = [51200000];
let delays = Array2::zeros((1, 16));
let amps = Array2::ones((1, 32));
let norm_to_zenith = false;
let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
let cuda_beam = unsafe {
beam.cuda_prepare(&freqs, delays.view(), amps.view(), norm_to_zenith)
.unwrap()
};

let mut az = vec![];
let mut za = vec![];
for d in 5..85 {
#[cfg(feature = "cuda-single")]
let rad = (d as f32).to_radians();
#[cfg(not(feature = "cuda-single"))]
let rad = (d as f64).to_radians();
az.push(rad);
za.push(rad);
}
let parallactic_correction = false;

b.iter(|| {
cuda_beam
.calc_jones(&az, &za, parallactic_correction)
.unwrap();
})
});

// Benchmarks with a fair few pointings!
let num_directions = 100000;
let mut az_double = vec![];
let mut za_double = vec![];
for i in 1..=num_directions {
az_double.push(0.9 * std::f64::consts::TAU / i as f64);
za_double.push(std::f64::consts::PI / i as f64);
}
let freqs = [51200000];
let delays = Array2::zeros((1, 16));
let amps = Array2::ones((1, 16));
let norm_to_zenith = true;

c.bench_function("calc_jones_array 100000 dirs", |b| {
let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
// Prime the cache.
beam.calc_jones(
az_double[0],
za_double[0],
freqs[0],
delays.as_slice().unwrap(),
amps.as_slice().unwrap(),
norm_to_zenith,
)
.unwrap();
b.iter(|| {
beam.calc_jones_array(
&az_double,
&za_double,
freqs[0],
delays.as_slice().unwrap(),
amps.as_slice().unwrap(),
norm_to_zenith,
)
.unwrap();
})
});

#[cfg(feature = "cuda")]
c.bench_function("cuda_calc_jones 100000 dirs", |b| {
let beam = FEEBeam::new("mwa_full_embedded_element_pattern.h5").unwrap();
let cuda_beam = unsafe {
beam.cuda_prepare(&freqs, delays.view(), amps.view(), norm_to_zenith)
.unwrap()
};
let parallactic_correction = true;

#[cfg(feature = "cuda-single")]
let (az, za): (Vec<_>, Vec<_>) = az_double
.iter()
.zip(za_double.iter())
.map(|(&az, &za)| (az as f32, za as f32))
.unzip();
#[cfg(not(feature = "cuda-single"))]
let (az, za) = (az_double.clone(), za_double.clone());

b.iter(|| {
cuda_beam
.calc_jones(&az, &za, parallactic_correction)
.unwrap();
})
});

// The following benchmarks require a few structs and methods to be public.
// These benchmarks remain commented because those structs and/or methods
// should not be made public in releases.
Expand Down
94 changes: 93 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,109 @@

use std::env;

#[cfg(feature = "cuda")]
fn parse_and_validate_compute(c: &str, var: &str) -> u16 {
// Check that there's only two numeric characters.
if c.len() != 2 {
panic!("{} is not a two-digit number!", var)
}

match c.parse::<u16>() {
Ok(p) => p,
Err(_) => panic!("{} couldn't be parsed into a number!", var),
}
}

fn main() {
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
println!("cargo:rerun-if-changed=build.rs");

#[cfg(feature = "cuda")]
{
const DEFAULT_CUDA_ARCHES: &[u16] = &[60, 70, 80];
const DEFAULT_CUDA_SMS: &[u16] = &[60, 70, 75, 86];

// Attempt to read HYPERBEAM_CUDA_COMPUTE. HYPERDRIVE_CUDA_COMPUTE can
// be used instead, too.
println!("cargo:rerun-if-env-changed=HYPERBEAM_CUDA_COMPUTE");
println!("cargo:rerun-if-env-changed=HYPERDRIVE_CUDA_COMPUTE");
let (arches, sms): (Vec<u16>, Vec<u16>) = match (
env::var("HYPERBEAM_CUDA_COMPUTE"),
env::var("HYPERDRIVE_CUDA_COMPUTE"),
) {
// When a user-supplied variable exists, use it as the CUDA arch and
// compute level.
(Ok(c), _) => {
let compute = parse_and_validate_compute(&c, "HYPERBEAM_CUDA_COMPUTE");
(vec![compute], vec![compute])
}
(Err(_), Ok(c)) => {
let compute = parse_and_validate_compute(&c, "HYPERDRIVE_CUDA_COMPUTE");
(vec![compute], vec![compute])
}
(Err(_), Err(_)) => {
// Print out all of the default arches and computes as a
// warning.
let mut warn_str = String::new();
warn_str.push_str("cargo:warning=No HYPERBEAM_CUDA_COMPUTE; Passing ");
warn_str.push_str(&format!("arch=compute_{:?}", DEFAULT_CUDA_ARCHES));
warn_str.push_str(" and ");
warn_str.push_str(&format!("code=sm_{:?}", DEFAULT_CUDA_SMS));
warn_str.push_str(" to nvcc");
println!("{}", warn_str);
(DEFAULT_CUDA_ARCHES.to_vec(), DEFAULT_CUDA_SMS.to_vec())
}
};

// TODO: Search for any C/C++/CUDA files and have rerun-if-changed on
// all of them.
println!("cargo:rerun-if-changed=src/fee/cuda/fee.h");
println!("cargo:rerun-if-changed=src/fee/cuda/fee.cu");

let mut cuda_target = cc::Build::new();
cuda_target
.cuda(true)
.flag("-cudart=static")
.include("src/fee/cuda/")
.file("src/fee/cuda/fee.cu");
// Loop over each arch and sm
for arch in arches {
for &sm in &sms {
if sm < arch {
continue;
}

let mut flag = String::new();
cuda_target.flag("-gencode");
flag.push_str(&format!("arch=compute_{},", arch));
flag.push_str(&format!("code=sm_{}", sm));
cuda_target.flag(&flag);
}
}

// If we're told to, use single-precision floats. The default in the
// CUDA code is to use double-precision.
#[cfg(feature = "cuda-single")]
cuda_target.define("SINGLE", None);

cuda_target.compile("hyperbeam_cu");
}

// Generate a C header for hyperbeam and write it to the include
// directory. This routine only need to be done if the ffi module has
// changed.
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
println!("cargo:rerun-if-changed=src/fee/ffi/mod.rs");
// Only do this if we're not on docs.rs (doesn't like writing files outside
// of OUT_DIR).
match env::var("DOCS_RS").as_deref() {
Ok("1") => (),
_ => {
// Rename an internal-only name depending on the CUDA precision.
#[cfg(feature = "cuda-single")]
let c_type = "float";
#[cfg(not(feature = "cuda-single"))]
let c_type = "double";

cbindgen::Builder::new()
.with_config(cbindgen::Config {
cpp_compat: true,
Expand All @@ -24,6 +115,7 @@ fn main() {
})
.with_crate(crate_dir)
.with_language(cbindgen::Language::C)
.rename_item("CudaFloat", c_type)
.generate()
.expect("Unable to generate bindings")
.write_to_file("include/mwa_hyperbeam.h");
Expand Down
Loading

0 comments on commit e78ff09

Please sign in to comment.