From 1b69c93fc2334df37d4053618f8971667a9f9d30 Mon Sep 17 00:00:00 2001 From: Jeff Ohrstrom Date: Fri, 14 Jun 2024 14:59:42 -0400 Subject: [PATCH] Gpu bug fix (#257) * allow for 0 gpus in desktops * allow any + gpu setting --- .../apps/bc_desktop/submit/slurm.yml.erb | 34 ++++++++++++------- .../apps/bc_desktop/submit/slurm.yml.erb | 34 ++++++++++++------- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb b/apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb index ef2bca0..9f1f4c0 100644 --- a/apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb +++ b/apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb @@ -19,37 +19,44 @@ return tasks_per_node + [ "--constraint", "48core" ] end - def plus_gpus(arr, gpu_arr) - gpu_count.to_i > 0 ? arr + gpu_arr : arr - end - + # gpu_count will always return at least 1, so take care when calling it. def gpu_count - if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0 + if !gpus.nil? && !gpus.empty? && gpus.to_i.positive? gpus else 1 end end + # any* node types can possible get a gpu if they've set gpu >= 1 + def possible_gpus + if gpus.to_s.to_i.positive? + ["--gpus-per-node", "#{gpu_count}"] + else + [] + end + end + slurm_args = case node_type # 'any' case handled by scheduler, this is just a quick short circuit when "any" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + any_node + possible_gpus + when "any-40core" - base_slurm_args + p18_node + base_slurm_args + p18_node + possible_gpus when "any-48core" - base_slurm_args + p20_node + base_slurm_args + p20_node + possible_gpus when "gpu-any" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"] when "gpu-40core" - plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"] when "gpu-48core" - plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"] when "vis" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]) + base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"] when "densegpu" - plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"]) + base_slurm_args + p20_node + ["--gpus-per-node", "4"] # using partitions here is easier than specifying memory requests when "largemem" @@ -58,6 +65,7 @@ when "hugemem" partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem" base_slurm_args + tasks_per_node + ["--partition", partition ] + else base_slurm_args end diff --git a/ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb b/ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb index ef2bca0..9f1f4c0 100644 --- a/ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb +++ b/ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb @@ -19,37 +19,44 @@ return tasks_per_node + [ "--constraint", "48core" ] end - def plus_gpus(arr, gpu_arr) - gpu_count.to_i > 0 ? arr + gpu_arr : arr - end - + # gpu_count will always return at least 1, so take care when calling it. def gpu_count - if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0 + if !gpus.nil? && !gpus.empty? && gpus.to_i.positive? gpus else 1 end end + # any* node types can possible get a gpu if they've set gpu >= 1 + def possible_gpus + if gpus.to_s.to_i.positive? + ["--gpus-per-node", "#{gpu_count}"] + else + [] + end + end + slurm_args = case node_type # 'any' case handled by scheduler, this is just a quick short circuit when "any" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + any_node + possible_gpus + when "any-40core" - base_slurm_args + p18_node + base_slurm_args + p18_node + possible_gpus when "any-48core" - base_slurm_args + p20_node + base_slurm_args + p20_node + possible_gpus when "gpu-any" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"] when "gpu-40core" - plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"] when "gpu-48core" - plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"]) + base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"] when "vis" - plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]) + base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"] when "densegpu" - plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"]) + base_slurm_args + p20_node + ["--gpus-per-node", "4"] # using partitions here is easier than specifying memory requests when "largemem" @@ -58,6 +65,7 @@ when "hugemem" partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem" base_slurm_args + tasks_per_node + ["--partition", partition ] + else base_slurm_args end