Skip to content

Commit

Permalink
Gpu bug fix (#257)
Browse files Browse the repository at this point in the history
* allow for 0 gpus in desktops

* allow any + gpu setting
  • Loading branch information
johrstrom authored Jun 14, 2024
1 parent 1264b44 commit 1b69c93
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 26 deletions.
34 changes: 21 additions & 13 deletions apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,44 @@
return tasks_per_node + [ "--constraint", "48core" ]
end

def plus_gpus(arr, gpu_arr)
gpu_count.to_i > 0 ? arr + gpu_arr : arr
end

# gpu_count will always return at least 1, so take care when calling it.
def gpu_count
if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0
if !gpus.nil? && !gpus.empty? && gpus.to_i.positive?
gpus
else
1
end
end

# any* node types can possible get a gpu if they've set gpu >= 1
def possible_gpus
if gpus.to_s.to_i.positive?
["--gpus-per-node", "#{gpu_count}"]
else
[]
end
end

slurm_args = case node_type
# 'any' case handled by scheduler, this is just a quick short circuit
when "any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + possible_gpus

when "any-40core"
base_slurm_args + p18_node
base_slurm_args + p18_node + possible_gpus
when "any-48core"
base_slurm_args + p20_node
base_slurm_args + p20_node + possible_gpus

when "gpu-any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-40core"
plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-48core"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"]
when "vis"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]
when "densegpu"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"])
base_slurm_args + p20_node + ["--gpus-per-node", "4"]

# using partitions here is easier than specifying memory requests
when "largemem"
Expand All @@ -58,6 +65,7 @@
when "hugemem"
partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem"
base_slurm_args + tasks_per_node + ["--partition", partition ]

else
base_slurm_args
end
Expand Down
34 changes: 21 additions & 13 deletions ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,44 @@
return tasks_per_node + [ "--constraint", "48core" ]
end

def plus_gpus(arr, gpu_arr)
gpu_count.to_i > 0 ? arr + gpu_arr : arr
end

# gpu_count will always return at least 1, so take care when calling it.
def gpu_count
if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0
if !gpus.nil? && !gpus.empty? && gpus.to_i.positive?
gpus
else
1
end
end

# any* node types can possible get a gpu if they've set gpu >= 1
def possible_gpus
if gpus.to_s.to_i.positive?
["--gpus-per-node", "#{gpu_count}"]
else
[]
end
end

slurm_args = case node_type
# 'any' case handled by scheduler, this is just a quick short circuit
when "any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + possible_gpus

when "any-40core"
base_slurm_args + p18_node
base_slurm_args + p18_node + possible_gpus
when "any-48core"
base_slurm_args + p20_node
base_slurm_args + p20_node + possible_gpus

when "gpu-any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-40core"
plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-48core"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"]
when "vis"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]
when "densegpu"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"])
base_slurm_args + p20_node + ["--gpus-per-node", "4"]

# using partitions here is easier than specifying memory requests
when "largemem"
Expand All @@ -58,6 +65,7 @@
when "hugemem"
partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem"
base_slurm_args + tasks_per_node + ["--partition", partition ]

else
base_slurm_args
end
Expand Down

0 comments on commit 1b69c93

Please sign in to comment.