Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gpu bug fix #257

Merged
merged 2 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 21 additions & 13 deletions apps.awesim.org/apps/bc_desktop/submit/slurm.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,44 @@
return tasks_per_node + [ "--constraint", "48core" ]
end

def plus_gpus(arr, gpu_arr)
gpu_count.to_i > 0 ? arr + gpu_arr : arr
end

# gpu_count will always return at least 1, so take care when calling it.
def gpu_count
if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0
if !gpus.nil? && !gpus.empty? && gpus.to_i.positive?
gpus
else
1
end
end

# any* node types can possible get a gpu if they've set gpu >= 1
def possible_gpus
if gpus.to_s.to_i.positive?
["--gpus-per-node", "#{gpu_count}"]
else
[]
end
end

slurm_args = case node_type
# 'any' case handled by scheduler, this is just a quick short circuit
when "any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + possible_gpus

when "any-40core"
base_slurm_args + p18_node
base_slurm_args + p18_node + possible_gpus
when "any-48core"
base_slurm_args + p20_node
base_slurm_args + p20_node + possible_gpus

when "gpu-any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-40core"
plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-48core"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"]
when "vis"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]
when "densegpu"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"])
base_slurm_args + p20_node + ["--gpus-per-node", "4"]

# using partitions here is easier than specifying memory requests
when "largemem"
Expand All @@ -58,6 +65,7 @@
when "hugemem"
partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem"
base_slurm_args + tasks_per_node + ["--partition", partition ]

else
base_slurm_args
end
Expand Down
34 changes: 21 additions & 13 deletions ondemand.osc.edu/apps/bc_desktop/submit/slurm.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,44 @@
return tasks_per_node + [ "--constraint", "48core" ]
end

def plus_gpus(arr, gpu_arr)
gpu_count.to_i > 0 ? arr + gpu_arr : arr
end

# gpu_count will always return at least 1, so take care when calling it.
def gpu_count
if !gpus.nil? && !gpus.empty? && gpus.to_i >= 0
if !gpus.nil? && !gpus.empty? && gpus.to_i.positive?
gpus
else
1
end
end

# any* node types can possible get a gpu if they've set gpu >= 1
def possible_gpus
if gpus.to_s.to_i.positive?
["--gpus-per-node", "#{gpu_count}"]
else
[]
end
end

slurm_args = case node_type
# 'any' case handled by scheduler, this is just a quick short circuit
when "any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + possible_gpus

when "any-40core"
base_slurm_args + p18_node
base_slurm_args + p18_node + possible_gpus
when "any-48core"
base_slurm_args + p20_node
base_slurm_args + p20_node + possible_gpus

when "gpu-any"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-40core"
plus_gpus(base_slurm_args + p18_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p18_node + ["--gpus-per-node", "#{gpu_count}"]
when "gpu-48core"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "#{gpu_count}"])
base_slurm_args + p20_node + ["--gpus-per-node", "#{gpu_count}"]
when "vis"
plus_gpus(base_slurm_args + any_node, ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"])
base_slurm_args + any_node + ["--gpus-per-node", "#{gpu_count}", "--gres", "vis"]
when "densegpu"
plus_gpus(base_slurm_args + p20_node, ["--gpus-per-node", "4"])
base_slurm_args + p20_node + ["--gpus-per-node", "4"]

# using partitions here is easier than specifying memory requests
when "largemem"
Expand All @@ -58,6 +65,7 @@
when "hugemem"
partition = bc_num_slots.to_i > 1 ? "hugemem-parallel" : "hugemem"
base_slurm_args + tasks_per_node + ["--partition", partition ]

else
base_slurm_args
end
Expand Down
Loading