-
Notifications
You must be signed in to change notification settings - Fork 6
/
ray_cluster.yaml
96 lines (75 loc) · 2.45 KB
/
ray_cluster.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
cluster_name: ray_cluster_gpu_aws_llm_finetuning
max_workers: 8
upscaling_speed: 1.0
docker:
image: "rayproject/ray-ml:latest-gpu"
container_name: "ray_container"
pull_before_run: False
run_options: # Extra options to pass into "docker run"
- --ulimit nofile=65536:65536
head_image: "rayproject/ray-ml:latest-gpu"
worker_image: "rayproject/ray-ml:latest-gpu"
idle_timeout_minutes: 5
provider:
type: aws
region: eu-west-2
cache_stopped_nodes: True
auth:
ssh_user: ubuntu
available_node_types:
head:
node_config:
InstanceType: g4dn.xlarge
ImageId: ami-0bacb0baeaa1dbf0b
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 300
VolumeType: gp3
worker:
min_workers: 2
max_workers: 8
node_config:
InstanceType: g4dn.xlarge
ImageId: ami-0bacb0baeaa1dbf0b
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
SpotOptions:
MaxPrice: "1" # Max Hourly Price
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 300
VolumeType: gp3
file_mounts: {
"~/src": "./src",
"~/data": "./data",
"~/infra": "./infra",
"~/config": "./config",
}
file_mounts_sync_continuously: False
cluster_synced_files: []
rsync_exclude:
- "**/.git"
- "**/.git/**"
rsync_filter:
- ".gitignore"
head_node_type: head
head_start_ray_commands:
- ulimit -n 65536
- conda activate py310 && ray stop
- conda activate py310 && ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
worker_start_ray_commands:
- ulimit -n 65536
- conda activate py310 && ray stop
- conda activate py310 && ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
initialization_commands: []
head_setup_commands:
- conda activate py310 && pip install -U pip
- conda activate py310 && pip install -r infra/requirements.txt
worker_setup_commands:
- conda activate py310 && pip install -U pip
- conda activate py310 && pip install -r infra/requirements.txt
setup_commands:
- conda create -n py310 python=3.10 -y && conda activate py310