-
Notifications
You must be signed in to change notification settings - Fork 1
/
gcp_deploy.py
145 lines (133 loc) · 3.72 KB
/
gcp_deploy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
from google.cloud import aiplatform
def main(opt):
models = aiplatform.Model.list()
model_found = False
for model in models:
if model.display_name == opt.model_name:
print(model.resource_name)
model_found = True
break
if not model_found:
print("No model found, creating...")
model = aiplatform.Model.upload(
display_name=opt.model_name,
serving_container_image_uri=opt.image_uri,
serving_container_ports=[8080],
serving_container_predict_route="/predict"
)
endpoints = aiplatform.Endpoint.list()
endpoint_found = False
for endpoint in endpoints:
if endpoint.display_name == opt.endpoint_name:
print(endpoint.resource_name)
endpoint_found = True
break
if not endpoint_found:
print("No endpoint found, creating...")
endpoint = aiplatform.Endpoint.create(
display_name=opt.endpoint_name
)
print(vars(endpoint))
models = endpoint.list_models()
if len(models) > 0:
print("undeploying previous models...")
endpoint.undeploy_all()
accelerator_count = opt.accelerator_count
accelerator_type = opt.gpu_type
if accelerator_count == 0:
accelerator_count = None
accelerator_type = None
#NVIDIA_TESLA_V100
#NVIDIA_TESLA_T4
endpoint.deploy(
model,
deployed_model_display_name=opt.endpoint_deployed_name,
traffic_percentage=100,
machine_type=opt.machine_type,
min_replica_count=opt.min_replica_count,
max_replica_count=opt.max_replica_count,
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
sync=True,
# Broken in cloud shell
# autoscaling_target_cpu_utilization=opt.cpu_duty_cycle,
# autoscaling_target_accelerator_duty_cycle=opt.accelerator_duty_cycle
)
print("done")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--min-replica-count',
type=int,
default=1,
help='Minimum number of replicas'
)
parser.add_argument(
'--machine-type',
type=str,
default='n1-standard-4',
help='Machine type'
)
parser.add_argument(
'--max-replica-count',
type=int,
default=1,
help='Maximum number of replicas'
)
parser.add_argument(
"--gpu-type",
type=str,
default='NVIDIA_TESLA_T4',
help="GPU type"
)
parser.add_argument(
"--accelerator-count",
type=int,
default=1,
help="GPU count"
)
parser.add_argument(
"--region",
type=str,
default="us-central1",
help="gcp region"
)
parser.add_argument(
"--model-name",
type=str,
required=True,
help="name of model"
)
parser.add_argument(
"--endpoint-name",
type=str,
required=True,
help="Name of endpoint"
)
parser.add_argument(
"--endpoint-deployed-name",
type=str,
required=True,
help="Endpoint deployed name"
)
parser.add_argument(
"--image-uri",
type=str,
required=True,
help="name of image in gcr. Ex: gcr.io/${project-name}/${model}:latest"
)
parser.add_argument(
"--accelerator-duty-cycle",
type=int,
default=20,
help="Autoscaling for GPUs."
)
parser.add_argument(
"--cpu-duty-cycle",
type=int,
default=5,
help="Autoscaling for CPUs."
)
opt = parser.parse_args()
main(opt)