Skip to content

Commit

Permalink
Fix formatting issues
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 680424508
  • Loading branch information
vertex-mg-bot authored and copybara-github committed Sep 30, 2024
1 parent 148a6fa commit de8b9ee
Showing 1 changed file with 18 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -806,35 +806,6 @@
" return model, endpoint\n",
"\n",
"\n",
"def predict_vllm(\n",
" prompt: str,\n",
" max_tokens: int,\n",
" temperature: float,\n",
" top_p: float,\n",
" top_k: int,\n",
" raw_response: bool,\n",
" lora_weight: str = \"\",\n",
"):\n",
" # Parameters for inference.\n",
" instance = {\n",
" \"prompt\": prompt,\n",
" \"max_tokens\": max_tokens,\n",
" \"temperature\": temperature,\n",
" \"top_p\": top_p,\n",
" \"top_k\": top_k,\n",
" \"raw_response\": raw_response,\n",
" }\n",
" if lora_weight:\n",
" instance[\"dynamic-lora\"] = lora_weight\n",
" instances = [instance]\n",
" response = endpoints[\"vllm_gpu\"].predict(\n",
" instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
" )\n",
"\n",
" for prediction in response.predictions:\n",
" print(prediction)\n",
"\n",
"\n",
"# Use FP8 base model for 405B since original model does not fit.\n",
"deploy_pretrained_model_id = pretrained_model_id\n",
"if \"Meta-Llama-3.1-405B\" in deploy_pretrained_model_id:\n",
Expand Down Expand Up @@ -880,24 +851,32 @@
"# @markdown Additionally, you can moderate the generated text with Vertex AI. See [Moderate text documentation](https://cloud.google.com/natural-language/docs/moderating-text) for more details.\n",
"\n",
"prompt = \"What is a car?\" # @param {type: \"string\"}\n",
"# @markdown If you encounter the issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.\n",
"# @markdown If you encounter an issue like `ServiceUnavailable: 503 Took too long to respond when processing`, you can reduce the maximum number of output tokens, such as set `max_tokens` as 20.\n",
"max_tokens = 50 # @param {type:\"integer\"}\n",
"temperature = 1.0 # @param {type:\"number\"}\n",
"top_p = 1.0 # @param {type:\"number\"}\n",
"top_k = 1 # @param {type:\"integer\"}\n",
"raw_response = False # @param {type:\"boolean\"}\n",
"\n",
"\n",
"predict_vllm(\n",
" prompt=prompt,\n",
" max_tokens=max_tokens,\n",
" temperature=temperature,\n",
" top_p=top_p,\n",
" top_k=top_k,\n",
" raw_response=raw_response,\n",
" lora_weight=final_checkpoint,\n",
"# Overrides parameters for inferences.\n",
"instance = {\n",
" \"prompt\": prompt,\n",
" \"max_tokens\": max_tokens,\n",
" \"temperature\": temperature,\n",
" \"top_p\": top_p,\n",
" \"top_k\": top_k,\n",
" \"raw_response\": raw_response,\n",
"}\n",
"if lora_weight:\n",
" instance[\"dynamic-lora\"] = lora_weight\n",
"instances = [instance]\n",
"response = endpoints[\"vllm_gpu\"].predict(\n",
" instances=instances, use_dedicated_endpoint=use_dedicated_endpoint\n",
")\n",
"\n",
"for prediction in response.predictions:\n",
" print(prediction)\n",
"\n",
"# @markdown Click \"Show Code\" to see more details."
]
},
Expand Down

0 comments on commit de8b9ee

Please sign in to comment.