llm/vllm/service.yaml

# service.yaml
# The newly-added `service` section to the `serve-openai-api.yaml` file.
service:
  # Specifying the path to the endpoint to check the readiness of the service.
  readiness_probe: /v1/models
  # How many replicas to manage.
  replicas: 2

# Fields below are the same with `serve-openai-api.yaml`.
envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
  ports:
    - 8000

setup: |
  conda activate vllm
  if [ $? -ne 0 ]; then
    conda create -n vllm python=3.10 -y
    conda activate vllm
  fi

  pip install transformers==4.38.0
  pip install vllm==0.3.2

  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"


run: |
  conda activate vllm
  echo 'Starting vllm openai api server...'
  python -m vllm.entrypoints.openai.api_server \
    --model $MODEL_NAME --tokenizer hf-internal-testing/llama-tokenizer \
    --host 0.0.0.0