diff --git a/notebooks/community/model_garden/model_garden_pytorch_mistral_peft_tuning.ipynb b/notebooks/community/model_garden/model_garden_pytorch_mistral_peft_tuning.ipynb index 5728d36a7..fa36d8c8a 100644 --- a/notebooks/community/model_garden/model_garden_pytorch_mistral_peft_tuning.ipynb +++ b/notebooks/community/model_garden/model_garden_pytorch_mistral_peft_tuning.ipynb @@ -4,6 +4,7 @@ "cell_type": "code", "execution_count": null, "metadata": { + "cellView": "form", "id": "iJc36RtD90jd" }, "outputs": [], @@ -95,7 +96,11 @@ "\n", "# @markdown 2. [Optional] [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing experiment outputs. Set the BUCKET_URI for the experiment environment. The specified Cloud Storage bucket (`BUCKET_URI`) should be located in the same region as where the notebook was launched. Note that a multi-region bucket (eg. \"us\") is not considered a match for a single region covered by the multi-region range (eg. \"us-central1\"). If not set, a unique GCS bucket will be created instead.\n", "\n", - "# @markdown 3. [Make sure that you have GPU quota for Vertex Training (finetuning) and Vertex Prediction (serving)](https://cloud.google.com/docs/quotas/view-manage). The quota name for Vertex Training is \"Custom model training your-gpu-type per region\" and the quota name for Vertex Prediction is \"Custom model serving your-gpu-type per region\" such as `Custom model training Nvidia L4 GPUs per region` and `Custom model serving Nvidia L4 GPUs per region` for L4 GPUs. [Submit a quota increase request](https://cloud.google.com/docs/quotas/view-manage#requesting_higher_quota) if additional quota is needed. At minimum, running this notebook requires 4 A100 80 GB for finetuning and 1 L4 for serving. More GPUs may be needed for larger models and different finetuning configurations. To secure GPUs for larger models, ask your customer engineer to get you allowlisted for a Shared Reservation or a Dynamic Workload Scheduler.\n", + "# @markdown 3. [Make sure that you have GPU quota for Vertex Training (finetuning) and Vertex Prediction (serving)](https://cloud.google.com/docs/quotas/view-manage). The quota name for Vertex Training is \"Custom model training your-gpu-type per region\" and the quota name for Vertex Prediction is \"Custom model serving your-gpu-type per region\" such as `Custom model training Nvidia L4 GPUs per region` and `Custom model serving Nvidia L4 GPUs per region` for L4 GPUs. [Submit a quota increase request](https://cloud.google.com/docs/quotas/view-manage#requesting_higher_quota) if additional quota is needed.\n", + "# @markdown\n", + "# @markdown By default, running this notebook requires 8 A100 80 GB for finetuning and 1 L4 for serving. The finetuning quota is available in `us-central1`.\n", + "# @markdown\n", + "# @markdown More GPUs may be needed for larger models and different finetuning configurations. To secure GPUs for larger models, ask your customer engineer to get you allowlisted for a Shared Reservation or a Dynamic Workload Scheduler.\n", "\n", "# Import the necessary packages\n", "! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git\n", @@ -267,12 +272,12 @@ "pretrained_model_id = f\"gs://vertex-model-garden-public-us/{base_model_id}\"\n", "\n", "# The pre-built training docker image.\n", - "TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00\"\n", + "TRAIN_DOCKER_URI = \"us-docker.pkg.dev/vertex-ai-restricted/vertex-vision-model-garden-dockers/pytorch-peft-train:20240724_0936_RC00\"\n", "\n", "# @markdown Batch size for finetuning.\n", "per_device_train_batch_size = 1 # @param{type:\"integer\"}\n", "# @markdown Number of updates steps to accumulate the gradients for, before performing a backward/update pass.\n", - "gradient_accumulation_steps = 8 # @param{type:\"integer\"}\n", + "gradient_accumulation_steps = 4 # @param{type:\"integer\"}\n", "# @markdown Maximum sequence length.\n", "max_seq_length = 4096 # @param{type:\"integer\"}\n", "# @markdown Setting a positive `max_steps` here will override `num_epochs`.\n", @@ -308,12 +313,9 @@ "# Worker pool spec for 4bit finetuning.\n", "accelerator_type = \"NVIDIA_A100_80GB\" # @param[\"NVIDIA_A100_80GB\"]\n", "\n", - "if accelerator_type == \"NVIDIA_L4\":\n", - " accelerator_count = 4\n", - " machine_type = \"g2-standard-48\"\n", - "elif accelerator_type == \"NVIDIA_A100_80GB\":\n", - " accelerator_count = 4\n", - " machine_type = \"a2-ultragpu-4g\"\n", + "if accelerator_type == \"NVIDIA_A100_80GB\":\n", + " accelerator_count = 8\n", + " machine_type = \"a2-ultragpu-8g\"\n", "else:\n", " raise ValueError(f\"Unsupported accelerator type: {accelerator_type}\")\n", "\n", @@ -325,6 +327,7 @@ " accelerator_type=accelerator_type,\n", " accelerator_count=accelerator_count,\n", " is_for_training=True,\n", + " is_restricted_image=True,\n", ")\n", "\n", "# Setup training job.\n", @@ -381,7 +384,6 @@ " f\"--template={template}\",\n", "] + eval_args\n", "\n", - "\n", "# Create TensorBoard\n", "tensorboard = aiplatform.Tensorboard.create(job_name)\n", "exp = aiplatform.TensorboardExperiment.create(\n", @@ -393,10 +395,11 @@ " container_uri=TRAIN_DOCKER_URI,\n", ")\n", "\n", + "print(\"Running training job with args:\")\n", + "print(\" \\\\\\n\".join(train_job_args))\n", "# Pass training arguments and launch job.\n", "train_job.run(\n", " args=train_job_args,\n", - " environment_variables={\"WANDB_DISABLED\": True},\n", " replica_count=replica_count,\n", " machine_type=machine_type,\n", " accelerator_type=accelerator_type,\n",