Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Depreciate v5litepod terminology for v5e #35

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 16 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ return the hardware back to the shared pool when they complete, developers can
achieve better use of finite hardware resources. And automated tests can run
overnight while resources tend to be underutilized.

## Supported TPU Types:
* v5e
* v4

# Installation
To install xpk, run the following command:

Expand Down Expand Up @@ -73,7 +77,7 @@ all zones.
gcloud compute reservations list --project=$PROJECT_ID
# Run cluster create with reservation
python3 xpk.py cluster create \
--cluster xpk-test --tpu-type=v5litepod-256 \
--cluster xpk-test --tpu-type=v5e-256 \
--num-slices=2 \
--reservation=$RESERVATION_ID
```
Expand All @@ -82,7 +86,7 @@ all zones.

```shell
python3 xpk.py cluster create \
--cluster xpk-test --tpu-type=v5litepod-16 \
--cluster xpk-test --tpu-type=v5e-16 \
--num-slices=4 --on-demand
```

Expand All @@ -93,7 +97,7 @@ all zones.

```shell
python3 xpk.py cluster create \
--cluster xpk-test --tpu-type=v5litepod-16 \
--cluster xpk-test --tpu-type=v5e-16 \
--num-slices=4
```

Expand All @@ -102,7 +106,7 @@ all zones.

```shell
python3 xpk.py cluster create \
--cluster xpk-test --tpu-type=v5litepod-16 \
--cluster xpk-test --tpu-type=v5e-16 \
--num-slices=8
```

Expand All @@ -112,13 +116,13 @@ all zones.

```shell
python3 xpk.py cluster create \
--cluster xpk-test --tpu-type=v5litepod-16 \
--cluster xpk-test --tpu-type=v5e-16 \
--num-slices=6

# Skip delete prompts using --force.

python3 xpk.py cluster create --force \
--cluster xpk-test --tpu-type=v5litepod-16 \
--cluster xpk-test --tpu-type=v5e-16 \
--num-slices=6

```
Expand Down Expand Up @@ -157,7 +161,7 @@ all zones.
```shell
python3 xpk.py workload create \
--workload xpk-test-workload --command "echo goodbye" --cluster \
xpk-test --tpu-type=v5litepod-16
xpk-test --tpu-type=v5e-16
```

### Workload Priority and Preemption
Expand All @@ -182,7 +186,7 @@ all zones.
```shell
python3 xpk.py workload create \
--workload xpk-test-medium-workload --command "echo goodbye" --cluster \
xpk-test --tpu-type=v5litepod-16 --priority=medium
xpk-test --tpu-type=v5e-16 --priority=medium
```

## Workload Delete
Expand Down Expand Up @@ -259,15 +263,15 @@ This flow pulls the `--script-dir` into the `--base-docker-image` and runs the n
echo -e '#!/bin/bash \n echo "Hello world from a test script!"' > test.sh
python3 xpk.py workload create --cluster xpk-test \
--workload xpk-test-workload-base-image --command "bash test.sh" \
--tpu-type=v5litepod-16 --num-slices=1
--tpu-type=v5e-16 --num-slices=1
```

* Recommended Flow For Normal Sized Jobs (fewer than 10k accelerators):
```shell
python3 xpk.py workload create --cluster xpk-test \
--workload xpk-test-workload-base-image --command "bash custom_script.sh" \
--base-docker-image=gcr.io/your_dependencies_docker_image \
--tpu-type=v5litepod-16 --num-slices=1
--tpu-type=v5e-16 --num-slices=1
```

## Optional Direct Docker Image Configuration: `--docker-image`
Expand All @@ -279,7 +283,7 @@ workload.
```shell
python3 xpk.py workload create --cluster xpk-test \
--workload xpk-test-workload-base-image --command "bash test.sh" \
--tpu-type=v5litepod-16 --num-slices=1 --docker-image=gcr.io/your_docker_image
--tpu-type=v5e-16 --num-slices=1 --docker-image=gcr.io/your_docker_image
```

* Recommended Flow For Large Sized Jobs (more than 10k accelerators):
Expand All @@ -289,7 +293,7 @@ workload.
# Run workload create with the same image.
python3 xpk.py workload create --cluster xpk-test \
--workload xpk-test-workload-base-image --command "bash test.sh" \
--tpu-type=v5litepod-16 --num-slices=1 --docker-image=gcr.io/your_docker_image
--tpu-type=v5e-16 --num-slices=1 --docker-image=gcr.io/your_docker_image
```

# More advanced facts:
Expand Down
37 changes: 29 additions & 8 deletions xpk.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,19 @@ class SystemCharacteristics:
MODIFICATIONS TO UserFacingNameToSystemCharacteristics IN MaxText/accelerator_to_spec_map.py !!!!! """
# vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
UserFacingNameToSystemCharacteristics = {
'v5litepod-16': SystemCharacteristics(
'v5e-16': SystemCharacteristics(
'4x4', 4, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', 4
),
'v5litepod-32': SystemCharacteristics(
'v5e-32': SystemCharacteristics(
'4x8', 8, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', 4
),
'v5litepod-64': SystemCharacteristics(
'v5e-64': SystemCharacteristics(
'8x8', 16, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', 4
),
'v5litepod-128': SystemCharacteristics(
'v5e-128': SystemCharacteristics(
'8x16', 32, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', 4
),
'v5litepod-256': SystemCharacteristics(
'v5e-256': SystemCharacteristics(
'16x16', 64, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', 4
),
'v4-8': SystemCharacteristics(
Expand Down Expand Up @@ -1128,6 +1128,25 @@ def default_subcommand_function(_args) -> int: # args is unused, so pylint: dis
return 0


def exit_if_v5litepod_found(args):
"""Depreciation error for v5litepod term.

Args:
args: user provided arguments for running the command.
"""
depreciated_tpu_type = 'v5litepod'
replaced_tpu_type = 'v5e'
if args.tpu_type.startswith(depreciated_tpu_type):
new_tpu_type = args.tpu_type.replace(depreciated_tpu_type, replaced_tpu_type)
xpk_print(
f'Error: `{depreciated_tpu_type}` is depreciated. Please use'
f' `{replaced_tpu_type}` instead. User provided'
f' `--tpu-type={args.tpu_type}` which should be'
f' `--tpu-type={new_tpu_type}.`'
)
xpk_exit(1)


def cluster_create(args) -> int:
"""Function around cluster creation.

Expand All @@ -1137,6 +1156,7 @@ def cluster_create(args) -> int:
Returns:
0 if successful and 1 otherwise.
"""
exit_if_v5litepod_found(args)
system_characteristics = UserFacingNameToSystemCharacteristics[args.tpu_type]

xpk_print(f'Starting cluster create for cluster {args.cluster}:', flush=True)
Expand Down Expand Up @@ -1543,6 +1563,7 @@ def workload_create(args) -> int:
xpk_exit(1)

xpk_print('Starting workload create', flush=True)
exit_if_v5litepod_found(args)
system = UserFacingNameToSystemCharacteristics[args.tpu_type]

setup_docker_image_code, docker_image = setup_docker_image(args)
Expand Down Expand Up @@ -1824,8 +1845,8 @@ def directory_path_type(value):
cluster_create_required_arguments.add_argument(
'--tpu-type',
type=str,
default='v5litepod-16',
help='The type of the TPU. v5litepod and v4 are the only supported types.',
default='v5e-16',
help='The type of the TPU.',
required=True,
)

Expand Down Expand Up @@ -2106,7 +2127,7 @@ def directory_path_type(value):
'--tpu-type',
type=str,
default=None,
help='The tpu type to use, v5litepod-16, etc.',
help='The tpu type to use, v5e-16, etc.',
required=True,
)
workload_create_parser_required_arguments.add_argument(
Expand Down