Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tests] fix smoke test race condition on first run #4494

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 70 additions & 7 deletions sky/provision/aws/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,66 @@ def _get_role(role_name: str):
f'{role_name}{colorama.Style.RESET_ALL} from AWS.')
raise exc

def _create_instance_profile_if_not_exists(instance_profile_name: str):
aylei marked this conversation as resolved.
Show resolved Hide resolved
try:
iam.meta.client.create_instance_profile(
InstanceProfileName=instance_profile_name)
except aws.botocore_exceptions().ClientError as exc:
if exc.response.get('Error',
{}).get('Code') == 'EntityAlreadyExists':
return
else:
utils.handle_boto_error(
exc,
f'Failed to create instance profile {colorama.Style.BRIGHT}'
f'{instance_profile_name}{colorama.Style.RESET_ALL} in AWS.'
)
raise exc

def _create_role_if_not_exists(role_name: str, policy_doc: Dict[str, Any]):
try:
iam.create_role(RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(policy_doc))
except aws.botocore_exceptions().ClientError as exc:
if exc.response.get('Error',
{}).get('Code') == 'EntityAlreadyExists':
return
else:
utils.handle_boto_error(
exc, f'Failed to create role {colorama.Style.BRIGHT}'
f'{role_name}{colorama.Style.RESET_ALL} in AWS.')
raise exc

def _check_instance_profile_role(profile: Any, role_name: str):
if profile.roles and profile.roles[0].name != role_name:
# If the associated role is not the role we expect, error out
# to user instead of silently overriding the role.
Copy link
Collaborator

@zpoint zpoint Dec 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need the comment here? The logger.fatal statement below is quite clear.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, removed the log for simplicity~

logger.fatal(f'The instance profile {profile.name} already has '
f'an associated role {profile.roles[0].name}, but the '
f'role {role.name} is not the same as the expected '
f'role. Please remove the existing role from the '
f'instance profile and try again.')
raise SystemExit(1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about we use the same handle_boto_error here as well to keep the error handling consistent and raise the same exc error?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, boto error is specific to the errors returned by AWS boto3 SDK, simulating a boto error here adding extra indirections. Besides, handle_boto_error will prefix error message with Boto3 error: , which looks misleading, wdyt?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. I was referring to raising the error from the caller directly, but I am ok with a custom exception.

If we are raising SystemExit here, it might cause the whole system to exit as we don't catch the system exit explicitly.
https://github.com/skypilot-org/skypilot/blob/master/sky/provision/provisioner.py#L136-L197

Just a note, we decide the failover across clouds based on the error message here: https://github.com/skypilot-org/skypilot/blob/master/sky/backends/cloud_vm_ray_backend.py#L1119-L1137. I assume if we fail in this case, we should skip the whole AWS cloud.


def _ensure_instance_profile_role(profile: Any, role: Any):
try:
profile.add_role(RoleName=role.name)
except aws.botocore_exceptions().ClientError as exc:
# AddRoleToInstanceProfile is not idempotent. Adding a role to an
# InstanceProfile that already has an associated role will cause
# LimitExceeded error, even if the two roles are identical.
# see also: https://docs.aws.amazon.com/IAM/latest/APIReference/API_AddRoleToInstanceProfile.html # pylint: disable=line-too-long
if exc.response.get('Error', {}).get('Code') == 'LimitExceeded':
_check_instance_profile_role(profile, role)
return
else:
utils.handle_boto_error(
exc, f'Failed to add role {colorama.Style.BRIGHT}'
f'{role.name}{colorama.Style.RESET_ALL} to instance '
f'profile {colorama.Style.BRIGHT}{profile.name}'
f'{colorama.Style.RESET_ALL} in AWS.')
raise exc

instance_profile_name = DEFAULT_SKYPILOT_INSTANCE_PROFILE
profile = _get_instance_profile(instance_profile_name)

Expand All @@ -158,14 +218,16 @@ def _get_role(role_name: str):
f'Creating new IAM instance profile {colorama.Style.BRIGHT}'
f'{instance_profile_name}{colorama.Style.RESET_ALL} for use as the '
'default.')
iam.meta.client.create_instance_profile(
InstanceProfileName=instance_profile_name)
_create_instance_profile_if_not_exists(instance_profile_name)
profile = _get_instance_profile(instance_profile_name)
time.sleep(15) # wait for propagation
assert profile is not None, 'Failed to create instance profile'

if not profile.roles:
role_name = DEFAULT_SKYPILOT_IAM_ROLE
role_name = DEFAULT_SKYPILOT_IAM_ROLE
if profile.roles:
_check_instance_profile_role(profile, role_name)
else:
# there is no role associated, ensure the role and associate
role = _get_role(role_name)
if role is None:
logger.info(
Expand All @@ -186,8 +248,9 @@ def _get_role(role_name: str):
'arn:aws:iam::aws:policy/AmazonS3FullAccess',
]

iam.create_role(RoleName=role_name,
AssumeRolePolicyDocument=json.dumps(policy_doc))
# TODO(aylei): need to check and reconcile the role permission
# in case of external modification.
_create_role_if_not_exists(role_name, policy_doc)
role = _get_role(role_name)
assert role is not None, 'Failed to create role'

Expand Down Expand Up @@ -217,7 +280,7 @@ def _get_role(role_name: str):
role.Policy('SkyPilotPassRolePolicy').put(
PolicyDocument=json.dumps(skypilot_pass_role_policy_doc))

profile.add_role(RoleName=role.name)
_ensure_instance_profile_role(profile, role)
time.sleep(15) # wait for propagation
return {'Arn': profile.arn}

Expand Down
Loading