skypilot-org · romilbhardwaj · Jan 3, 2025 · Dec 12, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -916,6 +916,13 @@ def write_cluster_config(
             tmp_yaml_path,
             cluster_config_overrides=to_provision.cluster_config_overrides)
         kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
+        yaml_obj = common_utils.read_yaml(tmp_yaml_path)
+        pod_config = yaml_obj['available_node_types']['ray_head_default'][
+            'node_config']
+        valid, message = kubernetes_utils.check_pod_config(pod_config)
+        if not valid:
+            raise exceptions.InvalidCloudConfigs(
+                f'Invalid pod_config. Details: {message}')
 
     if dryrun:
         # If dryrun, return the unfinished tmp yaml path.

diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
@@ -892,6 +892,52 @@ def check_credentials(context: Optional[str],
         return True, None
 
 
+def check_pod_config(pod_config: dict) \
+    -> Tuple[bool, Optional[str]]:
+    """Check if the pod_config is a valid pod config
+
+    Using deserialize api to check the pod_config is valid or not.
+
+    Returns:
+        bool: True if pod_config is valid.
+        str: Error message about why the pod_config is invalid, None otherwise.
+    """
+    errors = []
+    # This api_client won't be used to send any requests, so there is no need to
+    # load kubeconfig
+    api_client = kubernetes.kubernetes.client.ApiClient()
+
+    # Used for kubernetes api_client deserialize function, the function will use
+    # data attr, the detail ref:
+    # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244
+    class InnerResponse():
+
+        def __init__(self, data: dict):
+            self.data = json.dumps(data)
+
+    try:
+        # Validate metadata if present
+        if 'metadata' in pod_config:
+            try:
+                value = InnerResponse(pod_config['metadata'])
+                api_client.deserialize(
+                    value, kubernetes.kubernetes.client.V1ObjectMeta)
+            except ValueError as e:
+                errors.append(f'Invalid metadata: {str(e)}')
+        # Validate spec if present
+        if 'spec' in pod_config:
+            try:
+                value = InnerResponse(pod_config['spec'])
+                api_client.deserialize(value,
+                                       kubernetes.kubernetes.client.V1PodSpec)
+            except ValueError as e:
+                errors.append(f'Invalid spec: {str(e)}')
+        return len(errors) == 0, '.'.join(errors)
+    except Exception as e:  # pylint: disable=broad-except
+        errors.append(f'Validation error: {str(e)}')
+        return False, '.'.join(errors)
+
+
 def is_kubeconfig_exec_auth(
         context: Optional[str] = None) -> Tuple[bool, Optional[str]]:
     """Checks if the kubeconfig file uses exec-based authentication

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -7,6 +7,7 @@
 
 import sky
 from sky import skypilot_config
+import sky.exceptions
 from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import kubernetes_enums
@@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None:
         """))
 
 
+def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None:
+    task_file_path.write_text(
+        textwrap.dedent("""\
+        experimental:
+            config_overrides:
+                kubernetes:
+                    pod_config:
+                        metadata:
+                            labels:
+                                test-key: test-value
+                            annotations:
+                                abc: def
+                        spec:
+                            containers:
+                                - name:
+                            imagePullSecrets:
+                                - name: my-secret-2
+
+        setup: echo 'Setting up...'
+        run: echo 'Running...'
+        """))
+
+
 def test_nested_config(monkeypatch) -> None:
     """Test that the nested config works."""
     config = skypilot_config.Config()
@@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path,
     assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia'
 
 
+def test_k8s_config_with_invalid_config(monkeypatch, tmp_path,
+                                        enable_all_clouds) -> None:
+    config_path = tmp_path / 'config.yaml'
+    _create_config_file(config_path)
+    monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path)
+
+    _reload_config()
+    task_path = tmp_path / 'task.yaml'
+    _create_invalid_config_yaml_file(task_path)
+    task = sky.Task.from_yaml(task_path)
+
+    # Test Kubernetes pod_config invalid
+    cluster_name = 'test_k8s_config_with_invalid_config'
+    task.set_resources_override({'cloud': sky.Kubernetes()})
+    exception_occurred = False
+    try:
+        sky.launch(task, cluster_name=cluster_name, dryrun=True)
+    except sky.exceptions.ResourcesUnavailableError:
+        exception_occurred = True
+    assert exception_occurred
+
+
 def test_gcp_config_with_override(monkeypatch, tmp_path,
                                   enable_all_clouds) -> None:
     config_path = tmp_path / 'config.yaml'