Merge pull request #51 from run-ai/feature/ray

Feature // Ray integration
run-ai · Jul 4, 2023 · 90a78f9 · 90a78f9
2 parents 235b3cb + 31158b1
commit 90a78f9
Show file tree

Hide file tree

Showing 9 changed files with 129 additions and 5 deletions.
diff --git a/docs/source/development/development.rst b/docs/source/development/development.rst
@@ -29,7 +29,7 @@ Install the Python package in editable mode with the following command:
 
 .. code-block:: shell
 
-    pip install -e .
+    pip install -e .[dev]
 
 Then run the following commands to configure your terminal:
 

diff --git a/docs/source/development/reference.rst b/docs/source/development/reference.rst
@@ -91,8 +91,8 @@ Default is :code:`2`.
 
 :code:`GENV_MOCK_DEVICE_TOTAL_MEMORY`
 
-Total device memory in the :code:`nvidia-smi` mock shim.
-Default is :code:`16g`.
+Total device memory.
+Used in the :code:`nvidia-smi` mock shim and :code:`@genv.ray.remote`.
 
 ----
 

diff --git a/docs/source/overview/installation.rst b/docs/source/overview/installation.rst
@@ -107,6 +107,10 @@ Docker
 ------
 To install the :code:`genv-docker` refer to the Genv container toolkit :doc:`installation <../docker/installation>` page.
 
+Ray
+---
+To install the Ray integration of Genv read :ref:`here <Using Ray>`.
+
 .. _Remove Old Version:
 
 Remove Old Version

diff --git a/docs/source/overview/overview.rst b/docs/source/overview/overview.rst
@@ -20,7 +20,7 @@ Over time, the project developed into a complete GPU cluster management tool wit
 
 At its core, Genv is a `Python package <https://pypi.org/project/genv>`__ that provisions GPUs to environments and keeps its state as :ref:`files <Files>` at :code:`/var/tmp/genv`.
 
-On top of this core layer, Genv has a CLI (i.e. the command :code:`genv`), :doc:`Python SDK <../usage/python-sdk>` and :doc:`integrations <installation>` with many common tools and environments such as terminal, containers (e.g. :code:`docker`), Visual Studio Code, JupyterLab and PyCharm.
+On top of this core layer, Genv has a CLI (i.e. the command :code:`genv`), :doc:`Python SDK <../usage/python-sdk>` and :doc:`integrations <installation>` with many common tools and environments such as terminal, containers (e.g. :code:`docker`), Ray, Visual Studio Code, JupyterLab and PyCharm.
 
 .. figure:: overview.png
 

diff --git a/docs/source/usage/advanced-usage.rst b/docs/source/usage/advanced-usage.rst
@@ -264,3 +264,60 @@ For example:
 
 .. [#] `Over-allocation - Wikipedia <https://en.wikipedia.org/wiki/Thin_provisioning#Over-allocation>`_
 .. [#] `flock(1) - Linux manual page <https://man7.org/linux/man-pages/man1/flock.1.html>`_
+
+.. _Using Ray:
+
+Using Ray
+---------
+Genv supports activating Ray tasks as Genv environments.
+
+This can be useful for enforcing the resources used by the Genv workers.
+
+For example, as described `here <https://docs.ray.io/en/latest/ray-core/tasks/using-ray-with-gpus.html#fractional-gpus>`__, when using fractional GPUs with Ray, it is the user's responsibility to make sure that the individual tasks don't use more than their share of the GPU memory.
+
+Using Genv's Ray integration, you can use Genv :doc:`enforcement <./enforcement>` capabilities to ensure that your Ray tasks do not use more GPU resources than provided by Ray.
+
+Installation
+~~~~~~~~~~~~
+Because your Ray tasks will run remotely, you should have Genv installed on the remote nodes.
+This could be done in several ways.
+
+The `preferable <https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#environment-dependencies>`__ way is probably to install Genv :ref:`using <Install Using pip>` :code:`pip` on remote nodes manually using the command:
+
+.. code-block:: shell
+
+   pip install genv[ray]
+
+If you prefer not to, you can use Ray's `runtime environments <https://docs.ray.io/en/latest/ray-core/handling-dependencies.html#runtime-environments>`__ to install Genv in your remote Ray workers via Ray.
+
+You can use the argument :code:`pip` of :code:`ray.init`.
+For example:
+
+.. code-block:: python
+
+   ray.init(runtime_env={"pip": ["genv"]})
+
+If this does not work for you, you can use the argument :code:`py_modules` instead.
+For example:
+
+.. code-block:: python
+
+   import genv
+
+   ray.init(runtime_env={"py_modules": [genv]})
+
+Usage
+~~~~~
+To activate a Ray task as a Genv environment on the remote host, just replace the :code:`ray.remote` decorator with :code:`genv.ray.remote`.
+For example:
+
+.. code-block:: python
+
+   @genv.ray.remote(num_gpus=0.5)
+   def foo():
+      env_config = genv.sdk.configuration()
+      env_devices = genv.sdk.attached()
+
+      print(
+         f"Running in Genv environment '{env_config.name}' which is attached to device(s) at index {','.join(map(str, env_devices))}"
+      )
diff --git a/genv/_ray/__init__.py b/genv/_ray/__init__.py
@@ -0,0 +1 @@
+from .remote import remote
diff --git a/genv/_ray/remote.py b/genv/_ray/remote.py
@@ -0,0 +1,57 @@
+import math
+import os
+import pynvml
+import ray
+
+import genv
+
+
+def remote(**options):
+    """Wraps a Ray remote function with a Genv environment.
+
+    Inspired by https://github.com/ray-project/ray/blob/ray-2.5.1/python/ray/_private/worker.py#L3059
+    """
+
+    num_gpus = options.get("num_gpus", None)
+
+    if num_gpus is None:
+        raise ValueError(
+            "The @genv.ray.remote decorator must be applied only when the argument 'num_gpus' is being used."
+        )
+
+    def _decorator(function):
+        @ray.remote(**options)
+        def _wrapper(*args, **kwargs):
+            indices = ray.get_gpu_ids()
+
+            config = genv.sdk.Env.Config(
+                name=f"ray/{os.getpid()}", gpus=math.ceil(num_gpus)
+            )
+
+            if 0 < num_gpus < 1:
+                if "GENV_MOCK_DEVICE_TOTAL_MEMORY" in os.environ:
+                    total_bytes = genv.utils.memory_to_bytes(
+                        os.environ["GENV_MOCK_DEVICE_TOTAL_MEMORY"]
+                    )
+                else:
+                    index = indices[0]
+
+                    pynvml.nvmlInit()
+                    handle = pynvml.nvmlDeviceGetHandleByIndex(index)
+                    memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+                    total_bytes = memory_info.total
+
+                config.gpu_memory = str(math.floor(total_bytes * num_gpus))
+
+            with genv.sdk.activate():
+                genv.sdk.configure(config)
+
+                # TODO(raz): support attaching to multiple indices at once
+                for index in indices:
+                    genv.sdk.attach(index=index, allow_over_subscription=True)
+
+                return function(*args, **kwargs)
+
+        return _wrapper
+
+    return _decorator
diff --git a/genv/ray/__init__.py b/genv/ray/__init__.py
@@ -0,0 +1 @@
+from .._ray import *
diff --git a/setup.py b/setup.py
@@ -30,5 +30,9 @@
         ]
     },
     python_requires=">=3.7",
-    extras_require={"monitor": ["prometheus_client"]},
+    extras_require={
+        "dev": ["black"],
+        "monitor": ["prometheus_client"],
+        "ray": ["ray", "pynvml"],
+    },
 )