voxel51 · jacobsela · Oct 31, 2024 · Oct 31, 2024 · Nov 1, 2024 · Nov 11, 2024
diff --git a/fiftyone/utils/clip/zoo.py b/fiftyone/utils/clip/zoo.py
@@ -187,7 +187,7 @@ def _predict_all(self, imgs):
         frame_size = (width, height)
 
         if self._using_gpu:
-            imgs = imgs.cuda()
+            imgs = imgs.to(self.device)
 
         text_features = self._get_text_features()
         image_features = self._model.encode_image(imgs)

diff --git a/fiftyone/utils/open_clip.py b/fiftyone/utils/open_clip.py
@@ -106,7 +106,7 @@ def _get_text_features(self):
             # Tokenize text
             text = self._tokenizer(prompts)
             if self._using_gpu:
-                text = text.cuda()
+                text = text.to(self.device)
             self._text_features = self._model.encode_text(text)
 
         return self._text_features
@@ -118,7 +118,7 @@ def _embed_prompts(self, prompts):
         # Tokenize text
         text = self._tokenizer(formatted_prompts)
         if self._using_gpu:
-            text = text.cuda()
+            text = text.to(self.device)
         return self._model.encode_text(text)
 
     def _get_class_logits(self, text_features, image_features):
@@ -143,9 +143,11 @@ def _predict_all(self, imgs):
         frame_size = (width, height)
 
         if self._using_gpu:
-            imgs = imgs.cuda()
+            imgs = imgs.to(self.device)
 
-        with torch.no_grad(), torch.amp.autocast("cuda"):
+        with torch.no_grad(), torch.amp.autocast(
+            device_type=self.device.type if self._using_gpu else "cpu"
+        ):
             image_features = self._model.encode_image(imgs)
             text_features = self._get_text_features()
 

diff --git a/fiftyone/utils/super_gradients.py b/fiftyone/utils/super_gradients.py
@@ -96,7 +96,7 @@ def _load_model(self, config):
             )
 
         if self._using_gpu:
-            model = model.cuda()
+            model = model.to(self.device)
 
         return model
 

diff --git a/fiftyone/utils/transformers.py b/fiftyone/utils/transformers.py
@@ -323,6 +323,9 @@ class FiftyOneTransformerConfig(Config, HasZooModel):
     def __init__(self, d):
         self.model = self.parse_raw(d, "model", default=None)
         self.name_or_path = self.parse_string(d, "name_or_path", default=None)
+        self.device = self.parse_string(
+            d, "device", default="cuda" if torch.cuda.is_available() else "cpu"
+        )
         if etau.is_str(self.model):
             self.name_or_path = self.model
             self.model = None
@@ -451,7 +454,8 @@ class FiftyOneTransformer(TransformerEmbeddingsMixin, Model):
     def __init__(self, config):
         self.config = config
         self.model = self._load_model(config)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(self.config.device)
+        self.model.to(self.device)
         self.image_processor = self._load_image_processor()
 
     @property
@@ -496,7 +500,8 @@ def __init__(self, config):
         self.config = config
         self.classes = config.classes
         self.model = self._load_model(config)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(self.config.device)
+        self.model.to(self.device)
         self.processor = self._load_processor()
         self._text_prompts = None
 
@@ -581,7 +586,7 @@ def _load_model(self, config):
         if config.model is not None:
             return config.model
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(config.device)
         model = transformers.AutoModel.from_pretrained(config.name_or_path).to(
             device
         )
@@ -641,7 +646,7 @@ def _predict_from_retrieval(self, arg):
         with torch.no_grad():
             for text_prompt in text_prompts:
                 inputs = self.processor(arg, text_prompt, return_tensors="pt")
-                outputs = self.model(**inputs.to(self.device))
+                outputs = self.model(**(inputs.to(self.device)))
                 logits.append(outputs.logits[0, :].item())
 
         logits = np.array(logits)
@@ -693,14 +698,14 @@ class FiftyOneTransformerForImageClassification(FiftyOneTransformer):
     def _load_model(self, config):
         if config.model is not None:
             return config.model
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(config.device)
         return transformers.AutoModelForImageClassification.from_pretrained(
             config.name_or_path
         ).to(device)
 
     def _predict(self, inputs):
         with torch.no_grad():
-            results = self.model(**inputs.to(self.device))
+            results = self.model(**(inputs.to(self.device)))
         return to_classification(results, self.model.config.id2label)
 
     def predict(self, arg):
@@ -748,7 +753,8 @@ def __init__(self, config):
         self.classes = config.classes
         self.processor = self._load_processor(config)
         self.model = self._load_model(config)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(self.config.device)
+        self.model.to(self.device)
         self._text_prompts = None
 
     def _load_processor(self, config):
@@ -757,9 +763,7 @@ def _load_processor(self, config):
             if config.model is not None:
                 name_or_path = config.model.name_or_path
 
-        return transformers.AutoProcessor.from_pretrained(name_or_path).to(
-            self.device
-        )
+        return transformers.AutoProcessor.from_pretrained(name_or_path)
 
     def _load_model(self, config):
         name_or_path = config.name_or_path
@@ -770,7 +774,9 @@ def _load_model(self, config):
         if config.model is not None:
             return config.model
         else:
-            return _get_detector_from_processor(self.processor, name_or_path)
+            return _get_detector_from_processor(
+                self.processor, name_or_path
+            ).to(config.device)
 
     def _process_inputs(self, args):
         text_prompts = self._get_text_prompts()
@@ -781,7 +787,7 @@ def _process_inputs(self, args):
 
     def _predict(self, inputs, target_sizes):
         with torch.no_grad():
-            outputs = self.model(**inputs.to(self.device))
+            outputs = self.model(**(inputs.to(self.device)))
 
         results = self.processor.image_processor.post_process_object_detection(
             outputs, target_sizes=target_sizes
@@ -821,10 +827,9 @@ class FiftyOneTransformerForObjectDetection(FiftyOneTransformer):
     def _load_model(self, config):
         if config.model is not None:
             return config.model
-        device = "cuda" if torch.cuda.is_available() else "cpu"
         return transformers.AutoModelForObjectDetection.from_pretrained(
             config.name_or_path
-        ).to(device)
+        ).to(config.device)
 
     def _predict(self, inputs, target_sizes):
         with torch.no_grad():
@@ -875,11 +880,10 @@ def _load_model(self, config):
         if config.model is not None:
             model = config.model
         else:
-            device = "cuda" if torch.cuda.is_available() else "cpu"
             model = (
                 transformers.AutoModelForSemanticSegmentation.from_pretrained(
                     config.name_or_path
-                ).to(device)
+                ).to(config.device)
             )
 
         self.mask_targets = model.config.id2label
@@ -929,10 +933,9 @@ class FiftyOneTransformerForDepthEstimation(FiftyOneTransformer):
     def _load_model(self, config):
         if config.model is not None:
             return config.model
-        device = "cuda" if torch.cuda.is_available() else "cpu"
         return transformers.AutoModelForDepthEstimation.from_pretrained(
             config.name_or_path
-        ).to(device)
+        ).to(config.device)
 
     def _predict(self, inputs, target_sizes):
         with torch.no_grad():
@@ -1084,5 +1087,4 @@ def _get_detector_from_processor(processor, model_name_or_path):
         __import__(module_name, fromlist=[detector_class_name]),
         detector_class_name,
     )
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    return detector_class.from_pretrained(model_name_or_path).to(device)
+    return detector_class.from_pretrained(model_name_or_path)
diff --git a/fiftyone/utils/ultralytics.py b/fiftyone/utils/ultralytics.py
@@ -20,6 +20,7 @@
 import fiftyone.zoo.models as fozm
 
 ultralytics = fou.lazy_import("ultralytics")
+torch = fou.lazy_import("torch")
 
 
 def convert_ultralytics_model(model):
@@ -378,6 +379,9 @@ def __init__(self, d):
         self.model_name = self.parse_raw(d, "model_name", default=None)
         self.model_path = self.parse_raw(d, "model_path", default=None)
         self.classes = self.parse_array(d, "classes", default=None)
+        self.device = self.parse_string(
+            d, "device", default="cuda" if torch.cuda.is_available() else "cpu"
+        )
 
 
 class FiftyOneYOLOModel(Model):
@@ -390,6 +394,8 @@ class FiftyOneYOLOModel(Model):
     def __init__(self, config):
         self.config = config
         self.model = self._load_model(config)
+        self.device = torch.device(config.device)
+        self.model.to(self.device)
 
     def _load_model(self, config):
         if config.model is not None:

diff --git a/fiftyone/zoo/models/manifest-torch.json b/fiftyone/zoo/models/manifest-torch.json
@@ -2968,7 +2968,8 @@
                     "entrypoint_args": {
                         "repo_or_dir": "ultralytics/yolov5",
                         "model": "yolov5n",
-                        "pretrained": true
+                        "pretrained": true,
+                        "device": "cpu"
                     },
                     "output_processor_cls": "fiftyone.utils.ultralytics.UltralyticsOutputProcessor",
                     "raw_inputs": true
@@ -2998,7 +2999,8 @@
                     "entrypoint_args": {
                         "repo_or_dir": "ultralytics/yolov5",
                         "model": "yolov5s",
-                        "pretrained": true
+                        "pretrained": true,
+                        "device": "cpu"
                     },
                     "output_processor_cls": "fiftyone.utils.ultralytics.UltralyticsOutputProcessor",
                     "raw_inputs": true
@@ -3028,7 +3030,8 @@
                     "entrypoint_args": {
                         "repo_or_dir": "ultralytics/yolov5",
                         "model": "yolov5m",
-                        "pretrained": true
+                        "pretrained": true,
+                        "device": "cpu"
                     },
                     "output_processor_cls": "fiftyone.utils.ultralytics.UltralyticsOutputProcessor",
                     "raw_inputs": true
@@ -3058,7 +3061,8 @@
                     "entrypoint_args": {
                         "repo_or_dir": "ultralytics/yolov5",
                         "model": "yolov5l",
-                        "pretrained": true
+                        "pretrained": true,
+                        "device": "cpu"
                     },
                     "output_processor_cls": "fiftyone.utils.ultralytics.UltralyticsOutputProcessor",
                     "raw_inputs": true
@@ -4555,7 +4559,8 @@
                     "entrypoint_args": {
                         "repo_or_dir": "ultralytics/yolov5",
                         "model": "yolov5x",
-                        "pretrained": true
+                        "pretrained": true,
+                        "device": "cpu"
                     },
                     "output_processor_cls": "fiftyone.utils.ultralytics.UltralyticsOutputProcessor",
                     "raw_inputs": true