Merge pull request #4607 from voxel51/feature/update-index-stats

Added index statistics to dataset.stats
voxel51 · Aug 5, 2024 · 1fcc7b7 · 1fcc7b7
2 parents 87bbe27 + 90558c0
commit 1fcc7b7
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 7 deletions.
diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
@@ -567,24 +567,33 @@ def summary(self):
         """
         raise NotImplementedError("Subclass must implement summary()")
 
-    def stats(self, include_media=False, compressed=False):
+    def stats(
+        self,
+        include_media=False,
+        include_indexes=False,
+        compressed=False,
+    ):
         """Returns stats about the collection on disk.
 
         The ``samples`` keys refer to the sample documents stored in the
         database.
 
+        For video datasets, the ``frames`` keys refer to the frame documents
+        stored in the database.
+
         The ``media`` keys refer to the raw media associated with each sample
         on disk.
 
-        For video datasets, the ``frames`` keys refer to the frame documents
-        stored in the database.
+        The ``index[es]`` keys refer to the indexes associated with the
+        dataset.
 
         Note that dataset-level metadata such as annotation runs are not
         included in this computation.
 
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the collection
+            include_indexes (False): whether to return the stats on the indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False). This option is
@@ -630,6 +639,20 @@ def stats(self, include_media=False, compressed=False):
             stats["media_size"] = etau.to_human_bytes_str(media_bytes)
             total_bytes += media_bytes
 
+        if include_indexes:
+            ii = self.get_index_information(include_size=True)
+            index_bytes = {k: v["size"] for k, v in ii.items()}
+            indexes_bytes = sum(index_bytes.values())
+
+            stats["indexes_count"] = len(index_bytes)
+            stats["indexes_bytes"] = indexes_bytes
+            stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["index_bytes"] = index_bytes
+            stats["index_sizes"] = {
+                k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
+            }
+            total_bytes += indexes_bytes
+
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)
 
@@ -9028,13 +9051,15 @@ def list_indexes(self):
         """
         return list(self.get_index_information().keys())
 
-    def get_index_information(self):
+    def get_index_information(self, include_size=False):
         """Returns a dictionary of information about the indexes on this
         collection.
 
         See :meth:`pymongo:pymongo.collection.Collection.index_information` for
         details on the structure of this dictionary.
 
+        include_size (False): whether to include the size of each index
+
         Returns:
             a dict mapping index names to info dicts
         """
@@ -9043,6 +9068,16 @@ def get_index_information(self):
         # Sample-level indexes
         fields_map = self._get_db_fields_map(reverse=True)
         sample_info = self._dataset._sample_collection.index_information()
+
+        if include_size:
+            conn = foo.get_db_conn()
+            cs = conn.command(
+                "collstats", self._dataset._sample_collection_name
+            )
+            for key, size in cs["indexSizes"].items():
+                if key in sample_info:
+                    sample_info[key]["size"] = size
+
         for key, info in sample_info.items():
             if len(info["key"]) == 1:
                 field = info["key"][0][0]
@@ -9054,6 +9089,15 @@ def get_index_information(self):
             # Frame-level indexes
             fields_map = self._get_db_fields_map(frames=True, reverse=True)
             frame_info = self._dataset._frame_collection.index_information()
+
+            if include_size:
+                cs = conn.command(
+                    "collstats", self._dataset._frame_collection_name
+                )
+                for key, size in cs["indexSizes"].items():
+                    if key in frame_info:
+                        frame_info[key]["size"] = size
+
             for key, info in frame_info.items():
                 if len(info["key"]) == 1:
                     field = info["key"][0][0]

diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
@@ -56,6 +56,7 @@
 
 class DatasetNotFoundError(ValueError):
     """Exception raised when a dataset is not found."""
+
     def __init__(self, name):
         self._dataset_name = name
         super().__init__(f"Dataset {name} not found")
@@ -1080,24 +1081,33 @@ def summary(self):
 
         return "\n".join(lines)
 
-    def stats(self, include_media=False, compressed=False):
+    def stats(
+        self,
+        include_media=False,
+        include_indexes=False,
+        compressed=False,
+    ):
         """Returns stats about the dataset on disk.
 
         The ``samples`` keys refer to the sample documents stored in the
         database.
 
+        For video datasets, the ``frames`` keys refer to the frame documents
+        stored in the database.
+
         The ``media`` keys refer to the raw media associated with each sample
         on disk.
 
-        For video datasets, the ``frames`` keys refer to the frame documents
-        stored in the database.
+        The ``index[es]`` keys refer to the indexes associated with the
+        dataset.
 
         Note that dataset-level metadata such as annotation runs are not
         included in this computation.
 
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the dataset
+            include_indexes (False): whether to return the stats on the indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False)
@@ -1138,6 +1148,20 @@ def stats(self, include_media=False, compressed=False):
             stats["media_size"] = etau.to_human_bytes_str(media_bytes)
             total_bytes += media_bytes
 
+        if include_indexes:
+            ii = self.get_index_information(include_size=True)
+            index_bytes = {k: v["size"] for k, v in ii.items()}
+            indexes_bytes = sum(index_bytes.values())
+
+            stats["indexes_count"] = len(index_bytes)
+            stats["indexes_bytes"] = indexes_bytes
+            stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["index_bytes"] = index_bytes
+            stats["index_sizes"] = {
+                k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
+            }
+            total_bytes += indexes_bytes
+
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)
 

diff --git a/tests/unittests/dataset_tests.py b/tests/unittests/dataset_tests.py
@@ -532,6 +532,34 @@ def test_indexes(self):
         with self.assertRaises(ValueError):
             dataset.create_index("non_existent_field")
 
+    @drop_datasets
+    def test_index_sizes(self):
+        gt = fo.Detections(detections=[fo.Detection(label="foo")])
+        sample = fo.Sample(filepath="video.mp4", gt=gt)
+        sample.frames[1] = fo.Frame(gt=gt)
+
+        dataset = fo.Dataset()
+        dataset.add_sample(sample)
+
+        dataset.create_index("gt.detections.label")
+        dataset.create_index("frames.gt.detections.label")
+
+        info = dataset.get_index_information(include_size=True)
+
+        indexes = [
+            "id",
+            "filepath",
+            "gt.detections.label",
+            "frames.id",
+            "frames._sample_id_1_frame_number_1",
+            "frames.gt.detections.label",
+        ]
+
+        self.assertListEqual(dataset.list_indexes(), indexes)
+        self.assertSetEqual(set(info.keys()), set(indexes))
+        for d in info.values():
+            self.assertTrue(d.get("size") is not None)
+
     @drop_datasets
     def test_iter_samples(self):
         dataset = fo.Dataset()