From 2dfaf930a9482026ca5294a25907d6f73b40c999 Mon Sep 17 00:00:00 2001
From: minhtuevo <minhtuev@voxel51.com>
Date: Thu, 1 Aug 2024 15:13:25 -0700
Subject: [PATCH 1/5] Added index statistics to dataset.stats

---
 fiftyone/core/dataset.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
index d0ae130714..cc542191d0 100644
--- a/fiftyone/core/dataset.py
+++ b/fiftyone/core/dataset.py
@@ -56,6 +56,7 @@
 
 class DatasetNotFoundError(ValueError):
     """Exception raised when a dataset is not found."""
+
     def __init__(self, name):
         self._dataset_name = name
         super().__init__(f"Dataset {name} not found")
@@ -1080,7 +1081,9 @@ def summary(self):
 
         return "\n".join(lines)
 
-    def stats(self, include_media=False, compressed=False):
+    def stats(
+        self, include_media=False, compressed=False, include_indexes=False
+    ):
         """Returns stats about the dataset on disk.
 
         The ``samples`` keys refer to the sample documents stored in the
@@ -1101,6 +1104,7 @@ def stats(self, include_media=False, compressed=False):
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False)
+            include_indexes (False): whether to return the stats on the indexes
 
         Returns:
             a stats dict
@@ -1138,6 +1142,11 @@ def stats(self, include_media=False, compressed=False):
             stats["media_size"] = etau.to_human_bytes_str(media_bytes)
             total_bytes += media_bytes
 
+        if include_indexes:
+            stats["nindexes"] = cs["nindexes"]
+            stats["totalIndexSize"] = cs["totalIndexSize"]
+            stats["indexSizes"] = cs["indexSizes"]
+
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)
 

From 00c47053bc520980353ee167524af8bb595288e5 Mon Sep 17 00:00:00 2001
From: minhtuevo <minhtuev@voxel51.com>
Date: Thu, 1 Aug 2024 15:50:35 -0700
Subject: [PATCH 2/5] Included size statistics in collections.py

---
 fiftyone/core/collections.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
index 9eac30dc7d..0a66b1a118 100644
--- a/fiftyone/core/collections.py
+++ b/fiftyone/core/collections.py
@@ -9028,13 +9028,16 @@ def list_indexes(self):
         """
         return list(self.get_index_information().keys())
 
-    def get_index_information(self):
+    def get_index_information(self, include_size=False):
         """Returns a dictionary of information about the indexes on this
         collection.
 
         See :meth:`pymongo:pymongo.collection.Collection.index_information` for
         details on the structure of this dictionary.
 
+        include_size(False): whether to include the size of each index in the
+            collection
+
         Returns:
             a dict mapping index names to info dicts
         """
@@ -9043,6 +9046,14 @@ def get_index_information(self):
         # Sample-level indexes
         fields_map = self._get_db_fields_map(reverse=True)
         sample_info = self._dataset._sample_collection.index_information()
+
+        if include_size:
+            dataset_stats = self._dataset.stats(include_indexes=True)
+            for index_name in dataset_stats["indexSizes"]:
+                sample_info[index_name]["size"] = dataset_stats["indexSizes"][
+                    index_name
+                ]
+
         for key, info in sample_info.items():
             if len(info["key"]) == 1:
                 field = info["key"][0][0]

From 588c76c814140359f78bab6609d2ab1482ff4b28 Mon Sep 17 00:00:00 2001
From: minhtuevo <minhtuev@voxel51.com>
Date: Thu, 1 Aug 2024 16:31:24 -0700
Subject: [PATCH 3/5] Converted to human bytes str

---
 fiftyone/core/dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
index cc542191d0..c2bc82949b 100644
--- a/fiftyone/core/dataset.py
+++ b/fiftyone/core/dataset.py
@@ -1145,7 +1145,10 @@ def stats(
         if include_indexes:
             stats["nindexes"] = cs["nindexes"]
             stats["totalIndexSize"] = cs["totalIndexSize"]
-            stats["indexSizes"] = cs["indexSizes"]
+            stats["indexSizes"] = {
+                k: etau.to_human_bytes_str(v)
+                for k, v in cs["indexSizes"].items()
+            }
 
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)

From 196a3a2dd931f0786e4eafa1e941dd34b0379d11 Mon Sep 17 00:00:00 2001
From: minhtuevo <minhtuev@voxel51.com>
Date: Thu, 1 Aug 2024 20:09:16 -0700
Subject: [PATCH 4/5] Updated to reflect convention

---
 fiftyone/core/collections.py |  7 +++++--
 fiftyone/core/dataset.py     | 10 +++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
index 0a66b1a118..d59af420a8 100644
--- a/fiftyone/core/collections.py
+++ b/fiftyone/core/collections.py
@@ -9049,10 +9049,13 @@ def get_index_information(self, include_size=False):
 
         if include_size:
             dataset_stats = self._dataset.stats(include_indexes=True)
-            for index_name in dataset_stats["indexSizes"]:
-                sample_info[index_name]["size"] = dataset_stats["indexSizes"][
+            for index_name in dataset_stats["index_sizes"]:
+                sample_info[index_name]["size"] = dataset_stats["index_sizes"][
                     index_name
                 ]
+                sample_info[index_name]["bytes"] = dataset_stats[
+                    "index_bytes"
+                ][index_name]
 
         for key, info in sample_info.items():
             if len(info["key"]) == 1:
diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
index c2bc82949b..7b97c5c7ae 100644
--- a/fiftyone/core/dataset.py
+++ b/fiftyone/core/dataset.py
@@ -1143,9 +1143,13 @@ def stats(
             total_bytes += media_bytes
 
         if include_indexes:
-            stats["nindexes"] = cs["nindexes"]
-            stats["totalIndexSize"] = cs["totalIndexSize"]
-            stats["indexSizes"] = {
+            stats["num_indexes"] = cs["nindexes"]
+            stats["indexes_bytes"] = cs["totalIndexSize"]
+            stats["indexes_sizes"] = etau.to_human_bytes_str(
+                cs["totalIndexSize"]
+            )
+            stats["index_bytes"] = cs["indexSizes"]
+            stats["index_sizes"] = {
                 k: etau.to_human_bytes_str(v)
                 for k, v in cs["indexSizes"].items()
             }

From 90558c0634ce8557621acf0fb0f90891230e5b2d Mon Sep 17 00:00:00 2001
From: brimoor <brimoor@umich.edu>
Date: Mon, 5 Aug 2024 08:41:22 -0400
Subject: [PATCH 5/5] include frame indexes

---
 fiftyone/core/collections.py     | 56 ++++++++++++++++++++++++--------
 fiftyone/core/dataset.py         | 32 +++++++++++-------
 tests/unittests/dataset_tests.py | 28 ++++++++++++++++
 3 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py
index d59af420a8..f47452cf5f 100644
--- a/fiftyone/core/collections.py
+++ b/fiftyone/core/collections.py
@@ -567,17 +567,25 @@ def summary(self):
         """
         raise NotImplementedError("Subclass must implement summary()")
 
-    def stats(self, include_media=False, compressed=False):
+    def stats(
+        self,
+        include_media=False,
+        include_indexes=False,
+        compressed=False,
+    ):
         """Returns stats about the collection on disk.
 
         The ``samples`` keys refer to the sample documents stored in the
         database.
 
+        For video datasets, the ``frames`` keys refer to the frame documents
+        stored in the database.
+
         The ``media`` keys refer to the raw media associated with each sample
         on disk.
 
-        For video datasets, the ``frames`` keys refer to the frame documents
-        stored in the database.
+        The ``index[es]`` keys refer to the indexes associated with the
+        dataset.
 
         Note that dataset-level metadata such as annotation runs are not
         included in this computation.
@@ -585,6 +593,7 @@ def stats(self, include_media=False, compressed=False):
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the collection
+            include_indexes (False): whether to return the stats on the indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False). This option is
@@ -630,6 +639,20 @@ def stats(self, include_media=False, compressed=False):
             stats["media_size"] = etau.to_human_bytes_str(media_bytes)
             total_bytes += media_bytes
 
+        if include_indexes:
+            ii = self.get_index_information(include_size=True)
+            index_bytes = {k: v["size"] for k, v in ii.items()}
+            indexes_bytes = sum(index_bytes.values())
+
+            stats["indexes_count"] = len(index_bytes)
+            stats["indexes_bytes"] = indexes_bytes
+            stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["index_bytes"] = index_bytes
+            stats["index_sizes"] = {
+                k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
+            }
+            total_bytes += indexes_bytes
+
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)
 
@@ -9035,8 +9058,7 @@ def get_index_information(self, include_size=False):
         See :meth:`pymongo:pymongo.collection.Collection.index_information` for
         details on the structure of this dictionary.
 
-        include_size(False): whether to include the size of each index in the
-            collection
+        include_size (False): whether to include the size of each index
 
         Returns:
             a dict mapping index names to info dicts
@@ -9048,14 +9070,13 @@ def get_index_information(self, include_size=False):
         sample_info = self._dataset._sample_collection.index_information()
 
         if include_size:
-            dataset_stats = self._dataset.stats(include_indexes=True)
-            for index_name in dataset_stats["index_sizes"]:
-                sample_info[index_name]["size"] = dataset_stats["index_sizes"][
-                    index_name
-                ]
-                sample_info[index_name]["bytes"] = dataset_stats[
-                    "index_bytes"
-                ][index_name]
+            conn = foo.get_db_conn()
+            cs = conn.command(
+                "collstats", self._dataset._sample_collection_name
+            )
+            for key, size in cs["indexSizes"].items():
+                if key in sample_info:
+                    sample_info[key]["size"] = size
 
         for key, info in sample_info.items():
             if len(info["key"]) == 1:
@@ -9068,6 +9089,15 @@ def get_index_information(self, include_size=False):
             # Frame-level indexes
             fields_map = self._get_db_fields_map(frames=True, reverse=True)
             frame_info = self._dataset._frame_collection.index_information()
+
+            if include_size:
+                cs = conn.command(
+                    "collstats", self._dataset._frame_collection_name
+                )
+                for key, size in cs["indexSizes"].items():
+                    if key in frame_info:
+                        frame_info[key]["size"] = size
+
             for key, info in frame_info.items():
                 if len(info["key"]) == 1:
                     field = info["key"][0][0]
diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py
index 7b97c5c7ae..8939c411b4 100644
--- a/fiftyone/core/dataset.py
+++ b/fiftyone/core/dataset.py
@@ -1082,18 +1082,24 @@ def summary(self):
         return "\n".join(lines)
 
     def stats(
-        self, include_media=False, compressed=False, include_indexes=False
+        self,
+        include_media=False,
+        include_indexes=False,
+        compressed=False,
     ):
         """Returns stats about the dataset on disk.
 
         The ``samples`` keys refer to the sample documents stored in the
         database.
 
+        For video datasets, the ``frames`` keys refer to the frame documents
+        stored in the database.
+
         The ``media`` keys refer to the raw media associated with each sample
         on disk.
 
-        For video datasets, the ``frames`` keys refer to the frame documents
-        stored in the database.
+        The ``index[es]`` keys refer to the indexes associated with the
+        dataset.
 
         Note that dataset-level metadata such as annotation runs are not
         included in this computation.
@@ -1101,10 +1107,10 @@ def stats(
         Args:
             include_media (False): whether to include stats about the size of
                 the raw media in the dataset
+            include_indexes (False): whether to return the stats on the indexes
             compressed (False): whether to return the sizes of collections in
                 their compressed form on disk (True) or the logical
                 uncompressed size of the collections (False)
-            include_indexes (False): whether to return the stats on the indexes
 
         Returns:
             a stats dict
@@ -1143,16 +1149,18 @@ def stats(
             total_bytes += media_bytes
 
         if include_indexes:
-            stats["num_indexes"] = cs["nindexes"]
-            stats["indexes_bytes"] = cs["totalIndexSize"]
-            stats["indexes_sizes"] = etau.to_human_bytes_str(
-                cs["totalIndexSize"]
-            )
-            stats["index_bytes"] = cs["indexSizes"]
+            ii = self.get_index_information(include_size=True)
+            index_bytes = {k: v["size"] for k, v in ii.items()}
+            indexes_bytes = sum(index_bytes.values())
+
+            stats["indexes_count"] = len(index_bytes)
+            stats["indexes_bytes"] = indexes_bytes
+            stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
+            stats["index_bytes"] = index_bytes
             stats["index_sizes"] = {
-                k: etau.to_human_bytes_str(v)
-                for k, v in cs["indexSizes"].items()
+                k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
             }
+            total_bytes += indexes_bytes
 
         stats["total_bytes"] = total_bytes
         stats["total_size"] = etau.to_human_bytes_str(total_bytes)
diff --git a/tests/unittests/dataset_tests.py b/tests/unittests/dataset_tests.py
index b05f607057..b66b141808 100644
--- a/tests/unittests/dataset_tests.py
+++ b/tests/unittests/dataset_tests.py
@@ -532,6 +532,34 @@ def test_indexes(self):
         with self.assertRaises(ValueError):
             dataset.create_index("non_existent_field")
 
+    @drop_datasets
+    def test_index_sizes(self):
+        gt = fo.Detections(detections=[fo.Detection(label="foo")])
+        sample = fo.Sample(filepath="video.mp4", gt=gt)
+        sample.frames[1] = fo.Frame(gt=gt)
+
+        dataset = fo.Dataset()
+        dataset.add_sample(sample)
+
+        dataset.create_index("gt.detections.label")
+        dataset.create_index("frames.gt.detections.label")
+
+        info = dataset.get_index_information(include_size=True)
+
+        indexes = [
+            "id",
+            "filepath",
+            "gt.detections.label",
+            "frames.id",
+            "frames._sample_id_1_frame_number_1",
+            "frames.gt.detections.label",
+        ]
+
+        self.assertListEqual(dataset.list_indexes(), indexes)
+        self.assertSetEqual(set(info.keys()), set(indexes))
+        for d in info.values():
+            self.assertTrue(d.get("size") is not None)
+
     @drop_datasets
     def test_iter_samples(self):
         dataset = fo.Dataset()