From 2dfaf930a9482026ca5294a25907d6f73b40c999 Mon Sep 17 00:00:00 2001 From: minhtuevo Date: Thu, 1 Aug 2024 15:13:25 -0700 Subject: [PATCH 1/5] Added index statistics to dataset.stats --- fiftyone/core/dataset.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py index d0ae130714..cc542191d0 100644 --- a/fiftyone/core/dataset.py +++ b/fiftyone/core/dataset.py @@ -56,6 +56,7 @@ class DatasetNotFoundError(ValueError): """Exception raised when a dataset is not found.""" + def __init__(self, name): self._dataset_name = name super().__init__(f"Dataset {name} not found") @@ -1080,7 +1081,9 @@ def summary(self): return "\n".join(lines) - def stats(self, include_media=False, compressed=False): + def stats( + self, include_media=False, compressed=False, include_indexes=False + ): """Returns stats about the dataset on disk. The ``samples`` keys refer to the sample documents stored in the @@ -1101,6 +1104,7 @@ def stats(self, include_media=False, compressed=False): compressed (False): whether to return the sizes of collections in their compressed form on disk (True) or the logical uncompressed size of the collections (False) + include_indexes (False): whether to return the stats on the indexes Returns: a stats dict @@ -1138,6 +1142,11 @@ def stats(self, include_media=False, compressed=False): stats["media_size"] = etau.to_human_bytes_str(media_bytes) total_bytes += media_bytes + if include_indexes: + stats["nindexes"] = cs["nindexes"] + stats["totalIndexSize"] = cs["totalIndexSize"] + stats["indexSizes"] = cs["indexSizes"] + stats["total_bytes"] = total_bytes stats["total_size"] = etau.to_human_bytes_str(total_bytes) From 00c47053bc520980353ee167524af8bb595288e5 Mon Sep 17 00:00:00 2001 From: minhtuevo Date: Thu, 1 Aug 2024 15:50:35 -0700 Subject: [PATCH 2/5] Included size statistics in collections.py --- fiftyone/core/collections.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py index 9eac30dc7d..0a66b1a118 100644 --- a/fiftyone/core/collections.py +++ b/fiftyone/core/collections.py @@ -9028,13 +9028,16 @@ def list_indexes(self): """ return list(self.get_index_information().keys()) - def get_index_information(self): + def get_index_information(self, include_size=False): """Returns a dictionary of information about the indexes on this collection. See :meth:`pymongo:pymongo.collection.Collection.index_information` for details on the structure of this dictionary. + include_size(False): whether to include the size of each index in the + collection + Returns: a dict mapping index names to info dicts """ @@ -9043,6 +9046,14 @@ def get_index_information(self): # Sample-level indexes fields_map = self._get_db_fields_map(reverse=True) sample_info = self._dataset._sample_collection.index_information() + + if include_size: + dataset_stats = self._dataset.stats(include_indexes=True) + for index_name in dataset_stats["indexSizes"]: + sample_info[index_name]["size"] = dataset_stats["indexSizes"][ + index_name + ] + for key, info in sample_info.items(): if len(info["key"]) == 1: field = info["key"][0][0] From 588c76c814140359f78bab6609d2ab1482ff4b28 Mon Sep 17 00:00:00 2001 From: minhtuevo Date: Thu, 1 Aug 2024 16:31:24 -0700 Subject: [PATCH 3/5] Converted to human bytes str --- fiftyone/core/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py index cc542191d0..c2bc82949b 100644 --- a/fiftyone/core/dataset.py +++ b/fiftyone/core/dataset.py @@ -1145,7 +1145,10 @@ def stats( if include_indexes: stats["nindexes"] = cs["nindexes"] stats["totalIndexSize"] = cs["totalIndexSize"] - stats["indexSizes"] = cs["indexSizes"] + stats["indexSizes"] = { + k: etau.to_human_bytes_str(v) + for k, v in cs["indexSizes"].items() + } stats["total_bytes"] = total_bytes stats["total_size"] = etau.to_human_bytes_str(total_bytes) From 196a3a2dd931f0786e4eafa1e941dd34b0379d11 Mon Sep 17 00:00:00 2001 From: minhtuevo Date: Thu, 1 Aug 2024 20:09:16 -0700 Subject: [PATCH 4/5] Updated to reflect convention --- fiftyone/core/collections.py | 7 +++++-- fiftyone/core/dataset.py | 10 +++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py index 0a66b1a118..d59af420a8 100644 --- a/fiftyone/core/collections.py +++ b/fiftyone/core/collections.py @@ -9049,10 +9049,13 @@ def get_index_information(self, include_size=False): if include_size: dataset_stats = self._dataset.stats(include_indexes=True) - for index_name in dataset_stats["indexSizes"]: - sample_info[index_name]["size"] = dataset_stats["indexSizes"][ + for index_name in dataset_stats["index_sizes"]: + sample_info[index_name]["size"] = dataset_stats["index_sizes"][ index_name ] + sample_info[index_name]["bytes"] = dataset_stats[ + "index_bytes" + ][index_name] for key, info in sample_info.items(): if len(info["key"]) == 1: diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py index c2bc82949b..7b97c5c7ae 100644 --- a/fiftyone/core/dataset.py +++ b/fiftyone/core/dataset.py @@ -1143,9 +1143,13 @@ def stats( total_bytes += media_bytes if include_indexes: - stats["nindexes"] = cs["nindexes"] - stats["totalIndexSize"] = cs["totalIndexSize"] - stats["indexSizes"] = { + stats["num_indexes"] = cs["nindexes"] + stats["indexes_bytes"] = cs["totalIndexSize"] + stats["indexes_sizes"] = etau.to_human_bytes_str( + cs["totalIndexSize"] + ) + stats["index_bytes"] = cs["indexSizes"] + stats["index_sizes"] = { k: etau.to_human_bytes_str(v) for k, v in cs["indexSizes"].items() } From 90558c0634ce8557621acf0fb0f90891230e5b2d Mon Sep 17 00:00:00 2001 From: brimoor Date: Mon, 5 Aug 2024 08:41:22 -0400 Subject: [PATCH 5/5] include frame indexes --- fiftyone/core/collections.py | 56 ++++++++++++++++++++++++-------- fiftyone/core/dataset.py | 32 +++++++++++------- tests/unittests/dataset_tests.py | 28 ++++++++++++++++ 3 files changed, 91 insertions(+), 25 deletions(-) diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py index d59af420a8..f47452cf5f 100644 --- a/fiftyone/core/collections.py +++ b/fiftyone/core/collections.py @@ -567,17 +567,25 @@ def summary(self): """ raise NotImplementedError("Subclass must implement summary()") - def stats(self, include_media=False, compressed=False): + def stats( + self, + include_media=False, + include_indexes=False, + compressed=False, + ): """Returns stats about the collection on disk. The ``samples`` keys refer to the sample documents stored in the database. + For video datasets, the ``frames`` keys refer to the frame documents + stored in the database. + The ``media`` keys refer to the raw media associated with each sample on disk. - For video datasets, the ``frames`` keys refer to the frame documents - stored in the database. + The ``index[es]`` keys refer to the indexes associated with the + dataset. Note that dataset-level metadata such as annotation runs are not included in this computation. @@ -585,6 +593,7 @@ def stats(self, include_media=False, compressed=False): Args: include_media (False): whether to include stats about the size of the raw media in the collection + include_indexes (False): whether to return the stats on the indexes compressed (False): whether to return the sizes of collections in their compressed form on disk (True) or the logical uncompressed size of the collections (False). This option is @@ -630,6 +639,20 @@ def stats(self, include_media=False, compressed=False): stats["media_size"] = etau.to_human_bytes_str(media_bytes) total_bytes += media_bytes + if include_indexes: + ii = self.get_index_information(include_size=True) + index_bytes = {k: v["size"] for k, v in ii.items()} + indexes_bytes = sum(index_bytes.values()) + + stats["indexes_count"] = len(index_bytes) + stats["indexes_bytes"] = indexes_bytes + stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes) + stats["index_bytes"] = index_bytes + stats["index_sizes"] = { + k: etau.to_human_bytes_str(v) for k, v in index_bytes.items() + } + total_bytes += indexes_bytes + stats["total_bytes"] = total_bytes stats["total_size"] = etau.to_human_bytes_str(total_bytes) @@ -9035,8 +9058,7 @@ def get_index_information(self, include_size=False): See :meth:`pymongo:pymongo.collection.Collection.index_information` for details on the structure of this dictionary. - include_size(False): whether to include the size of each index in the - collection + include_size (False): whether to include the size of each index Returns: a dict mapping index names to info dicts @@ -9048,14 +9070,13 @@ def get_index_information(self, include_size=False): sample_info = self._dataset._sample_collection.index_information() if include_size: - dataset_stats = self._dataset.stats(include_indexes=True) - for index_name in dataset_stats["index_sizes"]: - sample_info[index_name]["size"] = dataset_stats["index_sizes"][ - index_name - ] - sample_info[index_name]["bytes"] = dataset_stats[ - "index_bytes" - ][index_name] + conn = foo.get_db_conn() + cs = conn.command( + "collstats", self._dataset._sample_collection_name + ) + for key, size in cs["indexSizes"].items(): + if key in sample_info: + sample_info[key]["size"] = size for key, info in sample_info.items(): if len(info["key"]) == 1: @@ -9068,6 +9089,15 @@ def get_index_information(self, include_size=False): # Frame-level indexes fields_map = self._get_db_fields_map(frames=True, reverse=True) frame_info = self._dataset._frame_collection.index_information() + + if include_size: + cs = conn.command( + "collstats", self._dataset._frame_collection_name + ) + for key, size in cs["indexSizes"].items(): + if key in frame_info: + frame_info[key]["size"] = size + for key, info in frame_info.items(): if len(info["key"]) == 1: field = info["key"][0][0] diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py index 7b97c5c7ae..8939c411b4 100644 --- a/fiftyone/core/dataset.py +++ b/fiftyone/core/dataset.py @@ -1082,18 +1082,24 @@ def summary(self): return "\n".join(lines) def stats( - self, include_media=False, compressed=False, include_indexes=False + self, + include_media=False, + include_indexes=False, + compressed=False, ): """Returns stats about the dataset on disk. The ``samples`` keys refer to the sample documents stored in the database. + For video datasets, the ``frames`` keys refer to the frame documents + stored in the database. + The ``media`` keys refer to the raw media associated with each sample on disk. - For video datasets, the ``frames`` keys refer to the frame documents - stored in the database. + The ``index[es]`` keys refer to the indexes associated with the + dataset. Note that dataset-level metadata such as annotation runs are not included in this computation. @@ -1101,10 +1107,10 @@ def stats( Args: include_media (False): whether to include stats about the size of the raw media in the dataset + include_indexes (False): whether to return the stats on the indexes compressed (False): whether to return the sizes of collections in their compressed form on disk (True) or the logical uncompressed size of the collections (False) - include_indexes (False): whether to return the stats on the indexes Returns: a stats dict @@ -1143,16 +1149,18 @@ def stats( total_bytes += media_bytes if include_indexes: - stats["num_indexes"] = cs["nindexes"] - stats["indexes_bytes"] = cs["totalIndexSize"] - stats["indexes_sizes"] = etau.to_human_bytes_str( - cs["totalIndexSize"] - ) - stats["index_bytes"] = cs["indexSizes"] + ii = self.get_index_information(include_size=True) + index_bytes = {k: v["size"] for k, v in ii.items()} + indexes_bytes = sum(index_bytes.values()) + + stats["indexes_count"] = len(index_bytes) + stats["indexes_bytes"] = indexes_bytes + stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes) + stats["index_bytes"] = index_bytes stats["index_sizes"] = { - k: etau.to_human_bytes_str(v) - for k, v in cs["indexSizes"].items() + k: etau.to_human_bytes_str(v) for k, v in index_bytes.items() } + total_bytes += indexes_bytes stats["total_bytes"] = total_bytes stats["total_size"] = etau.to_human_bytes_str(total_bytes) diff --git a/tests/unittests/dataset_tests.py b/tests/unittests/dataset_tests.py index b05f607057..b66b141808 100644 --- a/tests/unittests/dataset_tests.py +++ b/tests/unittests/dataset_tests.py @@ -532,6 +532,34 @@ def test_indexes(self): with self.assertRaises(ValueError): dataset.create_index("non_existent_field") + @drop_datasets + def test_index_sizes(self): + gt = fo.Detections(detections=[fo.Detection(label="foo")]) + sample = fo.Sample(filepath="video.mp4", gt=gt) + sample.frames[1] = fo.Frame(gt=gt) + + dataset = fo.Dataset() + dataset.add_sample(sample) + + dataset.create_index("gt.detections.label") + dataset.create_index("frames.gt.detections.label") + + info = dataset.get_index_information(include_size=True) + + indexes = [ + "id", + "filepath", + "gt.detections.label", + "frames.id", + "frames._sample_id_1_frame_number_1", + "frames.gt.detections.label", + ] + + self.assertListEqual(dataset.list_indexes(), indexes) + self.assertSetEqual(set(info.keys()), set(indexes)) + for d in info.values(): + self.assertTrue(d.get("size") is not None) + @drop_datasets def test_iter_samples(self): dataset = fo.Dataset()