Skip to content

Commit

Permalink
Merge pull request #4607 from voxel51/feature/update-index-stats
Browse files Browse the repository at this point in the history
Added index statistics to dataset.stats
  • Loading branch information
brimoor authored Aug 5, 2024
2 parents 87bbe27 + 90558c0 commit 1fcc7b7
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 7 deletions.
52 changes: 48 additions & 4 deletions fiftyone/core/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -567,24 +567,33 @@ def summary(self):
"""
raise NotImplementedError("Subclass must implement summary()")

def stats(self, include_media=False, compressed=False):
def stats(
self,
include_media=False,
include_indexes=False,
compressed=False,
):
"""Returns stats about the collection on disk.
The ``samples`` keys refer to the sample documents stored in the
database.
For video datasets, the ``frames`` keys refer to the frame documents
stored in the database.
The ``media`` keys refer to the raw media associated with each sample
on disk.
For video datasets, the ``frames`` keys refer to the frame documents
stored in the database.
The ``index[es]`` keys refer to the indexes associated with the
dataset.
Note that dataset-level metadata such as annotation runs are not
included in this computation.
Args:
include_media (False): whether to include stats about the size of
the raw media in the collection
include_indexes (False): whether to return the stats on the indexes
compressed (False): whether to return the sizes of collections in
their compressed form on disk (True) or the logical
uncompressed size of the collections (False). This option is
Expand Down Expand Up @@ -630,6 +639,20 @@ def stats(self, include_media=False, compressed=False):
stats["media_size"] = etau.to_human_bytes_str(media_bytes)
total_bytes += media_bytes

if include_indexes:
ii = self.get_index_information(include_size=True)
index_bytes = {k: v["size"] for k, v in ii.items()}
indexes_bytes = sum(index_bytes.values())

stats["indexes_count"] = len(index_bytes)
stats["indexes_bytes"] = indexes_bytes
stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
stats["index_bytes"] = index_bytes
stats["index_sizes"] = {
k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
}
total_bytes += indexes_bytes

stats["total_bytes"] = total_bytes
stats["total_size"] = etau.to_human_bytes_str(total_bytes)

Expand Down Expand Up @@ -9028,13 +9051,15 @@ def list_indexes(self):
"""
return list(self.get_index_information().keys())

def get_index_information(self):
def get_index_information(self, include_size=False):
"""Returns a dictionary of information about the indexes on this
collection.
See :meth:`pymongo:pymongo.collection.Collection.index_information` for
details on the structure of this dictionary.
include_size (False): whether to include the size of each index
Returns:
a dict mapping index names to info dicts
"""
Expand All @@ -9043,6 +9068,16 @@ def get_index_information(self):
# Sample-level indexes
fields_map = self._get_db_fields_map(reverse=True)
sample_info = self._dataset._sample_collection.index_information()

if include_size:
conn = foo.get_db_conn()
cs = conn.command(
"collstats", self._dataset._sample_collection_name
)
for key, size in cs["indexSizes"].items():
if key in sample_info:
sample_info[key]["size"] = size

for key, info in sample_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand All @@ -9054,6 +9089,15 @@ def get_index_information(self):
# Frame-level indexes
fields_map = self._get_db_fields_map(frames=True, reverse=True)
frame_info = self._dataset._frame_collection.index_information()

if include_size:
cs = conn.command(
"collstats", self._dataset._frame_collection_name
)
for key, size in cs["indexSizes"].items():
if key in frame_info:
frame_info[key]["size"] = size

for key, info in frame_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand Down
30 changes: 27 additions & 3 deletions fiftyone/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@

class DatasetNotFoundError(ValueError):
"""Exception raised when a dataset is not found."""

def __init__(self, name):
self._dataset_name = name
super().__init__(f"Dataset {name} not found")
Expand Down Expand Up @@ -1080,24 +1081,33 @@ def summary(self):

return "\n".join(lines)

def stats(self, include_media=False, compressed=False):
def stats(
self,
include_media=False,
include_indexes=False,
compressed=False,
):
"""Returns stats about the dataset on disk.
The ``samples`` keys refer to the sample documents stored in the
database.
For video datasets, the ``frames`` keys refer to the frame documents
stored in the database.
The ``media`` keys refer to the raw media associated with each sample
on disk.
For video datasets, the ``frames`` keys refer to the frame documents
stored in the database.
The ``index[es]`` keys refer to the indexes associated with the
dataset.
Note that dataset-level metadata such as annotation runs are not
included in this computation.
Args:
include_media (False): whether to include stats about the size of
the raw media in the dataset
include_indexes (False): whether to return the stats on the indexes
compressed (False): whether to return the sizes of collections in
their compressed form on disk (True) or the logical
uncompressed size of the collections (False)
Expand Down Expand Up @@ -1138,6 +1148,20 @@ def stats(self, include_media=False, compressed=False):
stats["media_size"] = etau.to_human_bytes_str(media_bytes)
total_bytes += media_bytes

if include_indexes:
ii = self.get_index_information(include_size=True)
index_bytes = {k: v["size"] for k, v in ii.items()}
indexes_bytes = sum(index_bytes.values())

stats["indexes_count"] = len(index_bytes)
stats["indexes_bytes"] = indexes_bytes
stats["indexes_size"] = etau.to_human_bytes_str(indexes_bytes)
stats["index_bytes"] = index_bytes
stats["index_sizes"] = {
k: etau.to_human_bytes_str(v) for k, v in index_bytes.items()
}
total_bytes += indexes_bytes

stats["total_bytes"] = total_bytes
stats["total_size"] = etau.to_human_bytes_str(total_bytes)

Expand Down
28 changes: 28 additions & 0 deletions tests/unittests/dataset_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,34 @@ def test_indexes(self):
with self.assertRaises(ValueError):
dataset.create_index("non_existent_field")

@drop_datasets
def test_index_sizes(self):
gt = fo.Detections(detections=[fo.Detection(label="foo")])
sample = fo.Sample(filepath="video.mp4", gt=gt)
sample.frames[1] = fo.Frame(gt=gt)

dataset = fo.Dataset()
dataset.add_sample(sample)

dataset.create_index("gt.detections.label")
dataset.create_index("frames.gt.detections.label")

info = dataset.get_index_information(include_size=True)

indexes = [
"id",
"filepath",
"gt.detections.label",
"frames.id",
"frames._sample_id_1_frame_number_1",
"frames.gt.detections.label",
]

self.assertListEqual(dataset.list_indexes(), indexes)
self.assertSetEqual(set(info.keys()), set(indexes))
for d in info.values():
self.assertTrue(d.get("size") is not None)

@drop_datasets
def test_iter_samples(self):
dataset = fo.Dataset()
Expand Down

0 comments on commit 1fcc7b7

Please sign in to comment.