Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds index usage info to get_index_information(include_stats=True) #5320

Merged
merged 1 commit into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions fiftyone/core/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -9506,8 +9506,8 @@ def get_index_information(self, include_stats=False):
details on the structure of this dictionary.

Args:
include_stats (False): whether to include the size and build status
of each index
include_stats (False): whether to include the size, usage, and
build status of each index

Returns:
a dict mapping index names to info dicts
Expand All @@ -9528,6 +9528,13 @@ def get_index_information(self, include_stats=False):
if key in sample_info:
sample_info[key]["in_progress"] = True

for d in self._dataset._sample_collection.aggregate(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering if it would be useful to pull this out and make this a function that accepts the collection name as an arg to allow for creating something like an admin dashboard to manage indexes across datasets without needing to load each dataset

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use case makes sense 👍 I'd suggest we consider whether a potential refactor adds clarity at that time

[{"$indexStats": {}}]
):
key = d["name"]
if key in sample_info:
sample_info[key]["accesses"] = d["accesses"]

for key, info in sample_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand All @@ -9550,6 +9557,13 @@ def get_index_information(self, include_stats=False):
if key in frame_info:
frame_info[key]["in_progress"] = True

for d in self._dataset._frame_collection.aggregate(
[{"$indexStats": {}}]
):
key = d["name"]
if key in frame_info:
frame_info[key]["accesses"] = d["accesses"]

for key, info in frame_info.items():
if len(info["key"]) == 1:
field = info["key"][0][0]
Expand Down
29 changes: 10 additions & 19 deletions fiftyone/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1202,21 +1202,13 @@ def stats(
return stats

def _sample_collstats(self):
conn = foo.get_db_conn()
return conn.command(
"collstats",
self._sample_collection_name,
)
return _get_collstats(self._sample_collection)

def _frame_collstats(self):
if self._frame_collection_name is None:
return None

conn = foo.get_db_conn()
return conn.command(
"collstats",
self._frame_collection_name,
)
return _get_collstats(self._frame_collection)

def first(self):
"""Returns the first sample in the dataset.
Expand Down Expand Up @@ -7770,15 +7762,6 @@ def _get_frame_collection(self, write_concern=None):
self._frame_collection_name, write_concern=write_concern
)

@property
def _frame_indexes(self):
frame_collection = self._frame_collection
if frame_collection is None:
return None

index_info = frame_collection.index_information()
return [k["key"][0][0] for k in index_info.values()]

def _apply_sample_field_schema(self, schema):
for field_name, field_or_str in schema.items():
kwargs = foo.get_field_kwargs(field_or_str)
Expand Down Expand Up @@ -9104,6 +9087,14 @@ def _get_single_index_map(coll):
}


def _get_collstats(coll):
pipeline = [
{"$collStats": {"storageStats": {}}},
{"$replaceRoot": {"newRoot": "$storageStats"}},
]
return next(coll.aggregate(pipeline))

brimoor marked this conversation as resolved.
Show resolved Hide resolved

def _add_collection_with_new_ids(
dataset,
sample_collection,
Expand Down
6 changes: 4 additions & 2 deletions tests/unittests/dataset_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ def test_indexes(self):
dataset.create_index("non_existent_field")

@drop_datasets
def test_index_sizes(self):
def test_index_stats(self):
gt = fo.Detections(detections=[fo.Detection(label="foo")])
sample = fo.Sample(filepath="video.mp4", gt=gt)
sample.frames[1] = fo.Frame(gt=gt)
Expand Down Expand Up @@ -700,7 +700,9 @@ def test_index_sizes(self):
self.assertSetEqual(set(dataset.list_indexes()), indexes)
self.assertSetEqual(set(info.keys()), indexes)
for d in info.values():
self.assertTrue(d.get("size") is not None)
self.assertTrue(d["size"] is not None)
self.assertTrue("ops" in d["accesses"])
self.assertTrue("since" in d["accesses"])

@drop_datasets
def test_index_in_progress(self):
Expand Down
Loading