Limit the number of memory records reported

For very big files, the generated reporters will hung when trying to process all of the memory records produced. This happens quite a lot in flamegraphs produced from very big files, where the browser cannot display the ploy with millions of points. To help here, add a new parameter to the FileReader class that limits the number of memory records (and therefore temporal snapshots) stored and reported. This should not affect most regular capture files but will help with the very big ones. Signed-off-by: Pablo Galindo <[email protected]>
bloomberg · Nov 3, 2023 · 73850e0 · 73850e0
1 parent 7ee88fc
commit 73850e0
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 2 deletions.
diff --git a/news/491.bugfix.rst b/news/491.bugfix.rst
@@ -0,0 +1 @@
+Limit the number of memory records displayed in reporters by default. This will help displaying flamegraphs for very big files.
diff --git a/src/memray/_memray.pyi b/src/memray/_memray.pyi
@@ -133,7 +133,10 @@ class FileReader:
     @property
     def metadata(self) -> Metadata: ...
     def __init__(
-        self, file_name: Union[str, Path], *, report_progress: bool = False
+        self,
+        file_name: Union[str, Path],
+        *,
+        report_progress: bool = False,
     ) -> None: ...
     def get_allocation_records(self) -> Iterable[AllocationRecord]: ...
     def get_temporal_allocation_records(

diff --git a/src/memray/_memray.pyx b/src/memray/_memray.pyx
@@ -879,6 +879,9 @@ cdef class FileReader:
         n_memory_snapshots_approx = 2048
         if 0 < stats["start_time"] < stats["end_time"]:
             n_memory_snapshots_approx = (stats["end_time"] - stats["start_time"]) / 10
+
+        if n_memory_snapshots_approx > max_memory_records:
+            n_memory_snapshots_approx = max_memory_records
         self._memory_snapshots.reserve(n_memory_snapshots_approx)
 
         cdef object total = stats['n_allocations'] or None
@@ -915,6 +918,10 @@ cdef class FileReader:
                     self._memory_snapshots.push_back(reader.getLatestMemorySnapshot())
                 else:
                     break
+
+        if len(self._memory_snapshots) > max_memory_records:
+            self._memory_snapshot_bucket = len(self._memory_snapshots) // max_memory_records
+            self._memory_snapshots = self._memory_snapshots[::self._memory_snapshot_bucket]
         self._high_watermark = finder.getHighWatermark()
         stats["n_allocations"] = progress_indicator.num_processed
 

diff --git a/src/memray/commands/common.py b/src/memray/commands/common.py
@@ -127,9 +127,13 @@ def write_report(
         merge_threads: Optional[bool] = None,
         inverted: Optional[bool] = None,
         temporal: bool = False,
+        max_memory_records: int = 0,
     ) -> None:
         try:
-            reader = FileReader(os.fspath(result_path), report_progress=True)
+            kwargs = {}
+            if max_memory_records > 0:
+                kwargs["max_memory_records"] = max_memory_records
+            reader = FileReader(os.fspath(result_path), report_progress=True, **kwargs)
             merge_threads = True if merge_threads is None else merge_threads
             inverted = False if inverted is None else inverted
 
@@ -271,6 +275,10 @@ def run(self, args: argparse.Namespace, parser: argparse.ArgumentParser) -> None
 
         if hasattr(args, "inverted"):
             kwargs["inverted"] = args.inverted
+
+        if hasattr(args, "max_memory_records"):
+            kwargs["max_memory_records"] = args.max_memory_records
+
         self.write_report(
             result_path,
             output_file,

diff --git a/src/memray/commands/flamegraph.py b/src/memray/commands/flamegraph.py
@@ -29,3 +29,10 @@ def prepare_parser(self, parser: argparse.ArgumentParser) -> None:
             action="store_true",
             default=False,
         )
+
+        parser.add_argument(
+            "--max-memory-records",
+            help="Maximum number of memory records to display",
+            type=int,
+            default=0,
+        )
diff --git a/tests/integration/test_tracking.py b/tests/integration/test_tracking.py
@@ -1672,6 +1672,34 @@ def test_memory_snapshots_tick_interval(self, tmp_path):
             for prev, _next in zip(memory_snapshots, memory_snapshots[1:])
         )
 
+    def test_memory_snapshots_limit_when_reading(self, tmp_path):
+        # GIVEN
+        allocator = MemoryAllocator()
+        output = tmp_path / "test.bin"
+
+        # WHEN
+        with Tracker(output):
+            for _ in range(2):
+                allocator.valloc(ALLOC_SIZE)
+                time.sleep(0.11)
+                allocator.free()
+
+        reader = FileReader(output)
+        memory_snapshots = list(reader.get_memory_snapshots())
+        temporal_records = list(reader.get_temporal_allocation_records())
+
+        assert memory_snapshots
+        n_snapshots = len(memory_snapshots)
+        n_temporal_records = len(temporal_records)
+
+        reader = FileReader(output, max_memory_records=n_snapshots // 2)
+        memory_snapshots = list(reader.get_memory_snapshots())
+        temporal_records = list(reader.get_temporal_allocation_records())
+
+        assert memory_snapshots
+        assert len(memory_snapshots) <= n_snapshots // 2 + 1
+        assert len(temporal_records) <= n_temporal_records // 2 + 1
+
     def test_temporary_allocations_when_filling_vector_without_preallocating(
         self, tmp_path
     ):