From 6d413654892e89aad4962138655ccbda78d48fa3 Mon Sep 17 00:00:00 2001
From: David Lakin <github@themoderndev.com>
Date: Thu, 16 May 2024 03:40:10 -0400
Subject: [PATCH] Refactor `fuzz_repo` to Improve Effectiveness & Efficiency

Prior to the changes introduced here, the implementation of `fuzz_repo`
had several issues:

1. `repo.stage()` was not called before the first `repo.do_commit()`
  call
2. When `repo.stage()` was eventually called, the argument it was passed
   was incorrect (a list of absolute paths instead of relative to the
   repo dir) causing a `ValueError` that broke the fuzzer runs. This was
   hidden during the initial local testing because of point 3 below.
3. Inefficient consumption of the fuzzer provided data resulted in the
   input bytes being exhausted early in the `TestOneInput` execution
   until the fuzzer was able to generate a corpus large enough to
   satisfy all of the `Consume*`  calls.

The issues listed above are addressed here by simplifying the test
harness implementation. The result being significanly improved fuzzing
coverage and slightly improved execution speed.

Other changes:

- `EnhancedFuzzedDataProvider.ConsumeRandomString` now accepts an
  argument to optionally exclude unicode surrogates from the returned
  string (useful for places where they will always raise an exception
  like most filesystem operations.)
- Adds new fuzzing engine recommended dictionary entries (now that the
  test is effective enough to generate recommendations.)

Closes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=69054
---
 fuzzing/dictionaries/fuzz_repo.dict |  3 +++
 fuzzing/fuzz-targets/fuzz_repo.py   | 32 ++++++++++++++---------------
 fuzzing/fuzz-targets/test_utils.py  | 10 +++++++--
 3 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/fuzzing/dictionaries/fuzz_repo.dict b/fuzzing/dictionaries/fuzz_repo.dict
index 77916edb6..99da684ca 100644
--- a/fuzzing/dictionaries/fuzz_repo.dict
+++ b/fuzzing/dictionaries/fuzz_repo.dict
@@ -1,2 +1,5 @@
 "{self.size}"
 "win32"
+"{self.extended_flags}"
+"P\\303\\000\\000\\000\\000\\000\\000"
+"\\377\\377\\377\\377"
diff --git a/fuzzing/fuzz-targets/fuzz_repo.py b/fuzzing/fuzz-targets/fuzz_repo.py
index dbc1e36ce..61db56486 100644
--- a/fuzzing/fuzz-targets/fuzz_repo.py
+++ b/fuzzing/fuzz-targets/fuzz_repo.py
@@ -21,16 +21,21 @@ def TestOneInput(data):
         repo.set_description(fdp.ConsumeRandomBytes())
         repo.get_description()
 
-        # Generate a minimal set of files based on fuzz data to minimize I/O operations.
-        file_paths = [
-            os.path.join(temp_dir, f"File{i}")
-            for i in range(min(3, fdp.ConsumeIntInRange(1, 3)))
-        ]
-        for file_path in file_paths:
-            with open(file_path, "wb") as f:
-                f.write(fdp.ConsumeRandomBytes())
+        try:
+            # Generate a minimal set of files based on fuzz data to minimize I/O operations.
+            file_names = [
+                f"File{i}{fdp.ConsumeRandomString(without_surrogates=True)}"
+                for i in range(min(3, fdp.ConsumeIntInRange(1, 3)))
+            ]
+            for file in file_names:
+                with open(os.path.join(temp_dir, file), "wb") as f:
+                    f.write(fdp.ConsumeRandomBytes())
+        except (ValueError, OSError):
+            # Exit early if the fuzzer generates an invalid filename.
+            return -1
 
         try:
+            repo.stage(file_names)
             repo.do_commit(
                 message=fdp.ConsumeRandomBytes(),
                 committer=fdp.ConsumeRandomBytes(),
@@ -45,15 +50,8 @@ def TestOneInput(data):
         except ValueError as e:
             if is_expected_exception(["Unable to handle non-minute offset"], e):
                 return -1
-
-        for file_path in file_paths:
-            with open(file_path, "wb") as f:
-                f.write(fdp.ConsumeRandomBytes())
-
-        repo.stage(file_paths)
-        repo.do_commit(
-            message=fdp.ConsumeRandomBytes(),
-        )
+            else:
+                raise e
 
 
 def main():
diff --git a/fuzzing/fuzz-targets/test_utils.py b/fuzzing/fuzz-targets/test_utils.py
index 90342080f..b10d3a10d 100644
--- a/fuzzing/fuzz-targets/test_utils.py
+++ b/fuzzing/fuzz-targets/test_utils.py
@@ -57,11 +57,12 @@ def ConsumeRandomBytes(self, max_length=None) -> bytes:
 
         return self.ConsumeBytes(self.ConsumeIntInRange(0, max_length))
 
-    def ConsumeRandomString(self, max_length=None) -> str:
+    def ConsumeRandomString(self, max_length=None, without_surrogates=False) -> str:
         """Consume bytes to produce a Unicode string.
 
         Args:
           max_length (int, optional): The maximum length of the string. Defaults to the number of remaining bytes.
+          without_surrogates (bool, optional): If True, never generate surrogate pair characters. Defaults to False.
 
         Returns:
          str: A Unicode string.
@@ -71,7 +72,12 @@ def ConsumeRandomString(self, max_length=None) -> str:
         else:
             max_length = min(max_length, self.remaining_bytes())
 
-        return self.ConsumeUnicode(self.ConsumeIntInRange(0, max_length))
+        count = self.ConsumeIntInRange(0, max_length)
+
+        if without_surrogates:
+            return self.ConsumeUnicodeNoSurrogates(count)
+        else:
+            return self.ConsumeUnicode(count)
 
     def ConsumeRandomInt(self, minimum=0, maximum=1234567890) -> int:
         """Consume bytes to produce an integer.