AntonOsika · ATheorell · May 12, 2024 · Apr 28, 2024 · Apr 28, 2024 · Apr 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -92,3 +92,7 @@ webapp/.next/
 # locally saved datasets
 gpt_engineer/benchmark/benchmarks/apps/dataset
 gpt_engineer/benchmark/benchmarks/mbpp/dataset
+
+gpt_engineer/benchmark/minimal_bench_config.toml
+
+test.json
diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
@@ -32,7 +32,7 @@
 from gpt_engineer.applications.cli.main import load_env_if_needed
 from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
-from gpt_engineer.benchmark.run import print_results, run
+from gpt_engineer.benchmark.run import export_yaml_results, print_results, run
 
 app = typer.Typer()  # creates a CLI app
 
@@ -72,8 +72,12 @@
         ),
     ],
     bench_config: Annotated[
-        Optional[str], typer.Argument(help="optional task name in benchmark")
+        str, typer.Argument(help="optional task name in benchmark")
     ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
+    yaml_output: Annotated[
+        Optional[str],
+        typer.Option(help="print results for each task", show_default=False),
+    ] = None,
     verbose: Annotated[
         bool, typer.Option(help="print results for each task", show_default=False)
     ] = False,
@@ -85,13 +89,12 @@
     ----------
     path_to_agent : str
         The file path to the Python module that contains a function called 'default_config_agent'.
-    benchmarks : str
-        A comma-separated string of benchmark names to run.
-    bench_config : Optional[str], default=default_bench_config.toml
+    bench_config : str, default=default_bench_config.toml
         Configuration file for choosing which benchmark problems to run. See default config for more details.
+    yaml_output: Optional[str], default=None
+        Pass a path to a yaml file to have results written to file.
     verbose : bool, default=False
         A flag to indicate whether to print results for each task.
-
     Returns
     -------
     None
@@ -101,6 +104,7 @@
     config = BenchConfig.from_toml(bench_config)
     print("using config file: " + bench_config)
     benchmarks = list()
+    benchmark_results = dict()
     for specific_config_name in vars(config):
         specific_config = getattr(config, specific_config_name)
         if hasattr(specific_config, "active"):
@@ -124,6 +128,11 @@
         )
         print_results(results)
         print()
+        benchmark_results[benchmark_name] = {
+            "detailed": [result.to_dict() for result in results]
+        }
+    if yaml_output is not None:
+        export_yaml_results(yaml_output, benchmark_results, config.to_dict())
 
 
 if __name__ == "__main__":

diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+from tomlkit.items import Integer
+
 from gpt_engineer.core.project_config import read_config
 
 
@@ -11,6 +13,7 @@
     test_end_index: int | None = 1
     train_start_index: int | None = 0
     train_end_index: int | None = 0
+    examples_per_problem: int | None = 10
 
 
 @dataclass
@@ -25,19 +28,13 @@
     active: bool | None = True
 
 
-@dataclass
-class GptengConfig:
-    active: bool | None = True
-
-
 @dataclass
 class BenchConfig:
     """Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""
 
     apps: AppsConfig = field(default_factory=AppsConfig)
     mbpp: MbppConfig = field(default_factory=MbppConfig)
     gptme: GptmeConfig = field(default_factory=GptmeConfig)
-    gpteng: GptengConfig = field(default_factory=GptengConfig)
 
     @classmethod
     def from_toml(cls, config_file: Path | str):
@@ -52,5 +49,21 @@
             apps=AppsConfig(**config_dict.get("apps", {})),
             mbpp=MbppConfig(**config_dict.get("mbpp", {})),
             gptme=GptmeConfig(**config_dict.get("gptme", {})),
-            gpteng=GptengConfig(**config_dict.get("gpteng", {})),
         )
+
+    @staticmethod
+    def recursive_resolve(data_dict):
+        for key, value in data_dict.items():
+            if isinstance(value, Integer):
+                data_dict[key] = int(value)
+            elif isinstance(value, dict):
+                BenchConfig.recursive_resolve(value)
+
+    def to_dict(self):
+        dict_config = {
+            benchmark_name: {key: val for key, val in spec_config.__dict__.items()}
+            for benchmark_name, spec_config in self.__dict__.items()
+        }
+        BenchConfig.recursive_resolve(dict_config)
+
+        return dict_config
diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -24,7 +24,6 @@
 from gpt_engineer.core.prompt import Prompt
 
 DATASET_PATH = Path(__file__).parent / "dataset"
-MAX_N_TEST_EXAMPLES = 10
 
 
 class AppsAssertion:
@@ -106,7 +105,9 @@ def load_apps(config: AppsConfig) -> Benchmark:
                         expected=problem.outputs[i],
                         command="python main.py" + ' "' + problem.inputs[i] + '"',
                     ).evaluate
-                    for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
+                    for i in range(
+                        min(len(problem.outputs), config.examples_per_problem)
+                    )
                 },
             )
         )

diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/__init__.py b/gpt_engineer/benchmark/benchmarks/gpteng/__init__.py
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py b/gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md b/gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md