From aecf2b1869a502b7ac633d23d6b4b9441a643bba Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 28 Apr 2024 12:41:14 +0200
Subject: [PATCH 01/11] code to test whether tox is culprit for openai_key fail
 on github

---
 .github/workflows/ci.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index a43f565200..c37cd0d544 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -22,6 +22,14 @@ jobs:
         python-version: ${{ matrix.python-version }}
         cache: 'pip' #Note that pip is for the tox level. Poetry is still used for installing the specific environments (tox.ini)
 
+    - name: Install test runner
+      run: pip install pytest pytest-cov
+
+    - name: Run unit tests
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      run: pytest --cov=gpt_engineer
+
     - name: Install tox
       run: pip install tox
 

From 81c980f1fcc8f1da63b1131245eda6e6abf09db6 Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 28 Apr 2024 12:45:03 +0200
Subject: [PATCH 02/11] adding depedencies to pytest ci level

---
 .github/workflows/ci.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index c37cd0d544..ac937871c2 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -22,6 +22,9 @@ jobs:
         python-version: ${{ matrix.python-version }}
         cache: 'pip' #Note that pip is for the tox level. Poetry is still used for installing the specific environments (tox.ini)
 
+    - name: Install package
+      run: pip install -e .
+
     - name: Install test runner
       run: pip install pytest pytest-cov
 

From 2dcf6b215042cc754148eb226565683fd897dbfd Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Tue, 30 Apr 2024 16:19:45 +0200
Subject: [PATCH 03/11] optional json export of bench results

---
 .gitignore                         |  4 ++++
 gpt_engineer/benchmark/__main__.py | 21 +++++++++++++++------
 gpt_engineer/benchmark/run.py      | 14 ++++++++++++++
 gpt_engineer/benchmark/types.py    |  5 +++++
 4 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 79745c28db..c0c793b88e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,3 +92,7 @@ webapp/.next/
 # locally saved datasets
 gpt_engineer/benchmark/benchmarks/apps/dataset
 gpt_engineer/benchmark/benchmarks/mbpp/dataset
+
+gpt_engineer/benchmark/minimal_bench_config.toml
+
+test.json
diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
index d8c1f58732..06aa4f000c 100644
--- a/gpt_engineer/benchmark/__main__.py
+++ b/gpt_engineer/benchmark/__main__.py
@@ -32,7 +32,7 @@
 from gpt_engineer.applications.cli.main import load_env_if_needed
 from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
-from gpt_engineer.benchmark.run import print_results, run
+from gpt_engineer.benchmark.run import export_json_results, print_results, run
 
 app = typer.Typer()  # creates a CLI app
 
@@ -72,8 +72,12 @@ def main(
         ),
     ],
     bench_config: Annotated[
-        Optional[str], typer.Argument(help="optional task name in benchmark")
+        str, typer.Argument(help="optional task name in benchmark")
     ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
+    json_output: Annotated[
+        Optional[str],
+        typer.Option(help="print results for each task", show_default=False),
+    ] = None,
     verbose: Annotated[
         bool, typer.Option(help="print results for each task", show_default=False)
     ] = False,
@@ -85,13 +89,12 @@ def main(
     ----------
     path_to_agent : str
         The file path to the Python module that contains a function called 'default_config_agent'.
-    benchmarks : str
-        A comma-separated string of benchmark names to run.
-    bench_config : Optional[str], default=default_bench_config.toml
+    bench_config : str, default=default_bench_config.toml
         Configuration file for choosing which benchmark problems to run. See default config for more details.
+    json_output: Optional[str], default=None
+        Pass a path to a json file to have results written to file.
     verbose : bool, default=False
         A flag to indicate whether to print results for each task.
-
     Returns
     -------
     None
@@ -101,6 +104,7 @@ def main(
     config = BenchConfig.from_toml(bench_config)
     print("using config file: " + bench_config)
     benchmarks = list()
+    benchmark_results = dict()
     for specific_config_name in vars(config):
         specific_config = getattr(config, specific_config_name)
         if hasattr(specific_config, "active"):
@@ -124,6 +128,11 @@ def main(
         )
         print_results(results)
         print()
+        benchmark_results[benchmark_name] = {
+            "detailed": [result.to_dict() for result in results]
+        }
+    if json_output is not None:
+        export_json_results(json_output, benchmark_results)
 
 
 if __name__ == "__main__":
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
index c2de03950f..56ef76e537 100644
--- a/gpt_engineer/benchmark/run.py
+++ b/gpt_engineer/benchmark/run.py
@@ -12,6 +12,7 @@
 print_results : function
     Prints the results of the benchmark tasks to the console.
 """
+import json
 import time
 
 from typing import List
@@ -132,3 +133,16 @@ def print_results(results: list[TaskResult]):
     print(f"Average success rate: {avg_success_rate * 100}% on {len(results)} tasks")
     print("--- Results ---")
     print()
+
+
+def export_json_results(json_path, complete_results):
+    for results in complete_results.values():
+        correct_tasks = [
+            task_result
+            for task_result in results["detailed"]
+            if task_result["solved"] == 1.0
+        ]
+        fraction_correct = len(correct_tasks) / len(results["detailed"])
+        results["fully_solved"] = fraction_correct
+    with open(json_path, "w") as f:
+        json.dump(complete_results, f, indent=4)
diff --git a/gpt_engineer/benchmark/types.py b/gpt_engineer/benchmark/types.py
index 9ca15df28c..f444b0efd6 100644
--- a/gpt_engineer/benchmark/types.py
+++ b/gpt_engineer/benchmark/types.py
@@ -87,3 +87,8 @@ def success_rate(self) -> float:
         )
 
         return succeeded / len(self.assertion_results)
+
+    def to_dict(self) -> dict:
+        out_dict = {key: value for key, value in self.__dict__.items()}
+        out_dict["solved"] = self.success_rate
+        return out_dict

From 4543f59df476cf6c6b824d19b816f3bb65793c15 Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sat, 4 May 2024 12:18:21 +0200
Subject: [PATCH 04/11] restoring ci that was accidentally changed

---
 .github/workflows/ci.yaml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index ac937871c2..a43f565200 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -22,17 +22,6 @@ jobs:
         python-version: ${{ matrix.python-version }}
         cache: 'pip' #Note that pip is for the tox level. Poetry is still used for installing the specific environments (tox.ini)
 
-    - name: Install package
-      run: pip install -e .
-
-    - name: Install test runner
-      run: pip install pytest pytest-cov
-
-    - name: Run unit tests
-      env:
-        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-      run: pytest --cov=gpt_engineer
-
     - name: Install tox
       run: pip install tox
 

From f8066b4e4315a45d97fc6c341e522738d48d34b4 Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sat, 4 May 2024 13:18:44 +0200
Subject: [PATCH 05/11] killed gpteng benchmark over path errors

---
 gpt_engineer/benchmark/bench_config.py        |   7 -
 .../benchmark/benchmarks/gpteng/__init__.py   |   8 -
 .../benchmark/benchmarks/gpteng/eval_tools.py | 188 ---------------
 .../gpteng/evals/EVAL_NEW_CODE_RESULTS.md     |  31 ---
 .../gpteng/evals/IMPROVE_CODE_RESULTS.md      |  65 ------
 .../benchmarks/gpteng/evals/README.md         |  15 --
 .../known_code_blobs/snake_game_files.txt     | 129 -----------
 .../known_code_blobs/web_todo_files.txt       | 108 ---------
 .../benchmark/benchmarks/gpteng/load.py       | 217 ------------------
 gpt_engineer/benchmark/benchmarks/load.py     |   2 -
 .../benchmark/default_bench_config.toml       |   3 -
 11 files changed, 773 deletions(-)
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/__init__.py
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/evals/IMPROVE_CODE_RESULTS.md
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/evals/README.md
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt
 delete mode 100644 gpt_engineer/benchmark/benchmarks/gpteng/load.py

diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
index aafc38f524..cc754289c7 100644
--- a/gpt_engineer/benchmark/bench_config.py
+++ b/gpt_engineer/benchmark/bench_config.py
@@ -25,11 +25,6 @@ class GptmeConfig:
     active: bool | None = True
 
 
-@dataclass
-class GptengConfig:
-    active: bool | None = True
-
-
 @dataclass
 class BenchConfig:
     """Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""
@@ -37,7 +32,6 @@ class BenchConfig:
     apps: AppsConfig = field(default_factory=AppsConfig)
     mbpp: MbppConfig = field(default_factory=MbppConfig)
     gptme: GptmeConfig = field(default_factory=GptmeConfig)
-    gpteng: GptengConfig = field(default_factory=GptengConfig)
 
     @classmethod
     def from_toml(cls, config_file: Path | str):
@@ -52,5 +46,4 @@ def from_dict(cls, config_dict: dict):
             apps=AppsConfig(**config_dict.get("apps", {})),
             mbpp=MbppConfig(**config_dict.get("mbpp", {})),
             gptme=GptmeConfig(**config_dict.get("gptme", {})),
-            gpteng=GptengConfig(**config_dict.get("gpteng", {})),
         )
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/__init__.py b/gpt_engineer/benchmark/benchmarks/gpteng/__init__.py
deleted file mode 100644
index 86e651640b..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-The `gpteng` package contains modules and data for evaluating the performance of
-GPT-based models on engineering tasks. It includes tools for running evaluations,
-loading benchmarks, and defining the expected outcomes of tasks.
-
-This package is part of the GPT Engineer benchmark suite, which aims to assess
-the capabilities of AI models in software engineering scenarios.
-"""
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py b/gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py
deleted file mode 100644
index 4b67885ddb..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py
+++ /dev/null
@@ -1,188 +0,0 @@
-"""
-Evaluation tools for assessing the performance of GPT-based models on code editing
-and creation tasks. These tools provide low-level checks on the written code and
-support higher-level tests for a comprehensive evaluation.
-
-Currently, the scope is limited to a few programming languages, with the potential
-for future expansion.
-
-Functions
----------
-check_language : function
-    Checks if the specified language in the evaluation dictionary is supported.
-
-assert_exists_in_source_code : function
-    Checks if a specified string exists in the source code within the provided files dictionary.
-
-run_code_class_has_property : function
-    Executes the code and checks if the specified class has the desired property.
-
-run_code_class_has_property_w_value : function
-    Executes the code and checks if the specified class has the desired property with the expected value.
-
-run_code_eval_function : function
-    Executes the code and evaluates a function call, checking if it returns the expected value.
-
-check_evaluation_component : function
-    Dispatches the evaluation component based on the type specified in the evaluation dictionary.
-"""
-
-from gpt_engineer.core.files_dict import FilesDict
-
-EVAL_LIST_NAME = "evaluations"  # the top level list in the YAML file
-
-
-def check_language(eval_d: dict) -> None:
-    """
-    Checks if the specified language in the evaluation dictionary is supported.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'language' key.
-
-    Raises
-    ------
-    Exception
-        If the specified language is not supported.
-    """
-    if eval_d["language"] != "python":
-        raise Exception(f"Language: {eval_d['language']} is not supported.")
-
-
-def assert_exists_in_source_code(eval_d: dict, files_dict: FilesDict) -> bool:
-    """
-    Checks if a specified string exists in the source code within the provided files dictionary.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'source_file' and 'existing_string' keys.
-    files_dict : FilesDict
-        The dictionary of file names to their respective source code content.
-
-    Returns
-    -------
-    bool
-        True if the specified string exists in the source code, False otherwise.
-    """
-
-    source_body = files_dict[eval_d["source_file"]]
-    return source_body.find(eval_d["existing_string"]) > -1
-
-
-def run_code_class_has_property(eval_d: dict, files_dict: FilesDict) -> bool:
-    """
-    Executes the code and checks if the specified class has the desired property.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'source_file', 'class_name', and 'property_name' keys.
-    files_dict : FilesDict
-        The dictionary of file names to their respective source code content.
-
-    Returns
-    -------
-    bool
-        True if the class has the specified property, False otherwise.
-    """
-
-    check_language(eval_d)
-    source_body = files_dict[eval_d["source_file"]]
-    exec(source_body)
-
-    class_ref = locals().get(eval_d["class_name"])
-    ob = class_ref()
-    return hasattr(ob, eval_d["property_name"])
-
-
-def run_code_class_has_property_w_value(eval_d: dict, files_dict: FilesDict) -> bool:
-    """
-    Executes the code and checks if the specified class has the desired property with the expected value.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'source_file', 'class_name', 'property_name', and 'expected_value' keys.
-    files_dict : FilesDict
-        The dictionary of file names to their respective source code content.
-
-    Returns
-    -------
-    bool
-        True if the class has the specified property with the expected value, False otherwise.
-    """
-
-    check_language(eval_d)
-    source_body = files_dict[eval_d["source_file"]]
-    exec(source_body)
-
-    class_ref = locals().get(eval_d["class_name"])
-    ob = class_ref()
-
-    assert hasattr(ob, eval_d["property_name"])
-
-    return getattr(ob, eval_d["property_name"]) == eval_d["expected_value"]
-
-
-def run_code_eval_function(eval_d: dict, files_dict: FilesDict) -> bool:
-    """
-    Executes the code and evaluates a function call, checking if it returns the expected value.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'source_file', 'function_name', and 'expected_value' keys.
-    files_dict : FilesDict
-        The dictionary of file names to their respective source code content.
-
-    Returns
-    -------
-    bool
-        True if the function call returns the expected value, False otherwise.
-    """
-
-    check_language(eval_d)
-    source_body = files_dict[eval_d["source_file"]]
-    exec(source_body)
-    function_ref = globals().get(eval_d["function_name"])
-
-    # TODO: add the ability to have function arguments
-    return function_ref() == eval_d["expected_value"]
-
-
-def check_evaluation_component(eval_d: dict, files_dict: FilesDict) -> bool:
-    """
-    Dispatches the evaluation component based on the type specified in the evaluation dictionary.
-
-    Parameters
-    ----------
-    eval_d : dict
-        The evaluation dictionary containing the 'type' key and other relevant information for the evaluation.
-    files_dict : FilesDict
-        The dictionary of file names to their respective source code content.
-
-    Returns
-    -------
-    bool
-        The result of the dispatched evaluation component.
-
-    Raises
-    ------
-    Exception
-        If the test type specified in the evaluation dictionary is not recognized.
-    """
-
-    test_type = eval_d.get("type")
-    if test_type == "assert_exists_in_source_code":
-        return assert_exists_in_source_code(eval_d, files_dict)
-    elif test_type == "run_code_class_has_property":
-        return run_code_class_has_property(eval_d, files_dict)
-    elif test_type == "run_code_class_has_property_w_value":
-        return run_code_class_has_property_w_value(eval_d, files_dict)
-    elif test_type == "run_code_eval_function":
-        return run_code_eval_function(eval_d, files_dict)
-    # The following are for new code
-    else:
-        raise Exception(f"Test type '{test_type}' is not recognized.")
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md b/gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md
deleted file mode 100644
index 3d60a0da41..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/evals/EVAL_NEW_CODE_RESULTS.md
+++ /dev/null
@@ -1,31 +0,0 @@
-## 2023-09-15
-
-### New Code Evaluation Summary:
-
-| Project                    | Evaluation   | All Tests Pass   |
-|:---------------------------|:-------------|:-----------------|
-| projects/password_gen_eval | password_gen | ✅               |
-
-### Detailed Test Results:
-
-| Project                    | Evaluation   | Test                                | Pass   |
-|:---------------------------|:-------------|:------------------------------------|:-------|
-| projects/password_gen_eval | password_gen | check_executable_exits_normally     | ✅     |
-| projects/password_gen_eval | password_gen | check_executable_satisfies_function | ✅     |
-## 2023-09-18
-
-### Existing Code Evaluation Summary:
-
-| Project                     | Evaluation         | All Tests Pass   |
-|:----------------------------|:-------------------|:-----------------|
-| projects/currency_converter | currency_converter | ✅                |
-| projects/password_gen_eval  | password_gen       | ✅                |
-
-### Detailed Test Results:
-
-| Project                     | Evaluation         | Test                                | Pass   |
-|:----------------------------|:-------------------|:------------------------------------|:-------|
-| projects/currency_converter | currency_converter | check_executable_exits_normally     | ✅      |
-| projects/currency_converter | currency_converter | check_executable_satisfies_function | ✅      |
-| projects/password_gen_eval  | password_gen       | check_executable_exits_normally     | ✅      |
-| projects/password_gen_eval  | password_gen       | check_executable_satisfies_function | ✅      |
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/evals/IMPROVE_CODE_RESULTS.md b/gpt_engineer/benchmark/benchmarks/gpteng/evals/IMPROVE_CODE_RESULTS.md
deleted file mode 100644
index 55fa1758d6..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/evals/IMPROVE_CODE_RESULTS.md
+++ /dev/null
@@ -1,65 +0,0 @@
-## 2023-08-24
-
-### Existing Code Evaluation Summary:
-
-| Project                  | Evaluation         | All Tests Pass   |
-|:-------------------------|:-------------------|:-----------------|
-| projects/snake_game_eval | simple_code_modify | ✅                |
-
-### Detailed Test Results:
-
-| Project                  | Evaluation         | Test                                | Pass   |
-|:-------------------------|:-------------------|:------------------------------------|:-------|
-| projects/snake_game_eval | simple_code_modify | assert_exists_in_source_code        | ✅      |
-| projects/snake_game_eval | simple_code_modify | assert_exists_in_source_code        | ✅      |
-| projects/snake_game_eval | simple_code_modify | run_code_class_has_property         | ✅      |
-| projects/snake_game_eval | simple_code_modify | run_code_class_has_property         | ✅      |
-| projects/snake_game_eval | simple_code_modify | run_code_class_has_property_w_value | ✅      |
-| projects/snake_game_eval | simple_code_modify | run_code_class_has_property_w_value | ✅      |
-
-## 2023-08-25
-
-### Existing Code Evaluation Summary:
-
-| Project                  | Evaluation                | All Tests Pass   |
-|:-------------------------|:--------------------------|:-----------------|
-| projects/snake_game_eval | simple_code_modify        | ✅                |
-| projects/web_todo_list   | modify_web_app_appearance | ✅                |
-| projects/snake_game_eval | modify_functionality      | ❌                |
-
-### Detailed Test Results:
-
-| Project                  | Evaluation                | Test                                | Pass   |
-|:-------------------------|:--------------------------|:------------------------------------|:-------|
-| projects/snake_game_eval | simple_code_modify        | assert_exists_in_source_code        | ✅      |
-| projects/snake_game_eval | simple_code_modify        | assert_exists_in_source_code        | ✅      |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property         | ✅      |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property         | ✅      |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property_w_value | ✅      |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property_w_value | ✅      |
-| projects/web_todo_list   | modify_web_app_appearance | assert_exists_in_source_code        | ✅      |
-| projects/web_todo_list   | modify_web_app_appearance | assert_exists_in_source_code        | ✅      |
-| projects/snake_game_eval | modify_functionality      | assert_exists_in_source_code        | ❌      |
-## 2023-09-15
-
-### Existing Code Evaluation Summary:
-
-| Project                  | Evaluation                | All Tests Pass   |
-|:-------------------------|:--------------------------|:-----------------|
-| projects/snake_game_eval | simple_code_modify        | ❌               |
-| projects/web_todo_list   | modify_web_app_appearance | ❌               |
-| projects/snake_game_eval | modify_functionality      | ❌               |
-
-### Detailed Test Results:
-
-| Project                  | Evaluation                | Test                                | Pass   |
-|:-------------------------|:--------------------------|:------------------------------------|:-------|
-| projects/snake_game_eval | simple_code_modify        | assert_exists_in_source_code        | ❌     |
-| projects/snake_game_eval | simple_code_modify        | assert_exists_in_source_code        | ❌     |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property         | ✅     |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property         | ✅     |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property_w_value | ❌     |
-| projects/snake_game_eval | simple_code_modify        | run_code_class_has_property_w_value | ❌     |
-| projects/web_todo_list   | modify_web_app_appearance | assert_exists_in_source_code        | ❌     |
-| projects/web_todo_list   | modify_web_app_appearance | assert_exists_in_source_code        | ❌     |
-| projects/snake_game_eval | modify_functionality      | assert_exists_in_source_code        | ❌     |
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/evals/README.md b/gpt_engineer/benchmark/benchmarks/gpteng/evals/README.md
deleted file mode 100644
index 58c5114203..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/evals/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-# Evals
-
-Evals are a set of tests that allow us to measure the performance of the gpt-engineer whole system.  This includes the gpt-enginer code, options and the chosen LLM.
-
-### Running Evals
-
-To run the existing code evals make sure you are in the gpt-engineer top level directory (you should see a directory called `evals`) type:
-
-`python evals/evals_existing_code.py`  This will run the default test file: `evals/existing_code_eval.yaml`, or you can run any YAML file of tests you wish with the command: `python evals/evals_existing_code.py your_test_file.yaml`
-
-Similarly to run the new code evals type:
-
-`python evals/evals_new_code.py`
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt b/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt
deleted file mode 100644
index ad3256adb3..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt
+++ /dev/null
@@ -1,129 +0,0 @@
-
-main.py
-```python
-from game import Game
-
-def main():
-    game = Game()
-    game.run()
-
-if __name__ == "__main__":
-    main()
-```
-
-game.py
-```python
-from snake import Snake
-from food import Food
-from grid import Grid
-from controller import Controller
-from view import View
-
-class Game:
-    def __init__(self):
-        self.snake = Snake()
-        self.food = Food()
-        self.grid = Grid()
-        self.controller = Controller()
-        self.view = View()
-
-    def run(self):
-        while True:
-            self.controller.read_input()
-            self.snake.move(self.controller.direction)
-            if self.snake.eat(self.food):
-                self.snake.grow()
-                self.food.place()
-            if self.snake.collide(self.grid):
-                break
-            self.view.draw(self.grid, self.snake, self.food)
-```
-
-snake.py
-```python
-class Snake:
-    def __init__(self):
-        self.body = [(0, 0)]
-        self.direction = (0, 1)
-
-    def move(self, direction):
-        self.direction = direction
-        head = self.body[0]
-        new_head = (head[0] + direction[0], head[1] + direction[1])
-        self.body.insert(0, new_head)
-        self.body.pop()
-
-    def grow(self):
-        self.body.append(self.body[-1])
-
-    def eat(self, food):
-        return self.body[0] == food.position
-
-    def collide(self, grid):
-        head = self.body[0]
-        return head in self.body[1:] or not (0 <= head[0] < grid.width and 0 <= head[1] < grid.height)
-```
-
-food.py
-```python
-import random
-
-class Food:
-    def __init__(self, width=10, height=10):
-        self.width = width
-        self.height = height
-        self.position = (0, 0)
-        self.place()
-
-    def place(self):
-        self.position = (random.randint(0, self.width-1), random.randint(0, self.height-1))
-```
-
-grid.py
-```python
-class Grid:
-    def __init__(self, width=10, height=10):
-        self.width = width
-        self.height = height
-```
-
-controller.py
-```python
-import msvcrt
-
-class Controller:
-    def __init__(self):
-        self.direction = (0, 1)
-
-    def read_input(self):
-        key = msvcrt.getch()
-        if key == b'w':
-            self.direction = (-1, 0)
-        elif key == b's':
-            self.direction = (1, 0)
-        elif key == b'a':
-            self.direction = (0, -1)
-        elif key == b'd':
-            self.direction = (0, 1)
-```
-
-view.py
-```python
-class View:
-    def draw(self, grid, snake, food):
-        for i in range(grid.height):
-            for j in range(grid.width):
-                if (i, j) in snake.body:
-                    print('S', end='')
-                elif (i, j) == food.position:
-                    print('F', end='')
-                else:
-                    print('.', end='')
-            print()
-        print()
-```
-
-requirements.txt
-```plaintext
-msvcrt==1.0
-```
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt b/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt
deleted file mode 100644
index 46333cd48d..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt
+++ /dev/null
@@ -1,108 +0,0 @@
-
-index.html
-```html
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>To-Do List App</title>
-    <link rel="stylesheet" href="styles.css">
-</head>
-<body>
-    <h1>To-Do List</h1>
-    <form id="task-form">
-        <input type="text" id="task-input" placeholder="Enter a task">
-        <button type="submit">Add Task</button>
-    </form>
-    <ul id="task-list"></ul>
-    <script src="app.js"></script>
-</body>
-</html>
-```
-
-Next, we have the `styles.css` file:
-
-styles.css
-```css
-body {
-    font-family: Arial, sans-serif;
-}
-
-#task-form {
-    margin-bottom: 20px;
-}
-
-#task-input {
-    margin-right: 10px;
-}
-
-#task-list {
-    list-style-type: none;
-}
-```
-
-Finally, we have the `app.js` file:
-
-app.js
-```javascript
-document.addEventListener('DOMContentLoaded', () => {
-    const taskForm = document.getElementById('task-form');
-    const taskInput = document.getElementById('task-input');
-    const taskList = document.getElementById('task-list');
-
-    taskForm.addEventListener('submit', event => {
-        event.preventDefault();
-        addTask(taskInput.value);
-        taskInput.value = '';
-    });
-
-    taskList.addEventListener('click', event => {
-        if (event.target.tagName === 'BUTTON') {
-            if (event.target.textContent === 'Edit') {
-                editTask(event.target.parentElement);
-            } else if (event.target.textContent === 'Delete') {
-                deleteTask(event.target.parentElement);
-            }
-        }
-    });
-
-    function addTask(task) {
-        const listItem = document.createElement('li');
-        listItem.textContent = task;
-        const editButton = document.createElement('button');
-        editButton.textContent = 'Edit';
-        const deleteButton = document.createElement('button');
-        deleteButton.textContent = 'Delete';
-        listItem.appendChild(editButton);
-        listItem.appendChild(deleteButton);
-        taskList.appendChild(listItem);
-        saveTasks();
-    }
-
-    function editTask(listItem) {
-        const task = prompt('Edit the task', listItem.firstChild.textContent);
-        listItem.firstChild.textContent = task;
-        saveTasks();
-    }
-
-    function deleteTask(listItem) {
-        taskList.removeChild(listItem);
-        saveTasks();
-    }
-
-    function saveTasks() {
-        const tasks = Array.from(taskList.children).map(listItem => listItem.firstChild.textContent);
-        localStorage.setItem('tasks', JSON.stringify(tasks));
-    }
-
-    function loadTasks() {
-        const tasks = JSON.parse(localStorage.getItem('tasks'));
-        if (tasks) {
-            tasks.forEach(task => addTask(task));
-        }
-    }
-
-    loadTasks();
-});
-```
diff --git a/gpt_engineer/benchmark/benchmarks/gpteng/load.py b/gpt_engineer/benchmark/benchmarks/gpteng/load.py
deleted file mode 100644
index c9c115234f..0000000000
--- a/gpt_engineer/benchmark/benchmarks/gpteng/load.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""
-Module for loading GPT-Eng evaluation tasks.
-
-This module provides functionality to load tasks for evaluating GPT-based models
-on engineering tasks. It converts predefined evaluation cases into Task objects
-that can be used to benchmark the performance of AI models.
-
-Functions
----------
-expect_to_assertion : function
-    Converts an expected result dictionary to an assertion function.
-
-eval_to_task : function
-    Converts an evaluation case dictionary to a Task object.
-
-load_gpteng : function
-    Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation.
-"""
-
-from pathlib import Path
-
-from gpt_engineer.benchmark.bench_config import GptengConfig
-from gpt_engineer.benchmark.benchmarks.gpteng.eval_tools import (
-    check_evaluation_component,
-)
-from gpt_engineer.benchmark.types import Assertable, Benchmark, Task
-from gpt_engineer.core.chat_to_files import chat_to_files_dict
-from gpt_engineer.core.prompt import Prompt
-
-evaluations = [
-    {
-        "name": "simple_code_modify",
-        "project_root": "projects/snake_game_eval",
-        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
-        "improve_code_prompt": "The grid is currently 10x10, change the grid to be 42x42.",
-        "expected_results": [
-            {
-                "type": "assert_exists_in_source_code",
-                "source_file": "grid.py",
-                "existing_string": "width=42",
-            },
-            {
-                "type": "assert_exists_in_source_code",
-                "source_file": "grid.py",
-                "existing_string": "height=42",
-            },
-            {
-                "type": "run_code_class_has_property",
-                "language": "python",
-                "source_file": "grid.py",
-                "class_name": "Grid",
-                "property_name": "height",
-            },
-            {
-                "type": "run_code_class_has_property",
-                "language": "python",
-                "source_file": "grid.py",
-                "class_name": "Grid",
-                "property_name": "width",
-            },
-            {
-                "type": "run_code_class_has_property_w_value",
-                "language": "python",
-                "source_file": "grid.py",
-                "class_name": "Grid",
-                "property_name": "height",
-                "expected_value": 42,
-            },
-            {
-                "type": "run_code_class_has_property_w_value",
-                "language": "python",
-                "source_file": "grid.py",
-                "class_name": "Grid",
-                "property_name": "width",
-                "expected_value": 42,
-            },
-        ],
-    },
-    {
-        "name": "modify_web_app_appearance",
-        "project_root": "projects/web_todo_list",
-        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/web_todo_files.txt",
-        "improve_code_prompt": "Fix the margins around the form to be 45px, and make the background color orange.",
-        "expected_results": [
-            {
-                "type": "assert_exists_in_source_code",
-                "source_file": "styles.css",
-                "existing_string": "#task-form {\\n    margin: 45px;",
-            },
-            {
-                "type": "assert_exists_in_source_code",
-                "source_file": "styles.css",
-                "existing_string": "background-color: orange;",
-            },
-        ],
-    },
-    {
-        "name": "modify_functionality",
-        "project_root": "projects/snake_game_eval",
-        "code_blob": "gpt_engineer/benchmark/benchmarks/gpteng/known_code_blobs/snake_game_files.txt",
-        "improve_code_prompt": "Add a 2 second delay before the game starts.",
-        "expected_results": [
-            {
-                "type": "assert_exists_in_source_code",
-                "source_file": "game.py",
-                "existing_string": "time.sleep(2)",
-            }
-        ],
-    },
-]
-
-# Not supporting execution paths that used to exist
-# evaluations = [
-#     {
-#         "name": "currency_converter",
-#         "project_root": "projects/currency_converter",
-#         "code_prompt": "Build a currency converter CLI tool in Python using an API for exchange rates.  The currency converter should be a python program named currency.py with three required arguments: base currency symbol, target currency symbol and base currency amount.  The currency converter will convert the amount in base currency amount to the target currency.  The output of the program should only be the amount of target currency.  For example the following command: `python currency.py USD CNY 1` should return a number like 7.5.",
-#         "expected_results": [
-#             {
-#                 "type": "check_executable_exits_normally",
-#                 "executable_name": "python currency.py",
-#                 "executable_arguments": "USD CAD 10"
-#             },
-#             {
-#                 "type": "check_executable_satisfies_function",
-#                 "executable_name": "python currency.py",
-#                 "executable_arguments": "USD CAD 10",
-#                 "output_satisfies": "tf = lambda a : a.replace('.', '').isnumeric()"
-#             }
-#         ]
-#     },
-#     {
-#         "name": "password_gen",
-#         "project_root": "projects/password_gen_eval",
-#         "code_prompt": "Create a password generator CLI tool in Python that generates strong, random passwords based on user-specified criteria, such as length and character types (letters, numbers, symbols).  The password generator should be a python program named passwordgenerator.py with two arguments: length, and character types.  The character types argument can be one or more of the the following: l for lowercase, u for uppercase, d for digits, and s for symbols.",
-#         "expected_results": [
-#             {
-#                 "type": "check_executable_exits_normally",
-#                 "executable_name": "python passwordgenerator.py",
-#                 "executable_arguments": "10 d"
-#             },
-#             {
-#                 "type": "check_executable_satisfies_function",
-#                 "executable_name": "python passwordgenerator.py",
-#                 "executable_arguments": "10 d",
-#                 "output_satisfies": "tf = lambda a : len(a) == 10"
-#             }
-#         ]
-#     }
-# ]
-#
-
-
-def expect_to_assertion(expected_result):
-    """
-    Converts an expected result dictionary to an assertion function.
-
-    Parameters
-    ----------
-    expected_result : dict
-        The dictionary containing the expected result configuration.
-
-    Returns
-    -------
-    function
-        An assertion function that takes an Assertable object and returns a boolean.
-    """
-
-    def assertion(assertable: Assertable):
-        return check_evaluation_component(expected_result, assertable.files)
-
-    return assertion
-
-
-def eval_to_task(case):
-    """
-    Converts an evaluation case dictionary to a Task object.
-
-    Parameters
-    ----------
-    case : dict
-        The dictionary containing the evaluation case configuration.
-
-    Returns
-    -------
-    Task
-        A Task object constructed from the evaluation case dictionary.
-    """
-    if "improve_code_prompt" in case:
-        prompt = case["improve_code_prompt"]
-    else:
-        prompt = case["code_prompt"]
-
-    return Task(
-        name=case["name"],
-        initial_code=chat_to_files_dict(Path(case["code_blob"]).read_text()),
-        prompt=Prompt(prompt),
-        command=None,
-        assertions={
-            f"{e['type']}_{i}": expect_to_assertion(e)
-            for i, e in enumerate(case["expected_results"])
-        },
-    )
-
-
-def load_gpteng(config: GptengConfig) -> Benchmark:
-    """
-    Loads the GPT-Eng benchmark, which consists of a series of tasks for evaluation.
-
-    Returns
-    -------
-    Benchmark
-        A Benchmark object containing a list of Task objects for the GPT-Eng evaluation.
-    """
-    return Benchmark(
-        name="gpte_eval", tasks=[eval_to_task(case) for case in evaluations]
-    )
diff --git a/gpt_engineer/benchmark/benchmarks/load.py b/gpt_engineer/benchmark/benchmarks/load.py
index e065875edf..dfbf7ba1c6 100644
--- a/gpt_engineer/benchmark/benchmarks/load.py
+++ b/gpt_engineer/benchmark/benchmarks/load.py
@@ -11,14 +11,12 @@
 """
 from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.apps.load import load_apps
-from gpt_engineer.benchmark.benchmarks.gpteng.load import load_gpteng
 from gpt_engineer.benchmark.benchmarks.gptme.load import load_gptme
 from gpt_engineer.benchmark.benchmarks.mbpp.load import load_mbpp
 from gpt_engineer.benchmark.types import Benchmark
 
 BENCHMARKS = {
     "gptme": load_gptme,
-    "gpteng": load_gpteng,
     "apps": load_apps,
     "mbpp": load_mbpp,
 }
diff --git a/gpt_engineer/benchmark/default_bench_config.toml b/gpt_engineer/benchmark/default_bench_config.toml
index 5da0079454..50205dc705 100644
--- a/gpt_engineer/benchmark/default_bench_config.toml
+++ b/gpt_engineer/benchmark/default_bench_config.toml
@@ -12,8 +12,5 @@ active = true
 test_len = 2
 train_len = 2
 
-[gpteng]
-active = true
-
 [gptme]
 active = true

From 9feedcaf8f625ffd599c84f234e3d342198cbe6b Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 5 May 2024 21:00:24 +0200
Subject: [PATCH 06/11] json export including config

---
 gpt_engineer/benchmark/__main__.py     | 2 +-
 gpt_engineer/benchmark/bench_config.py | 7 +++++++
 gpt_engineer/benchmark/run.py          | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
index 06aa4f000c..f2165f980a 100644
--- a/gpt_engineer/benchmark/__main__.py
+++ b/gpt_engineer/benchmark/__main__.py
@@ -132,7 +132,7 @@ def main(
             "detailed": [result.to_dict() for result in results]
         }
     if json_output is not None:
-        export_json_results(json_output, benchmark_results)
+        export_json_results(json_output, benchmark_results, config.to_dict())
 
 
 if __name__ == "__main__":
diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
index cc754289c7..b48838d446 100644
--- a/gpt_engineer/benchmark/bench_config.py
+++ b/gpt_engineer/benchmark/bench_config.py
@@ -47,3 +47,10 @@ def from_dict(cls, config_dict: dict):
             mbpp=MbppConfig(**config_dict.get("mbpp", {})),
             gptme=GptmeConfig(**config_dict.get("gptme", {})),
         )
+
+    def to_dict(self):
+        dict_config = {
+            benchmark_name: {key: val for key, val in spec_config.__dict__.items()}
+            for benchmark_name, spec_config in self.__dict__.items()
+        }
+        return dict_config
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
index 56ef76e537..cc106b6594 100644
--- a/gpt_engineer/benchmark/run.py
+++ b/gpt_engineer/benchmark/run.py
@@ -135,7 +135,7 @@ def print_results(results: list[TaskResult]):
     print()
 
 
-def export_json_results(json_path, complete_results):
+def export_json_results(json_path, complete_results, config):
     for results in complete_results.values():
         correct_tasks = [
             task_result
@@ -144,5 +144,6 @@ def export_json_results(json_path, complete_results):
         ]
         fraction_correct = len(correct_tasks) / len(results["detailed"])
         results["fully_solved"] = fraction_correct
+    complete_results["config"] = config
     with open(json_path, "w") as f:
         json.dump(complete_results, f, indent=4)

From 1c371d1a9e92cab96e83b6984f48db88c8718bdb Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 5 May 2024 21:04:30 +0200
Subject: [PATCH 07/11] Set number of test input-outputs per APPs problem in
 config

---
 gpt_engineer/benchmark/bench_config.py         | 1 +
 gpt_engineer/benchmark/benchmarks/apps/load.py | 5 +++--
 gpt_engineer/benchmark/benchmarks/mbpp/load.py | 1 -
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
index b48838d446..af31db7149 100644
--- a/gpt_engineer/benchmark/bench_config.py
+++ b/gpt_engineer/benchmark/bench_config.py
@@ -11,6 +11,7 @@ class AppsConfig:
     test_end_index: int | None = 1
     train_start_index: int | None = 0
     train_end_index: int | None = 0
+    examples_per_problem: int | None = 10
 
 
 @dataclass
diff --git a/gpt_engineer/benchmark/benchmarks/apps/load.py b/gpt_engineer/benchmark/benchmarks/apps/load.py
index 0929aa447f..4d0d99729a 100644
--- a/gpt_engineer/benchmark/benchmarks/apps/load.py
+++ b/gpt_engineer/benchmark/benchmarks/apps/load.py
@@ -24,7 +24,6 @@
 from gpt_engineer.core.prompt import Prompt
 
 DATASET_PATH = Path(__file__).parent / "dataset"
-MAX_N_TEST_EXAMPLES = 10
 
 
 class AppsAssertion:
@@ -106,7 +105,9 @@ def load_apps(config: AppsConfig) -> Benchmark:
                         expected=problem.outputs[i],
                         command="python main.py" + ' "' + problem.inputs[i] + '"',
                     ).evaluate
-                    for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
+                    for i in range(
+                        min(len(problem.outputs), config.examples_per_problem)
+                    )
                 },
             )
         )
diff --git a/gpt_engineer/benchmark/benchmarks/mbpp/load.py b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
index 5f815be823..f0ed7282e4 100644
--- a/gpt_engineer/benchmark/benchmarks/mbpp/load.py
+++ b/gpt_engineer/benchmark/benchmarks/mbpp/load.py
@@ -24,7 +24,6 @@
 from gpt_engineer.core.prompt import Prompt
 
 DATASET_PATH = Path(__file__).parent / "dataset"
-MAX_N_TEST_EXAMPLES = 10
 
 
 class MbppAssertion:

From 6960faac0c2a36cd8aafc3e7cb79028259c9951b Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Fri, 10 May 2024 13:25:54 +0200
Subject: [PATCH 08/11] use yaml

---
 gpt_engineer/benchmark/__main__.py | 12 ++++++------
 gpt_engineer/benchmark/run.py      |  8 ++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gpt_engineer/benchmark/__main__.py b/gpt_engineer/benchmark/__main__.py
index f2165f980a..683ea00f0a 100644
--- a/gpt_engineer/benchmark/__main__.py
+++ b/gpt_engineer/benchmark/__main__.py
@@ -32,7 +32,7 @@
 from gpt_engineer.applications.cli.main import load_env_if_needed
 from gpt_engineer.benchmark.bench_config import BenchConfig
 from gpt_engineer.benchmark.benchmarks.load import get_benchmark
-from gpt_engineer.benchmark.run import export_json_results, print_results, run
+from gpt_engineer.benchmark.run import export_yaml_results, print_results, run
 
 app = typer.Typer()  # creates a CLI app
 
@@ -74,7 +74,7 @@ def main(
     bench_config: Annotated[
         str, typer.Argument(help="optional task name in benchmark")
     ] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
-    json_output: Annotated[
+    yaml_output: Annotated[
         Optional[str],
         typer.Option(help="print results for each task", show_default=False),
     ] = None,
@@ -91,8 +91,8 @@ def main(
         The file path to the Python module that contains a function called 'default_config_agent'.
     bench_config : str, default=default_bench_config.toml
         Configuration file for choosing which benchmark problems to run. See default config for more details.
-    json_output: Optional[str], default=None
-        Pass a path to a json file to have results written to file.
+    yaml_output: Optional[str], default=None
+        Pass a path to a yaml file to have results written to file.
     verbose : bool, default=False
         A flag to indicate whether to print results for each task.
     Returns
@@ -131,8 +131,8 @@ def main(
         benchmark_results[benchmark_name] = {
             "detailed": [result.to_dict() for result in results]
         }
-    if json_output is not None:
-        export_json_results(json_output, benchmark_results, config.to_dict())
+    if yaml_output is not None:
+        export_yaml_results(yaml_output, benchmark_results, config.to_dict())
 
 
 if __name__ == "__main__":
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
index cc106b6594..7855e1dd55 100644
--- a/gpt_engineer/benchmark/run.py
+++ b/gpt_engineer/benchmark/run.py
@@ -12,7 +12,7 @@
 print_results : function
     Prints the results of the benchmark tasks to the console.
 """
-import json
+import yaml
 import time
 
 from typing import List
@@ -135,7 +135,7 @@ def print_results(results: list[TaskResult]):
     print()
 
 
-def export_json_results(json_path, complete_results, config):
+def export_yaml_results(yaml_path, complete_results, config):
     for results in complete_results.values():
         correct_tasks = [
             task_result
@@ -145,5 +145,5 @@ def export_json_results(json_path, complete_results, config):
         fraction_correct = len(correct_tasks) / len(results["detailed"])
         results["fully_solved"] = fraction_correct
     complete_results["config"] = config
-    with open(json_path, "w") as f:
-        json.dump(complete_results, f, indent=4)
+    with open(yaml_path, "w") as f:
+        yaml.dump(complete_results, f, indent=4)

From 230332d1f04cfc30270d8c63cc9ce8ed8e86a8c4 Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sat, 11 May 2024 12:46:00 +0200
Subject: [PATCH 09/11] FIxed write yaml to int problem

---
 gpt_engineer/benchmark/bench_config.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
index af31db7149..d9e125f424 100644
--- a/gpt_engineer/benchmark/bench_config.py
+++ b/gpt_engineer/benchmark/bench_config.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-
+from tomlkit.items import Integer
 from gpt_engineer.core.project_config import read_config
 
 
@@ -49,9 +49,19 @@ def from_dict(cls, config_dict: dict):
             gptme=GptmeConfig(**config_dict.get("gptme", {})),
         )
 
+    @staticmethod
+    def recursive_resolve(data_dict):
+        for key, value in data_dict.items():
+            if isinstance(value, Integer):
+                data_dict[key] = int(value)
+            elif isinstance(value, dict):
+                BenchConfig.recursive_resolve(value)
+
     def to_dict(self):
         dict_config = {
             benchmark_name: {key: val for key, val in spec_config.__dict__.items()}
             for benchmark_name, spec_config in self.__dict__.items()
         }
+        BenchConfig.recursive_resolve(dict_config)
+
         return dict_config

From 67c0a2b7794a69e881128586edfab1be601c581b Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 12 May 2024 12:16:22 +0200
Subject: [PATCH 10/11] Linting

---
 gpt_engineer/benchmark/bench_config.py | 2 ++
 gpt_engineer/benchmark/run.py          | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/gpt_engineer/benchmark/bench_config.py b/gpt_engineer/benchmark/bench_config.py
index d9e125f424..526e997bb6 100644
--- a/gpt_engineer/benchmark/bench_config.py
+++ b/gpt_engineer/benchmark/bench_config.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
+
 from tomlkit.items import Integer
+
 from gpt_engineer.core.project_config import read_config
 
 
diff --git a/gpt_engineer/benchmark/run.py b/gpt_engineer/benchmark/run.py
index 7855e1dd55..de5b979d4b 100644
--- a/gpt_engineer/benchmark/run.py
+++ b/gpt_engineer/benchmark/run.py
@@ -12,11 +12,12 @@
 print_results : function
     Prints the results of the benchmark tasks to the console.
 """
-import yaml
 import time
 
 from typing import List
 
+import yaml
+
 from gpt_engineer.benchmark.types import Assertable, Benchmark, TaskResult
 from gpt_engineer.core.base_agent import BaseAgent
 from gpt_engineer.core.default.disk_execution_env import DiskExecutionEnv

From 5aec1eb8ced5c392a7690b1a6abbe1a05c311fc7 Mon Sep 17 00:00:00 2001
From: Axel Theorell <axel.theorell@gmail.com>
Date: Sun, 12 May 2024 12:28:35 +0200
Subject: [PATCH 11/11] removed gpteng test, since gpteng benchmark is removed

---
 tests/benchmark/test_BenchConfig.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/benchmark/test_BenchConfig.py b/tests/benchmark/test_BenchConfig.py
index 87619ae665..79dafd47f3 100644
--- a/tests/benchmark/test_BenchConfig.py
+++ b/tests/benchmark/test_BenchConfig.py
@@ -5,7 +5,6 @@
 from gpt_engineer.benchmark.bench_config import (
     AppsConfig,
     BenchConfig,
-    GptengConfig,
     GptmeConfig,
     MbppConfig,
 )
@@ -18,7 +17,6 @@ def test_default_values(self):
         assert isinstance(config.apps, AppsConfig)
         assert isinstance(config.mbpp, MbppConfig)
         assert isinstance(config.gptme, GptmeConfig)
-        assert isinstance(config.gpteng, GptengConfig)
         assert config.apps.active is True
         assert config.apps.test_start_index == 0
         assert config.apps.test_end_index == 1
@@ -28,7 +26,6 @@ def test_default_values(self):
         assert config.mbpp.test_len == 1
         assert config.mbpp.train_len == 0
         assert config.gptme.active is True
-        assert config.gpteng.active is True
 
     #  Creating a BenchConfig object with specific values should return an instance of BenchConfig with the specified attributes set to the specified values.
     def test_specific_values(self):
@@ -42,12 +39,10 @@ def test_specific_values(self):
             ),
             mbpp=MbppConfig(active=False, test_len=5, train_len=6),
             gptme=GptmeConfig(active=False),
-            gpteng=GptengConfig(active=False),
         )
         assert isinstance(config.apps, AppsConfig)
         assert isinstance(config.mbpp, MbppConfig)
         assert isinstance(config.gptme, GptmeConfig)
-        assert isinstance(config.gpteng, GptengConfig)
         assert config.apps.active is False
         assert config.apps.test_start_index == 1
         assert config.apps.test_end_index == 2
@@ -57,7 +52,6 @@ def test_specific_values(self):
         assert config.mbpp.test_len == 5
         assert config.mbpp.train_len == 6
         assert config.gptme.active is False
-        assert config.gpteng.active is False
 
     #  Calling the from_dict method with a valid dictionary should return an instance of BenchConfig with attributes set according to the values in the dictionary.
     def test_from_dict_valid_dict(self):
@@ -71,13 +65,11 @@ def test_from_dict_valid_dict(self):
             },
             "mbpp": {"active": False, "test_len": 5, "train_len": 6},
             "gptme": {"active": False},
-            "gpteng": {"active": False},
         }
         config = BenchConfig.from_dict(config_dict)
         assert isinstance(config.apps, AppsConfig)
         assert isinstance(config.mbpp, MbppConfig)
         assert isinstance(config.gptme, GptmeConfig)
-        assert isinstance(config.gpteng, GptengConfig)
         assert config.apps.active is False
         assert config.apps.test_start_index == 1
         assert config.apps.test_end_index == 2
@@ -87,7 +79,6 @@ def test_from_dict_valid_dict(self):
         assert config.mbpp.test_len == 5
         assert config.mbpp.train_len == 6
         assert config.gptme.active is False
-        assert config.gpteng.active is False
 
     #  Calling the from_toml method with an invalid path to a TOML file should raise an appropriate exception.
     def test_from_toml_invalid_path(self):