Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bench export #1147

Merged
merged 11 commits into from
May 12, 2024
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ webapp/.next/
# locally saved datasets
gpt_engineer/benchmark/benchmarks/apps/dataset
gpt_engineer/benchmark/benchmarks/mbpp/dataset

gpt_engineer/benchmark/minimal_bench_config.toml

test.json
21 changes: 15 additions & 6 deletions gpt_engineer/benchmark/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from gpt_engineer.applications.cli.main import load_env_if_needed
from gpt_engineer.benchmark.bench_config import BenchConfig
from gpt_engineer.benchmark.benchmarks.load import get_benchmark
from gpt_engineer.benchmark.run import print_results, run
from gpt_engineer.benchmark.run import export_yaml_results, print_results, run

Check warning on line 35 in gpt_engineer/benchmark/__main__.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/__main__.py#L35

Added line #L35 was not covered by tests

app = typer.Typer() # creates a CLI app

Expand Down Expand Up @@ -72,8 +72,12 @@
),
],
bench_config: Annotated[
Optional[str], typer.Argument(help="optional task name in benchmark")
str, typer.Argument(help="optional task name in benchmark")
] = os.path.join(os.path.dirname(__file__), "default_bench_config.toml"),
yaml_output: Annotated[
Optional[str],
typer.Option(help="print results for each task", show_default=False),
] = None,
verbose: Annotated[
bool, typer.Option(help="print results for each task", show_default=False)
] = False,
Expand All @@ -85,13 +89,12 @@
----------
path_to_agent : str
The file path to the Python module that contains a function called 'default_config_agent'.
benchmarks : str
A comma-separated string of benchmark names to run.
bench_config : Optional[str], default=default_bench_config.toml
bench_config : str, default=default_bench_config.toml
Configuration file for choosing which benchmark problems to run. See default config for more details.
yaml_output: Optional[str], default=None
Pass a path to a yaml file to have results written to file.
verbose : bool, default=False
A flag to indicate whether to print results for each task.

Returns
-------
None
Expand All @@ -101,6 +104,7 @@
config = BenchConfig.from_toml(bench_config)
print("using config file: " + bench_config)
benchmarks = list()
benchmark_results = dict()

Check warning on line 107 in gpt_engineer/benchmark/__main__.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/__main__.py#L107

Added line #L107 was not covered by tests
for specific_config_name in vars(config):
specific_config = getattr(config, specific_config_name)
if hasattr(specific_config, "active"):
Expand All @@ -124,6 +128,11 @@
)
print_results(results)
print()
benchmark_results[benchmark_name] = {

Check warning on line 131 in gpt_engineer/benchmark/__main__.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/__main__.py#L131

Added line #L131 was not covered by tests
"detailed": [result.to_dict() for result in results]
}
if yaml_output is not None:
export_yaml_results(yaml_output, benchmark_results, config.to_dict())

Check warning on line 135 in gpt_engineer/benchmark/__main__.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/__main__.py#L134-L135

Added lines #L134 - L135 were not covered by tests


if __name__ == "__main__":
Expand Down
27 changes: 20 additions & 7 deletions gpt_engineer/benchmark/bench_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass, field
from pathlib import Path

from tomlkit.items import Integer

from gpt_engineer.core.project_config import read_config


Expand All @@ -11,6 +13,7 @@
test_end_index: int | None = 1
train_start_index: int | None = 0
train_end_index: int | None = 0
examples_per_problem: int | None = 10


@dataclass
Expand All @@ -25,19 +28,13 @@
active: bool | None = True


@dataclass
class GptengConfig:
active: bool | None = True


@dataclass
class BenchConfig:
"""Configuration for the GPT Engineer CLI and gptengineer.app via `gpt-engineer.toml`."""

apps: AppsConfig = field(default_factory=AppsConfig)
mbpp: MbppConfig = field(default_factory=MbppConfig)
gptme: GptmeConfig = field(default_factory=GptmeConfig)
gpteng: GptengConfig = field(default_factory=GptengConfig)

@classmethod
def from_toml(cls, config_file: Path | str):
Expand All @@ -52,5 +49,21 @@
apps=AppsConfig(**config_dict.get("apps", {})),
mbpp=MbppConfig(**config_dict.get("mbpp", {})),
gptme=GptmeConfig(**config_dict.get("gptme", {})),
gpteng=GptengConfig(**config_dict.get("gpteng", {})),
)

@staticmethod
def recursive_resolve(data_dict):
for key, value in data_dict.items():
if isinstance(value, Integer):
data_dict[key] = int(value)
elif isinstance(value, dict):
BenchConfig.recursive_resolve(value)

Check warning on line 60 in gpt_engineer/benchmark/bench_config.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/bench_config.py#L56-L60

Added lines #L56 - L60 were not covered by tests

def to_dict(self):
dict_config = {

Check warning on line 63 in gpt_engineer/benchmark/bench_config.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/bench_config.py#L63

Added line #L63 was not covered by tests
benchmark_name: {key: val for key, val in spec_config.__dict__.items()}
for benchmark_name, spec_config in self.__dict__.items()
}
BenchConfig.recursive_resolve(dict_config)

Check warning on line 67 in gpt_engineer/benchmark/bench_config.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/bench_config.py#L67

Added line #L67 was not covered by tests

return dict_config

Check warning on line 69 in gpt_engineer/benchmark/bench_config.py

View check run for this annotation

Codecov / codecov/patch

gpt_engineer/benchmark/bench_config.py#L69

Added line #L69 was not covered by tests
5 changes: 3 additions & 2 deletions gpt_engineer/benchmark/benchmarks/apps/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from gpt_engineer.core.prompt import Prompt

DATASET_PATH = Path(__file__).parent / "dataset"
MAX_N_TEST_EXAMPLES = 10


class AppsAssertion:
Expand Down Expand Up @@ -106,7 +105,9 @@ def load_apps(config: AppsConfig) -> Benchmark:
expected=problem.outputs[i],
command="python main.py" + ' "' + problem.inputs[i] + '"',
).evaluate
for i in range(min(len(problem.outputs), MAX_N_TEST_EXAMPLES))
for i in range(
min(len(problem.outputs), config.examples_per_problem)
)
},
)
)
Expand Down
8 changes: 0 additions & 8 deletions gpt_engineer/benchmark/benchmarks/gpteng/__init__.py

This file was deleted.

188 changes: 0 additions & 188 deletions gpt_engineer/benchmark/benchmarks/gpteng/eval_tools.py

This file was deleted.

This file was deleted.

Loading
Loading