mcode-benchmark/generate_prompts.py at main · Modelcode-ai/mcode-benchmark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python3
"""
Generate prompt.md files for MCode benchmarks.
Reads config.toml to determine which benchmarks to process.
Supports both 'server' and 'cli' benchmark types.

Templates are stored in templates/<version>/ directory as Jinja2 files:
- prompt_cli.md.j2    - Template for CLI benchmark prompts
- prompt_server.md.j2 - Template for server benchmark prompts

Template versions:
- v1: Original minimal template (used by initial experiments)
- v2: Improved template with drop-in replacement emphasis
"""

DEFAULT_TEMPLATE_VERSION = "v1"  # Default to v1 for backward compatibility
import yaml
import tomllib
import argparse
import sys
from pathlib import Path
from typing import Optional
from jinja2 import Environment, FileSystemLoader, TemplateNotFound


def load_config(config_path: Path) -> dict:
    """Load configuration from TOML file."""
    if not config_path.exists():
        return {}
    with open(config_path, 'rb') as f:
        return tomllib.load(f)


def load_benchmark_config(benchmark_dir: Path) -> Optional[dict]:
    """Load benchmark.yml from a benchmark directory."""
    benchmark_file = benchmark_dir / "benchmark.yml"
    metadata_file = benchmark_dir / "metadata.yml"

    if not benchmark_file.exists():
        return None

    with open(benchmark_file) as f:
        config = yaml.safe_load(f)

    # Skip benchmarks missing required fields
    if 'type' not in config.get('benchmark', {}):
        return None

    # Load ID from metadata.yml
    benchmark_id = "000"
    if metadata_file.exists():
        with open(metadata_file) as f:
            metadata = yaml.safe_load(f)
            benchmark_id = metadata.get("id", "000")

    config['_id'] = benchmark_id
    config['_dir'] = benchmark_dir
    return config


def discover_benchmarks(dataset_dir: Path) -> dict:
    """Discover all benchmarks in the dataset directory."""
    benchmarks = {}
    for item in dataset_dir.iterdir():
        if item.is_dir() and (item / "benchmark.yml").exists():
            config = load_benchmark_config(item)
            if config:
                benchmarks[config['_id']] = config
    return benchmarks


# Cache for Jinja2 environments (one per version)
_jinja_envs = {}

def get_jinja_env(version: str = DEFAULT_TEMPLATE_VERSION) -> Environment:
    """Get or create the Jinja2 environment for a specific template version."""
    global _jinja_envs
    if version not in _jinja_envs:
        templates_dir = Path(__file__).parent / "templates" / version
        if not templates_dir.exists():
            raise ValueError(f"Template version '{version}' not found at {templates_dir}")
        _jinja_envs[version] = Environment(
            loader=FileSystemLoader(templates_dir),
            trim_blocks=True,
            lstrip_blocks=True,
        )
    return _jinja_envs[version]


def generate_prompt(config: dict, template_version: str = DEFAULT_TEMPLATE_VERSION) -> str:
    """Generate prompt.md content from benchmark configuration using Jinja2 templates."""
    benchmark_type = config['benchmark']['type']
    source_lang = config['source']['language'].capitalize()
    source_desc = config['source']['description']
    target_lang = config['destination']['language'].capitalize()
    target_desc = config['destination']['description']

    # Build instructions (handle both install_cmd and build_cmd)
    install_cmd = config['destination'].get('install_cmd')
    build_cmd = config['destination'].get('build_cmd')

    build_steps = []
    if install_cmd:
        build_steps.append(f"Install dependencies: `{install_cmd}`")
    if build_cmd:
        build_steps.append(f"Build: `{build_cmd}`")

    if not build_steps:
        build_instructions = "No build steps required."
    else:
        build_instructions = "\n".join(f"- {step}" for step in build_steps)

    # Load and render template based on benchmark type
    env = get_jinja_env(template_version)
    template_name = f"prompt_{benchmark_type}.md.j2"

    try:
        template = env.get_template(template_name)
    except TemplateNotFound:
        raise ValueError(f"Unknown benchmark type: {benchmark_type} (template {template_name} not found)")

    # Prepare template variables
    template_vars = {
        'source_lang': source_lang,
        'source_desc': source_desc,
        'target_lang': target_lang,
        'target_desc': target_desc,
        'build_instructions': build_instructions,
        'run_cmd': config['destination'].get('run_cmd', ''),
    }

    # Add server-specific variables
    if benchmark_type == 'server':
        template_vars['port_env_var'] = config['destination'].get('port_env_var', 'SERVER_PORT')

    return template.render(**template_vars)


def main():
    parser = argparse.ArgumentParser(description='Generate prompt.md files for MCode benchmarks')
    parser.add_argument('--config', type=str, default='config.toml',
                        help='Path to config file (default: config.toml)')
    parser.add_argument('--all', action='store_true',
                        help='Generate prompts for all benchmarks, ignoring config.toml filter')
    parser.add_argument('--benchmark', type=str,
                        help='Generate prompt for a specific benchmark ID only')
    parser.add_argument('--template-version', type=str, default=DEFAULT_TEMPLATE_VERSION,
                        help=f'Template version to use (default: {DEFAULT_TEMPLATE_VERSION})')
    args = parser.parse_args()

    # Setup paths
    root_dir = Path(__file__).parent
    dataset_dir = root_dir / "dataset"
    config_path = root_dir / args.config

    # Load config
    config = load_config(config_path)
    benchmark_ids = config.get('benchmarks', {}).get('ids', [])

    # Discover benchmarks
    benchmarks = discover_benchmarks(dataset_dir)

    if not benchmarks:
        print("No benchmarks found in dataset/")
        sys.exit(1)

    # Filter benchmarks
    if args.benchmark:
        # Single benchmark by ID
        if args.benchmark not in benchmarks:
            print(f"Benchmark ID '{args.benchmark}' not found")
            print(f"Available IDs: {list(benchmarks.keys())}")
            sys.exit(1)
        benchmarks = {args.benchmark: benchmarks[args.benchmark]}
    elif not args.all and benchmark_ids:
        # Filter by config.toml
        benchmarks = {k: v for k, v in benchmarks.items() if k in benchmark_ids}
        if not benchmarks:
            print(f"No benchmarks found matching IDs in config: {benchmark_ids}")
            print(f"Available IDs: {list(discover_benchmarks(dataset_dir).keys())}")
            sys.exit(1)

    # Sort by ID
    sorted_benchmarks = sorted(benchmarks.items(), key=lambda x: x[0])

    print(f"Generating prompts for {len(sorted_benchmarks)} benchmark(s)...")
    print(f"Using template version: {args.template_version}")
    print("=" * 60)

    for benchmark_id, config in sorted_benchmarks:
        benchmark_name = config['benchmark']['name']
        benchmark_type = config['benchmark']['type']
        benchmark_dir = config['_dir']

        print(f"[{benchmark_id}] {benchmark_name} ({benchmark_type})")

        try:
            prompt = generate_prompt(config, args.template_version)
            prompt_path = benchmark_dir / "workspace" / "prompt.md"

            # Ensure workspace directory exists
            prompt_path.parent.mkdir(parents=True, exist_ok=True)

            with open(prompt_path, 'w') as f:
                f.write(prompt)

            print(f"  -> Generated {prompt_path.relative_to(root_dir)}")

        except KeyError as e:
            print(f"  -> ERROR: Missing required field {e}")
        except Exception as e:
            print(f"  -> ERROR: {e}")

    print("=" * 60)
    print("Done.")


if __name__ == "__main__":
    main()