nopcoder/execution_fitness.py at main · dotpipe/nopcoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import subprocess
import tempfile
import os
import uuid
import time


class ExecutionFitness:
    """
    Executes generated code safely in a sandboxed subprocess
    and measures real correctness.
    """

    def __init__(self, timeout=2):
        self.timeout = timeout

    # =====================================================
    # SAFE PYTHON EXECUTION
    # =====================================================
    def run_python(self, code, test_input=""):

        file_id = str(uuid.uuid4())
        file_path = f"/tmp/opcode_{file_id}.py"

        with open(file_path, "w") as f:
            f.write(code)

        try:
            start = time.time()

            result = subprocess.run(
                ["python3", file_path],
                input=test_input.encode(),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                timeout=self.timeout
            )

            duration = time.time() - start

            return {
                "stdout": result.stdout.decode(),
                "stderr": result.stderr.decode(),
                "exit_code": result.returncode,
                "time": duration,
                "success": result.returncode == 0
            }

        except subprocess.TimeoutExpired:
            return {
                "stdout": "",
                "stderr": "TIMEOUT",
                "exit_code": -1,
                "time": self.timeout,
                "success": False
            }

        finally:
            if os.path.exists(file_path):
                os.remove(file_path)

    # =====================================================
    # FITNESS SCORE
    # =====================================================
    def score(self, execution_results, expected_output=None):

        score = 0.0

        for res in execution_results:

            if res["success"]:
                score += 2.0
            else:
                score -= 3.0

            if res["stderr"]:
                score -= 1.0

            score -= res["time"] * 0.3

        return score

    def run_with_tests(self, code, tests):

        results = []

        for test in tests:

            res = self.run_python(code, test_input=test)

            results.append(res)

        return results