nopcoder/debate_engine.py at main · dotpipe/nopcoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class DebateEngine:
    """
    Multi-agent debate system:
    - agents produce independent outputs
    - optional cross-critique
    - judge scoring via LLM
    - returns ranked reasoning
    """

    def __init__(self, llm_router):
        self.llm_router = llm_router

    # =====================================================
    # MAIN DEBATE LOOP
    # =====================================================
    def run(self, agents, prompt, task_type="reasoning"):
        outputs = {}

        # ---------------------------------------------
        # ROUND 1: independent generation
        # ---------------------------------------------
        for agent in agents:
            model, response = agent.think(prompt, task_type)
            outputs[agent.name] = {
                "model": model,
                "response": response
            }

        # ---------------------------------------------
        # ROUND 2: optional critique pass
        # ---------------------------------------------
        critiques = self._critique_round(agents, outputs, prompt)

        return {
            "outputs": outputs,
            "critiques": critiques
        }

    # =====================================================
    # CRITIQUE PHASE (agents evaluate each other)
    # =====================================================
    def _critique_round(self, agents, outputs, prompt):
        critiques = {}

        for agent in agents:

            others = "\n\n".join(
                f"{name}: {data['response']}"
                for name, data in outputs.items()
                if name != agent.name
            )

            critique_prompt = f"""
You are {agent.name}.

Original task:
{prompt}

Other agents' answers:
{others}

Critique the reasoning quality.
Be strict, point out errors, contradictions, or improvements.

Return concise critique only.
"""

            model, critique = agent.think(critique_prompt, task_type="critique")

            critiques[agent.name] = {
                "model": model,
                "critique": critique
            }

        return critiques

    # =====================================================
    # SCORING (LLM JUDGE)
    # =====================================================
    def score(self, debate_result, judge_model="gpt"):

        scores = {}

        outputs = debate_result["outputs"]
        critiques = debate_result["critiques"]

        for name, data in outputs.items():

            critique = critiques.get(name, {}).get("critique", "")

            eval_prompt = f"""
You are a strict reasoning evaluator.

Task quality scoring (1-10):

ORIGINAL ANSWER:
{data['response']}

CRITIQUE:
{critique}

Score ONLY a number (1-10).
"""

            raw_score = self.llm_router.call(judge_model, eval_prompt)

            try:
                score = float(raw_score.strip())
            except:
                score = 0.0

            scores[name] = score

        return scores

    # =====================================================
    # WINNER SELECTION
    # =====================================================
    def winner(self, scores):
        if not scores:
            return None

        return max(scores, key=scores.get)