From 7f7fb2512af2682f05c4efadbabbb5e63572d608 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 06:54:40 +0000
Subject: [PATCH 01/13] Update judge command to support custom judge models

Co-Authored-By: liren@liquid.ai <liren@liquid.ai>
---
 .github/workflows/run-eval.yaml |  4 ++-
 bin/api/entrypoint.sh           | 20 ++++++++++--
 bin/api/run_docker_eval.sh      | 49 +++++++++++++++++++++++++----
 bin/api/run_openai_judge.sh     | 56 ++++++++++++++++++++++++++++-----
 llm_judge/common.py             | 52 +++++++++++++++---------------
 llm_judge/gen_judgment.py       | 16 ++++++++--
 6 files changed, 150 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index 3014b44..e7957c2 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -75,7 +75,9 @@ jobs:
         run: |
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --openai-api-key "$OPENAI_API_KEY" \
+            --judge-model-name "gpt-4" \
+            --judge-model-url "https://api.openai.com/v1" \
+            --judge-model-api-key "$OPENAI_API_KEY" \
             --parallel 3
 
       - name: Process Judge Results
diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh
index 05e5026..562069a 100755
--- a/bin/api/entrypoint.sh
+++ b/bin/api/entrypoint.sh
@@ -47,6 +47,18 @@ elif [[ "$MODE" == "judge" ]]; then
     # Extract arguments for judge mode
     PARALLEL="5"
     CI="false"
+    JUDGE_MODEL_NAME=${JUDGE_MODEL_NAME:-"gpt-4"}
+    JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""}
+    JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""}
+    
+    # If no judge model API key is provided, use OpenAI API key
+    if [[ -z "$JUDGE_MODEL_API_KEY" && -n "$OPENAI_API_KEY" ]]; then
+        JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
+        # Set default OpenAI URL if none provided
+        if [[ -z "$JUDGE_MODEL_URL" ]]; then
+            JUDGE_MODEL_URL="https://api.openai.com/v1"
+        fi
+    fi
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -67,14 +79,18 @@ elif [[ "$MODE" == "judge" ]]; then
     # Generate judgments
     python llm_judge/gen_judgment.py \
         --model-list "$MODEL_NAME" \
+        --judge-model "$JUDGE_MODEL_NAME" \
+        --judge-model-url "$JUDGE_MODEL_URL" \
+        --judge-model-api-key "$JUDGE_MODEL_API_KEY" \
         --parallel "$PARALLEL" \
         --bench-name japanese_mt_bench
 
     # Show results
     python llm_judge/show_result.py \
         --model-list "$MODEL_NAME" \
+        --judge-model "$JUDGE_MODEL_NAME" \
         --ci "$CI" \
         --bench-name japanese_mt_bench \
-        --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \
-        --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json
+        --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
+        --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json"
 fi
diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh
index 294195f..c2861ed 100755
--- a/bin/api/run_docker_eval.sh
+++ b/bin/api/run_docker_eval.sh
@@ -22,10 +22,13 @@ print_usage() {
     echo "  --question-count  Number of questions to evaluate (optional)"
     echo
     echo "Judge mode options:"
-    echo "  --model-name      Name of the model to evaluate"
-    echo "  --openai-api-key  OpenAI API key for GPT-4 judgment"
-    echo "  --parallel        Number of parallel processes (default: 5)"
-    echo "  --ci              CI mode (default: false)"
+    echo "  --model-name          Name of the model to evaluate"
+    echo "  --openai-api-key      OpenAI API key for backward compatibility"
+    echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
+    echo "  --judge-model-url     Base URL for the judge model API"
+    echo "  --judge-model-api-key API key for the judge model"
+    echo "  --parallel            Number of parallel processes (default: 5)"
+    echo "  --ci                  CI mode (default: false)"
 }
 
 if [ $# -lt 1 ]; then
@@ -107,6 +110,9 @@ elif [[ "$MODE" == "judge" ]]; then
     # Process judge mode arguments
     MODEL_NAME=""
     OPENAI_API_KEY=""
+    JUDGE_MODEL_NAME="gpt-4"
+    JUDGE_MODEL_URL=""
+    JUDGE_MODEL_API_KEY=""
     PARALLEL="5"
     CI="false"
 
@@ -120,6 +126,18 @@ elif [[ "$MODE" == "judge" ]]; then
                 OPENAI_API_KEY="$2"
                 shift 2
                 ;;
+            --judge-model-name)
+                JUDGE_MODEL_NAME="$2"
+                shift 2
+                ;;
+            --judge-model-url)
+                JUDGE_MODEL_URL="$2"
+                shift 2
+                ;;
+            --judge-model-api-key)
+                JUDGE_MODEL_API_KEY="$2"
+                shift 2
+                ;;
             --parallel)
                 PARALLEL="$2"
                 shift 2
@@ -142,16 +160,35 @@ elif [[ "$MODE" == "judge" ]]; then
         exit 1
     fi
 
-    if [[ -z "$OPENAI_API_KEY" ]]; then
-        echo "Error: --openai-api-key is required"
+    # If --judge-model-api-key is provided, use it
+    if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then
+        # Use the new judge model parameters
+        if [[ -z "$JUDGE_MODEL_URL" ]]; then
+            echo "Error: --judge-model-url is required when using --judge-model-api-key"
+            print_usage
+            exit 1
+        fi
+    elif [[ -z "$OPENAI_API_KEY" ]]; then
+        # Fall back to requiring OpenAI API key
+        echo "Error: Either --judge-model-api-key or --openai-api-key is required"
         print_usage
         exit 1
+    else
+        # If only OpenAI API key is provided, use it as the judge model API key
+        JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
+        # Default to OpenAI API URL if using OpenAI API key
+        if [[ -z "$JUDGE_MODEL_URL" ]]; then
+            JUDGE_MODEL_URL="https://api.openai.com/v1"
+        fi
     fi
 
     # Run judge mode
     docker run --rm -it \
         --network="host" \
         -e MODEL_NAME="$MODEL_NAME" \
+        -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \
+        -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \
+        -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \
         -e OPENAI_API_KEY="$OPENAI_API_KEY" \
         -v "$(pwd)/llm_judge:/app/llm_judge" \
         liquidai/mt-bench:latest judge \
diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh
index de8c7d6..c501bf0 100755
--- a/bin/api/run_openai_judge.sh
+++ b/bin/api/run_openai_judge.sh
@@ -1,16 +1,23 @@
 #!/bin/bash
 
 print_usage() {
-    echo "Usage: $0 --openai-api-key <api_key> --model-name <model_name> --parallel <parallel>"
+    echo "Usage: $0 --model-name <model_name> [--openai-api-key <api_key> | (--judge-model-name <judge_model_name> --judge-model-url <url> --judge-model-api-key <api_key>)] --parallel <parallel>"
     echo
     echo "Arguments:"
-    echo "  --openai-api-key OpenAI API key"
-    echo "  --model-name     Model name"
-    echo "  --parallel       Number of parallel processes"
+    echo "  --model-name          Model name to be evaluated"
+    echo "  --openai-api-key      OpenAI API key (backward compatibility)"
+    echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
+    echo "  --judge-model-url     Base URL for the judge model API"
+    echo "  --judge-model-api-key API key for the judge model"
+    echo "  --parallel            Number of parallel processes"
+    echo "  --ci                  CI mode"
 }
 
 OPENAI_API_KEY=""
 MODEL_NAME=""
+JUDGE_MODEL_NAME="gpt-4"
+JUDGE_MODEL_URL=""
+JUDGE_MODEL_API_KEY=""
 PARALLEL="5"
 CI="false"
 
@@ -24,6 +31,18 @@ while [[ $# -gt 0 ]]; do
             MODEL_NAME="$2"
             shift 2
             ;;
+        --judge-model-name)
+            JUDGE_MODEL_NAME="$2"
+            shift 2
+            ;;
+        --judge-model-url)
+            JUDGE_MODEL_URL="$2"
+            shift 2
+            ;;
+        --judge-model-api-key)
+            JUDGE_MODEL_API_KEY="$2"
+            shift 2
+            ;;
         --parallel)
             PARALLEL="$2"
             shift 2
@@ -40,10 +59,26 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-if [[ -z "$OPENAI_API_KEY" ]]; then
-    echo "Error: --openai-api-key is required"
+# If judge model API key is provided, use it
+if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then
+    # Use the new judge model parameters
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        echo "Error: --judge-model-url is required when using --judge-model-api-key"
+        print_usage
+        exit 1
+    fi
+elif [[ -z "$OPENAI_API_KEY" ]]; then
+    # Fall back to requiring OpenAI API key
+    echo "Error: Either --judge-model-api-key or --openai-api-key is required"
     print_usage
     exit 1
+else
+    # If only OpenAI API key is provided, use it as the judge model API key
+    JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
+    # Default to OpenAI API URL if using OpenAI API key
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        JUDGE_MODEL_URL="https://api.openai.com/v1"
+    fi
 fi
 
 if [[ -z "$MODEL_NAME" ]]; then
@@ -53,15 +88,20 @@ if [[ -z "$MODEL_NAME" ]]; then
 fi
 
 export OPENAI_API_KEY="$OPENAI_API_KEY"
+export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME"
+export JUDGE_MODEL_URL="$JUDGE_MODEL_URL"
+export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY"
 export PYTHONPATH=.
 
 python llm_judge/gen_judgment.py \
   --model-list "$MODEL_NAME" \
+  --judge-model "$JUDGE_MODEL_NAME" \
   --parallel "$PARALLEL" \
   --bench-name japanese_mt_bench
 
 python llm_judge/show_result.py --model-list "$MODEL_NAME" \
+  --judge-model "$JUDGE_MODEL_NAME" \
   --ci "$CI" \
   --bench-name japanese_mt_bench \
-  --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \
-  --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json
+  --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
+  --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json"
diff --git a/llm_judge/common.py b/llm_judge/common.py
index bf4d3f6..1436850 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -157,7 +157,7 @@ def load_judge_prompts(prompt_file: str):
     return prompts
 
 
-def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azure=True):
+def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_dict=None):
     kwargs = {}
     model = judge.model_name
     if ref_answer is not None:
@@ -193,12 +193,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur
         conv.append_message(conv.roles[1], None)
 
         if model in OPENAI_MODEL_LIST:
-            if azure:
-                judgment = chat_completion_openai_azure(
-                    model, conv, temperature=0, max_tokens=2048
-                )
-            else:
-                judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+            judgment = chat_completion_openai(
+                model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+            )
         elif model in ANTHROPIC_MODEL_LIST:
             judgment = chat_completion_anthropic(
                 model, conv, temperature=0, max_tokens=1024
@@ -226,7 +223,7 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur
     return rating_list, user_prompt_list, judgment_list
 
 
-def play_a_match_single(match: MatchPair, output_file: str, azure=True):
+def play_a_match_single(match: MatchSingle, output_file: str, api_dict=None):
     question, model, answer, judge, ref_answer, multi_turn = (
         match.question,
         match.model,
@@ -238,7 +235,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
 
     if judge.prompt_template["type"] == "single":
         score_list, user_prompt_list, judgment_list = run_judge_single(
-            question, answer, judge, ref_answer, multi_turn=multi_turn, azure=azure
+            question, answer, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
 
         question_id = question["question_id"]
@@ -259,7 +256,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
             f"judge: {(judge.model_name, judge.prompt_template['name'])}"
         )
     else:
-        raise ValueError(f"invalid judge type: {judge['type']}")
+        raise ValueError(f"invalid judge type: {judge.prompt_template['type']}")
 
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
@@ -269,7 +266,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
     return result
 
 
-def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, azure=True):
+def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, api_dict=None):
     kwargs = {}
     model = judge.model_name
     if ref_answer is not None:
@@ -305,12 +302,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
 
     if model in OPENAI_MODEL_LIST:
         conv.set_system_message(system_prompt)
-        if azure:
-            judgment = chat_completion_openai_azure(
-                model, conv, temperature=0, max_tokens=2048
-            )
-        else:
-            judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+        judgment = chat_completion_openai(
+            model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+        )
     elif model in ANTHROPIC_MODEL_LIST:
         if system_prompt != "You are a helpful assistant.":
             user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
@@ -352,7 +346,7 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     return winner, user_prompt, judgment
 
 
-def play_a_match_pair(match: MatchPair, output_file: str):
+def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None):
     question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = (
         match.question,
         match.model_1,
@@ -366,10 +360,10 @@ def play_a_match_pair(match: MatchPair, output_file: str):
 
     if judge.prompt_template["type"] == "pairwise":
         g1_winner, g1_user_prompt, g1_judgment = run_judge_pair(
-            question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, azure=True
+            question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
         g2_winner, g2_user_prompt, g2_judgment = run_judge_pair(
-            question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, azure=True
+            question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
 
         g1_map = {"A": "model_1", "B": "model_2"}
@@ -401,15 +395,19 @@ def play_a_match_pair(match: MatchPair, output_file: str):
         )
     elif judge.prompt_template["type"] == "single":
         m1_score, m1_user_prompt, m1_judgment = run_judge_single(
-            question, answer_1, judge, azure=True
+            question, answer_1, judge, ref_answer, api_dict=api_dict
         )
         m2_score, m2_user_prompt, m2_judgment = run_judge_single(
-            question, answer_2, judge, azure=True
+            question, answer_2, judge, ref_answer, api_dict=api_dict
         )
 
-        if abs(m1_score - m2_score) <= TIE_DELTA:
+        # Extract first score from lists
+        m1_first_score = m1_score[0] if isinstance(m1_score, list) else m1_score
+        m2_first_score = m2_score[0] if isinstance(m2_score, list) else m2_score
+        
+        if abs(m1_first_score - m2_first_score) <= TIE_DELTA:
             winner = "tie"
-        elif m1_score > m2_score:
+        elif m1_first_score > m2_first_score:
             winner = "model_1"
         else:
             winner = "model_2"
@@ -436,7 +434,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
             f"judge: {(judge.model_name, judge.prompt_template['name'])}"
         )
     else:
-        raise ValueError(f"invalid judge type: {judge['type']}")
+        raise ValueError(f"invalid judge type: {judge.prompt_template['type']}")
 
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
@@ -520,8 +518,8 @@ def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=
         except openai.error.InvalidRequestError as e:
             print(type(e), e)
             break
-        except KeyError:
-            print(response)
+        except KeyError as e:
+            print(f"KeyError: {e}")
             break
 
     return output
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index a4cb2c3..7732f8c 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -184,7 +184,9 @@ def make_judge_single(judge_model, judge_prompts):
         default="llm_judge/data/judge_prompts.jsonl",
         help="The file of judge prompts.",
     )
-    parser.add_argument("--judge-model", type=str, default="gpt-4")
+    parser.add_argument("--judge-model", type=str, default="gpt-4", help="The model used for judging")
+    parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API")
+    parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model")
     parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
     parser.add_argument(
         "--mode",
@@ -320,14 +322,22 @@ def make_judge_single(judge_model, judge_prompts):
     print(json.dumps(match_stat, indent=4, ensure_ascii=False))
     # input("Press Enter to confirm...")
 
+    # Prepare API dict if judge model URL and API key are provided
+    api_dict = None
+    if args.judge_model_url and args.judge_model_api_key:
+        api_dict = {
+            "api_base": args.judge_model_url,
+            "api_key": args.judge_model_api_key
+        }
+
     # Play matches
     if args.parallel == 1:
         for match in tqdm(matches):
-            play_a_match_func(match, output_file=output_file, azure=args.azure)
+            play_a_match_func(match, output_file=output_file, api_dict=api_dict)
     else:
 
         def play_a_match_wrapper(match):
-            play_a_match_func(match, output_file=output_file, azure=args.azure)
+            play_a_match_func(match, output_file=output_file, api_dict=api_dict)
 
         np.random.seed(0)
         np.random.shuffle(matches)

From 030e054ffb866138388cf0683457e2ae2db1a03e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 07:11:48 +0000
Subject: [PATCH 02/13] Remove OpenAI API key parameter completely

Co-Authored-By: liren@liquid.ai <liren@liquid.ai>
---
 bin/api/entrypoint.sh       | 16 +++++++------
 bin/api/run_docker_eval.sh  | 35 +++++++++--------------------
 bin/api/run_openai_judge.sh | 37 +++++++++---------------------
 llm_judge/common.py         | 45 +++----------------------------------
 llm_judge/gen_judgment.py   | 16 ++++---------
 5 files changed, 37 insertions(+), 112 deletions(-)

diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh
index 562069a..ce284a8 100755
--- a/bin/api/entrypoint.sh
+++ b/bin/api/entrypoint.sh
@@ -51,13 +51,15 @@ elif [[ "$MODE" == "judge" ]]; then
     JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""}
     JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""}
     
-    # If no judge model API key is provided, use OpenAI API key
-    if [[ -z "$JUDGE_MODEL_API_KEY" && -n "$OPENAI_API_KEY" ]]; then
-        JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
-        # Set default OpenAI URL if none provided
-        if [[ -z "$JUDGE_MODEL_URL" ]]; then
-            JUDGE_MODEL_URL="https://api.openai.com/v1"
-        fi
+    # Ensure required parameters are set
+    if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+        echo "Error: JUDGE_MODEL_API_KEY environment variable is required"
+        exit 1
+    fi
+    
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        echo "Error: JUDGE_MODEL_URL environment variable is required"
+        exit 1
     fi
 
     while [[ $# -gt 0 ]]; do
diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh
index c2861ed..5a05101 100755
--- a/bin/api/run_docker_eval.sh
+++ b/bin/api/run_docker_eval.sh
@@ -23,7 +23,6 @@ print_usage() {
     echo
     echo "Judge mode options:"
     echo "  --model-name          Name of the model to evaluate"
-    echo "  --openai-api-key      OpenAI API key for backward compatibility"
     echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
     echo "  --judge-model-url     Base URL for the judge model API"
     echo "  --judge-model-api-key API key for the judge model"
@@ -109,7 +108,6 @@ if [[ "$MODE" == "generate" ]]; then
 elif [[ "$MODE" == "judge" ]]; then
     # Process judge mode arguments
     MODEL_NAME=""
-    OPENAI_API_KEY=""
     JUDGE_MODEL_NAME="gpt-4"
     JUDGE_MODEL_URL=""
     JUDGE_MODEL_API_KEY=""
@@ -122,10 +120,7 @@ elif [[ "$MODE" == "judge" ]]; then
                 MODEL_NAME="$2"
                 shift 2
                 ;;
-            --openai-api-key)
-                OPENAI_API_KEY="$2"
-                shift 2
-                ;;
+
             --judge-model-name)
                 JUDGE_MODEL_NAME="$2"
                 shift 2
@@ -160,26 +155,17 @@ elif [[ "$MODE" == "judge" ]]; then
         exit 1
     fi
 
-    # If --judge-model-api-key is provided, use it
-    if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then
-        # Use the new judge model parameters
-        if [[ -z "$JUDGE_MODEL_URL" ]]; then
-            echo "Error: --judge-model-url is required when using --judge-model-api-key"
-            print_usage
-            exit 1
-        fi
-    elif [[ -z "$OPENAI_API_KEY" ]]; then
-        # Fall back to requiring OpenAI API key
-        echo "Error: Either --judge-model-api-key or --openai-api-key is required"
+    # Validate required parameters
+    if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+        echo "Error: --judge-model-api-key is required"
+        print_usage
+        exit 1
+    fi
+    
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        echo "Error: --judge-model-url is required"
         print_usage
         exit 1
-    else
-        # If only OpenAI API key is provided, use it as the judge model API key
-        JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
-        # Default to OpenAI API URL if using OpenAI API key
-        if [[ -z "$JUDGE_MODEL_URL" ]]; then
-            JUDGE_MODEL_URL="https://api.openai.com/v1"
-        fi
     fi
 
     # Run judge mode
@@ -189,7 +175,6 @@ elif [[ "$MODE" == "judge" ]]; then
         -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \
         -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \
         -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \
-        -e OPENAI_API_KEY="$OPENAI_API_KEY" \
         -v "$(pwd)/llm_judge:/app/llm_judge" \
         liquidai/mt-bench:latest judge \
         --parallel "$PARALLEL" \
diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh
index c501bf0..f9c512e 100755
--- a/bin/api/run_openai_judge.sh
+++ b/bin/api/run_openai_judge.sh
@@ -1,11 +1,10 @@
 #!/bin/bash
 
 print_usage() {
-    echo "Usage: $0 --model-name <model_name> [--openai-api-key <api_key> | (--judge-model-name <judge_model_name> --judge-model-url <url> --judge-model-api-key <api_key>)] --parallel <parallel>"
+    echo "Usage: $0 --model-name <model_name> --judge-model-name <judge_model_name> --judge-model-url <url> --judge-model-api-key <api_key> --parallel <parallel>"
     echo
     echo "Arguments:"
     echo "  --model-name          Model name to be evaluated"
-    echo "  --openai-api-key      OpenAI API key (backward compatibility)"
     echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
     echo "  --judge-model-url     Base URL for the judge model API"
     echo "  --judge-model-api-key API key for the judge model"
@@ -13,7 +12,6 @@ print_usage() {
     echo "  --ci                  CI mode"
 }
 
-OPENAI_API_KEY=""
 MODEL_NAME=""
 JUDGE_MODEL_NAME="gpt-4"
 JUDGE_MODEL_URL=""
@@ -23,10 +21,7 @@ CI="false"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --openai-api-key)
-            OPENAI_API_KEY="$2"
-            shift 2
-            ;;
+
         --model-name)
             MODEL_NAME="$2"
             shift 2
@@ -59,26 +54,17 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-# If judge model API key is provided, use it
-if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then
-    # Use the new judge model parameters
-    if [[ -z "$JUDGE_MODEL_URL" ]]; then
-        echo "Error: --judge-model-url is required when using --judge-model-api-key"
-        print_usage
-        exit 1
-    fi
-elif [[ -z "$OPENAI_API_KEY" ]]; then
-    # Fall back to requiring OpenAI API key
-    echo "Error: Either --judge-model-api-key or --openai-api-key is required"
+# Validate required parameters
+if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+    echo "Error: --judge-model-api-key is required"
+    print_usage
+    exit 1
+fi
+
+if [[ -z "$JUDGE_MODEL_URL" ]]; then
+    echo "Error: --judge-model-url is required"
     print_usage
     exit 1
-else
-    # If only OpenAI API key is provided, use it as the judge model API key
-    JUDGE_MODEL_API_KEY="$OPENAI_API_KEY"
-    # Default to OpenAI API URL if using OpenAI API key
-    if [[ -z "$JUDGE_MODEL_URL" ]]; then
-        JUDGE_MODEL_URL="https://api.openai.com/v1"
-    fi
 fi
 
 if [[ -z "$MODEL_NAME" ]]; then
@@ -87,7 +73,6 @@ if [[ -z "$MODEL_NAME" ]]; then
     exit 1
 fi
 
-export OPENAI_API_KEY="$OPENAI_API_KEY"
 export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME"
 export JUDGE_MODEL_URL="$JUDGE_MODEL_URL"
 export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY"
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 1436850..067313d 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -477,52 +477,13 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     return output
 
 
-def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=None):
-    openai.api_type = "azure"
-    openai.api_version = "2023-07-01-preview"
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
-    else:
-        openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
-        openai.api_key = os.environ["AZURE_OPENAI_KEY"]
+# Remove Azure-specific function as we now use custom judge model parameters
+# This function is no longer needed
 
     if "azure-" in model:
         model = model[6:]
 
-    output = API_ERROR_OUTPUT
-    min_sleep_time = 1
-    max_sleep_time = API_RETRY_SLEEP
-    for _ in range(API_MAX_RETRY):
-        try:
-            messages = conv.to_openai_api_messages()
-            response = openai.ChatCompletion.create(
-                engine=model,
-                messages=messages,
-                n=1,
-                temperature=temperature,
-                max_tokens=max_tokens,
-            )
-            output = response["choices"][0]["message"]["content"]
-            break
-        except openai.error.RateLimitError as e:
-            print(type(e), e)
-            sleep_time = random.randint(min_sleep_time, max_sleep_time)
-            print(f"Sleeping for {sleep_time} seconds")
-            time.sleep(sleep_time)
-            max_sleep_time = min(MAX_API_RETRY_SLEEP, max_sleep_time * 2)
-            min_sleep_time = max_sleep_time // 2
-        except openai.error.OpenAIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-        except openai.error.InvalidRequestError as e:
-            print(type(e), e)
-            break
-        except KeyError as e:
-            print(f"KeyError: {e}")
-            break
-
-    return output
+# Function body removed as we now use custom judge model parameters
 
 
 def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None):
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 7732f8c..961c871 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -213,9 +213,7 @@ def make_judge_single(judge_model, judge_prompts):
     parser.add_argument(
         "--first-n", type=int, help="A debug option. Only run the first `n` judgments."
     )
-    parser.add_argument(
-        "--azure", action="store_true", help="Use Azure API instead of openai.", default=False
-    )
+    # Remove Azure parameter as we now use custom judge model parameters
     args = parser.parse_args()
 
     args.model_list = [model_path.replace("/", "_") for model_path in args.model_list]
@@ -250,20 +248,14 @@ def make_judge_single(judge_model, judge_prompts):
     if args.mode == "single":
         judges = make_judge_single(args.judge_model, judge_prompts)
         play_a_match_func = play_a_match_single
-        if args.azure:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_single_azure.jsonl")
-        else:
-            model_suffix = "_".join(args.model_list)
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")
+        model_suffix = "_".join(args.model_list)
+        output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")
         make_match_func = make_match_single
         baseline_model = None
     else:
         judges = make_judge_pairwise(args.judge_model, judge_prompts)
         play_a_match_func = play_a_match_pair
-        if args.azure:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair_azure.jsonl")
-        else:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")
+        output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")
         if args.mode == "pairwise-all":
             make_match_func = make_match_all_pairs
             baseline_model = None

From f8ab3bae4c21bc06acda2f9b502986cc611c920e Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 07:23:14 +0000
Subject: [PATCH 03/13] Update README and GitHub workflow to use gpt-4o as
 judge model

Co-Authored-By: liren@liquid.ai <liren@liquid.ai>
---
 .github/workflows/run-eval.yaml |  2 +-
 README.md                       | 34 ++++++++++++++++++++-------------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index e7957c2..33d044e 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -75,7 +75,7 @@ jobs:
         run: |
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --judge-model-name "gpt-4" \
+            --judge-model-name "gpt-4o" \
             --judge-model-url "https://api.openai.com/v1" \
             --judge-model-api-key "$OPENAI_API_KEY" \
             --parallel 3
diff --git a/README.md b/README.md
index 07fc2a5..47e2b8b 100644
--- a/README.md
+++ b/README.md
@@ -21,17 +21,19 @@ bin/api/run_docker_eval.sh generate \
 
 Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-name>.jsonl`
 
-2. Run OpenAI judge:
+2. Run judge:
 
 ```bash
 bin/api/run_docker_eval.sh judge \
   --model-name <model-name> \
-  --openai-api-key <openai-api-key>
+  --judge-model-name <judge-model-name> \
+  --judge-model-url <judge-model-url> \
+  --judge-model-api-key <judge-model-api-key>
 ```
 
-GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_<model-name>.jsonl`.
+Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/<judge-model-name>_<model-name>.jsonl`.
 
-The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-<model-name>.json`.
+The final scores will be output in `llm_judge/data/japanese_mt_bench/<judge-model-name>-score-<model-name>.json`.
 
 ### Examples
 
@@ -45,7 +47,9 @@ bin/api/run_docker_eval.sh generate \
 
 bin/api/run_docker_eval.sh judge \
   --model-name lfm-3b-jp \
-  --openai-api-key <OPENAI-API-KEY>
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
 Run eval for `lfm-3b-ichikara` on-prem:
@@ -71,7 +75,9 @@ bin/api/run_docker_eval.sh generate \
 
 bin/api/run_docker_eval.sh judge \
   --model-name lfm-3b-jp \
-  --openai-api-key <OPENAI-API-KEY>
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
 ## Run Evaluation without Docker
@@ -111,16 +117,16 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-
 2. Run the following scripts to generate GPT-4 judgement scores for the model answers.
 
 ```bash
-bin/api/run_openai_judge.sh --model-name <model-name> --openai-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh --model-name <model-name> --judge-model-name <judge-model-name> --judge-model-url <judge-model-url> --judge-model-api-key <judge-model-api-key>
 
 # examples:
-bin/api/run_openai_judge.sh --model-name lfm-3b-jp --openai-api-key <OPENAI-API-KEY>
-bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --openai-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh --model-name lfm-3b-jp --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key <OPENAI-API-KEY>
 ```
 
-GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_<model-name>.jsonl`.
+Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/<judge-model-name>_<model-name>.jsonl`.
 
-The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-<model-name>.json`.
+The final scores will be output in `llm_judge/data/japanese_mt_bench/<judge-model-name>-score-<model-name>.json`.
 
 </details>
 
@@ -148,8 +154,10 @@ This applies to both `bin/api/run_docker_eval.sh judge` and `bin/api/run_openai_
 
 | Argument | Description | Required |
 | --- | --- | --- |
-| `--model-name` | Model name | Yes |
-| `--openai-api-key` | OpenAI API key | Yes |
+| `--model-name` | Model name to be evaluated | Yes |
+| `--judge-model-name` | Name of the judge model (default: gpt-4) | No |
+| `--judge-model-url` | Base URL for the judge model API | Yes |
+| `--judge-model-api-key` | API key for the judge model | Yes |
 | `--parallel` | Number of parallel API calls | No. Default to 5. |
 
 </details>

From 75237681a4d3ade49acee46178b38f5cf86abc51 Mon Sep 17 00:00:00 2001
From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Date: Thu, 20 Mar 2025 07:26:34 +0000
Subject: [PATCH 04/13] Fix CI: Revert to gpt-4 as judge model name

Co-Authored-By: liren@liquid.ai <liren@liquid.ai>
---
 .github/workflows/run-eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index 33d044e..e7957c2 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -75,7 +75,7 @@ jobs:
         run: |
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --judge-model-name "gpt-4o" \
+            --judge-model-name "gpt-4" \
             --judge-model-url "https://api.openai.com/v1" \
             --judge-model-api-key "$OPENAI_API_KEY" \
             --parallel 3

From cfc8845992479c1ded1f146b901d50b277bbf1c1 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Wed, 9 Apr 2025 20:06:41 -0400
Subject: [PATCH 05/13] Run judge on gpt-4o-mini

---
 .github/workflows/run-eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index e7957c2..c823872 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -75,7 +75,7 @@ jobs:
         run: |
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --judge-model-name "gpt-4" \
+            --judge-model-name "gpt-4o-mini" \
             --judge-model-url "https://api.openai.com/v1" \
             --judge-model-api-key "$OPENAI_API_KEY" \
             --parallel 3

From ae180a86f06ed4ce46d311f5706667719d5bf30d Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Wed, 9 Apr 2025 20:51:23 -0400
Subject: [PATCH 06/13] Use gpt reference answer

---
 README.md              | 2 ++
 llm_judge/common.py    | 4 ++--
 model/model_adapter.py | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 47e2b8b..381e320 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,8 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-
 
 2. Run judge:
 
+The judge script will use the judge model to compare [GPT-4 results](llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl) with the model results. The judge model defaults to GPT-4.
+
 ```bash
 bin/api/run_docker_eval.sh judge \
   --model-name <model-name> \
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 067313d..ff56792 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -719,8 +719,8 @@ def check_data(questions, model_answers, ref_answers, models, judges):
             if q["category"] not in NEED_REF_CATS:
                 continue
             assert (
-                q["question_id"] in ref_answers[jg.model_name]
-            ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+                q["question_id"] in ref_answers['gpt-4']
+            ), f"Missing reference answer to Question {q['question_id']} from 'gpt-4'"
 
 
 def get_model_list(answer_dir):
diff --git a/model/model_adapter.py b/model/model_adapter.py
index 688960e..ce099e8 100755
--- a/model/model_adapter.py
+++ b/model/model_adapter.py
@@ -1076,7 +1076,7 @@ class ChatGPTAdapter(BaseModelAdapter):
     """The model adapter for ChatGPT"""
 
     def match(self, model_path: str):
-        return model_path in OPENAI_MODEL_LIST
+        return model_path.startswith("gpt")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()
@@ -1089,7 +1089,7 @@ class AzureOpenAIAdapter(BaseModelAdapter):
     """The model adapter for Azure OpenAI"""
 
     def match(self, model_path: str):
-        return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
+        return model_path.startswith("azure")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()
@@ -1118,7 +1118,7 @@ class ClaudeAdapter(BaseModelAdapter):
     """The model adapter for Claude"""
 
     def match(self, model_path: str):
-        return model_path in ANTHROPIC_MODEL_LIST
+        return model_path.startswith("claude")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()

From 6c81de7fad1321f9f8cd3c087fa67b35f347b033 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 01:52:39 -0400
Subject: [PATCH 07/13] Fix more key errors

---
 llm_judge/gen_judgment.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 961c871..260cf86 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -47,7 +47,7 @@ def make_match(
             a_1 = model_answers[m_1][q_id]
             a_2 = model_answers[baseline_model][q_id]
             if ref_answers is not None:
-                ref = ref_answers[judge.model_name][q_id]
+                ref = ref_answers['gpt-4'][q_id]
                 match = MatchPair(
                     dict(q),
                     m_1,
@@ -87,7 +87,7 @@ def make_match_all_pairs(
                 a_1 = model_answers[m_1][q_id]
                 a_2 = model_answers[m_2][q_id]
                 if ref_answers is not None:
-                    ref = ref_answers[judge.model_name][q_id]
+                    ref = ref_answers['gpt-4'][q_id]
                     match = MatchPair(
                         dict(q),
                         m_1,
@@ -127,7 +127,7 @@ def make_match_single(
                 print(f"Model {m} does not have answer for question {q_id}")
                 continue
             if ref_answers is not None:
-                ref = ref_answers[judge.model_name][q_id]
+                ref = ref_answers['gpt-4'][q_id]
                 matches.append(
                     MatchSingle(
                         dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn

From df7362873267fd2abf5e22643f37ecd0b2a0eb7e Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 02:17:51 -0400
Subject: [PATCH 08/13] Use openai chat completions

---
 llm_judge/common.py | 99 ++++-----------------------------------------
 1 file changed, 8 insertions(+), 91 deletions(-)

diff --git a/llm_judge/common.py b/llm_judge/common.py
index ff56792..054a59d 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -13,16 +13,11 @@
 import random
 
 import openai
-import anthropic
 from dotenv import load_dotenv
 
 load_dotenv()
 
-from model.model_adapter import (
-    get_conversation_template,
-    ANTHROPIC_MODEL_LIST,
-    OPENAI_MODEL_LIST,
-)
+from model.model_adapter import get_conversation_template
 
 # API setting constants
 API_MAX_RETRY = 16
@@ -192,16 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_
         conv.append_message(conv.roles[0], user_prompt)
         conv.append_message(conv.roles[1], None)
 
-        if model in OPENAI_MODEL_LIST:
-            judgment = chat_completion_openai(
-                model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
-            )
-        elif model in ANTHROPIC_MODEL_LIST:
-            judgment = chat_completion_anthropic(
-                model, conv, temperature=0, max_tokens=1024
-            )
-        else:
-            raise ValueError(f"Invalid judge model name: {model}")
+        judgment = chat_completion_openai(
+            model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+        )
 
         if judge.prompt_template["output_format"] == "[[rating]]":
             match = re.search(one_score_pattern, judgment)
@@ -300,20 +288,10 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     conv.append_message(conv.roles[0], user_prompt)
     conv.append_message(conv.roles[1], None)
 
-    if model in OPENAI_MODEL_LIST:
-        conv.set_system_message(system_prompt)
-        judgment = chat_completion_openai(
-            model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
-        )
-    elif model in ANTHROPIC_MODEL_LIST:
-        if system_prompt != "You are a helpful assistant.":
-            user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
-            conv.messages[0][1] = user_prompt
-        judgment = chat_completion_anthropic(
-            model, conv, temperature=0, max_tokens=1024
-        )
-    else:
-        raise ValueError(f"Invalid judge model name: {model}")
+    conv.set_system_message(system_prompt)
+    judgment = chat_completion_openai(
+        model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+    )
 
     if judge.prompt_template["output_format"] == "[[A]]":
         if "[[A]]" in judgment:
@@ -477,67 +455,6 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     return output
 
 
-# Remove Azure-specific function as we now use custom judge model parameters
-# This function is no longer needed
-
-    if "azure-" in model:
-        model = model[6:]
-
-# Function body removed as we now use custom judge model parameters
-
-
-def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None):
-    if api_dict is not None and "api_key" in api_dict:
-        api_key = api_dict["api_key"]
-    else:
-        api_key = os.environ["ANTHROPIC_API_KEY"]
-
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            c = anthropic.Anthropic(api_key=api_key)
-            prompt = conv.get_prompt()
-            response = c.completions.create(
-                model=model,
-                prompt=prompt,
-                stop_sequences=[anthropic.HUMAN_PROMPT],
-                max_tokens_to_sample=max_tokens,
-                temperature=temperature,
-            )
-            output = response.completion
-            break
-        except anthropic.APIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return output.strip()
-
-
-def chat_completion_palm(chat_state, model, conv, temperature, max_tokens):
-    from serve.api_provider import init_palm_chat
-
-    assert model == "palm-2-chat-bison-001"
-
-    if chat_state is None:
-        chat_state = init_palm_chat("chat-bison@001")
-
-    parameters = {
-        "temperature": temperature,
-        "top_p": 0.8,
-        "top_k": 40,
-        "max_output_tokens": max_tokens,
-    }
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            response = chat_state.send_message(conv.messages[-2][1], **parameters)
-            output = response.text
-            break
-        except Exception as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return chat_state, output
-
-
 def normalize_game_key_single(gamekey, result):
     """Make the model names sorted in a game key."""
     qid, model_1, model_2 = gamekey

From ce03360634eb87665dcb4534264dcf65c81cb359 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 02:27:02 -0400
Subject: [PATCH 09/13] Judge the model by itself

---
 .github/workflows/run-eval.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index c823872..0783398 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -71,13 +71,14 @@ jobs:
         env:
           MODEL_NAME: lfm-3b
           MODEL_URL: ${{ vars.MODEL_URL }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MODEL_API_KEY: ${{ secrets.MODEL_API_KEY }}
         run: |
+          # let the model judge itself against the GPT-4 answers
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --judge-model-name "gpt-4o-mini" \
-            --judge-model-url "https://api.openai.com/v1" \
-            --judge-model-api-key "$OPENAI_API_KEY" \
+            --judge-model-name "$MODEL_NAME" \
+            --judge-model-url "$MODEL_URL" \
+            --judge-model-api-key "$MODEL_API_KEY" \
             --parallel 3
 
       - name: Process Judge Results

From fb491d772fcf46ad955ca61f7e998d94095f5af8 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 03:02:42 -0400
Subject: [PATCH 10/13] Log api base and key

---
 llm_judge/common.py       |  8 ++++++--
 llm_judge/gen_judgment.py | 14 +++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/llm_judge/common.py b/llm_judge/common.py
index 054a59d..1565f36 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -424,8 +424,12 @@ def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None):
 
 def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
+        if "api_base" in api_dict:
+            print(f"Using API base: {api_dict['api_base']}")
+            openai.api_base = api_dict["api_base"]
+        if "api_key" in api_dict:
+            print(f"Using API key: {api_dict['api_key'][0:4]}***")
+            openai.api_key = api_dict["api_key"]
     output = API_ERROR_OUTPUT
     min_sleep_time = 1
     max_sleep_time = API_RETRY_SLEEP
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 260cf86..9fb5822 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -316,11 +316,15 @@ def make_judge_single(judge_model, judge_prompts):
 
     # Prepare API dict if judge model URL and API key are provided
     api_dict = None
-    if args.judge_model_url and args.judge_model_api_key:
-        api_dict = {
-            "api_base": args.judge_model_url,
-            "api_key": args.judge_model_api_key
-        }
+
+    if args.judge_model_url or args.judge_model_api_key:
+        api_dict = {}
+        if args.judge_model_url:
+            print(f"Using custom judge model URL: {args.judge_model_url}")
+            api_dict["api_base"] = args.judge_model_url
+        if args.judge_model_api_key:
+            print(f"Using custom judge model API key: {args.judge_model_api_key[0:4]}***")
+            api_dict["api_key"] = args.judge_model_api_key
 
     # Play matches
     if args.parallel == 1:

From 8fe17345c2e75845ab1e5c91f45596ccbc7e07b4 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 03:26:15 -0400
Subject: [PATCH 11/13] Use lfm-7b as judge

---
 .github/workflows/run-eval.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index 0783398..abce77a 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -76,7 +76,7 @@ jobs:
           # let the model judge itself against the GPT-4 answers
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --judge-model-name "$MODEL_NAME" \
+            --judge-model-name "lfm-7b" \
             --judge-model-url "$MODEL_URL" \
             --judge-model-api-key "$MODEL_API_KEY" \
             --parallel 3

From 338b54b1555577177aa16e6d753564156e3978d1 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 03:26:49 -0400
Subject: [PATCH 12/13] Add more typing

---
 conversation.py           |  2 +-
 llm_judge/common.py       | 10 +++++++---
 llm_judge/gen_judgment.py | 14 +++++++-------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/conversation.py b/conversation.py
index 45c05dc..f071516 100755
--- a/conversation.py
+++ b/conversation.py
@@ -379,7 +379,7 @@ def register_conv_template(template: Conversation, override: bool = False):
 
 def get_conv_template(name: str) -> Conversation:
     """Get a conversation template."""
-    print("Using template: ", name)
+    print("Using template:", name)
     return conv_templates[name].copy()
 
 
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 1565f36..98cce59 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -9,7 +9,7 @@
 import os
 import re
 import time
-from typing import Optional
+from typing import Any, Optional
 import random
 
 import openai
@@ -211,7 +211,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_
     return rating_list, user_prompt_list, judgment_list
 
 
-def play_a_match_single(match: MatchSingle, output_file: str, api_dict=None):
+def play_a_match_single(
+    match: MatchSingle, output_file: str, api_dict: dict[str, Any] | None = None
+) -> dict[str, Any]:
     question, model, answer, judge, ref_answer, multi_turn = (
         match.question,
         match.model,
@@ -324,7 +326,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     return winner, user_prompt, judgment
 
 
-def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None):
+def play_a_match_pair(
+    match: MatchPair, output_file: str, api_dict: dict[str, Any] | None = None
+) -> dict[str, Any]:
     question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = (
         match.question,
         match.model_1,
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 9fb5822..91c74f9 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -6,6 +6,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Callable
 
 import numpy as np
 from tqdm import tqdm
@@ -247,15 +248,15 @@ def make_judge_single(judge_model, judge_prompts):
     current_dir = os.path.dirname(os.path.abspath(__file__))
     if args.mode == "single":
         judges = make_judge_single(args.judge_model, judge_prompts)
-        play_a_match_func = play_a_match_single
+        play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single
         model_suffix = "_".join(args.model_list)
-        output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl"))
         make_match_func = make_match_single
         baseline_model = None
     else:
         judges = make_judge_pairwise(args.judge_model, judge_prompts)
-        play_a_match_func = play_a_match_pair
-        output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")
+        play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl"))
         if args.mode == "pairwise-all":
             make_match_func = make_match_all_pairs
             baseline_model = None
@@ -331,9 +332,8 @@ def make_judge_single(judge_model, judge_prompts):
         for match in tqdm(matches):
             play_a_match_func(match, output_file=output_file, api_dict=api_dict)
     else:
-
-        def play_a_match_wrapper(match):
-            play_a_match_func(match, output_file=output_file, api_dict=api_dict)
+        def play_a_match_wrapper(input_match):
+            play_a_match_func(input_match, output_file=output_file, api_dict=api_dict)
 
         np.random.seed(0)
         np.random.shuffle(matches)

From e833cd3f99cebf1920f484aadabbbf8797d3cd77 Mon Sep 17 00:00:00 2001
From: Liren Tu <tuliren@gmail.com>
Date: Thu, 10 Apr 2025 03:56:33 -0400
Subject: [PATCH 13/13] Fix arguments and update logging

---
 README.md                   | 19 ++++++++++++++++---
 bin/api/entrypoint.sh       |  4 ++--
 bin/api/run_openai_judge.sh | 32 ++++++++++++++------------------
 llm_judge/common.py         |  2 --
 llm_judge/gen_judgment.py   | 18 ++++++++++++------
 llm_judge/show_result.py    | 10 +++++-----
 6 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/README.md b/README.md
index 381e320..ed31242 100644
--- a/README.md
+++ b/README.md
@@ -119,11 +119,24 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-
 2. Run the following scripts to generate GPT-4 judgement scores for the model answers.
 
 ```bash
-bin/api/run_openai_judge.sh --model-name <model-name> --judge-model-name <judge-model-name> --judge-model-url <judge-model-url> --judge-model-api-key <judge-model-api-key>
+bin/api/run_openai_judge.sh \
+  --model-name <model-name> \
+  --judge-model-name <judge-model-name> \
+  --judge-model-url <judge-model-url> \
+  --judge-model-api-key <judge-model-api-key>
 
 # examples:
-bin/api/run_openai_judge.sh --model-name lfm-3b-jp --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key <OPENAI-API-KEY>
-bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh \
+  --model-name lfm-3b-jp \
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
+
+bin/api/run_openai_judge.sh \
+  --model-name lfm-3b-ichikara \
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
 Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/<judge-model-name>_<model-name>.jsonl`.
diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh
index ce284a8..844d97c 100755
--- a/bin/api/entrypoint.sh
+++ b/bin/api/entrypoint.sh
@@ -81,7 +81,7 @@ elif [[ "$MODE" == "judge" ]]; then
     # Generate judgments
     python llm_judge/gen_judgment.py \
         --model-list "$MODEL_NAME" \
-        --judge-model "$JUDGE_MODEL_NAME" \
+        --judge-model-name "$JUDGE_MODEL_NAME" \
         --judge-model-url "$JUDGE_MODEL_URL" \
         --judge-model-api-key "$JUDGE_MODEL_API_KEY" \
         --parallel "$PARALLEL" \
@@ -90,7 +90,7 @@ elif [[ "$MODE" == "judge" ]]; then
     # Show results
     python llm_judge/show_result.py \
         --model-list "$MODEL_NAME" \
-        --judge-model "$JUDGE_MODEL_NAME" \
+        --judge-model-name "$JUDGE_MODEL_NAME" \
         --ci "$CI" \
         --bench-name japanese_mt_bench \
         --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh
index f9c512e..28e5af1 100755
--- a/bin/api/run_openai_judge.sh
+++ b/bin/api/run_openai_judge.sh
@@ -1,15 +1,15 @@
 #!/bin/bash
 
 print_usage() {
-    echo "Usage: $0 --model-name <model_name> --judge-model-name <judge_model_name> --judge-model-url <url> --judge-model-api-key <api_key> --parallel <parallel>"
+    echo "Usage: $0 --model-name <model_name> [--judge-model-name <judge_model_name>] [--judge-model-url <url>] --judge-model-api-key <api_key> [--parallel <parallel>]"
     echo
     echo "Arguments:"
-    echo "  --model-name          Model name to be evaluated"
+    echo "  --model-name          Model name to be evaluated (required)"
     echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
-    echo "  --judge-model-url     Base URL for the judge model API"
-    echo "  --judge-model-api-key API key for the judge model"
-    echo "  --parallel            Number of parallel processes"
-    echo "  --ci                  CI mode"
+    echo "  --judge-model-url     Base URL for the judge model API (default: https://api.openai.com/v1)"
+    echo "  --judge-model-api-key API key for the judge model (required)"
+    echo "  --parallel            Number of parallel processes (default: 5)"
+    echo "  --ci                  CI mode (default: false)"
 }
 
 MODEL_NAME=""
@@ -55,20 +55,14 @@ while [[ $# -gt 0 ]]; do
 done
 
 # Validate required parameters
-if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
-    echo "Error: --judge-model-api-key is required"
-    print_usage
-    exit 1
-fi
-
-if [[ -z "$JUDGE_MODEL_URL" ]]; then
-    echo "Error: --judge-model-url is required"
+if [[ -z "$MODEL_NAME" ]]; then
+    echo "Error: --model-name is required"
     print_usage
     exit 1
 fi
 
-if [[ -z "$MODEL_NAME" ]]; then
-    echo "Error: --model-name is required"
+if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+    echo "Error: --judge-model-api-key is required"
     print_usage
     exit 1
 fi
@@ -80,12 +74,14 @@ export PYTHONPATH=.
 
 python llm_judge/gen_judgment.py \
   --model-list "$MODEL_NAME" \
-  --judge-model "$JUDGE_MODEL_NAME" \
+  --judge-model-name "$JUDGE_MODEL_NAME" \
+  --judge-model-url "$JUDGE_MODEL_URL" \
+  --judge-model-api-key "$JUDGE_MODEL_API_KEY" \
   --parallel "$PARALLEL" \
   --bench-name japanese_mt_bench
 
 python llm_judge/show_result.py --model-list "$MODEL_NAME" \
-  --judge-model "$JUDGE_MODEL_NAME" \
+  --judge-model-name "$JUDGE_MODEL_NAME" \
   --ci "$CI" \
   --bench-name japanese_mt_bench \
   --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 98cce59..c990892 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -429,10 +429,8 @@ def play_a_match_pair(
 def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     if api_dict is not None:
         if "api_base" in api_dict:
-            print(f"Using API base: {api_dict['api_base']}")
             openai.api_base = api_dict["api_base"]
         if "api_key" in api_dict:
-            print(f"Using API key: {api_dict['api_key'][0:4]}***")
             openai.api_key = api_dict["api_key"]
     output = API_ERROR_OUTPUT
     min_sleep_time = 1
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 91c74f9..b900933 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -185,7 +185,7 @@ def make_judge_single(judge_model, judge_prompts):
         default="llm_judge/data/judge_prompts.jsonl",
         help="The file of judge prompts.",
     )
-    parser.add_argument("--judge-model", type=str, default="gpt-4", help="The model used for judging")
+    parser.add_argument("--judge-model-name", type=str, default="gpt-4", help="The model used for judging")
     parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API")
     parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model")
     parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
@@ -216,6 +216,12 @@ def make_judge_single(judge_model, judge_prompts):
     )
     # Remove Azure parameter as we now use custom judge model parameters
     args = parser.parse_args()
+    print(f"Model name: {args.model_list}")
+    print(f"Judge model name: {args.judge_model_name}")
+    if args.judge_model_url:
+        print(f"Judge model URL: {args.judge_model_url}")
+    if args.judge_model_api_key:
+        print(f"Judge model API key: {args.judge_model_api_key[0:4]}***")
 
     args.model_list = [model_path.replace("/", "_") for model_path in args.model_list]
 
@@ -247,16 +253,16 @@ def make_judge_single(judge_model, judge_prompts):
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
     if args.mode == "single":
-        judges = make_judge_single(args.judge_model, judge_prompts)
+        judges = make_judge_single(args.judge_model_name, judge_prompts)
         play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single
         model_suffix = "_".join(args.model_list)
-        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl"))
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_{model_suffix}.jsonl"))
         make_match_func = make_match_single
         baseline_model = None
     else:
-        judges = make_judge_pairwise(args.judge_model, judge_prompts)
+        judges = make_judge_pairwise(args.judge_model_name, judge_prompts)
         play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair
-        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl"))
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_pair.jsonl"))
         if args.mode == "pairwise-all":
             make_match_func = make_match_all_pairs
             baseline_model = None
@@ -303,7 +309,7 @@ def make_judge_single(judge_model, judge_prompts):
     match_stat = {}
     match_stat["bench_name"] = args.bench_name
     match_stat["mode"] = args.mode
-    match_stat["judge"] = args.judge_model
+    match_stat["judge"] = args.judge_model_name
     match_stat["baseline"] = baseline_model
     match_stat["model_list"] = models
     match_stat["total_num_questions"] = len(questions)
diff --git a/llm_judge/show_result.py b/llm_judge/show_result.py
index 4190c9a..2cb620b 100755
--- a/llm_judge/show_result.py
+++ b/llm_judge/show_result.py
@@ -26,9 +26,9 @@ def calculate_averages(scores):
 def display_result_single(args):
     if args.input_file is None:
         if args.azure:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single_azure.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single_azure.jsonl"
         else:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single.jsonl"
     else:
         input_file = args.input_file
 
@@ -115,9 +115,9 @@ def score_category(category):
 def display_result_pairwise(args):
     if args.input_file is None:
         if args.azure:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair_azure.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair_azure.jsonl"
         else:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair.jsonl"
     else:
         input_file = args.input_file
 
@@ -167,7 +167,7 @@ def display_result_pairwise(args):
     parser = argparse.ArgumentParser()
     parser.add_argument("--bench-name", type=str, default="mt_bench")
     parser.add_argument("--input-file", type=str)
-    parser.add_argument("--judge-model", type=str, default="gpt-4")
+    parser.add_argument("--judge-model-name", type=str, default="gpt-4")
     parser.add_argument("--output-file", type=str)
     parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
     parser.add_argument(