diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml
index 3014b44..abce77a 100644
--- a/.github/workflows/run-eval.yaml
+++ b/.github/workflows/run-eval.yaml
@@ -71,11 +71,14 @@ jobs:
         env:
           MODEL_NAME: lfm-3b
           MODEL_URL: ${{ vars.MODEL_URL }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          MODEL_API_KEY: ${{ secrets.MODEL_API_KEY }}
         run: |
+          # let the model judge itself against the GPT-4 answers
           bin/api/run_openai_judge.sh \
             --model-name "$MODEL_NAME" \
-            --openai-api-key "$OPENAI_API_KEY" \
+            --judge-model-name "lfm-7b" \
+            --judge-model-url "$MODEL_URL" \
+            --judge-model-api-key "$MODEL_API_KEY" \
             --parallel 3
 
       - name: Process Judge Results
diff --git a/README.md b/README.md
index 07fc2a5..ed31242 100644
--- a/README.md
+++ b/README.md
@@ -21,17 +21,21 @@ bin/api/run_docker_eval.sh generate \
 
 Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-name>.jsonl`
 
-2. Run OpenAI judge:
+2. Run judge:
+
+The judge script will use the judge model to compare [GPT-4 results](llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl) with the model results. The judge model defaults to GPT-4.
 
 ```bash
 bin/api/run_docker_eval.sh judge \
   --model-name <model-name> \
-  --openai-api-key <openai-api-key>
+  --judge-model-name <judge-model-name> \
+  --judge-model-url <judge-model-url> \
+  --judge-model-api-key <judge-model-api-key>
 ```
 
-GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_<model-name>.jsonl`.
+Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/<judge-model-name>_<model-name>.jsonl`.
 
-The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-<model-name>.json`.
+The final scores will be output in `llm_judge/data/japanese_mt_bench/<judge-model-name>-score-<model-name>.json`.
 
 ### Examples
 
@@ -45,7 +49,9 @@ bin/api/run_docker_eval.sh generate \
 
 bin/api/run_docker_eval.sh judge \
   --model-name lfm-3b-jp \
-  --openai-api-key <OPENAI-API-KEY>
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
 Run eval for `lfm-3b-ichikara` on-prem:
@@ -71,7 +77,9 @@ bin/api/run_docker_eval.sh generate \
 
 bin/api/run_docker_eval.sh judge \
   --model-name lfm-3b-jp \
-  --openai-api-key <OPENAI-API-KEY>
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
 ## Run Evaluation without Docker
@@ -111,16 +119,29 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/<model-
 2. Run the following scripts to generate GPT-4 judgement scores for the model answers.
 
 ```bash
-bin/api/run_openai_judge.sh --model-name <model-name> --openai-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh \
+  --model-name <model-name> \
+  --judge-model-name <judge-model-name> \
+  --judge-model-url <judge-model-url> \
+  --judge-model-api-key <judge-model-api-key>
 
 # examples:
-bin/api/run_openai_judge.sh --model-name lfm-3b-jp --openai-api-key <OPENAI-API-KEY>
-bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --openai-api-key <OPENAI-API-KEY>
+bin/api/run_openai_judge.sh \
+  --model-name lfm-3b-jp \
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
+
+bin/api/run_openai_judge.sh \
+  --model-name lfm-3b-ichikara \
+  --judge-model-name gpt-4o \
+  --judge-model-url https://api.openai.com/v1 \
+  --judge-model-api-key <OPENAI-API-KEY>
 ```
 
-GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_<model-name>.jsonl`.
+Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/<judge-model-name>_<model-name>.jsonl`.
 
-The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-<model-name>.json`.
+The final scores will be output in `llm_judge/data/japanese_mt_bench/<judge-model-name>-score-<model-name>.json`.
 
 </details>
 
@@ -148,8 +169,10 @@ This applies to both `bin/api/run_docker_eval.sh judge` and `bin/api/run_openai_
 
 | Argument | Description | Required |
 | --- | --- | --- |
-| `--model-name` | Model name | Yes |
-| `--openai-api-key` | OpenAI API key | Yes |
+| `--model-name` | Model name to be evaluated | Yes |
+| `--judge-model-name` | Name of the judge model (default: gpt-4) | No |
+| `--judge-model-url` | Base URL for the judge model API | Yes |
+| `--judge-model-api-key` | API key for the judge model | Yes |
 | `--parallel` | Number of parallel API calls | No. Default to 5. |
 
 </details>
diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh
index 05e5026..844d97c 100755
--- a/bin/api/entrypoint.sh
+++ b/bin/api/entrypoint.sh
@@ -47,6 +47,20 @@ elif [[ "$MODE" == "judge" ]]; then
     # Extract arguments for judge mode
     PARALLEL="5"
     CI="false"
+    JUDGE_MODEL_NAME=${JUDGE_MODEL_NAME:-"gpt-4"}
+    JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""}
+    JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""}
+    
+    # Ensure required parameters are set
+    if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+        echo "Error: JUDGE_MODEL_API_KEY environment variable is required"
+        exit 1
+    fi
+    
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        echo "Error: JUDGE_MODEL_URL environment variable is required"
+        exit 1
+    fi
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -67,14 +81,18 @@ elif [[ "$MODE" == "judge" ]]; then
     # Generate judgments
     python llm_judge/gen_judgment.py \
         --model-list "$MODEL_NAME" \
+        --judge-model-name "$JUDGE_MODEL_NAME" \
+        --judge-model-url "$JUDGE_MODEL_URL" \
+        --judge-model-api-key "$JUDGE_MODEL_API_KEY" \
         --parallel "$PARALLEL" \
         --bench-name japanese_mt_bench
 
     # Show results
     python llm_judge/show_result.py \
         --model-list "$MODEL_NAME" \
+        --judge-model-name "$JUDGE_MODEL_NAME" \
         --ci "$CI" \
         --bench-name japanese_mt_bench \
-        --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \
-        --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json
+        --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
+        --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json"
 fi
diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh
index 294195f..5a05101 100755
--- a/bin/api/run_docker_eval.sh
+++ b/bin/api/run_docker_eval.sh
@@ -22,10 +22,12 @@ print_usage() {
     echo "  --question-count  Number of questions to evaluate (optional)"
     echo
     echo "Judge mode options:"
-    echo "  --model-name      Name of the model to evaluate"
-    echo "  --openai-api-key  OpenAI API key for GPT-4 judgment"
-    echo "  --parallel        Number of parallel processes (default: 5)"
-    echo "  --ci              CI mode (default: false)"
+    echo "  --model-name          Name of the model to evaluate"
+    echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
+    echo "  --judge-model-url     Base URL for the judge model API"
+    echo "  --judge-model-api-key API key for the judge model"
+    echo "  --parallel            Number of parallel processes (default: 5)"
+    echo "  --ci                  CI mode (default: false)"
 }
 
 if [ $# -lt 1 ]; then
@@ -106,7 +108,9 @@ if [[ "$MODE" == "generate" ]]; then
 elif [[ "$MODE" == "judge" ]]; then
     # Process judge mode arguments
     MODEL_NAME=""
-    OPENAI_API_KEY=""
+    JUDGE_MODEL_NAME="gpt-4"
+    JUDGE_MODEL_URL=""
+    JUDGE_MODEL_API_KEY=""
     PARALLEL="5"
     CI="false"
 
@@ -116,8 +120,17 @@ elif [[ "$MODE" == "judge" ]]; then
                 MODEL_NAME="$2"
                 shift 2
                 ;;
-            --openai-api-key)
-                OPENAI_API_KEY="$2"
+
+            --judge-model-name)
+                JUDGE_MODEL_NAME="$2"
+                shift 2
+                ;;
+            --judge-model-url)
+                JUDGE_MODEL_URL="$2"
+                shift 2
+                ;;
+            --judge-model-api-key)
+                JUDGE_MODEL_API_KEY="$2"
                 shift 2
                 ;;
             --parallel)
@@ -142,8 +155,15 @@ elif [[ "$MODE" == "judge" ]]; then
         exit 1
     fi
 
-    if [[ -z "$OPENAI_API_KEY" ]]; then
-        echo "Error: --openai-api-key is required"
+    # Validate required parameters
+    if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+        echo "Error: --judge-model-api-key is required"
+        print_usage
+        exit 1
+    fi
+    
+    if [[ -z "$JUDGE_MODEL_URL" ]]; then
+        echo "Error: --judge-model-url is required"
         print_usage
         exit 1
     fi
@@ -152,7 +172,9 @@ elif [[ "$MODE" == "judge" ]]; then
     docker run --rm -it \
         --network="host" \
         -e MODEL_NAME="$MODEL_NAME" \
-        -e OPENAI_API_KEY="$OPENAI_API_KEY" \
+        -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \
+        -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \
+        -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \
         -v "$(pwd)/llm_judge:/app/llm_judge" \
         liquidai/mt-bench:latest judge \
         --parallel "$PARALLEL" \
diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh
index de8c7d6..28e5af1 100755
--- a/bin/api/run_openai_judge.sh
+++ b/bin/api/run_openai_judge.sh
@@ -1,29 +1,43 @@
 #!/bin/bash
 
 print_usage() {
-    echo "Usage: $0 --openai-api-key <api_key> --model-name <model_name> --parallel <parallel>"
+    echo "Usage: $0 --model-name <model_name> [--judge-model-name <judge_model_name>] [--judge-model-url <url>] --judge-model-api-key <api_key> [--parallel <parallel>]"
     echo
     echo "Arguments:"
-    echo "  --openai-api-key OpenAI API key"
-    echo "  --model-name     Model name"
-    echo "  --parallel       Number of parallel processes"
+    echo "  --model-name          Model name to be evaluated (required)"
+    echo "  --judge-model-name    Name of the judge model (default: gpt-4)"
+    echo "  --judge-model-url     Base URL for the judge model API (default: https://api.openai.com/v1)"
+    echo "  --judge-model-api-key API key for the judge model (required)"
+    echo "  --parallel            Number of parallel processes (default: 5)"
+    echo "  --ci                  CI mode (default: false)"
 }
 
-OPENAI_API_KEY=""
 MODEL_NAME=""
+JUDGE_MODEL_NAME="gpt-4"
+JUDGE_MODEL_URL=""
+JUDGE_MODEL_API_KEY=""
 PARALLEL="5"
 CI="false"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --openai-api-key)
-            OPENAI_API_KEY="$2"
-            shift 2
-            ;;
+
         --model-name)
             MODEL_NAME="$2"
             shift 2
             ;;
+        --judge-model-name)
+            JUDGE_MODEL_NAME="$2"
+            shift 2
+            ;;
+        --judge-model-url)
+            JUDGE_MODEL_URL="$2"
+            shift 2
+            ;;
+        --judge-model-api-key)
+            JUDGE_MODEL_API_KEY="$2"
+            shift 2
+            ;;
         --parallel)
             PARALLEL="$2"
             shift 2
@@ -40,28 +54,35 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
-if [[ -z "$OPENAI_API_KEY" ]]; then
-    echo "Error: --openai-api-key is required"
+# Validate required parameters
+if [[ -z "$MODEL_NAME" ]]; then
+    echo "Error: --model-name is required"
     print_usage
     exit 1
 fi
 
-if [[ -z "$MODEL_NAME" ]]; then
-    echo "Error: --model-name is required"
+if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then
+    echo "Error: --judge-model-api-key is required"
     print_usage
     exit 1
 fi
 
-export OPENAI_API_KEY="$OPENAI_API_KEY"
+export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME"
+export JUDGE_MODEL_URL="$JUDGE_MODEL_URL"
+export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY"
 export PYTHONPATH=.
 
 python llm_judge/gen_judgment.py \
   --model-list "$MODEL_NAME" \
+  --judge-model-name "$JUDGE_MODEL_NAME" \
+  --judge-model-url "$JUDGE_MODEL_URL" \
+  --judge-model-api-key "$JUDGE_MODEL_API_KEY" \
   --parallel "$PARALLEL" \
   --bench-name japanese_mt_bench
 
 python llm_judge/show_result.py --model-list "$MODEL_NAME" \
+  --judge-model-name "$JUDGE_MODEL_NAME" \
   --ci "$CI" \
   --bench-name japanese_mt_bench \
-  --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \
-  --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json
+  --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \
+  --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json"
diff --git a/conversation.py b/conversation.py
index 45c05dc..f071516 100755
--- a/conversation.py
+++ b/conversation.py
@@ -379,7 +379,7 @@ def register_conv_template(template: Conversation, override: bool = False):
 
 def get_conv_template(name: str) -> Conversation:
     """Get a conversation template."""
-    print("Using template: ", name)
+    print("Using template:", name)
     return conv_templates[name].copy()
 
 
diff --git a/llm_judge/common.py b/llm_judge/common.py
index bf4d3f6..c990892 100755
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -9,20 +9,15 @@
 import os
 import re
 import time
-from typing import Optional
+from typing import Any, Optional
 import random
 
 import openai
-import anthropic
 from dotenv import load_dotenv
 
 load_dotenv()
 
-from model.model_adapter import (
-    get_conversation_template,
-    ANTHROPIC_MODEL_LIST,
-    OPENAI_MODEL_LIST,
-)
+from model.model_adapter import get_conversation_template
 
 # API setting constants
 API_MAX_RETRY = 16
@@ -157,7 +152,7 @@ def load_judge_prompts(prompt_file: str):
     return prompts
 
 
-def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azure=True):
+def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_dict=None):
     kwargs = {}
     model = judge.model_name
     if ref_answer is not None:
@@ -192,19 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur
         conv.append_message(conv.roles[0], user_prompt)
         conv.append_message(conv.roles[1], None)
 
-        if model in OPENAI_MODEL_LIST:
-            if azure:
-                judgment = chat_completion_openai_azure(
-                    model, conv, temperature=0, max_tokens=2048
-                )
-            else:
-                judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
-        elif model in ANTHROPIC_MODEL_LIST:
-            judgment = chat_completion_anthropic(
-                model, conv, temperature=0, max_tokens=1024
-            )
-        else:
-            raise ValueError(f"Invalid judge model name: {model}")
+        judgment = chat_completion_openai(
+            model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+        )
 
         if judge.prompt_template["output_format"] == "[[rating]]":
             match = re.search(one_score_pattern, judgment)
@@ -226,7 +211,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur
     return rating_list, user_prompt_list, judgment_list
 
 
-def play_a_match_single(match: MatchPair, output_file: str, azure=True):
+def play_a_match_single(
+    match: MatchSingle, output_file: str, api_dict: dict[str, Any] | None = None
+) -> dict[str, Any]:
     question, model, answer, judge, ref_answer, multi_turn = (
         match.question,
         match.model,
@@ -238,7 +225,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
 
     if judge.prompt_template["type"] == "single":
         score_list, user_prompt_list, judgment_list = run_judge_single(
-            question, answer, judge, ref_answer, multi_turn=multi_turn, azure=azure
+            question, answer, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
 
         question_id = question["question_id"]
@@ -259,7 +246,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
             f"judge: {(judge.model_name, judge.prompt_template['name'])}"
         )
     else:
-        raise ValueError(f"invalid judge type: {judge['type']}")
+        raise ValueError(f"invalid judge type: {judge.prompt_template['type']}")
 
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
@@ -269,7 +256,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True):
     return result
 
 
-def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, azure=True):
+def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, api_dict=None):
     kwargs = {}
     model = judge.model_name
     if ref_answer is not None:
@@ -303,23 +290,10 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     conv.append_message(conv.roles[0], user_prompt)
     conv.append_message(conv.roles[1], None)
 
-    if model in OPENAI_MODEL_LIST:
-        conv.set_system_message(system_prompt)
-        if azure:
-            judgment = chat_completion_openai_azure(
-                model, conv, temperature=0, max_tokens=2048
-            )
-        else:
-            judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
-    elif model in ANTHROPIC_MODEL_LIST:
-        if system_prompt != "You are a helpful assistant.":
-            user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
-            conv.messages[0][1] = user_prompt
-        judgment = chat_completion_anthropic(
-            model, conv, temperature=0, max_tokens=1024
-        )
-    else:
-        raise ValueError(f"Invalid judge model name: {model}")
+    conv.set_system_message(system_prompt)
+    judgment = chat_completion_openai(
+        model, conv, temperature=0, max_tokens=2048, api_dict=api_dict
+    )
 
     if judge.prompt_template["output_format"] == "[[A]]":
         if "[[A]]" in judgment:
@@ -352,7 +326,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     return winner, user_prompt, judgment
 
 
-def play_a_match_pair(match: MatchPair, output_file: str):
+def play_a_match_pair(
+    match: MatchPair, output_file: str, api_dict: dict[str, Any] | None = None
+) -> dict[str, Any]:
     question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = (
         match.question,
         match.model_1,
@@ -366,10 +342,10 @@ def play_a_match_pair(match: MatchPair, output_file: str):
 
     if judge.prompt_template["type"] == "pairwise":
         g1_winner, g1_user_prompt, g1_judgment = run_judge_pair(
-            question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, azure=True
+            question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
         g2_winner, g2_user_prompt, g2_judgment = run_judge_pair(
-            question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, azure=True
+            question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict
         )
 
         g1_map = {"A": "model_1", "B": "model_2"}
@@ -401,15 +377,19 @@ def play_a_match_pair(match: MatchPair, output_file: str):
         )
     elif judge.prompt_template["type"] == "single":
         m1_score, m1_user_prompt, m1_judgment = run_judge_single(
-            question, answer_1, judge, azure=True
+            question, answer_1, judge, ref_answer, api_dict=api_dict
         )
         m2_score, m2_user_prompt, m2_judgment = run_judge_single(
-            question, answer_2, judge, azure=True
+            question, answer_2, judge, ref_answer, api_dict=api_dict
         )
 
-        if abs(m1_score - m2_score) <= TIE_DELTA:
+        # Extract first score from lists
+        m1_first_score = m1_score[0] if isinstance(m1_score, list) else m1_score
+        m2_first_score = m2_score[0] if isinstance(m2_score, list) else m2_score
+        
+        if abs(m1_first_score - m2_first_score) <= TIE_DELTA:
             winner = "tie"
-        elif m1_score > m2_score:
+        elif m1_first_score > m2_first_score:
             winner = "model_1"
         else:
             winner = "model_2"
@@ -436,7 +416,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
             f"judge: {(judge.model_name, judge.prompt_template['name'])}"
         )
     else:
-        raise ValueError(f"invalid judge type: {judge['type']}")
+        raise ValueError(f"invalid judge type: {judge.prompt_template['type']}")
 
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
@@ -448,8 +428,10 @@ def play_a_match_pair(match: MatchPair, output_file: str):
 
 def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
+        if "api_base" in api_dict:
+            openai.api_base = api_dict["api_base"]
+        if "api_key" in api_dict:
+            openai.api_key = api_dict["api_key"]
     output = API_ERROR_OUTPUT
     min_sleep_time = 1
     max_sleep_time = API_RETRY_SLEEP
@@ -479,106 +461,6 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
     return output
 
 
-def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=None):
-    openai.api_type = "azure"
-    openai.api_version = "2023-07-01-preview"
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
-    else:
-        openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"]
-        openai.api_key = os.environ["AZURE_OPENAI_KEY"]
-
-    if "azure-" in model:
-        model = model[6:]
-
-    output = API_ERROR_OUTPUT
-    min_sleep_time = 1
-    max_sleep_time = API_RETRY_SLEEP
-    for _ in range(API_MAX_RETRY):
-        try:
-            messages = conv.to_openai_api_messages()
-            response = openai.ChatCompletion.create(
-                engine=model,
-                messages=messages,
-                n=1,
-                temperature=temperature,
-                max_tokens=max_tokens,
-            )
-            output = response["choices"][0]["message"]["content"]
-            break
-        except openai.error.RateLimitError as e:
-            print(type(e), e)
-            sleep_time = random.randint(min_sleep_time, max_sleep_time)
-            print(f"Sleeping for {sleep_time} seconds")
-            time.sleep(sleep_time)
-            max_sleep_time = min(MAX_API_RETRY_SLEEP, max_sleep_time * 2)
-            min_sleep_time = max_sleep_time // 2
-        except openai.error.OpenAIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-        except openai.error.InvalidRequestError as e:
-            print(type(e), e)
-            break
-        except KeyError:
-            print(response)
-            break
-
-    return output
-
-
-def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None):
-    if api_dict is not None and "api_key" in api_dict:
-        api_key = api_dict["api_key"]
-    else:
-        api_key = os.environ["ANTHROPIC_API_KEY"]
-
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            c = anthropic.Anthropic(api_key=api_key)
-            prompt = conv.get_prompt()
-            response = c.completions.create(
-                model=model,
-                prompt=prompt,
-                stop_sequences=[anthropic.HUMAN_PROMPT],
-                max_tokens_to_sample=max_tokens,
-                temperature=temperature,
-            )
-            output = response.completion
-            break
-        except anthropic.APIError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return output.strip()
-
-
-def chat_completion_palm(chat_state, model, conv, temperature, max_tokens):
-    from serve.api_provider import init_palm_chat
-
-    assert model == "palm-2-chat-bison-001"
-
-    if chat_state is None:
-        chat_state = init_palm_chat("chat-bison@001")
-
-    parameters = {
-        "temperature": temperature,
-        "top_p": 0.8,
-        "top_k": 40,
-        "max_output_tokens": max_tokens,
-    }
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            response = chat_state.send_message(conv.messages[-2][1], **parameters)
-            output = response.text
-            break
-        except Exception as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return chat_state, output
-
-
 def normalize_game_key_single(gamekey, result):
     """Make the model names sorted in a game key."""
     qid, model_1, model_2 = gamekey
@@ -760,8 +642,8 @@ def check_data(questions, model_answers, ref_answers, models, judges):
             if q["category"] not in NEED_REF_CATS:
                 continue
             assert (
-                q["question_id"] in ref_answers[jg.model_name]
-            ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"
+                q["question_id"] in ref_answers['gpt-4']
+            ), f"Missing reference answer to Question {q['question_id']} from 'gpt-4'"
 
 
 def get_model_list(answer_dir):
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index a4cb2c3..b900933 100755
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -6,6 +6,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Any, Callable
 
 import numpy as np
 from tqdm import tqdm
@@ -47,7 +48,7 @@ def make_match(
             a_1 = model_answers[m_1][q_id]
             a_2 = model_answers[baseline_model][q_id]
             if ref_answers is not None:
-                ref = ref_answers[judge.model_name][q_id]
+                ref = ref_answers['gpt-4'][q_id]
                 match = MatchPair(
                     dict(q),
                     m_1,
@@ -87,7 +88,7 @@ def make_match_all_pairs(
                 a_1 = model_answers[m_1][q_id]
                 a_2 = model_answers[m_2][q_id]
                 if ref_answers is not None:
-                    ref = ref_answers[judge.model_name][q_id]
+                    ref = ref_answers['gpt-4'][q_id]
                     match = MatchPair(
                         dict(q),
                         m_1,
@@ -127,7 +128,7 @@ def make_match_single(
                 print(f"Model {m} does not have answer for question {q_id}")
                 continue
             if ref_answers is not None:
-                ref = ref_answers[judge.model_name][q_id]
+                ref = ref_answers['gpt-4'][q_id]
                 matches.append(
                     MatchSingle(
                         dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn
@@ -184,7 +185,9 @@ def make_judge_single(judge_model, judge_prompts):
         default="llm_judge/data/judge_prompts.jsonl",
         help="The file of judge prompts.",
     )
-    parser.add_argument("--judge-model", type=str, default="gpt-4")
+    parser.add_argument("--judge-model-name", type=str, default="gpt-4", help="The model used for judging")
+    parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API")
+    parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model")
     parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
     parser.add_argument(
         "--mode",
@@ -211,10 +214,14 @@ def make_judge_single(judge_model, judge_prompts):
     parser.add_argument(
         "--first-n", type=int, help="A debug option. Only run the first `n` judgments."
     )
-    parser.add_argument(
-        "--azure", action="store_true", help="Use Azure API instead of openai.", default=False
-    )
+    # Remove Azure parameter as we now use custom judge model parameters
     args = parser.parse_args()
+    print(f"Model name: {args.model_list}")
+    print(f"Judge model name: {args.judge_model_name}")
+    if args.judge_model_url:
+        print(f"Judge model URL: {args.judge_model_url}")
+    if args.judge_model_api_key:
+        print(f"Judge model API key: {args.judge_model_api_key[0:4]}***")
 
     args.model_list = [model_path.replace("/", "_") for model_path in args.model_list]
 
@@ -246,22 +253,16 @@ def make_judge_single(judge_model, judge_prompts):
 
     current_dir = os.path.dirname(os.path.abspath(__file__))
     if args.mode == "single":
-        judges = make_judge_single(args.judge_model, judge_prompts)
-        play_a_match_func = play_a_match_single
-        if args.azure:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_single_azure.jsonl")
-        else:
-            model_suffix = "_".join(args.model_list)
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")
+        judges = make_judge_single(args.judge_model_name, judge_prompts)
+        play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single
+        model_suffix = "_".join(args.model_list)
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_{model_suffix}.jsonl"))
         make_match_func = make_match_single
         baseline_model = None
     else:
-        judges = make_judge_pairwise(args.judge_model, judge_prompts)
-        play_a_match_func = play_a_match_pair
-        if args.azure:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair_azure.jsonl")
-        else:
-            output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")
+        judges = make_judge_pairwise(args.judge_model_name, judge_prompts)
+        play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair
+        output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_pair.jsonl"))
         if args.mode == "pairwise-all":
             make_match_func = make_match_all_pairs
             baseline_model = None
@@ -308,7 +309,7 @@ def make_judge_single(judge_model, judge_prompts):
     match_stat = {}
     match_stat["bench_name"] = args.bench_name
     match_stat["mode"] = args.mode
-    match_stat["judge"] = args.judge_model
+    match_stat["judge"] = args.judge_model_name
     match_stat["baseline"] = baseline_model
     match_stat["model_list"] = models
     match_stat["total_num_questions"] = len(questions)
@@ -320,14 +321,25 @@ def make_judge_single(judge_model, judge_prompts):
     print(json.dumps(match_stat, indent=4, ensure_ascii=False))
     # input("Press Enter to confirm...")
 
+    # Prepare API dict if judge model URL and API key are provided
+    api_dict = None
+
+    if args.judge_model_url or args.judge_model_api_key:
+        api_dict = {}
+        if args.judge_model_url:
+            print(f"Using custom judge model URL: {args.judge_model_url}")
+            api_dict["api_base"] = args.judge_model_url
+        if args.judge_model_api_key:
+            print(f"Using custom judge model API key: {args.judge_model_api_key[0:4]}***")
+            api_dict["api_key"] = args.judge_model_api_key
+
     # Play matches
     if args.parallel == 1:
         for match in tqdm(matches):
-            play_a_match_func(match, output_file=output_file, azure=args.azure)
+            play_a_match_func(match, output_file=output_file, api_dict=api_dict)
     else:
-
-        def play_a_match_wrapper(match):
-            play_a_match_func(match, output_file=output_file, azure=args.azure)
+        def play_a_match_wrapper(input_match):
+            play_a_match_func(input_match, output_file=output_file, api_dict=api_dict)
 
         np.random.seed(0)
         np.random.shuffle(matches)
diff --git a/llm_judge/show_result.py b/llm_judge/show_result.py
index 4190c9a..2cb620b 100755
--- a/llm_judge/show_result.py
+++ b/llm_judge/show_result.py
@@ -26,9 +26,9 @@ def calculate_averages(scores):
 def display_result_single(args):
     if args.input_file is None:
         if args.azure:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single_azure.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single_azure.jsonl"
         else:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single.jsonl"
     else:
         input_file = args.input_file
 
@@ -115,9 +115,9 @@ def score_category(category):
 def display_result_pairwise(args):
     if args.input_file is None:
         if args.azure:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair_azure.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair_azure.jsonl"
         else:
-            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl"
+            input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair.jsonl"
     else:
         input_file = args.input_file
 
@@ -167,7 +167,7 @@ def display_result_pairwise(args):
     parser = argparse.ArgumentParser()
     parser.add_argument("--bench-name", type=str, default="mt_bench")
     parser.add_argument("--input-file", type=str)
-    parser.add_argument("--judge-model", type=str, default="gpt-4")
+    parser.add_argument("--judge-model-name", type=str, default="gpt-4")
     parser.add_argument("--output-file", type=str)
     parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo")
     parser.add_argument(
diff --git a/model/model_adapter.py b/model/model_adapter.py
index 688960e..ce099e8 100755
--- a/model/model_adapter.py
+++ b/model/model_adapter.py
@@ -1076,7 +1076,7 @@ class ChatGPTAdapter(BaseModelAdapter):
     """The model adapter for ChatGPT"""
 
     def match(self, model_path: str):
-        return model_path in OPENAI_MODEL_LIST
+        return model_path.startswith("gpt")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()
@@ -1089,7 +1089,7 @@ class AzureOpenAIAdapter(BaseModelAdapter):
     """The model adapter for Azure OpenAI"""
 
     def match(self, model_path: str):
-        return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
+        return model_path.startswith("azure")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()
@@ -1118,7 +1118,7 @@ class ClaudeAdapter(BaseModelAdapter):
     """The model adapter for Claude"""
 
     def match(self, model_path: str):
-        return model_path in ANTHROPIC_MODEL_LIST
+        return model_path.startswith("claude")
 
     def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         raise NotImplementedError()