diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index 3014b44..abce77a 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -71,11 +71,14 @@ jobs: env: MODEL_NAME: lfm-3b MODEL_URL: ${{ vars.MODEL_URL }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + MODEL_API_KEY: ${{ secrets.MODEL_API_KEY }} run: | + # let the model judge itself against the GPT-4 answers bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --openai-api-key "$OPENAI_API_KEY" \ + --judge-model-name "lfm-7b" \ + --judge-model-url "$MODEL_URL" \ + --judge-model-api-key "$MODEL_API_KEY" \ --parallel 3 - name: Process Judge Results diff --git a/README.md b/README.md index 07fc2a5..ed31242 100644 --- a/README.md +++ b/README.md @@ -21,17 +21,21 @@ bin/api/run_docker_eval.sh generate \ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/.jsonl` -2. Run OpenAI judge: +2. Run judge: + +The judge script will use the judge model to compare [GPT-4 results](llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl) with the model results. The judge model defaults to GPT-4. ```bash bin/api/run_docker_eval.sh judge \ --model-name \ - --openai-api-key + --judge-model-name \ + --judge-model-url \ + --judge-model-api-key ``` -GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_.jsonl`. +Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/_.jsonl`. -The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-.json`. +The final scores will be output in `llm_judge/data/japanese_mt_bench/-score-.json`. ### Examples @@ -45,7 +49,9 @@ bin/api/run_docker_eval.sh generate \ bin/api/run_docker_eval.sh judge \ --model-name lfm-3b-jp \ - --openai-api-key + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` Run eval for `lfm-3b-ichikara` on-prem: @@ -71,7 +77,9 @@ bin/api/run_docker_eval.sh generate \ bin/api/run_docker_eval.sh judge \ --model-name lfm-3b-jp \ - --openai-api-key + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` ## Run Evaluation without Docker @@ -111,16 +119,29 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/ --openai-api-key +bin/api/run_openai_judge.sh \ + --model-name \ + --judge-model-name \ + --judge-model-url \ + --judge-model-api-key # examples: -bin/api/run_openai_judge.sh --model-name lfm-3b-jp --openai-api-key -bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --openai-api-key +bin/api/run_openai_judge.sh \ + --model-name lfm-3b-jp \ + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key + +bin/api/run_openai_judge.sh \ + --model-name lfm-3b-ichikara \ + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` -GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_.jsonl`. +Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/_.jsonl`. -The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-.json`. +The final scores will be output in `llm_judge/data/japanese_mt_bench/-score-.json`. @@ -148,8 +169,10 @@ This applies to both `bin/api/run_docker_eval.sh judge` and `bin/api/run_openai_ | Argument | Description | Required | | --- | --- | --- | -| `--model-name` | Model name | Yes | -| `--openai-api-key` | OpenAI API key | Yes | +| `--model-name` | Model name to be evaluated | Yes | +| `--judge-model-name` | Name of the judge model (default: gpt-4) | No | +| `--judge-model-url` | Base URL for the judge model API | Yes | +| `--judge-model-api-key` | API key for the judge model | Yes | | `--parallel` | Number of parallel API calls | No. Default to 5. | diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh index 05e5026..844d97c 100755 --- a/bin/api/entrypoint.sh +++ b/bin/api/entrypoint.sh @@ -47,6 +47,20 @@ elif [[ "$MODE" == "judge" ]]; then # Extract arguments for judge mode PARALLEL="5" CI="false" + JUDGE_MODEL_NAME=${JUDGE_MODEL_NAME:-"gpt-4"} + JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""} + JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""} + + # Ensure required parameters are set + if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: JUDGE_MODEL_API_KEY environment variable is required" + exit 1 + fi + + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: JUDGE_MODEL_URL environment variable is required" + exit 1 + fi while [[ $# -gt 0 ]]; do case $1 in @@ -67,14 +81,18 @@ elif [[ "$MODE" == "judge" ]]; then # Generate judgments python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ + --judge-model-url "$JUDGE_MODEL_URL" \ + --judge-model-api-key "$JUDGE_MODEL_API_KEY" \ --parallel "$PARALLEL" \ --bench-name japanese_mt_bench # Show results python llm_judge/show_result.py \ --model-list "$MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ - --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \ - --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json + --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ + --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json" fi diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh index 294195f..5a05101 100755 --- a/bin/api/run_docker_eval.sh +++ b/bin/api/run_docker_eval.sh @@ -22,10 +22,12 @@ print_usage() { echo " --question-count Number of questions to evaluate (optional)" echo echo "Judge mode options:" - echo " --model-name Name of the model to evaluate" - echo " --openai-api-key OpenAI API key for GPT-4 judgment" - echo " --parallel Number of parallel processes (default: 5)" - echo " --ci CI mode (default: false)" + echo " --model-name Name of the model to evaluate" + echo " --judge-model-name Name of the judge model (default: gpt-4)" + echo " --judge-model-url Base URL for the judge model API" + echo " --judge-model-api-key API key for the judge model" + echo " --parallel Number of parallel processes (default: 5)" + echo " --ci CI mode (default: false)" } if [ $# -lt 1 ]; then @@ -106,7 +108,9 @@ if [[ "$MODE" == "generate" ]]; then elif [[ "$MODE" == "judge" ]]; then # Process judge mode arguments MODEL_NAME="" - OPENAI_API_KEY="" + JUDGE_MODEL_NAME="gpt-4" + JUDGE_MODEL_URL="" + JUDGE_MODEL_API_KEY="" PARALLEL="5" CI="false" @@ -116,8 +120,17 @@ elif [[ "$MODE" == "judge" ]]; then MODEL_NAME="$2" shift 2 ;; - --openai-api-key) - OPENAI_API_KEY="$2" + + --judge-model-name) + JUDGE_MODEL_NAME="$2" + shift 2 + ;; + --judge-model-url) + JUDGE_MODEL_URL="$2" + shift 2 + ;; + --judge-model-api-key) + JUDGE_MODEL_API_KEY="$2" shift 2 ;; --parallel) @@ -142,8 +155,15 @@ elif [[ "$MODE" == "judge" ]]; then exit 1 fi - if [[ -z "$OPENAI_API_KEY" ]]; then - echo "Error: --openai-api-key is required" + # Validate required parameters + if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: --judge-model-api-key is required" + print_usage + exit 1 + fi + + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: --judge-model-url is required" print_usage exit 1 fi @@ -152,7 +172,9 @@ elif [[ "$MODE" == "judge" ]]; then docker run --rm -it \ --network="host" \ -e MODEL_NAME="$MODEL_NAME" \ - -e OPENAI_API_KEY="$OPENAI_API_KEY" \ + -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \ + -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \ + -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \ -v "$(pwd)/llm_judge:/app/llm_judge" \ liquidai/mt-bench:latest judge \ --parallel "$PARALLEL" \ diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh index de8c7d6..28e5af1 100755 --- a/bin/api/run_openai_judge.sh +++ b/bin/api/run_openai_judge.sh @@ -1,29 +1,43 @@ #!/bin/bash print_usage() { - echo "Usage: $0 --openai-api-key --model-name --parallel " + echo "Usage: $0 --model-name [--judge-model-name ] [--judge-model-url ] --judge-model-api-key [--parallel ]" echo echo "Arguments:" - echo " --openai-api-key OpenAI API key" - echo " --model-name Model name" - echo " --parallel Number of parallel processes" + echo " --model-name Model name to be evaluated (required)" + echo " --judge-model-name Name of the judge model (default: gpt-4)" + echo " --judge-model-url Base URL for the judge model API (default: https://api.openai.com/v1)" + echo " --judge-model-api-key API key for the judge model (required)" + echo " --parallel Number of parallel processes (default: 5)" + echo " --ci CI mode (default: false)" } -OPENAI_API_KEY="" MODEL_NAME="" +JUDGE_MODEL_NAME="gpt-4" +JUDGE_MODEL_URL="" +JUDGE_MODEL_API_KEY="" PARALLEL="5" CI="false" while [[ $# -gt 0 ]]; do case $1 in - --openai-api-key) - OPENAI_API_KEY="$2" - shift 2 - ;; + --model-name) MODEL_NAME="$2" shift 2 ;; + --judge-model-name) + JUDGE_MODEL_NAME="$2" + shift 2 + ;; + --judge-model-url) + JUDGE_MODEL_URL="$2" + shift 2 + ;; + --judge-model-api-key) + JUDGE_MODEL_API_KEY="$2" + shift 2 + ;; --parallel) PARALLEL="$2" shift 2 @@ -40,28 +54,35 @@ while [[ $# -gt 0 ]]; do esac done -if [[ -z "$OPENAI_API_KEY" ]]; then - echo "Error: --openai-api-key is required" +# Validate required parameters +if [[ -z "$MODEL_NAME" ]]; then + echo "Error: --model-name is required" print_usage exit 1 fi -if [[ -z "$MODEL_NAME" ]]; then - echo "Error: --model-name is required" +if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: --judge-model-api-key is required" print_usage exit 1 fi -export OPENAI_API_KEY="$OPENAI_API_KEY" +export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" +export JUDGE_MODEL_URL="$JUDGE_MODEL_URL" +export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" export PYTHONPATH=. python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ + --judge-model-url "$JUDGE_MODEL_URL" \ + --judge-model-api-key "$JUDGE_MODEL_API_KEY" \ --parallel "$PARALLEL" \ --bench-name japanese_mt_bench python llm_judge/show_result.py --model-list "$MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ - --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \ - --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json + --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ + --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json" diff --git a/conversation.py b/conversation.py index 45c05dc..f071516 100755 --- a/conversation.py +++ b/conversation.py @@ -379,7 +379,7 @@ def register_conv_template(template: Conversation, override: bool = False): def get_conv_template(name: str) -> Conversation: """Get a conversation template.""" - print("Using template: ", name) + print("Using template:", name) return conv_templates[name].copy() diff --git a/llm_judge/common.py b/llm_judge/common.py index bf4d3f6..c990892 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -9,20 +9,15 @@ import os import re import time -from typing import Optional +from typing import Any, Optional import random import openai -import anthropic from dotenv import load_dotenv load_dotenv() -from model.model_adapter import ( - get_conversation_template, - ANTHROPIC_MODEL_LIST, - OPENAI_MODEL_LIST, -) +from model.model_adapter import get_conversation_template # API setting constants API_MAX_RETRY = 16 @@ -157,7 +152,7 @@ def load_judge_prompts(prompt_file: str): return prompts -def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azure=True): +def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_dict=None): kwargs = {} model = judge.model_name if ref_answer is not None: @@ -192,19 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur conv.append_message(conv.roles[0], user_prompt) conv.append_message(conv.roles[1], None) - if model in OPENAI_MODEL_LIST: - if azure: - judgment = chat_completion_openai_azure( - model, conv, temperature=0, max_tokens=2048 - ) - else: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) - elif model in ANTHROPIC_MODEL_LIST: - judgment = chat_completion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) - else: - raise ValueError(f"Invalid judge model name: {model}") + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) if judge.prompt_template["output_format"] == "[[rating]]": match = re.search(one_score_pattern, judgment) @@ -226,7 +211,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur return rating_list, user_prompt_list, judgment_list -def play_a_match_single(match: MatchPair, output_file: str, azure=True): +def play_a_match_single( + match: MatchSingle, output_file: str, api_dict: dict[str, Any] | None = None +) -> dict[str, Any]: question, model, answer, judge, ref_answer, multi_turn = ( match.question, match.model, @@ -238,7 +225,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): if judge.prompt_template["type"] == "single": score_list, user_prompt_list, judgment_list = run_judge_single( - question, answer, judge, ref_answer, multi_turn=multi_turn, azure=azure + question, answer, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) question_id = question["question_id"] @@ -259,7 +246,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): f"judge: {(judge.model_name, judge.prompt_template['name'])}" ) else: - raise ValueError(f"invalid judge type: {judge['type']}") + raise ValueError(f"invalid judge type: {judge.prompt_template['type']}") if output_file: os.makedirs(os.path.dirname(output_file), exist_ok=True) @@ -269,7 +256,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): return result -def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, azure=True): +def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, api_dict=None): kwargs = {} model = judge.model_name if ref_answer is not None: @@ -303,23 +290,10 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F conv.append_message(conv.roles[0], user_prompt) conv.append_message(conv.roles[1], None) - if model in OPENAI_MODEL_LIST: - conv.set_system_message(system_prompt) - if azure: - judgment = chat_completion_openai_azure( - model, conv, temperature=0, max_tokens=2048 - ) - else: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) - elif model in ANTHROPIC_MODEL_LIST: - if system_prompt != "You are a helpful assistant.": - user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt - conv.messages[0][1] = user_prompt - judgment = chat_completion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) - else: - raise ValueError(f"Invalid judge model name: {model}") + conv.set_system_message(system_prompt) + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) if judge.prompt_template["output_format"] == "[[A]]": if "[[A]]" in judgment: @@ -352,7 +326,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F return winner, user_prompt, judgment -def play_a_match_pair(match: MatchPair, output_file: str): +def play_a_match_pair( + match: MatchPair, output_file: str, api_dict: dict[str, Any] | None = None +) -> dict[str, Any]: question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = ( match.question, match.model_1, @@ -366,10 +342,10 @@ def play_a_match_pair(match: MatchPair, output_file: str): if judge.prompt_template["type"] == "pairwise": g1_winner, g1_user_prompt, g1_judgment = run_judge_pair( - question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, azure=True + question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) g2_winner, g2_user_prompt, g2_judgment = run_judge_pair( - question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, azure=True + question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) g1_map = {"A": "model_1", "B": "model_2"} @@ -401,15 +377,19 @@ def play_a_match_pair(match: MatchPair, output_file: str): ) elif judge.prompt_template["type"] == "single": m1_score, m1_user_prompt, m1_judgment = run_judge_single( - question, answer_1, judge, azure=True + question, answer_1, judge, ref_answer, api_dict=api_dict ) m2_score, m2_user_prompt, m2_judgment = run_judge_single( - question, answer_2, judge, azure=True + question, answer_2, judge, ref_answer, api_dict=api_dict ) - if abs(m1_score - m2_score) <= TIE_DELTA: + # Extract first score from lists + m1_first_score = m1_score[0] if isinstance(m1_score, list) else m1_score + m2_first_score = m2_score[0] if isinstance(m2_score, list) else m2_score + + if abs(m1_first_score - m2_first_score) <= TIE_DELTA: winner = "tie" - elif m1_score > m2_score: + elif m1_first_score > m2_first_score: winner = "model_1" else: winner = "model_2" @@ -436,7 +416,7 @@ def play_a_match_pair(match: MatchPair, output_file: str): f"judge: {(judge.model_name, judge.prompt_template['name'])}" ) else: - raise ValueError(f"invalid judge type: {judge['type']}") + raise ValueError(f"invalid judge type: {judge.prompt_template['type']}") if output_file: os.makedirs(os.path.dirname(output_file), exist_ok=True) @@ -448,8 +428,10 @@ def play_a_match_pair(match: MatchPair, output_file: str): def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] + if "api_base" in api_dict: + openai.api_base = api_dict["api_base"] + if "api_key" in api_dict: + openai.api_key = api_dict["api_key"] output = API_ERROR_OUTPUT min_sleep_time = 1 max_sleep_time = API_RETRY_SLEEP @@ -479,106 +461,6 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): return output -def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=None): - openai.api_type = "azure" - openai.api_version = "2023-07-01-preview" - if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] - else: - openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"] - openai.api_key = os.environ["AZURE_OPENAI_KEY"] - - if "azure-" in model: - model = model[6:] - - output = API_ERROR_OUTPUT - min_sleep_time = 1 - max_sleep_time = API_RETRY_SLEEP - for _ in range(API_MAX_RETRY): - try: - messages = conv.to_openai_api_messages() - response = openai.ChatCompletion.create( - engine=model, - messages=messages, - n=1, - temperature=temperature, - max_tokens=max_tokens, - ) - output = response["choices"][0]["message"]["content"] - break - except openai.error.RateLimitError as e: - print(type(e), e) - sleep_time = random.randint(min_sleep_time, max_sleep_time) - print(f"Sleeping for {sleep_time} seconds") - time.sleep(sleep_time) - max_sleep_time = min(MAX_API_RETRY_SLEEP, max_sleep_time * 2) - min_sleep_time = max_sleep_time // 2 - except openai.error.OpenAIError as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - except openai.error.InvalidRequestError as e: - print(type(e), e) - break - except KeyError: - print(response) - break - - return output - - -def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None): - if api_dict is not None and "api_key" in api_dict: - api_key = api_dict["api_key"] - else: - api_key = os.environ["ANTHROPIC_API_KEY"] - - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - c = anthropic.Anthropic(api_key=api_key) - prompt = conv.get_prompt() - response = c.completions.create( - model=model, - prompt=prompt, - stop_sequences=[anthropic.HUMAN_PROMPT], - max_tokens_to_sample=max_tokens, - temperature=temperature, - ) - output = response.completion - break - except anthropic.APIError as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return output.strip() - - -def chat_completion_palm(chat_state, model, conv, temperature, max_tokens): - from serve.api_provider import init_palm_chat - - assert model == "palm-2-chat-bison-001" - - if chat_state is None: - chat_state = init_palm_chat("chat-bison@001") - - parameters = { - "temperature": temperature, - "top_p": 0.8, - "top_k": 40, - "max_output_tokens": max_tokens, - } - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - response = chat_state.send_message(conv.messages[-2][1], **parameters) - output = response.text - break - except Exception as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return chat_state, output - - def normalize_game_key_single(gamekey, result): """Make the model names sorted in a game key.""" qid, model_1, model_2 = gamekey @@ -760,8 +642,8 @@ def check_data(questions, model_answers, ref_answers, models, judges): if q["category"] not in NEED_REF_CATS: continue assert ( - q["question_id"] in ref_answers[jg.model_name] - ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}" + q["question_id"] in ref_answers['gpt-4'] + ), f"Missing reference answer to Question {q['question_id']} from 'gpt-4'" def get_model_list(answer_dir): diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index a4cb2c3..b900933 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -6,6 +6,7 @@ import json import os from concurrent.futures import ThreadPoolExecutor +from typing import Any, Callable import numpy as np from tqdm import tqdm @@ -47,7 +48,7 @@ def make_match( a_1 = model_answers[m_1][q_id] a_2 = model_answers[baseline_model][q_id] if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] match = MatchPair( dict(q), m_1, @@ -87,7 +88,7 @@ def make_match_all_pairs( a_1 = model_answers[m_1][q_id] a_2 = model_answers[m_2][q_id] if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] match = MatchPair( dict(q), m_1, @@ -127,7 +128,7 @@ def make_match_single( print(f"Model {m} does not have answer for question {q_id}") continue if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] matches.append( MatchSingle( dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn @@ -184,7 +185,9 @@ def make_judge_single(judge_model, judge_prompts): default="llm_judge/data/judge_prompts.jsonl", help="The file of judge prompts.", ) - parser.add_argument("--judge-model", type=str, default="gpt-4") + parser.add_argument("--judge-model-name", type=str, default="gpt-4", help="The model used for judging") + parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API") + parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model") parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") parser.add_argument( "--mode", @@ -211,10 +214,14 @@ def make_judge_single(judge_model, judge_prompts): parser.add_argument( "--first-n", type=int, help="A debug option. Only run the first `n` judgments." ) - parser.add_argument( - "--azure", action="store_true", help="Use Azure API instead of openai.", default=False - ) + # Remove Azure parameter as we now use custom judge model parameters args = parser.parse_args() + print(f"Model name: {args.model_list}") + print(f"Judge model name: {args.judge_model_name}") + if args.judge_model_url: + print(f"Judge model URL: {args.judge_model_url}") + if args.judge_model_api_key: + print(f"Judge model API key: {args.judge_model_api_key[0:4]}***") args.model_list = [model_path.replace("/", "_") for model_path in args.model_list] @@ -246,22 +253,16 @@ def make_judge_single(judge_model, judge_prompts): current_dir = os.path.dirname(os.path.abspath(__file__)) if args.mode == "single": - judges = make_judge_single(args.judge_model, judge_prompts) - play_a_match_func = play_a_match_single - if args.azure: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_single_azure.jsonl") - else: - model_suffix = "_".join(args.model_list) - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl") + judges = make_judge_single(args.judge_model_name, judge_prompts) + play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single + model_suffix = "_".join(args.model_list) + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_{model_suffix}.jsonl")) make_match_func = make_match_single baseline_model = None else: - judges = make_judge_pairwise(args.judge_model, judge_prompts) - play_a_match_func = play_a_match_pair - if args.azure: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair_azure.jsonl") - else: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl") + judges = make_judge_pairwise(args.judge_model_name, judge_prompts) + play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_pair.jsonl")) if args.mode == "pairwise-all": make_match_func = make_match_all_pairs baseline_model = None @@ -308,7 +309,7 @@ def make_judge_single(judge_model, judge_prompts): match_stat = {} match_stat["bench_name"] = args.bench_name match_stat["mode"] = args.mode - match_stat["judge"] = args.judge_model + match_stat["judge"] = args.judge_model_name match_stat["baseline"] = baseline_model match_stat["model_list"] = models match_stat["total_num_questions"] = len(questions) @@ -320,14 +321,25 @@ def make_judge_single(judge_model, judge_prompts): print(json.dumps(match_stat, indent=4, ensure_ascii=False)) # input("Press Enter to confirm...") + # Prepare API dict if judge model URL and API key are provided + api_dict = None + + if args.judge_model_url or args.judge_model_api_key: + api_dict = {} + if args.judge_model_url: + print(f"Using custom judge model URL: {args.judge_model_url}") + api_dict["api_base"] = args.judge_model_url + if args.judge_model_api_key: + print(f"Using custom judge model API key: {args.judge_model_api_key[0:4]}***") + api_dict["api_key"] = args.judge_model_api_key + # Play matches if args.parallel == 1: for match in tqdm(matches): - play_a_match_func(match, output_file=output_file, azure=args.azure) + play_a_match_func(match, output_file=output_file, api_dict=api_dict) else: - - def play_a_match_wrapper(match): - play_a_match_func(match, output_file=output_file, azure=args.azure) + def play_a_match_wrapper(input_match): + play_a_match_func(input_match, output_file=output_file, api_dict=api_dict) np.random.seed(0) np.random.shuffle(matches) diff --git a/llm_judge/show_result.py b/llm_judge/show_result.py index 4190c9a..2cb620b 100755 --- a/llm_judge/show_result.py +++ b/llm_judge/show_result.py @@ -26,9 +26,9 @@ def calculate_averages(scores): def display_result_single(args): if args.input_file is None: if args.azure: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single_azure.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single_azure.jsonl" else: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single.jsonl" else: input_file = args.input_file @@ -115,9 +115,9 @@ def score_category(category): def display_result_pairwise(args): if args.input_file is None: if args.azure: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair_azure.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair_azure.jsonl" else: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair.jsonl" else: input_file = args.input_file @@ -167,7 +167,7 @@ def display_result_pairwise(args): parser = argparse.ArgumentParser() parser.add_argument("--bench-name", type=str, default="mt_bench") parser.add_argument("--input-file", type=str) - parser.add_argument("--judge-model", type=str, default="gpt-4") + parser.add_argument("--judge-model-name", type=str, default="gpt-4") parser.add_argument("--output-file", type=str) parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") parser.add_argument( diff --git a/model/model_adapter.py b/model/model_adapter.py index 688960e..ce099e8 100755 --- a/model/model_adapter.py +++ b/model/model_adapter.py @@ -1076,7 +1076,7 @@ class ChatGPTAdapter(BaseModelAdapter): """The model adapter for ChatGPT""" def match(self, model_path: str): - return model_path in OPENAI_MODEL_LIST + return model_path.startswith("gpt") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() @@ -1089,7 +1089,7 @@ class AzureOpenAIAdapter(BaseModelAdapter): """The model adapter for Azure OpenAI""" def match(self, model_path: str): - return model_path in ("azure-gpt-35-turbo", "azure-gpt-4") + return model_path.startswith("azure") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() @@ -1118,7 +1118,7 @@ class ClaudeAdapter(BaseModelAdapter): """The model adapter for Claude""" def match(self, model_path: str): - return model_path in ANTHROPIC_MODEL_LIST + return model_path.startswith("claude") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError()