From 7f7fb2512af2682f05c4efadbabbb5e63572d608 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Mar 2025 06:54:40 +0000 Subject: [PATCH 01/13] Update judge command to support custom judge models Co-Authored-By: liren@liquid.ai --- .github/workflows/run-eval.yaml | 4 ++- bin/api/entrypoint.sh | 20 ++++++++++-- bin/api/run_docker_eval.sh | 49 +++++++++++++++++++++++++---- bin/api/run_openai_judge.sh | 56 ++++++++++++++++++++++++++++----- llm_judge/common.py | 52 +++++++++++++++--------------- llm_judge/gen_judgment.py | 16 ++++++++-- 6 files changed, 150 insertions(+), 47 deletions(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index 3014b44..e7957c2 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -75,7 +75,9 @@ jobs: run: | bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --openai-api-key "$OPENAI_API_KEY" \ + --judge-model-name "gpt-4" \ + --judge-model-url "https://api.openai.com/v1" \ + --judge-model-api-key "$OPENAI_API_KEY" \ --parallel 3 - name: Process Judge Results diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh index 05e5026..562069a 100755 --- a/bin/api/entrypoint.sh +++ b/bin/api/entrypoint.sh @@ -47,6 +47,18 @@ elif [[ "$MODE" == "judge" ]]; then # Extract arguments for judge mode PARALLEL="5" CI="false" + JUDGE_MODEL_NAME=${JUDGE_MODEL_NAME:-"gpt-4"} + JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""} + JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""} + + # If no judge model API key is provided, use OpenAI API key + if [[ -z "$JUDGE_MODEL_API_KEY" && -n "$OPENAI_API_KEY" ]]; then + JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" + # Set default OpenAI URL if none provided + if [[ -z "$JUDGE_MODEL_URL" ]]; then + JUDGE_MODEL_URL="https://api.openai.com/v1" + fi + fi while [[ $# -gt 0 ]]; do case $1 in @@ -67,14 +79,18 @@ elif [[ "$MODE" == "judge" ]]; then # Generate judgments python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ + --judge-model "$JUDGE_MODEL_NAME" \ + --judge-model-url "$JUDGE_MODEL_URL" \ + --judge-model-api-key "$JUDGE_MODEL_API_KEY" \ --parallel "$PARALLEL" \ --bench-name japanese_mt_bench # Show results python llm_judge/show_result.py \ --model-list "$MODEL_NAME" \ + --judge-model "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ - --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \ - --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json + --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ + --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json" fi diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh index 294195f..c2861ed 100755 --- a/bin/api/run_docker_eval.sh +++ b/bin/api/run_docker_eval.sh @@ -22,10 +22,13 @@ print_usage() { echo " --question-count Number of questions to evaluate (optional)" echo echo "Judge mode options:" - echo " --model-name Name of the model to evaluate" - echo " --openai-api-key OpenAI API key for GPT-4 judgment" - echo " --parallel Number of parallel processes (default: 5)" - echo " --ci CI mode (default: false)" + echo " --model-name Name of the model to evaluate" + echo " --openai-api-key OpenAI API key for backward compatibility" + echo " --judge-model-name Name of the judge model (default: gpt-4)" + echo " --judge-model-url Base URL for the judge model API" + echo " --judge-model-api-key API key for the judge model" + echo " --parallel Number of parallel processes (default: 5)" + echo " --ci CI mode (default: false)" } if [ $# -lt 1 ]; then @@ -107,6 +110,9 @@ elif [[ "$MODE" == "judge" ]]; then # Process judge mode arguments MODEL_NAME="" OPENAI_API_KEY="" + JUDGE_MODEL_NAME="gpt-4" + JUDGE_MODEL_URL="" + JUDGE_MODEL_API_KEY="" PARALLEL="5" CI="false" @@ -120,6 +126,18 @@ elif [[ "$MODE" == "judge" ]]; then OPENAI_API_KEY="$2" shift 2 ;; + --judge-model-name) + JUDGE_MODEL_NAME="$2" + shift 2 + ;; + --judge-model-url) + JUDGE_MODEL_URL="$2" + shift 2 + ;; + --judge-model-api-key) + JUDGE_MODEL_API_KEY="$2" + shift 2 + ;; --parallel) PARALLEL="$2" shift 2 @@ -142,16 +160,35 @@ elif [[ "$MODE" == "judge" ]]; then exit 1 fi - if [[ -z "$OPENAI_API_KEY" ]]; then - echo "Error: --openai-api-key is required" + # If --judge-model-api-key is provided, use it + if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then + # Use the new judge model parameters + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: --judge-model-url is required when using --judge-model-api-key" + print_usage + exit 1 + fi + elif [[ -z "$OPENAI_API_KEY" ]]; then + # Fall back to requiring OpenAI API key + echo "Error: Either --judge-model-api-key or --openai-api-key is required" print_usage exit 1 + else + # If only OpenAI API key is provided, use it as the judge model API key + JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" + # Default to OpenAI API URL if using OpenAI API key + if [[ -z "$JUDGE_MODEL_URL" ]]; then + JUDGE_MODEL_URL="https://api.openai.com/v1" + fi fi # Run judge mode docker run --rm -it \ --network="host" \ -e MODEL_NAME="$MODEL_NAME" \ + -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \ + -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \ + -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \ -e OPENAI_API_KEY="$OPENAI_API_KEY" \ -v "$(pwd)/llm_judge:/app/llm_judge" \ liquidai/mt-bench:latest judge \ diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh index de8c7d6..c501bf0 100755 --- a/bin/api/run_openai_judge.sh +++ b/bin/api/run_openai_judge.sh @@ -1,16 +1,23 @@ #!/bin/bash print_usage() { - echo "Usage: $0 --openai-api-key --model-name --parallel " + echo "Usage: $0 --model-name [--openai-api-key | (--judge-model-name --judge-model-url --judge-model-api-key )] --parallel " echo echo "Arguments:" - echo " --openai-api-key OpenAI API key" - echo " --model-name Model name" - echo " --parallel Number of parallel processes" + echo " --model-name Model name to be evaluated" + echo " --openai-api-key OpenAI API key (backward compatibility)" + echo " --judge-model-name Name of the judge model (default: gpt-4)" + echo " --judge-model-url Base URL for the judge model API" + echo " --judge-model-api-key API key for the judge model" + echo " --parallel Number of parallel processes" + echo " --ci CI mode" } OPENAI_API_KEY="" MODEL_NAME="" +JUDGE_MODEL_NAME="gpt-4" +JUDGE_MODEL_URL="" +JUDGE_MODEL_API_KEY="" PARALLEL="5" CI="false" @@ -24,6 +31,18 @@ while [[ $# -gt 0 ]]; do MODEL_NAME="$2" shift 2 ;; + --judge-model-name) + JUDGE_MODEL_NAME="$2" + shift 2 + ;; + --judge-model-url) + JUDGE_MODEL_URL="$2" + shift 2 + ;; + --judge-model-api-key) + JUDGE_MODEL_API_KEY="$2" + shift 2 + ;; --parallel) PARALLEL="$2" shift 2 @@ -40,10 +59,26 @@ while [[ $# -gt 0 ]]; do esac done -if [[ -z "$OPENAI_API_KEY" ]]; then - echo "Error: --openai-api-key is required" +# If judge model API key is provided, use it +if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then + # Use the new judge model parameters + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: --judge-model-url is required when using --judge-model-api-key" + print_usage + exit 1 + fi +elif [[ -z "$OPENAI_API_KEY" ]]; then + # Fall back to requiring OpenAI API key + echo "Error: Either --judge-model-api-key or --openai-api-key is required" print_usage exit 1 +else + # If only OpenAI API key is provided, use it as the judge model API key + JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" + # Default to OpenAI API URL if using OpenAI API key + if [[ -z "$JUDGE_MODEL_URL" ]]; then + JUDGE_MODEL_URL="https://api.openai.com/v1" + fi fi if [[ -z "$MODEL_NAME" ]]; then @@ -53,15 +88,20 @@ if [[ -z "$MODEL_NAME" ]]; then fi export OPENAI_API_KEY="$OPENAI_API_KEY" +export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" +export JUDGE_MODEL_URL="$JUDGE_MODEL_URL" +export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" export PYTHONPATH=. python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ + --judge-model "$JUDGE_MODEL_NAME" \ --parallel "$PARALLEL" \ --bench-name japanese_mt_bench python llm_judge/show_result.py --model-list "$MODEL_NAME" \ + --judge-model "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ - --input-file llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_$MODEL_NAME.jsonl \ - --output llm_judge/data/japanese_mt_bench/gpt4-score-$MODEL_NAME.json + --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ + --output "llm_judge/data/japanese_mt_bench/${JUDGE_MODEL_NAME}-score-$MODEL_NAME.json" diff --git a/llm_judge/common.py b/llm_judge/common.py index bf4d3f6..1436850 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -157,7 +157,7 @@ def load_judge_prompts(prompt_file: str): return prompts -def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azure=True): +def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_dict=None): kwargs = {} model = judge.model_name if ref_answer is not None: @@ -193,12 +193,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur conv.append_message(conv.roles[1], None) if model in OPENAI_MODEL_LIST: - if azure: - judgment = chat_completion_openai_azure( - model, conv, temperature=0, max_tokens=2048 - ) - else: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) elif model in ANTHROPIC_MODEL_LIST: judgment = chat_completion_anthropic( model, conv, temperature=0, max_tokens=1024 @@ -226,7 +223,7 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, azur return rating_list, user_prompt_list, judgment_list -def play_a_match_single(match: MatchPair, output_file: str, azure=True): +def play_a_match_single(match: MatchSingle, output_file: str, api_dict=None): question, model, answer, judge, ref_answer, multi_turn = ( match.question, match.model, @@ -238,7 +235,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): if judge.prompt_template["type"] == "single": score_list, user_prompt_list, judgment_list = run_judge_single( - question, answer, judge, ref_answer, multi_turn=multi_turn, azure=azure + question, answer, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) question_id = question["question_id"] @@ -259,7 +256,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): f"judge: {(judge.model_name, judge.prompt_template['name'])}" ) else: - raise ValueError(f"invalid judge type: {judge['type']}") + raise ValueError(f"invalid judge type: {judge.prompt_template['type']}") if output_file: os.makedirs(os.path.dirname(output_file), exist_ok=True) @@ -269,7 +266,7 @@ def play_a_match_single(match: MatchPair, output_file: str, azure=True): return result -def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, azure=True): +def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=False, api_dict=None): kwargs = {} model = judge.model_name if ref_answer is not None: @@ -305,12 +302,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F if model in OPENAI_MODEL_LIST: conv.set_system_message(system_prompt) - if azure: - judgment = chat_completion_openai_azure( - model, conv, temperature=0, max_tokens=2048 - ) - else: - judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048) + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) elif model in ANTHROPIC_MODEL_LIST: if system_prompt != "You are a helpful assistant.": user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt @@ -352,7 +346,7 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F return winner, user_prompt, judgment -def play_a_match_pair(match: MatchPair, output_file: str): +def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None): question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = ( match.question, match.model_1, @@ -366,10 +360,10 @@ def play_a_match_pair(match: MatchPair, output_file: str): if judge.prompt_template["type"] == "pairwise": g1_winner, g1_user_prompt, g1_judgment = run_judge_pair( - question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, azure=True + question, answer_1, answer_2, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) g2_winner, g2_user_prompt, g2_judgment = run_judge_pair( - question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, azure=True + question, answer_2, answer_1, judge, ref_answer, multi_turn=multi_turn, api_dict=api_dict ) g1_map = {"A": "model_1", "B": "model_2"} @@ -401,15 +395,19 @@ def play_a_match_pair(match: MatchPair, output_file: str): ) elif judge.prompt_template["type"] == "single": m1_score, m1_user_prompt, m1_judgment = run_judge_single( - question, answer_1, judge, azure=True + question, answer_1, judge, ref_answer, api_dict=api_dict ) m2_score, m2_user_prompt, m2_judgment = run_judge_single( - question, answer_2, judge, azure=True + question, answer_2, judge, ref_answer, api_dict=api_dict ) - if abs(m1_score - m2_score) <= TIE_DELTA: + # Extract first score from lists + m1_first_score = m1_score[0] if isinstance(m1_score, list) else m1_score + m2_first_score = m2_score[0] if isinstance(m2_score, list) else m2_score + + if abs(m1_first_score - m2_first_score) <= TIE_DELTA: winner = "tie" - elif m1_score > m2_score: + elif m1_first_score > m2_first_score: winner = "model_1" else: winner = "model_2" @@ -436,7 +434,7 @@ def play_a_match_pair(match: MatchPair, output_file: str): f"judge: {(judge.model_name, judge.prompt_template['name'])}" ) else: - raise ValueError(f"invalid judge type: {judge['type']}") + raise ValueError(f"invalid judge type: {judge.prompt_template['type']}") if output_file: os.makedirs(os.path.dirname(output_file), exist_ok=True) @@ -520,8 +518,8 @@ def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict= except openai.error.InvalidRequestError as e: print(type(e), e) break - except KeyError: - print(response) + except KeyError as e: + print(f"KeyError: {e}") break return output diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index a4cb2c3..7732f8c 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -184,7 +184,9 @@ def make_judge_single(judge_model, judge_prompts): default="llm_judge/data/judge_prompts.jsonl", help="The file of judge prompts.", ) - parser.add_argument("--judge-model", type=str, default="gpt-4") + parser.add_argument("--judge-model", type=str, default="gpt-4", help="The model used for judging") + parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API") + parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model") parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") parser.add_argument( "--mode", @@ -320,14 +322,22 @@ def make_judge_single(judge_model, judge_prompts): print(json.dumps(match_stat, indent=4, ensure_ascii=False)) # input("Press Enter to confirm...") + # Prepare API dict if judge model URL and API key are provided + api_dict = None + if args.judge_model_url and args.judge_model_api_key: + api_dict = { + "api_base": args.judge_model_url, + "api_key": args.judge_model_api_key + } + # Play matches if args.parallel == 1: for match in tqdm(matches): - play_a_match_func(match, output_file=output_file, azure=args.azure) + play_a_match_func(match, output_file=output_file, api_dict=api_dict) else: def play_a_match_wrapper(match): - play_a_match_func(match, output_file=output_file, azure=args.azure) + play_a_match_func(match, output_file=output_file, api_dict=api_dict) np.random.seed(0) np.random.shuffle(matches) From 030e054ffb866138388cf0683457e2ae2db1a03e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Mar 2025 07:11:48 +0000 Subject: [PATCH 02/13] Remove OpenAI API key parameter completely Co-Authored-By: liren@liquid.ai --- bin/api/entrypoint.sh | 16 +++++++------ bin/api/run_docker_eval.sh | 35 +++++++++-------------------- bin/api/run_openai_judge.sh | 37 +++++++++--------------------- llm_judge/common.py | 45 +++---------------------------------- llm_judge/gen_judgment.py | 16 ++++--------- 5 files changed, 37 insertions(+), 112 deletions(-) diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh index 562069a..ce284a8 100755 --- a/bin/api/entrypoint.sh +++ b/bin/api/entrypoint.sh @@ -51,13 +51,15 @@ elif [[ "$MODE" == "judge" ]]; then JUDGE_MODEL_URL=${JUDGE_MODEL_URL:-""} JUDGE_MODEL_API_KEY=${JUDGE_MODEL_API_KEY:-""} - # If no judge model API key is provided, use OpenAI API key - if [[ -z "$JUDGE_MODEL_API_KEY" && -n "$OPENAI_API_KEY" ]]; then - JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" - # Set default OpenAI URL if none provided - if [[ -z "$JUDGE_MODEL_URL" ]]; then - JUDGE_MODEL_URL="https://api.openai.com/v1" - fi + # Ensure required parameters are set + if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: JUDGE_MODEL_API_KEY environment variable is required" + exit 1 + fi + + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: JUDGE_MODEL_URL environment variable is required" + exit 1 fi while [[ $# -gt 0 ]]; do diff --git a/bin/api/run_docker_eval.sh b/bin/api/run_docker_eval.sh index c2861ed..5a05101 100755 --- a/bin/api/run_docker_eval.sh +++ b/bin/api/run_docker_eval.sh @@ -23,7 +23,6 @@ print_usage() { echo echo "Judge mode options:" echo " --model-name Name of the model to evaluate" - echo " --openai-api-key OpenAI API key for backward compatibility" echo " --judge-model-name Name of the judge model (default: gpt-4)" echo " --judge-model-url Base URL for the judge model API" echo " --judge-model-api-key API key for the judge model" @@ -109,7 +108,6 @@ if [[ "$MODE" == "generate" ]]; then elif [[ "$MODE" == "judge" ]]; then # Process judge mode arguments MODEL_NAME="" - OPENAI_API_KEY="" JUDGE_MODEL_NAME="gpt-4" JUDGE_MODEL_URL="" JUDGE_MODEL_API_KEY="" @@ -122,10 +120,7 @@ elif [[ "$MODE" == "judge" ]]; then MODEL_NAME="$2" shift 2 ;; - --openai-api-key) - OPENAI_API_KEY="$2" - shift 2 - ;; + --judge-model-name) JUDGE_MODEL_NAME="$2" shift 2 @@ -160,26 +155,17 @@ elif [[ "$MODE" == "judge" ]]; then exit 1 fi - # If --judge-model-api-key is provided, use it - if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then - # Use the new judge model parameters - if [[ -z "$JUDGE_MODEL_URL" ]]; then - echo "Error: --judge-model-url is required when using --judge-model-api-key" - print_usage - exit 1 - fi - elif [[ -z "$OPENAI_API_KEY" ]]; then - # Fall back to requiring OpenAI API key - echo "Error: Either --judge-model-api-key or --openai-api-key is required" + # Validate required parameters + if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: --judge-model-api-key is required" + print_usage + exit 1 + fi + + if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: --judge-model-url is required" print_usage exit 1 - else - # If only OpenAI API key is provided, use it as the judge model API key - JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" - # Default to OpenAI API URL if using OpenAI API key - if [[ -z "$JUDGE_MODEL_URL" ]]; then - JUDGE_MODEL_URL="https://api.openai.com/v1" - fi fi # Run judge mode @@ -189,7 +175,6 @@ elif [[ "$MODE" == "judge" ]]; then -e JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" \ -e JUDGE_MODEL_URL="$JUDGE_MODEL_URL" \ -e JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" \ - -e OPENAI_API_KEY="$OPENAI_API_KEY" \ -v "$(pwd)/llm_judge:/app/llm_judge" \ liquidai/mt-bench:latest judge \ --parallel "$PARALLEL" \ diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh index c501bf0..f9c512e 100755 --- a/bin/api/run_openai_judge.sh +++ b/bin/api/run_openai_judge.sh @@ -1,11 +1,10 @@ #!/bin/bash print_usage() { - echo "Usage: $0 --model-name [--openai-api-key | (--judge-model-name --judge-model-url --judge-model-api-key )] --parallel " + echo "Usage: $0 --model-name --judge-model-name --judge-model-url --judge-model-api-key --parallel " echo echo "Arguments:" echo " --model-name Model name to be evaluated" - echo " --openai-api-key OpenAI API key (backward compatibility)" echo " --judge-model-name Name of the judge model (default: gpt-4)" echo " --judge-model-url Base URL for the judge model API" echo " --judge-model-api-key API key for the judge model" @@ -13,7 +12,6 @@ print_usage() { echo " --ci CI mode" } -OPENAI_API_KEY="" MODEL_NAME="" JUDGE_MODEL_NAME="gpt-4" JUDGE_MODEL_URL="" @@ -23,10 +21,7 @@ CI="false" while [[ $# -gt 0 ]]; do case $1 in - --openai-api-key) - OPENAI_API_KEY="$2" - shift 2 - ;; + --model-name) MODEL_NAME="$2" shift 2 @@ -59,26 +54,17 @@ while [[ $# -gt 0 ]]; do esac done -# If judge model API key is provided, use it -if [[ -n "$JUDGE_MODEL_API_KEY" ]]; then - # Use the new judge model parameters - if [[ -z "$JUDGE_MODEL_URL" ]]; then - echo "Error: --judge-model-url is required when using --judge-model-api-key" - print_usage - exit 1 - fi -elif [[ -z "$OPENAI_API_KEY" ]]; then - # Fall back to requiring OpenAI API key - echo "Error: Either --judge-model-api-key or --openai-api-key is required" +# Validate required parameters +if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: --judge-model-api-key is required" + print_usage + exit 1 +fi + +if [[ -z "$JUDGE_MODEL_URL" ]]; then + echo "Error: --judge-model-url is required" print_usage exit 1 -else - # If only OpenAI API key is provided, use it as the judge model API key - JUDGE_MODEL_API_KEY="$OPENAI_API_KEY" - # Default to OpenAI API URL if using OpenAI API key - if [[ -z "$JUDGE_MODEL_URL" ]]; then - JUDGE_MODEL_URL="https://api.openai.com/v1" - fi fi if [[ -z "$MODEL_NAME" ]]; then @@ -87,7 +73,6 @@ if [[ -z "$MODEL_NAME" ]]; then exit 1 fi -export OPENAI_API_KEY="$OPENAI_API_KEY" export JUDGE_MODEL_NAME="$JUDGE_MODEL_NAME" export JUDGE_MODEL_URL="$JUDGE_MODEL_URL" export JUDGE_MODEL_API_KEY="$JUDGE_MODEL_API_KEY" diff --git a/llm_judge/common.py b/llm_judge/common.py index 1436850..067313d 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -477,52 +477,13 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): return output -def chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict=None): - openai.api_type = "azure" - openai.api_version = "2023-07-01-preview" - if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] - else: - openai.api_base = os.environ["AZURE_OPENAI_ENDPOINT"] - openai.api_key = os.environ["AZURE_OPENAI_KEY"] +# Remove Azure-specific function as we now use custom judge model parameters +# This function is no longer needed if "azure-" in model: model = model[6:] - output = API_ERROR_OUTPUT - min_sleep_time = 1 - max_sleep_time = API_RETRY_SLEEP - for _ in range(API_MAX_RETRY): - try: - messages = conv.to_openai_api_messages() - response = openai.ChatCompletion.create( - engine=model, - messages=messages, - n=1, - temperature=temperature, - max_tokens=max_tokens, - ) - output = response["choices"][0]["message"]["content"] - break - except openai.error.RateLimitError as e: - print(type(e), e) - sleep_time = random.randint(min_sleep_time, max_sleep_time) - print(f"Sleeping for {sleep_time} seconds") - time.sleep(sleep_time) - max_sleep_time = min(MAX_API_RETRY_SLEEP, max_sleep_time * 2) - min_sleep_time = max_sleep_time // 2 - except openai.error.OpenAIError as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - except openai.error.InvalidRequestError as e: - print(type(e), e) - break - except KeyError as e: - print(f"KeyError: {e}") - break - - return output +# Function body removed as we now use custom judge model parameters def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None): diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 7732f8c..961c871 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -213,9 +213,7 @@ def make_judge_single(judge_model, judge_prompts): parser.add_argument( "--first-n", type=int, help="A debug option. Only run the first `n` judgments." ) - parser.add_argument( - "--azure", action="store_true", help="Use Azure API instead of openai.", default=False - ) + # Remove Azure parameter as we now use custom judge model parameters args = parser.parse_args() args.model_list = [model_path.replace("/", "_") for model_path in args.model_list] @@ -250,20 +248,14 @@ def make_judge_single(judge_model, judge_prompts): if args.mode == "single": judges = make_judge_single(args.judge_model, judge_prompts) play_a_match_func = play_a_match_single - if args.azure: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_single_azure.jsonl") - else: - model_suffix = "_".join(args.model_list) - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl") + model_suffix = "_".join(args.model_list) + output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl") make_match_func = make_match_single baseline_model = None else: judges = make_judge_pairwise(args.judge_model, judge_prompts) play_a_match_func = play_a_match_pair - if args.azure: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair_azure.jsonl") - else: - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl") + output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl") if args.mode == "pairwise-all": make_match_func = make_match_all_pairs baseline_model = None From f8ab3bae4c21bc06acda2f9b502986cc611c920e Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Mar 2025 07:23:14 +0000 Subject: [PATCH 03/13] Update README and GitHub workflow to use gpt-4o as judge model Co-Authored-By: liren@liquid.ai --- .github/workflows/run-eval.yaml | 2 +- README.md | 34 ++++++++++++++++++++------------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index e7957c2..33d044e 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -75,7 +75,7 @@ jobs: run: | bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --judge-model-name "gpt-4" \ + --judge-model-name "gpt-4o" \ --judge-model-url "https://api.openai.com/v1" \ --judge-model-api-key "$OPENAI_API_KEY" \ --parallel 3 diff --git a/README.md b/README.md index 07fc2a5..47e2b8b 100644 --- a/README.md +++ b/README.md @@ -21,17 +21,19 @@ bin/api/run_docker_eval.sh generate \ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/.jsonl` -2. Run OpenAI judge: +2. Run judge: ```bash bin/api/run_docker_eval.sh judge \ --model-name \ - --openai-api-key + --judge-model-name \ + --judge-model-url \ + --judge-model-api-key ``` -GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_.jsonl`. +Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/_.jsonl`. -The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-.json`. +The final scores will be output in `llm_judge/data/japanese_mt_bench/-score-.json`. ### Examples @@ -45,7 +47,9 @@ bin/api/run_docker_eval.sh generate \ bin/api/run_docker_eval.sh judge \ --model-name lfm-3b-jp \ - --openai-api-key + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` Run eval for `lfm-3b-ichikara` on-prem: @@ -71,7 +75,9 @@ bin/api/run_docker_eval.sh generate \ bin/api/run_docker_eval.sh judge \ --model-name lfm-3b-jp \ - --openai-api-key + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` ## Run Evaluation without Docker @@ -111,16 +117,16 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/ --openai-api-key +bin/api/run_openai_judge.sh --model-name --judge-model-name --judge-model-url --judge-model-api-key # examples: -bin/api/run_openai_judge.sh --model-name lfm-3b-jp --openai-api-key -bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --openai-api-key +bin/api/run_openai_judge.sh --model-name lfm-3b-jp --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key +bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key ``` -GPT judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/gpt-4_.jsonl`. +Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/_.jsonl`. -The final scores will be output in `llm_judge/data/japanese_mt_bench/gpt4-score-.json`. +The final scores will be output in `llm_judge/data/japanese_mt_bench/-score-.json`. @@ -148,8 +154,10 @@ This applies to both `bin/api/run_docker_eval.sh judge` and `bin/api/run_openai_ | Argument | Description | Required | | --- | --- | --- | -| `--model-name` | Model name | Yes | -| `--openai-api-key` | OpenAI API key | Yes | +| `--model-name` | Model name to be evaluated | Yes | +| `--judge-model-name` | Name of the judge model (default: gpt-4) | No | +| `--judge-model-url` | Base URL for the judge model API | Yes | +| `--judge-model-api-key` | API key for the judge model | Yes | | `--parallel` | Number of parallel API calls | No. Default to 5. | From 75237681a4d3ade49acee46178b38f5cf86abc51 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 20 Mar 2025 07:26:34 +0000 Subject: [PATCH 04/13] Fix CI: Revert to gpt-4 as judge model name Co-Authored-By: liren@liquid.ai --- .github/workflows/run-eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index 33d044e..e7957c2 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -75,7 +75,7 @@ jobs: run: | bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --judge-model-name "gpt-4o" \ + --judge-model-name "gpt-4" \ --judge-model-url "https://api.openai.com/v1" \ --judge-model-api-key "$OPENAI_API_KEY" \ --parallel 3 From cfc8845992479c1ded1f146b901d50b277bbf1c1 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Wed, 9 Apr 2025 20:06:41 -0400 Subject: [PATCH 05/13] Run judge on gpt-4o-mini --- .github/workflows/run-eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index e7957c2..c823872 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -75,7 +75,7 @@ jobs: run: | bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --judge-model-name "gpt-4" \ + --judge-model-name "gpt-4o-mini" \ --judge-model-url "https://api.openai.com/v1" \ --judge-model-api-key "$OPENAI_API_KEY" \ --parallel 3 From ae180a86f06ed4ce46d311f5706667719d5bf30d Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Wed, 9 Apr 2025 20:51:23 -0400 Subject: [PATCH 06/13] Use gpt reference answer --- README.md | 2 ++ llm_judge/common.py | 4 ++-- model/model_adapter.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 47e2b8b..381e320 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,8 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/ \ diff --git a/llm_judge/common.py b/llm_judge/common.py index 067313d..ff56792 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -719,8 +719,8 @@ def check_data(questions, model_answers, ref_answers, models, judges): if q["category"] not in NEED_REF_CATS: continue assert ( - q["question_id"] in ref_answers[jg.model_name] - ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}" + q["question_id"] in ref_answers['gpt-4'] + ), f"Missing reference answer to Question {q['question_id']} from 'gpt-4'" def get_model_list(answer_dir): diff --git a/model/model_adapter.py b/model/model_adapter.py index 688960e..ce099e8 100755 --- a/model/model_adapter.py +++ b/model/model_adapter.py @@ -1076,7 +1076,7 @@ class ChatGPTAdapter(BaseModelAdapter): """The model adapter for ChatGPT""" def match(self, model_path: str): - return model_path in OPENAI_MODEL_LIST + return model_path.startswith("gpt") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() @@ -1089,7 +1089,7 @@ class AzureOpenAIAdapter(BaseModelAdapter): """The model adapter for Azure OpenAI""" def match(self, model_path: str): - return model_path in ("azure-gpt-35-turbo", "azure-gpt-4") + return model_path.startswith("azure") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() @@ -1118,7 +1118,7 @@ class ClaudeAdapter(BaseModelAdapter): """The model adapter for Claude""" def match(self, model_path: str): - return model_path in ANTHROPIC_MODEL_LIST + return model_path.startswith("claude") def load_model(self, model_path: str, from_pretrained_kwargs: dict): raise NotImplementedError() From 6c81de7fad1321f9f8cd3c087fa67b35f347b033 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 01:52:39 -0400 Subject: [PATCH 07/13] Fix more key errors --- llm_judge/gen_judgment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 961c871..260cf86 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -47,7 +47,7 @@ def make_match( a_1 = model_answers[m_1][q_id] a_2 = model_answers[baseline_model][q_id] if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] match = MatchPair( dict(q), m_1, @@ -87,7 +87,7 @@ def make_match_all_pairs( a_1 = model_answers[m_1][q_id] a_2 = model_answers[m_2][q_id] if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] match = MatchPair( dict(q), m_1, @@ -127,7 +127,7 @@ def make_match_single( print(f"Model {m} does not have answer for question {q_id}") continue if ref_answers is not None: - ref = ref_answers[judge.model_name][q_id] + ref = ref_answers['gpt-4'][q_id] matches.append( MatchSingle( dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn From df7362873267fd2abf5e22643f37ecd0b2a0eb7e Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 02:17:51 -0400 Subject: [PATCH 08/13] Use openai chat completions --- llm_judge/common.py | 99 ++++----------------------------------------- 1 file changed, 8 insertions(+), 91 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index ff56792..054a59d 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -13,16 +13,11 @@ import random import openai -import anthropic from dotenv import load_dotenv load_dotenv() -from model.model_adapter import ( - get_conversation_template, - ANTHROPIC_MODEL_LIST, - OPENAI_MODEL_LIST, -) +from model.model_adapter import get_conversation_template # API setting constants API_MAX_RETRY = 16 @@ -192,16 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_ conv.append_message(conv.roles[0], user_prompt) conv.append_message(conv.roles[1], None) - if model in OPENAI_MODEL_LIST: - judgment = chat_completion_openai( - model, conv, temperature=0, max_tokens=2048, api_dict=api_dict - ) - elif model in ANTHROPIC_MODEL_LIST: - judgment = chat_completion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) - else: - raise ValueError(f"Invalid judge model name: {model}") + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) if judge.prompt_template["output_format"] == "[[rating]]": match = re.search(one_score_pattern, judgment) @@ -300,20 +288,10 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F conv.append_message(conv.roles[0], user_prompt) conv.append_message(conv.roles[1], None) - if model in OPENAI_MODEL_LIST: - conv.set_system_message(system_prompt) - judgment = chat_completion_openai( - model, conv, temperature=0, max_tokens=2048, api_dict=api_dict - ) - elif model in ANTHROPIC_MODEL_LIST: - if system_prompt != "You are a helpful assistant.": - user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt - conv.messages[0][1] = user_prompt - judgment = chat_completion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) - else: - raise ValueError(f"Invalid judge model name: {model}") + conv.set_system_message(system_prompt) + judgment = chat_completion_openai( + model, conv, temperature=0, max_tokens=2048, api_dict=api_dict + ) if judge.prompt_template["output_format"] == "[[A]]": if "[[A]]" in judgment: @@ -477,67 +455,6 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): return output -# Remove Azure-specific function as we now use custom judge model parameters -# This function is no longer needed - - if "azure-" in model: - model = model[6:] - -# Function body removed as we now use custom judge model parameters - - -def chat_completion_anthropic(model, conv, temperature, max_tokens, api_dict=None): - if api_dict is not None and "api_key" in api_dict: - api_key = api_dict["api_key"] - else: - api_key = os.environ["ANTHROPIC_API_KEY"] - - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - c = anthropic.Anthropic(api_key=api_key) - prompt = conv.get_prompt() - response = c.completions.create( - model=model, - prompt=prompt, - stop_sequences=[anthropic.HUMAN_PROMPT], - max_tokens_to_sample=max_tokens, - temperature=temperature, - ) - output = response.completion - break - except anthropic.APIError as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return output.strip() - - -def chat_completion_palm(chat_state, model, conv, temperature, max_tokens): - from serve.api_provider import init_palm_chat - - assert model == "palm-2-chat-bison-001" - - if chat_state is None: - chat_state = init_palm_chat("chat-bison@001") - - parameters = { - "temperature": temperature, - "top_p": 0.8, - "top_k": 40, - "max_output_tokens": max_tokens, - } - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - response = chat_state.send_message(conv.messages[-2][1], **parameters) - output = response.text - break - except Exception as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return chat_state, output - - def normalize_game_key_single(gamekey, result): """Make the model names sorted in a game key.""" qid, model_1, model_2 = gamekey From ce03360634eb87665dcb4534264dcf65c81cb359 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 02:27:02 -0400 Subject: [PATCH 09/13] Judge the model by itself --- .github/workflows/run-eval.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index c823872..0783398 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -71,13 +71,14 @@ jobs: env: MODEL_NAME: lfm-3b MODEL_URL: ${{ vars.MODEL_URL }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + MODEL_API_KEY: ${{ secrets.MODEL_API_KEY }} run: | + # let the model judge itself against the GPT-4 answers bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --judge-model-name "gpt-4o-mini" \ - --judge-model-url "https://api.openai.com/v1" \ - --judge-model-api-key "$OPENAI_API_KEY" \ + --judge-model-name "$MODEL_NAME" \ + --judge-model-url "$MODEL_URL" \ + --judge-model-api-key "$MODEL_API_KEY" \ --parallel 3 - name: Process Judge Results From fb491d772fcf46ad955ca61f7e998d94095f5af8 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 03:02:42 -0400 Subject: [PATCH 10/13] Log api base and key --- llm_judge/common.py | 8 ++++++-- llm_judge/gen_judgment.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index 054a59d..1565f36 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -424,8 +424,12 @@ def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None): def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): if api_dict is not None: - openai.api_base = api_dict["api_base"] - openai.api_key = api_dict["api_key"] + if "api_base" in api_dict: + print(f"Using API base: {api_dict['api_base']}") + openai.api_base = api_dict["api_base"] + if "api_key" in api_dict: + print(f"Using API key: {api_dict['api_key'][0:4]}***") + openai.api_key = api_dict["api_key"] output = API_ERROR_OUTPUT min_sleep_time = 1 max_sleep_time = API_RETRY_SLEEP diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 260cf86..9fb5822 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -316,11 +316,15 @@ def make_judge_single(judge_model, judge_prompts): # Prepare API dict if judge model URL and API key are provided api_dict = None - if args.judge_model_url and args.judge_model_api_key: - api_dict = { - "api_base": args.judge_model_url, - "api_key": args.judge_model_api_key - } + + if args.judge_model_url or args.judge_model_api_key: + api_dict = {} + if args.judge_model_url: + print(f"Using custom judge model URL: {args.judge_model_url}") + api_dict["api_base"] = args.judge_model_url + if args.judge_model_api_key: + print(f"Using custom judge model API key: {args.judge_model_api_key[0:4]}***") + api_dict["api_key"] = args.judge_model_api_key # Play matches if args.parallel == 1: From 8fe17345c2e75845ab1e5c91f45596ccbc7e07b4 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 03:26:15 -0400 Subject: [PATCH 11/13] Use lfm-7b as judge --- .github/workflows/run-eval.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run-eval.yaml b/.github/workflows/run-eval.yaml index 0783398..abce77a 100644 --- a/.github/workflows/run-eval.yaml +++ b/.github/workflows/run-eval.yaml @@ -76,7 +76,7 @@ jobs: # let the model judge itself against the GPT-4 answers bin/api/run_openai_judge.sh \ --model-name "$MODEL_NAME" \ - --judge-model-name "$MODEL_NAME" \ + --judge-model-name "lfm-7b" \ --judge-model-url "$MODEL_URL" \ --judge-model-api-key "$MODEL_API_KEY" \ --parallel 3 From 338b54b1555577177aa16e6d753564156e3978d1 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 03:26:49 -0400 Subject: [PATCH 12/13] Add more typing --- conversation.py | 2 +- llm_judge/common.py | 10 +++++++--- llm_judge/gen_judgment.py | 14 +++++++------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/conversation.py b/conversation.py index 45c05dc..f071516 100755 --- a/conversation.py +++ b/conversation.py @@ -379,7 +379,7 @@ def register_conv_template(template: Conversation, override: bool = False): def get_conv_template(name: str) -> Conversation: """Get a conversation template.""" - print("Using template: ", name) + print("Using template:", name) return conv_templates[name].copy() diff --git a/llm_judge/common.py b/llm_judge/common.py index 1565f36..98cce59 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -9,7 +9,7 @@ import os import re import time -from typing import Optional +from typing import Any, Optional import random import openai @@ -211,7 +211,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False, api_ return rating_list, user_prompt_list, judgment_list -def play_a_match_single(match: MatchSingle, output_file: str, api_dict=None): +def play_a_match_single( + match: MatchSingle, output_file: str, api_dict: dict[str, Any] | None = None +) -> dict[str, Any]: question, model, answer, judge, ref_answer, multi_turn = ( match.question, match.model, @@ -324,7 +326,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F return winner, user_prompt, judgment -def play_a_match_pair(match: MatchPair, output_file: str, api_dict=None): +def play_a_match_pair( + match: MatchPair, output_file: str, api_dict: dict[str, Any] | None = None +) -> dict[str, Any]: question, model_1, model_2, answer_1, answer_2, judge, ref_answer, multi_turn = ( match.question, match.model_1, diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 9fb5822..91c74f9 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -6,6 +6,7 @@ import json import os from concurrent.futures import ThreadPoolExecutor +from typing import Any, Callable import numpy as np from tqdm import tqdm @@ -247,15 +248,15 @@ def make_judge_single(judge_model, judge_prompts): current_dir = os.path.dirname(os.path.abspath(__file__)) if args.mode == "single": judges = make_judge_single(args.judge_model, judge_prompts) - play_a_match_func = play_a_match_single + play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single model_suffix = "_".join(args.model_list) - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl") + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")) make_match_func = make_match_single baseline_model = None else: judges = make_judge_pairwise(args.judge_model, judge_prompts) - play_a_match_func = play_a_match_pair - output_file = os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl") + play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")) if args.mode == "pairwise-all": make_match_func = make_match_all_pairs baseline_model = None @@ -331,9 +332,8 @@ def make_judge_single(judge_model, judge_prompts): for match in tqdm(matches): play_a_match_func(match, output_file=output_file, api_dict=api_dict) else: - - def play_a_match_wrapper(match): - play_a_match_func(match, output_file=output_file, api_dict=api_dict) + def play_a_match_wrapper(input_match): + play_a_match_func(input_match, output_file=output_file, api_dict=api_dict) np.random.seed(0) np.random.shuffle(matches) From e833cd3f99cebf1920f484aadabbbf8797d3cd77 Mon Sep 17 00:00:00 2001 From: Liren Tu Date: Thu, 10 Apr 2025 03:56:33 -0400 Subject: [PATCH 13/13] Fix arguments and update logging --- README.md | 19 ++++++++++++++++--- bin/api/entrypoint.sh | 4 ++-- bin/api/run_openai_judge.sh | 32 ++++++++++++++------------------ llm_judge/common.py | 2 -- llm_judge/gen_judgment.py | 18 ++++++++++++------ llm_judge/show_result.py | 10 +++++----- 6 files changed, 49 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 381e320..ed31242 100644 --- a/README.md +++ b/README.md @@ -119,11 +119,24 @@ Results will be output in `llm_judge/data/japanese_mt_bench/model_answer/ --judge-model-name --judge-model-url --judge-model-api-key +bin/api/run_openai_judge.sh \ + --model-name \ + --judge-model-name \ + --judge-model-url \ + --judge-model-api-key # examples: -bin/api/run_openai_judge.sh --model-name lfm-3b-jp --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key -bin/api/run_openai_judge.sh --model-name lfm-3b-ichikara --judge-model-name gpt-4o --judge-model-url https://api.openai.com/v1 --judge-model-api-key +bin/api/run_openai_judge.sh \ + --model-name lfm-3b-jp \ + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key + +bin/api/run_openai_judge.sh \ + --model-name lfm-3b-ichikara \ + --judge-model-name gpt-4o \ + --judge-model-url https://api.openai.com/v1 \ + --judge-model-api-key ``` Judge results will be output to `llm_judge/data/japanese_mt_bench/model_judgment/_.jsonl`. diff --git a/bin/api/entrypoint.sh b/bin/api/entrypoint.sh index ce284a8..844d97c 100755 --- a/bin/api/entrypoint.sh +++ b/bin/api/entrypoint.sh @@ -81,7 +81,7 @@ elif [[ "$MODE" == "judge" ]]; then # Generate judgments python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ - --judge-model "$JUDGE_MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ --judge-model-url "$JUDGE_MODEL_URL" \ --judge-model-api-key "$JUDGE_MODEL_API_KEY" \ --parallel "$PARALLEL" \ @@ -90,7 +90,7 @@ elif [[ "$MODE" == "judge" ]]; then # Show results python llm_judge/show_result.py \ --model-list "$MODEL_NAME" \ - --judge-model "$JUDGE_MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ diff --git a/bin/api/run_openai_judge.sh b/bin/api/run_openai_judge.sh index f9c512e..28e5af1 100755 --- a/bin/api/run_openai_judge.sh +++ b/bin/api/run_openai_judge.sh @@ -1,15 +1,15 @@ #!/bin/bash print_usage() { - echo "Usage: $0 --model-name --judge-model-name --judge-model-url --judge-model-api-key --parallel " + echo "Usage: $0 --model-name [--judge-model-name ] [--judge-model-url ] --judge-model-api-key [--parallel ]" echo echo "Arguments:" - echo " --model-name Model name to be evaluated" + echo " --model-name Model name to be evaluated (required)" echo " --judge-model-name Name of the judge model (default: gpt-4)" - echo " --judge-model-url Base URL for the judge model API" - echo " --judge-model-api-key API key for the judge model" - echo " --parallel Number of parallel processes" - echo " --ci CI mode" + echo " --judge-model-url Base URL for the judge model API (default: https://api.openai.com/v1)" + echo " --judge-model-api-key API key for the judge model (required)" + echo " --parallel Number of parallel processes (default: 5)" + echo " --ci CI mode (default: false)" } MODEL_NAME="" @@ -55,20 +55,14 @@ while [[ $# -gt 0 ]]; do done # Validate required parameters -if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then - echo "Error: --judge-model-api-key is required" - print_usage - exit 1 -fi - -if [[ -z "$JUDGE_MODEL_URL" ]]; then - echo "Error: --judge-model-url is required" +if [[ -z "$MODEL_NAME" ]]; then + echo "Error: --model-name is required" print_usage exit 1 fi -if [[ -z "$MODEL_NAME" ]]; then - echo "Error: --model-name is required" +if [[ -z "$JUDGE_MODEL_API_KEY" ]]; then + echo "Error: --judge-model-api-key is required" print_usage exit 1 fi @@ -80,12 +74,14 @@ export PYTHONPATH=. python llm_judge/gen_judgment.py \ --model-list "$MODEL_NAME" \ - --judge-model "$JUDGE_MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ + --judge-model-url "$JUDGE_MODEL_URL" \ + --judge-model-api-key "$JUDGE_MODEL_API_KEY" \ --parallel "$PARALLEL" \ --bench-name japanese_mt_bench python llm_judge/show_result.py --model-list "$MODEL_NAME" \ - --judge-model "$JUDGE_MODEL_NAME" \ + --judge-model-name "$JUDGE_MODEL_NAME" \ --ci "$CI" \ --bench-name japanese_mt_bench \ --input-file "llm_judge/data/japanese_mt_bench/model_judgment/${JUDGE_MODEL_NAME}_$MODEL_NAME.jsonl" \ diff --git a/llm_judge/common.py b/llm_judge/common.py index 98cce59..c990892 100755 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -429,10 +429,8 @@ def play_a_match_pair( def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): if api_dict is not None: if "api_base" in api_dict: - print(f"Using API base: {api_dict['api_base']}") openai.api_base = api_dict["api_base"] if "api_key" in api_dict: - print(f"Using API key: {api_dict['api_key'][0:4]}***") openai.api_key = api_dict["api_key"] output = API_ERROR_OUTPUT min_sleep_time = 1 diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 91c74f9..b900933 100755 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -185,7 +185,7 @@ def make_judge_single(judge_model, judge_prompts): default="llm_judge/data/judge_prompts.jsonl", help="The file of judge prompts.", ) - parser.add_argument("--judge-model", type=str, default="gpt-4", help="The model used for judging") + parser.add_argument("--judge-model-name", type=str, default="gpt-4", help="The model used for judging") parser.add_argument("--judge-model-url", type=str, default="", help="Base URL for the judge model API") parser.add_argument("--judge-model-api-key", type=str, default="", help="API key for the judge model") parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") @@ -216,6 +216,12 @@ def make_judge_single(judge_model, judge_prompts): ) # Remove Azure parameter as we now use custom judge model parameters args = parser.parse_args() + print(f"Model name: {args.model_list}") + print(f"Judge model name: {args.judge_model_name}") + if args.judge_model_url: + print(f"Judge model URL: {args.judge_model_url}") + if args.judge_model_api_key: + print(f"Judge model API key: {args.judge_model_api_key[0:4]}***") args.model_list = [model_path.replace("/", "_") for model_path in args.model_list] @@ -247,16 +253,16 @@ def make_judge_single(judge_model, judge_prompts): current_dir = os.path.dirname(os.path.abspath(__file__)) if args.mode == "single": - judges = make_judge_single(args.judge_model, judge_prompts) + judges = make_judge_single(args.judge_model_name, judge_prompts) play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_single model_suffix = "_".join(args.model_list) - output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_{model_suffix}.jsonl")) + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_{model_suffix}.jsonl")) make_match_func = make_match_single baseline_model = None else: - judges = make_judge_pairwise(args.judge_model, judge_prompts) + judges = make_judge_pairwise(args.judge_model_name, judge_prompts) play_a_match_func: Callable[[MatchSingle | MatchPair, str, dict[str, Any] | None], dict[str, Any]] = play_a_match_pair - output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model}_pair.jsonl")) + output_file = str(os.path.join(current_dir, "data", args.bench_name, "model_judgment", f"{args.judge_model_name}_pair.jsonl")) if args.mode == "pairwise-all": make_match_func = make_match_all_pairs baseline_model = None @@ -303,7 +309,7 @@ def make_judge_single(judge_model, judge_prompts): match_stat = {} match_stat["bench_name"] = args.bench_name match_stat["mode"] = args.mode - match_stat["judge"] = args.judge_model + match_stat["judge"] = args.judge_model_name match_stat["baseline"] = baseline_model match_stat["model_list"] = models match_stat["total_num_questions"] = len(questions) diff --git a/llm_judge/show_result.py b/llm_judge/show_result.py index 4190c9a..2cb620b 100755 --- a/llm_judge/show_result.py +++ b/llm_judge/show_result.py @@ -26,9 +26,9 @@ def calculate_averages(scores): def display_result_single(args): if args.input_file is None: if args.azure: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single_azure.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single_azure.jsonl" else: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_single.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_single.jsonl" else: input_file = args.input_file @@ -115,9 +115,9 @@ def score_category(category): def display_result_pairwise(args): if args.input_file is None: if args.azure: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair_azure.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair_azure.jsonl" else: - input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model}_pair.jsonl" + input_file = f"data/{args.bench_name}/model_judgment/{args.judge_model_name}_pair.jsonl" else: input_file = args.input_file @@ -167,7 +167,7 @@ def display_result_pairwise(args): parser = argparse.ArgumentParser() parser.add_argument("--bench-name", type=str, default="mt_bench") parser.add_argument("--input-file", type=str) - parser.add_argument("--judge-model", type=str, default="gpt-4") + parser.add_argument("--judge-model-name", type=str, default="gpt-4") parser.add_argument("--output-file", type=str) parser.add_argument("--baseline-model", type=str, default="gpt-3.5-turbo") parser.add_argument(