99import json
1010from typing import Any , Dict , List
1111
12- from fireworks import LLM
12+ import litellm
1313
1414from eval_protocol .models import EvaluateResult , EvaluationRow , Message , MetricResult
1515from eval_protocol .pytest import default_single_turn_rollout_processor , evaluation_test
1616
17- judge_llm = LLM (model = "accounts/fireworks/models/kimi-k2-instruct" , deployment_type = "serverless" )
17+ # Configure the judge model for LiteLLM
18+ JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
1819
1920
2021def hallucination_dataset_adapter (data : List [Dict [str , Any ]]) -> List [EvaluationRow ]:
@@ -31,7 +32,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
3132@evaluation_test (
3233 input_dataset = ["tests/pytest/data/halueval_sample_dataset.jsonl" ],
3334 dataset_adapter = hallucination_dataset_adapter ,
34- model = ["accounts/fireworks/models/kimi-k2-instruct" ],
35+ model = ["fireworks_ai/ accounts/fireworks/models/kimi-k2-instruct" ],
3536 rollout_input_params = [{"temperature" : 0.0 , "max_tokens" : 512 }],
3637 rollout_processor = default_single_turn_rollout_processor ,
3738 threshold_of_success = 0.33 ,
@@ -77,7 +78,8 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
7778 """
7879
7980 try :
80- response = judge_llm .chat .completions .create (
81+ response = litellm .completion (
82+ model = JUDGE_MODEL ,
8183 messages = [{"role" : "system" , "content" : system_prompt }, {"role" : "user" , "content" : user_prompt }],
8284 temperature = 0.1 ,
8385 max_tokens = 500 ,
0 commit comments