updating tests

xzrderek · xzrderek · commit 4511dd1f615a · 2025-08-06T23:10:11.000-07:00
diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -29,7 +29,7 @@ def apps_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluatio
 @evaluation_test(
     input_dataset=["tests/pytest/data/apps_sample_dataset.jsonl"],
     dataset_adapter=apps_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.33,
     rollout_processor=default_single_turn_rollout_processor,
diff --git a/tests/pytest/test_basic_coding.py b/tests/pytest/test_basic_coding.py
@@ -28,7 +28,7 @@ def coding_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evaluat
 @evaluation_test(
     input_dataset=["tests/pytest/data/basic_coding_dataset.jsonl"],
     dataset_adapter=coding_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.8,
     rollout_processor=default_single_turn_rollout_processor,
diff --git a/tests/pytest/test_hallucination.py b/tests/pytest/test_hallucination.py
@@ -9,12 +9,13 @@
 import json
 from typing import Any, Dict, List
 
-from fireworks import LLM
+import litellm
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
 from eval_protocol.pytest import default_single_turn_rollout_processor, evaluation_test
 
-judge_llm = LLM(model="accounts/fireworks/models/kimi-k2-instruct", deployment_type="serverless")
+# Configure the judge model for LiteLLM
+JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"
 
 
 def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -31,7 +32,7 @@ def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[Evaluation
 @evaluation_test(
     input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
     dataset_adapter=hallucination_dataset_adapter,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 512}],
     rollout_processor=default_single_turn_rollout_processor,
     threshold_of_success=0.33,
@@ -77,7 +78,8 @@ def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
     """
 
     try:
-        response = judge_llm.chat.completions.create(
+        response = litellm.completion(
+            model=JUDGE_MODEL,
             messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
             temperature=0.1,
             max_tokens=500,
diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -24,7 +24,7 @@ def markdown_dataset_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
 @evaluation_test(
     input_dataset=["tests/pytest/data/markdown_dataset.jsonl"],
     dataset_adapter=markdown_dataset_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0, "max_tokens": 4096}],
     threshold_of_success=0.5,
     rollout_processor=default_single_turn_rollout_processor,
diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py
@@ -19,7 +19,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu
 
 @evaluation_test(
     input_dataset=["tests/pytest/data/function_calling.jsonl"],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     mode="pointwise",
     dataset_adapter=function_calling_to_evaluation_row,
     rollout_processor=default_single_turn_rollout_processor,
diff --git a/tests/pytest/test_pytest_input_messages.py b/tests/pytest/test_pytest_input_messages.py
@@ -10,7 +10,7 @@
             Message(role="user", content="What is the capital of France?"),
         ]
     ],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_processor=default_single_turn_rollout_processor,
 )
 def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:
diff --git a/tests/pytest/test_pytest_json_schema.py b/tests/pytest/test_pytest_json_schema.py
@@ -23,7 +23,7 @@ def json_schema_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evaluation
 
 @evaluation_test(
     input_dataset=["tests/pytest/data/json_schema.jsonl"],
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     mode="pointwise",
     rollout_processor=default_single_turn_rollout_processor,
     dataset_adapter=json_schema_to_evaluation_row,
diff --git a/tests/pytest/test_pytest_math_example.py b/tests/pytest/test_pytest_math_example.py
@@ -8,7 +8,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=gsm8k_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.0,
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -11,7 +11,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=gsm8k_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.0,
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -8,7 +8,7 @@
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
     dataset_adapter=word_count_to_evaluation_row,
-    model=["accounts/fireworks/models/kimi-k2-instruct"],
+    model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],
     rollout_input_params=[{"temperature": 0.0}],
     max_dataset_rows=5,
     threshold_of_success=0.3,  # Reasonable threshold for word count evaluation

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`Message(role="user", content="What is the capital of France?"),`
`11`	`11`	`]`
`12`	`12`	`],`
`13`		`- model=["accounts/fireworks/models/kimi-k2-instruct"],`
	`13`	`+ model=["fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"],`
`14`	`14`	`rollout_processor=default_single_turn_rollout_processor,`
`15`	`15`	`)`
`16`	`16`	`def test_input_messages_in_decorator(rows: List[EvaluationRow]) -> List[EvaluationRow]:`