From ef3080b922fa95b0ebcc3fdb2cc07bb63fae3db7 Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Tue, 11 Nov 2025 02:51:44 +0100
Subject: [PATCH 1/3] Initial commit with automatic conversion script from HELM
 eval to unified schema

---
 README.md                                     |   47 +-
 eval_converters/helm/adapter.py               |  631 +--
 eval_converters/helm/converter.py             |   93 +
 eval_converters/helm/utils.py                 |   62 -
 pyproject.toml                                |    2 +-
 schema/eval.schema.json                       |   13 +-
 schema/eval_types.py                          |   10 +-
 ...rai_pythia-1b-v0_hellaswag_1751729998.json | 1317 ++++++
 ...penai_gpt2_mmlu_philosophy_1762354922.json | 2445 +++++++++++
 .../per_instance_stats.json                   | 3742 +++++++++++++++++
 .../run_spec.json                             |   73 +
 .../scenario.json                             |    9 +
 .../scenario_state.json                       |  844 ++++
 .../stats.json                                | 2850 +++++++++++++
 14 files changed, 11654 insertions(+), 484 deletions(-)
 create mode 100644 tests/data/helm/helm_eleutherai_pythia-1b-v0_hellaswag_1751729998.json
 create mode 100644 tests/data/helm/helm_openai_gpt2_mmlu_philosophy_1762354922.json
 create mode 100644 tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
 create mode 100644 tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
 create mode 100644 tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario.json
 create mode 100644 tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
 create mode 100644 tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/stats.json

diff --git a/README.md b/README.md
index 7b5cc18..1c6f0dd 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Convert eval log from Inspect AI into json format with following command:
 uv run inspect log convert path_to_eval_file_generated_by_inspect --to json --output-dir inspect_json
 ```
 
-Then we can convert Inspect evaluation log into unified schema via eval_converters/inspect/converter.py. Conversion for example data can be generated via below script: 
+Then we can convert Inspect evaluation log into unified schema via `eval_converters/inspect/converter.py`. Conversion for example data can be generated via below script: 
 
 ```bash
 uv run python3 -m eval_converters.inspect.converter
@@ -63,7 +63,50 @@ options:
   --source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL
 ```
 
-## Tests
+
+### HELM
+
+Convert eval log from HELM into json format with following command:
+
+```bash
+uv run inspect log convert path_to_eval_file_generated_by_inspect --to json --output-dir inspect_json
+```
+
+You can convert HELM evaluation log into unified schema via `eval_converters/helm/converter.py`. For example:
+
+```bash
+uv run python3 -m eval_converters.inspect.converter --log_path tests/data/helm
+```
+
+The automatic conversion script requires following files generated by HELM to work correctly:
+- per_instance_stats.json
+- run_spec.json
+- scenario_state.json
+- scenario.json
+- stats.json
+
+Full manual for conversion of your own HELM evaluation log into unified is available below:
+
+```bash
+usage: converter.py [-h] [--log_dirpath LOG_DIRPATH] [--huggingface_dataset HUGGINGFACE_DATASET] [--output_dir OUTPUT_DIR] [--source_organization_name SOURCE_ORGANIZATION_NAME]
+                    [--evaluator_relationship {first_party,third_party,collaborative,other}] [--source_organization_url SOURCE_ORGANIZATION_URL]
+                    [--source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL]
+
+options:
+  -h, --help            show this help message and exit
+  --log_dirpath LOG_DIRPATH
+                        Path to directory with single evaluaion or multiple evaluations to convert
+  --huggingface_dataset HUGGINGFACE_DATASET
+  --output_dir OUTPUT_DIR
+  --source_organization_name SOURCE_ORGANIZATION_NAME
+                        Orgnization which pushed evaluation to the evalHub.
+  --evaluator_relationship {first_party,third_party,collaborative,other}
+                        Relationship of evaluation author to the model
+  --source_organization_url SOURCE_ORGANIZATION_URL
+  --source_organization_logo_url SOURCE_ORGANIZATION_LOGO_URL
+```
+
+### Tests
 
 Run below script to perform unit tests for all evaluation platforms.
 
diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
index de6c746..36d7620 100644
--- a/eval_converters/helm/adapter.py
+++ b/eval_converters/helm/adapter.py
@@ -1,10 +1,10 @@
 import os
-import wget
-import json
-from typing import List, Dict
+from typing import Any, Dict, List, Optional, Union
 from helm.benchmark.metrics.metric import PerInstanceStats
 from helm.benchmark.presentation.schema import Schema, read_schema
-from helm.benchmark.adaptation.scenario_state import ScenarioState
+from helm.benchmark.scenarios.scenario import Reference, Scenario
+from helm.benchmark.adaptation.scenario_state import AdapterSpec, RequestState, ScenarioState
+from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
 from helm.benchmark.model_deployment_registry import get_model_deployment
 from helm.benchmark.run_spec import RunSpec
@@ -12,61 +12,31 @@
 from dacite import from_dict
 from pathlib import Path
 
-from schema.eval_types import EvaluationResult, ModelInfo, Configuration, InferenceSettings, GenerationArgs, Quantization, BitPrecision, Method, Model, PromptConfig, Instance, Output, Evaluation, TaskType, PromptClass, SampleIdentifier, EvaluationMethod
 from schema import SCHEMA_VERSION
+from schema.eval_types import (
+    DetailedEvaluationResultsPerSample,
+    EvaluationLog,
+    EvaluationResult,
+    EvaluationSource,
+    EvaluationSourceType,
+    MetricConfig,
+    ModelInfo,
+    ScoreDetails,
+    SourceData,
+    SourceMetadata
+)
+
+from eval_converters.common.adapter import AdapterMetadata, BaseEvaluationAdapter, SupportedLibrary
+from eval_converters.common.error import AdapterError
+from eval_converters.common.utils import convert_timestamp_to_unix_format
 
 from eval_converters.common.adapter import BaseEvaluationAdapter, AdapterMetadata, SupportedLibrary
-from eval_converters.common.utils import detect_family, detect_hf_split
-from .utils import detect_prompt_class, get_adapter_class_from_method_string
 
 from transformers import AutoConfig
 
 # run this just once in your process to initialize the registry
 register_builtin_configs_from_helm_package()
 
-def infer_quantization(model_name_or_path: str):
-	"""
-	Returns (BitPrecision, Method) enums for the given HF model.
-	"""
-	try:
-		cfg = AutoConfig.from_pretrained(model_name_or_path)
-	except Exception as e:
-		raise ValueError(
-			f"Failed to load model config for {model_name_or_path}: {e} \n"
-			"This may happen if you are using a HELM model name instead of HuggingFace model name in the adapter_spec.model field."
-			"For example, HELM uses 'meta/llama-3.1-8b-instruct' while HuggingFace uses meta-llama/llama-3.1-8b-instruct' \n"
-			"Please verify the model name and try again."
-		)
-	qcfg = getattr(cfg, "quantization_config", None)
-
-	if qcfg is None:
-		return BitPrecision.none, Method.None_
-	
-	bits = int(qcfg.get("bits") or qcfg.get("weight_bits") or qcfg.get("q_bits"))
-	
-	if bits == 8:
-		precision = BitPrecision.int8
-	elif bits == 4:
-		precision = BitPrecision.int4
-	elif bits == 16:
-		precision = BitPrecision.float16
-	elif bits == 32:
-		precision = BitPrecision.float32
-	else:
-		precision = BitPrecision.none
-
-	method_key = qcfg.get("quant_method") or ""
-	method_map = {
-		"gptq": Method.static,
-		"awq": Method.static,
-		"bitsandbytes": Method.dynamic,
-		"quanto": Method.static,
-		"hqq": Method.static,
-		"torchao": Method.static,
-	}
-
-	method = method_map.get(method_key, Method.None_)
-	return precision, method
 
 class HELMAdapter(BaseEvaluationAdapter):
 	"""
@@ -76,13 +46,15 @@ class HELMAdapter(BaseEvaluationAdapter):
 	SCENARIO_STATE_FILE = 'scenario_state.json'
 	RUN_SPEC_FILE = 'run_spec.json'
 	SCENARIO_FILE = 'scenario.json'
+	STATS_FILE = 'stats.json'
+	PER_INSTANCE_STATS_FILE = 'per_instance_stats.json'
+	REQUIRED_LOG_FILES = [SCENARIO_STATE_FILE, RUN_SPEC_FILE, SCENARIO_FILE, SCENARIO_STATE_FILE, PER_INSTANCE_STATS_FILE]
 
 	@property
 	def metadata(self) -> AdapterMetadata:
 		return AdapterMetadata(
 			name="HELMAdapter",
 			version="0.0.1",
-			supported_library_versions=["0.5.6"],
 			description="Adapter for transforming HELM evaluation outputs to unified schema format"
 		)
 
@@ -90,23 +62,6 @@ def metadata(self) -> AdapterMetadata:
 	def supported_library(self) -> SupportedLibrary:
 		return SupportedLibrary.HELM
 
-	# to get the instance scores, we need to load the per_instance_stats.json file
-	# and extract the main metric name from the schema
-	# then, we can use the get_instance_scores_from_run_path method to get the instance scores
-	@staticmethod
-	def get_instance_scores(run_path: str, main_metric_name: str) -> Dict[str, float]:
-		instance_scores: Dict[str, float] = {}
-		per_instance_stats_path = os.path.join(run_path, "per_instance_stats.json")
-		if os.path.exists(per_instance_stats_path):
-			with open(per_instance_stats_path, "r") as f:
-				per_instance_stats = from_json(f.read(), List[PerInstanceStats])
-			for per_instance_stat in per_instance_stats:
-				for stat in per_instance_stat.stats:
-					if stat.name.name == main_metric_name:
-						assert stat.mean is not None
-						instance_scores[per_instance_stat.instance_id] = stat.mean
-			return instance_scores
-
 	@staticmethod
 	def get_main_metric_name(run_path: str, schema_path: str) -> str:
 		if schema_path.endswith(".json"):
@@ -124,385 +79,233 @@ def get_main_metric_name(run_path: str, schema_path: str) -> str:
 				return schema.name_to_run_group[group].environment["main_name"]
 		raise Exception(f"Could not find main metric name for {run_path}")
 	
-	def transform_from_directory(self, dir_path):
-		super().transform_from_directory(dir_path)
-		
+	def _load_evaluation_run_logfiles(self, dir_path) -> Dict:
 		scenario_state_dict = self._load_file(Path(f'{dir_path}/{self.SCENARIO_STATE_FILE}'))
 		run_spec_dict = self._load_file(Path(f'{dir_path}/{self.RUN_SPEC_FILE}'))
-		scenario_dict = self._load_file(Path(f'{dir_path}/{self.SCENARIO_FILE}')) # We don't load into Scenario instance as it is an abstract class
-
-		# Load raw data object into a ScenarioState
-		scenario_state = from_dict(data_class=ScenarioState, data=scenario_state_dict)
-		adapter_spec = scenario_state.adapter_spec
-
-		# Load raw data object into a RunSpec
-		run_spec = from_dict(data_class=RunSpec, data=run_spec_dict)
-
-		# Construct the EvaluationResult components
-		# 1. Model
-		# 1.1. ModelInfo
-		model_info = ModelInfo(
-			name=adapter_spec.model,
-			family=detect_family(adapter_spec.model),
-		)
-
-		# 1.2. Configuration
-		# HELM does not provide context window size, try loading it from model config, else set to 1
-		try:
-			# try getting context window from model deployment
-			deployment = get_model_deployment(adapter_spec.model_deployment)
-			if deployment and deployment.max_sequence_length is not None:
-				context_window = deployment.max_sequence_length
-
-			# if not available, try loading it from model config
-			else:
-				config = AutoConfig.from_pretrained(adapter_spec.model)
-
-				priority_fields = [
-					"max_position_embeddings",
-					"n_positions",
-					"seq_len",
-					"seq_length",
-					"n_ctx",
-					"sliding_window"
-				]
-
-				context_window = next((getattr(config, f) for f in priority_fields if hasattr(config, f)), None)
-				if context_window is None:
-					context_window = 1
+		scenario_dict = self._load_file(Path(f'{dir_path}/{self.SCENARIO_FILE}'))
+		stats = self._load_file(Path(f'{dir_path}/{self.STATS_FILE}'))
 		
-		except Exception as e:
-			self.logger.error(f"Error getting context window: {e}")
-			context_window = 1
-
-		configuration = Configuration(
-			context_window=context_window,
-		)
+		with open(f'{dir_path}/{self.PER_INSTANCE_STATS_FILE}', "r") as f:
+			per_instance_stats = from_json(f.read(), List[PerInstanceStats])
+
+		return {
+			'per_instance_stats': per_instance_stats,
+			'run_spec_dict': run_spec_dict,
+			'scenario_dict': scenario_dict,
+			'scenario_state_dict': scenario_state_dict,
+			'stats': stats
+		}
+	
+	def directory_contains_required_files(self, dir_path):
+		if os.path.isdir(dir_path):
+			files = os.listdir(dir_path)
+			return all(required_file in files for required_file in self.REQUIRED_LOG_FILES)
+		
+		return False
 
-		# 1.3. InferenceSettings
+	def transform_from_directory(
+		self, dir_path: str, source_metadata: SourceMetadata
+	) -> Optional[Union[EvaluationLog, List[EvaluationLog]]]:
+		"""
+		Transforms evaluation logs found in the specified directory into the unified schema.
+		It handles both a single evaluation run (if the directory itself contains log files)
+		or multiple runs (if the directory contains subdirectories, each with log files).
+		"""
 		try:
-			precision, method = infer_quantization(adapter_spec.model)
+			if self.directory_contains_required_files(dir_path):
+				# Single evaluation run in the current directory
+				data = self._load_evaluation_run_logfiles(dir_path)
+				return self._transform_single(data, source_metadata)
+			else:
+				# Multiple evaluation runs in subdirectories
+				converted_evals: List[EvaluationLog] = []
+				
+				for entry in os.scandir(dir_path):
+					eval_run_dirpath = entry.path
+					if entry.is_dir() and self.directory_contains_required_files(eval_run_dirpath):
+						data = self._load_evaluation_run_logfiles(eval_run_dirpath)
+						converted_evals.append(self._transform_single(data, source_metadata))
+				
+				return converted_evals
+				
 		except Exception as e:
-			self.logger.warning(f"Error getting quantization: {e}")
-			precision = BitPrecision.none
-			method = Method.None_
-
-		quantization = Quantization(
-			bit_precision=precision,
-			method=method,
-		)
-		inference_settings = InferenceSettings(
-			quantization=quantization,
-			generation_args=GenerationArgs(
-				temperature=adapter_spec.temperature,
-				stop_sequences=adapter_spec.stop_sequences,
-			)
-		)
+			print(f'Error during conversion to unified schema in directory "{dir_path}": {e}')
+			return None
+	
+	def _get_correct_response(self, references: List['Reference']) -> Optional[str]:
+		"""Extracts the text of the first reference that has tags."""
+		for ref in references:
+			if ref.tags:
+				return ref.output.text
+		return None
+
+	def _extract_detailed_evaluation_info_for_samples(
+		self, request_states: List[RequestState]
+	) -> List[DetailedEvaluationResultsPerSample]:
+		"""
+		Extracts detailed evaluation information for each sample from the request states.
+		"""
+		results: List[DetailedEvaluationResultsPerSample] = []
 		
-		# 2. PromptConfig
-		# 2.1. PromptClass
-		prompt_class = detect_prompt_class(adapter_spec.method)
-
-		evaluation_results: List[EvaluationResult] = []
-		for request_state in scenario_state.request_states:
-			# 3. Instance
-			# 3.1. SampleIdentifier
-			sample_identifier = SampleIdentifier(
-				dataset_name=scenario_dict['name'],
-				hf_repo="", # FIXME: use HF repo if available
-				hf_split=detect_hf_split(request_state.instance.split),
-				hf_index=-1,  # FIXME: use actual index if available
-			)
-			
-			# Extract ground truth: the first correct reference
-			# FIXME: need to modify the schema to support evaluation with more than one ground truth: https://crfm-helm.readthedocs.io/en/latest/code/#adding-new-scenarios
-			references = request_state.instance.references
-			ground_truth = {}
-			for i, ref in enumerate(references):
-				if "correct" in ref.tags:
-					ground_truth = {
-						"id": str(i),
-						"text": ref.output.text,
-					}
-					break
-			
-			# 3.2. ClassificationFields (required for classification tasks)
-			classification_fields = {}
-			output_mapping_dict = request_state.output_mapping or {}
-			if prompt_class == PromptClass.MultipleChoice: 
-				choices = [{"id": k, "text": v} for k, v in output_mapping_dict.items()]
-
-				classification_fields = {
-					"full_input": request_state.request.prompt,
-					"question": request_state.instance.input.text,
-					"choices": choices,
-					"ground_truth": ground_truth,
-				}
-			
-			instance = Instance(
-				task_type=TaskType.classification if prompt_class == PromptClass.MultipleChoice else TaskType.generation,
-				raw_input=request_state.instance.input.text,
-				language='en',  # FIXME: other languages?
-				sample_identifier=sample_identifier,
-				classification_fields=classification_fields,
-			)
+		for state in request_states:
+			references = state.instance.references or []
+			correct_response = self._get_correct_response(references)
+
+			ground_truth = None
+			if correct_response:
+				ground_truth = next(
+					(
+						choice 
+						for choice, response in state.output_mapping.items() 
+						if response in correct_response
+					),
+					None
+				)
+
+			choices_list = [
+				f'{choice}. {response}' 
+				for choice, response in state.output_mapping.items()
+			]
 			
-			# 4. Output
-			output = Output(
-				response=request_state.result.completions[0].text
+			results.append(
+				DetailedEvaluationResultsPerSample(
+					sample_id=state.instance.id,
+					input=state.instance.input.text,
+					prompt=state.request.prompt,
+					ground_truth=ground_truth,
+					response=state.result.completions[0].text.strip() if state.result.completions else '',
+					choices=choices_list
+				)
 			)
+				
+		return results
 
-			# 5. Evaluation
-			adapter = get_adapter_class_from_method_string(adapter_spec.method)
-			evaluation_method = EvaluationMethod(
-				method_name=adapter_spec.method,
-				description=adapter.__class__.__doc__, # Use the adapter's docstring as description
-			)
+	def _extract_model_info(self, adapter_spec: AdapterSpec) -> ModelInfo:
+		deployment = get_model_deployment(adapter_spec.model_deployment)
+		client_args = getattr(deployment.client_spec, "args", None)
 
-			try:
-				# check for schema_*.yaml file in dir_path, if not found, download schema_*.yaml file from Github to dir_path using wget
-				schema_path = os.path.join(dir_path, "schema_capabilities.yaml")
-				if not os.path.exists(schema_path):
-					wget.download("https://raw.githubusercontent.com/stanford-crfm/helm/main/src/helm/benchmark/static/schema_capabilities.yaml", schema_path)
+		if "huggingface" in deployment.name or not client_args:
+			model_id = deployment.model_name
+		else:
+			model_id = client_args.get("pretrained_model_name_or_path", deployment.model_name)
 
-				main_metric_name = self.get_main_metric_name(dir_path, schema_path)
-				instance_scores = self.get_instance_scores(dir_path, main_metric_name)
-				score = instance_scores[request_state.instance.id]
-				
-			except Exception as e:
-				self.logger.warning(f"Error getting instance scores: {e}")
-				score = 0.0
-			
-			evaluation = Evaluation(
-				evaluation_method=evaluation_method,
-				ground_truth=ground_truth["text"],
-				score=score,
-			)
-		
-			evaluation_results.append(EvaluationResult(
-				schema_version=SCHEMA_VERSION,
-				evaluation_id=run_spec.name,
-				model=Model(
-					model_info=model_info,
-					configuration=configuration,
-					inference_settings=inference_settings,
-				),
-				prompt_config=PromptConfig(prompt_class=prompt_class),
-				instance=instance,
-				output=output,
-				evaluation=evaluation,
-			))
-		
-		return evaluation_results
+		return ModelInfo(
+			name=deployment.model_name,
+			id=model_id,
+			developer=deployment.model_name.split("/", 1)[0],
+			inference_platform=deployment.name.split("/", 1)[0],
+		)
 	
-	def _transform_single(self, raw_data, base_dir=None):
+	def _extract_generation_config(self, adapter_spec: AdapterSpec) -> Dict[str, Any]:
+		return {
+			'temperature': adapter_spec.temperature,
+			'max_tokens': adapter_spec.max_tokens,
+			'stop_sequences': adapter_spec.stop_sequences,
+			'instructions': adapter_spec.instructions,
+			'input_prefix': adapter_spec.input_prefix,
+			'input_suffix': adapter_spec.input_suffix,
+			'output_prefix': adapter_spec.output_prefix,
+			'output_suffix': adapter_spec.output_suffix,
+			'instance_prefix': adapter_spec.instance_prefix
+		}
+
+
+	def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) -> EvaluationLog:
 		"""
 		Args:
-			raw_data: Single evaluation record in HELM format (dict, JSON string, or file path)
+			raw_data: Single evaluation record in HELM format (dictionary with log files generated by HELM, each file is loaded as JSON format)
 
 		Returns:
-			EvaluationResult in unified schema format
+			EvaluationLog in unified schema format
 		"""
-		# check if raw_data is a dictionary, JSON string, or file path
-		if isinstance(raw_data, dict):
-			data = raw_data
-		elif isinstance(raw_data, (str, bytes)) and (raw_data.strip().startswith('{') or raw_data.strip().startswith('[')):
-			# It's a JSON string
-			data = json.loads(raw_data)
-		else:
-			# Assume it's a file path
-			with open(raw_data, 'r') as f:
-				data = json.load(f)
 
-		
-		scenario_state_dict = data['scenario_state_dict']
-		run_spec_dict = data['run_spec_dict']
-		scenario_dict = data['scenario_dict']
+		scenario_state_dict = raw_data['scenario_state_dict']
+		run_spec_dict = raw_data['run_spec_dict']
+		scenario_dict = raw_data['scenario_dict']
+		stats = raw_data['stats']
 
-		# Load raw data object into a ScenarioState
 		scenario_state = from_dict(data_class=ScenarioState, data=scenario_state_dict)
 		adapter_spec = scenario_state.adapter_spec
+		request_states = scenario_state.request_states
 
-		# Load raw data object into a RunSpec
 		run_spec = from_dict(data_class=RunSpec, data=run_spec_dict)
 
-		# Construct the EvaluationResult components
-		# 1. Model
-		# 1.1. ModelInfo
-		model_info = ModelInfo(
-			name=adapter_spec.model,
-			family=detect_family(adapter_spec.model),
-		)
+		stats: List[Stat] = [from_dict(data_class=Stat, data=stat_info) for stat_info in stats]
 
-		# 1.2. Configuration
-		# HELM does not provide context window size, try loading it from model config, else set to 1
-		try:
-			# try getting context window from model deployment
-			deployment = get_model_deployment(adapter_spec.model_deployment)
-			if deployment and deployment.max_sequence_length is not None:
-				context_window = deployment.max_sequence_length
+		timestamp = str(min(state.result.request_datetime for state in request_states))
 
-			# if not available, try loading it from model config
-			else:
-				config = AutoConfig.from_pretrained(adapter_spec.model)
-
-				priority_fields = [
-					"max_position_embeddings",
-					"n_positions",
-					"seq_len",
-					"seq_length",
-					"n_ctx",
-					"sliding_window"
-				]
-
-				context_window = next((getattr(config, f) for f in priority_fields if hasattr(config, f)), None)
-				if context_window is None:
-					context_window = 1
-		
-		except Exception as e:
-			self.logger.warning(f"Error getting context window: {e}")
-			context_window = 1
+		source_data = SourceData(
+            dataset_name=scenario_dict.get('name'),
+            samples_number=len(request_states),
+            sample_ids=[state.instance.id for state in request_states],
+			additional_details={
+				'scenario_name': run_spec.scenario_spec.class_name,
+				'subject': run_spec.scenario_spec.args.get('subject')
+			}
+        )
 
-		configuration = Configuration(
-			context_window=context_window,
-		)
-
-		# 1.3. InferenceSettings
-		try:
-			precision, method = infer_quantization(adapter_spec.model)
-		except Exception as e:
-			self.logger.warning(f"Error getting quantization: {e}")
-			precision = BitPrecision.none
-			method = Method.None_
+		evaluation_source = EvaluationSource(
+            evaluation_source_name='helm',
+            evaluation_source_type=EvaluationSourceType.evaluation_platform
+        )
 
-		quantization = Quantization(
-			bit_precision=precision,
-			method=method,
-		)
-		inference_settings = InferenceSettings(
-			quantization=quantization,
-			generation_args=GenerationArgs(
-				temperature=adapter_spec.temperature,
-				stop_sequences=adapter_spec.stop_sequences,
-			)
-		)
-		
-		# 2. PromptConfig
-		# 2.1. PromptClass
-		prompt_class = detect_prompt_class(adapter_spec.method)
+		model_info = self._extract_model_info(adapter_spec)
 
 		evaluation_results: List[EvaluationResult] = []
-		for request_state in scenario_state.request_states:
-			# 3. Instance
-			# 3.1. SampleIdentifier
-			sample_identifier = SampleIdentifier(
-				dataset_name=scenario_dict['name'],
-				hf_repo="", # FIXME: use HF repo if available
-				hf_split=detect_hf_split(request_state.instance.split),
-				hf_index=-1,  # FIXME: use actual index if available
-			)
-			
-			# Extract ground truth: the first correct reference
-			# FIXME: need to modify the schema to support evaluation with more than one ground truth: https://crfm-helm.readthedocs.io/en/latest/code/#adding-new-scenarios
-			references = request_state.instance.references
-			ground_truth = {}
-			for i, ref in enumerate(references):
-				if "correct" in ref.tags:
-					ground_truth = {
-						"id": str(i),
-						"text": ref.output.text,
-					}
-					break
-			
-			# 3.2. ClassificationFields (required for classification tasks)
-			classification_fields = {}
-			output_mapping_dict = request_state.output_mapping or {}
-			if prompt_class == PromptClass.MultipleChoice: 
-				choices = [{"id": k, "text": v} for k, v in output_mapping_dict.items()]
-
-				classification_fields = {
-					"full_input": request_state.request.prompt,
-					"question": request_state.instance.input.text,
-					"choices": choices,
-					"ground_truth": ground_truth,
-				}
-			
-			instance = Instance(
-				task_type=TaskType.classification if prompt_class == PromptClass.MultipleChoice else TaskType.generation,
-				raw_input=request_state.instance.input.text,
-				language='en',  # FIXME: other languages?
-				sample_identifier=sample_identifier,
-				classification_fields=classification_fields,
-			)
-			
-			# 4. Output
-			output = Output(
-				response=request_state.result.completions[0].text
-			)
 
-			# 5. Evaluation
-			adapter = get_adapter_class_from_method_string(adapter_spec.method)
-			evaluation_method = EvaluationMethod(
-				method_name=adapter_spec.method,
-				description=adapter.__class__.__doc__, # Use the adapter's docstring as description
+		metric_names = []
+		for metric_spec in run_spec.metric_specs: 
+			metric_names.extend(
+				metric_spec.args.get('names') if metric_spec.args else []
 			)
 
-			try:
-				schema_path = os.path.join(base_dir, "schema_capabilities.yaml")
-				if not os.path.exists(schema_path):
-					wget.download("https://raw.githubusercontent.com/stanford-crfm/helm/main/src/helm/benchmark/static/schema_capabilities.yaml", schema_path)
-
-				schema = read_schema(schema_path)
-				main_metric_name = None
-
-				# find the main metric name from the schema
-				for group in run_spec.groups:
-					if group in schema.name_to_run_group and "main_name" in schema.name_to_run_group[group].environment:
-						main_metric_name = schema.name_to_run_group[group].environment["main_name"]
-						break
-				if main_metric_name is None:
-					raise Exception("Could not find main metric name")
-
-				# get the per instance stats from the data
-				per_instance_stats = []
-				if "per_instance_stats" in data:
-					per_instance_stats = from_json(data["per_instance_stats"], List[PerInstanceStats])
-
-				# get the instance scores from the per instance stats
-				instance_scores = {}
-				for per_instance_stat in per_instance_stats:
-					for stat in per_instance_stat.stats:	
-						if stat.name.name == main_metric_name:
-							assert stat.mean is not None
-							instance_scores[per_instance_stat.instance_id] = stat.mean
-							break
-
-			except Exception as e:
-				self.logger.warning(f"Error getting instance scores: {e}")
-				instance_scores = {}
-
-			score = instance_scores.get(request_state.instance.id, 0.0)
-
-			# 6. EvaluationResult
-			evaluation = Evaluation(
-				evaluation_method=evaluation_method,
-				ground_truth=ground_truth["text"],
-				score=score,
+		for metric_name in metric_names:
+			metric_config = MetricConfig(
+				evaluation_description=metric_name,
+				lower_is_better=False
 			)
+
+			for stat in stats:
+				if not stat.name.name.startswith(metric_name):
+					continue
+
+				generic_details_fields = (
+					"count", "sum", "sum_squared", "min", "max", "mean", "variance", "stddev"
+				)
+				details = {field: getattr(stat, field) for field in generic_details_fields}
+				details['split'] = stat.name.split
+				details['perturbation'] = stat.name.perturbation
+
+				score_details = ScoreDetails(score=stat.mean, details=details)
+
+				evaluation_results.append(
+					EvaluationResult(
+						evaluation_name=run_spec.adapter_spec.method,
+						evaluation_timestamp=timestamp,
+						metric_config=metric_config,
+						score_details=score_details,
+						detailed_evaluation_results_url=None,
+						generation_config=self._extract_generation_config(adapter_spec)
+					)
+				)
 		
-			evaluation_results.append(EvaluationResult(
-				schema_version=SCHEMA_VERSION,
-				evaluation_id=run_spec.name,
-				model=Model(
-					model_info=model_info,
-					configuration=configuration,
-					inference_settings=inference_settings,
-				),
-				prompt_config=PromptConfig(prompt_class=prompt_class),
-				instance=instance,
-				output=output,
-				evaluation=evaluation,
-			))
-		
-		return evaluation_results
-	
\ No newline at end of file
+		detailed_eval_results = self._extract_detailed_evaluation_info_for_samples(request_states)
+
+		scenario_subject = run_spec.scenario_spec.args.get('subject')
+		dataset_unique_name = source_data.dataset_name
+		if scenario_subject:
+			dataset_unique_name += f"/{scenario_subject}"
+
+		evaluation_id = f'helm/{model_info.id}/{dataset_unique_name}/{timestamp}'
+
+		return EvaluationLog(
+            schema_version=SCHEMA_VERSION,
+            evaluation_id=evaluation_id,
+            retrieved_timestamp=timestamp,
+            source_data=source_data,
+            evaluation_source=evaluation_source,
+            source_metadata=source_metadata,
+            model_info=model_info,
+            evaluation_results=evaluation_results,
+            detailed_evaluation_results_per_samples=detailed_eval_results
+        ) 	
\ No newline at end of file
diff --git a/eval_converters/helm/converter.py b/eval_converters/helm/converter.py
index e69de29..04baab7 100644
--- a/eval_converters/helm/converter.py
+++ b/eval_converters/helm/converter.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+from argparse import ArgumentParser
+import json
+import os
+from enum import Enum
+from pathlib import Path
+from typing import Dict, List, Union
+
+from eval_converters.helm.adapter import HELMAdapter
+from schema.eval_types import (
+    EvaluatorRelationship,
+    EvaluationLog,
+    SourceMetadata
+)
+
+def parse_args():
+    parser = ArgumentParser()
+
+    parser.add_argument('--log_dirpath', type=str, default='tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', help="Path to directory with single evaluaion or multiple evaluations to convert")
+    parser.add_argument('--huggingface_dataset', type=str)
+    parser.add_argument('--output_dir', type=str, default='tests/data/helm')
+    parser.add_argument('--source_organization_name', type=str, default='Unknown', help='Orgnization which pushed evaluation to the evalHub.')
+    parser.add_argument('--evaluator_relationship', type=str, default='other', help='Relationship of evaluation author to the model', choices=['first_party', 'third_party', 'collaborative', 'other'])
+    parser.add_argument('--source_organization_url', type=str, default=None)
+    parser.add_argument('--source_organization_logo_url', type=str, default=None)
+
+
+    args = parser.parse_args()
+    return args
+
+
+class EnumEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Enum):
+            return obj.value
+        return super().default(obj)
+
+class HELMEvalLogConverter:
+    def __init__(self, log_dirpath: str | Path, output_dir: str = 'unified_schema/helm'):
+        '''
+        HELM generates log file for an evaluation.
+        '''
+        self.log_dirpath = Path(log_dirpath)
+        
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+    def convert_to_unified_schema(self, source_metadata: SourceMetadata = None) -> Union[EvaluationLog, List[EvaluationLog]]:
+        return HELMAdapter().transform_from_directory(self.log_dirpath, source_metadata=source_metadata)
+
+    def save_to_file(self, unified_eval_log: EvaluationLog, output_filename: str) -> bool:
+        try:
+            json_str = unified_eval_log.model_dump_json(indent=2)
+
+            with open(f'{self.output_dir}/{output_filename}', 'w') as json_file:
+                json_file.write(json_str)
+
+            print(f'Unified eval log was successfully saved to {output_filename} file.')
+        except Exception as e:
+            print(f"Problem with saving unified eval log to file: {e}")
+            raise e
+
+    def save_to_hf_datasets(self, unified_eval_log: EvaluationLog) -> bool:
+        # TODO
+        pass
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    helm_converter = HELMEvalLogConverter(
+        log_dirpath=args.log_dirpath,
+        output_dir=args.output_dir
+    )
+    
+    source_metadata = SourceMetadata(
+        source_organization_name=args.source_organization_name,
+        source_organization_url=args.source_organization_url,
+        source_organization_logo_url=args.source_organization_logo_url,
+        evaluator_relationship=EvaluatorRelationship(args.evaluator_relationship)
+    )
+
+    unified_output = helm_converter.convert_to_unified_schema(source_metadata)
+
+    if unified_output and isinstance(unified_output, EvaluationLog):
+        output_filename = f'{str(unified_output.evaluation_id).replace('/', '_')}.json'
+        helm_converter.save_to_file(unified_output, output_filename)
+    elif unified_output and isinstance(unified_output, List):
+        for single_unified_output in unified_output:
+            output_filename = f'{str(single_unified_output.evaluation_id).replace('/', '_')}.json'
+            helm_converter.save_to_file(single_unified_output, output_filename)
+    else:
+        print("Missing unified schema result!")
\ No newline at end of file
diff --git a/eval_converters/helm/utils.py b/eval_converters/helm/utils.py
index 247cfd2..e69de29 100644
--- a/eval_converters/helm/utils.py
+++ b/eval_converters/helm/utils.py
@@ -1,62 +0,0 @@
-from schema.eval_types import PromptClass
-from helm.benchmark.adaptation.adapters.adapter import Adapter
-from helm.benchmark.adaptation.adapter_spec import (
-    ADAPT_EHR_INSTRUCTION,
-    ADAPT_GENERATION,
-    ADAPT_CHAT,
-    ADAPT_GENERATION_MULTIMODAL,
-    ADAPT_LANGUAGE_MODELING,
-    ADAPT_MULTIPLE_CHOICE_JOINT,
-    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-    ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
-    ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
-    ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
-    ADAPT_RANKING_BINARY,
-)
-from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
-from helm.benchmark.adaptation.adapters.generation_adapter import GenerationAdapter
-from helm.benchmark.adaptation.adapters.chat_adapter import ChatAdapter
-from helm.benchmark.adaptation.adapters.language_modeling_adapter import LanguageModelingAdapter
-from helm.benchmark.adaptation.adapters.multimodal.generation_multimodal_adapter import GenerationMultimodalAdapter
-from helm.benchmark.adaptation.adapters.multimodal.multiple_choice_joint_multimodal_adapter import (
-    MultipleChoiceJointMultimodalAdapter,
-)
-from helm.benchmark.adaptation.adapters.multiple_choice_calibrated_adapter import MultipleChoiceCalibratedAdapter
-from helm.benchmark.adaptation.adapters.multiple_choice_joint_adapter import MultipleChoiceJointAdapter
-from helm.benchmark.adaptation.adapters.multiple_choice_joint_chain_of_thought_adapter import (
-    MultipleChoiceJointChainOfThoughtAdapter,
-)
-from helm.benchmark.adaptation.adapters.multiple_choice_separate_adapter import MultipleChoiceSeparateAdapter
-from helm.benchmark.adaptation.adapters.ehr_instruction_adapter import EHRInstructionAdapter
-
-def detect_prompt_class(adaptation_method: str) -> PromptClass:
-    """
-    Detect the PromptClass based on the adaptation method.
-    """
-    if 'multiple_choice' in adaptation_method.lower():
-        return PromptClass.MultipleChoice
-    
-    return PromptClass.Completion # FIXME: how to deal with OpenEnded?
-
-def get_adapter_class_from_method_string(method_str: str) -> type[Adapter]:
-    method_str = method_str.strip().lower()
-
-    mapping = {
-        ADAPT_EHR_INSTRUCTION: EHRInstructionAdapter,
-        ADAPT_GENERATION: GenerationAdapter,
-        ADAPT_CHAT: ChatAdapter,
-        ADAPT_LANGUAGE_MODELING: LanguageModelingAdapter,
-        ADAPT_MULTIPLE_CHOICE_JOINT: MultipleChoiceJointAdapter,
-        ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT: MultipleChoiceJointChainOfThoughtAdapter,
-        ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL: MultipleChoiceSeparateAdapter,
-        ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED: MultipleChoiceCalibratedAdapter,
-        ADAPT_RANKING_BINARY: BinaryRankingAdapter,
-        ADAPT_GENERATION_MULTIMODAL: GenerationMultimodalAdapter,
-        ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL: MultipleChoiceJointMultimodalAdapter,
-    }
-
-    for key in mapping:
-        if key in method_str:
-            return mapping[key]
-
-    raise ValueError(f"Unknown adapter method string: {method_str}")
diff --git a/pyproject.toml b/pyproject.toml
index 5f7297e..cf1d37d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,4 +36,4 @@ exclude = ["tests*"]
 pythonpath = ["."]
 
 [tool.uv]
-package = true
\ No newline at end of file
+package = true
diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index c98bb7c..b7c7dfc 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -63,8 +63,13 @@
                             "type": "array",
                             "description": "Array of sample ids used for evaluation",
                             "items": {
-                                "type": "integer"
+                                "type": ["integer", "string"]
                             }
+                        },
+                        "additional_details": {
+                            "type": "object",
+                            "description": "Additional dataset info parameters",
+                            "additionalProperties": true
                         }
                     }
                 }
@@ -350,9 +355,13 @@
                                 "type": "string",
                                 "description": "Raw input for the model"
                             },
+                            "prompt": {
+                                "type": "string",
+                                "description": "Full prompt for the model"
+                            },
                             "ground_truth": {
                                 "type": "string",
-                                "description": "Target reponsse"
+                                "description": "Target response"
                             },
                             "response": {
                                 "type": "string",
diff --git a/schema/eval_types.py b/schema/eval_types.py
index 168e364..f287d7d 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-11-04T19:42:49+00:00
+#   timestamp: 2025-11-11T00:26:04+00:00
 
 from __future__ import annotations
 
@@ -19,9 +19,12 @@ class SourceData(BaseModel):
     samples_number: Optional[int] = Field(
         None, description='Number of samples in the dataset'
     )
-    sample_ids: Optional[List[int]] = Field(
+    sample_ids: Optional[List[Union[int, str]]] = Field(
         None, description='Array of sample ids used for evaluation'
     )
+    additional_details: Optional[Dict[str, Any]] = Field(
+        None, description='Additional dataset info parameters'
+    )
 
 
 class EvaluationSourceType(Enum):
@@ -149,7 +152,8 @@ class FullLogprob(BaseModel):
 class DetailedEvaluationResultsPerSample(BaseModel):
     sample_id: str = Field(..., description='Simple sample ID')
     input: str = Field(..., description='Raw input for the model')
-    ground_truth: str = Field(..., description='Target reponsse')
+    prompt: Optional[str] = Field(None, description='Full prompt for the model')
+    ground_truth: str = Field(..., description='Target response')
     response: str = Field(..., description='Response from the model')
     choices: Optional[List[str]] = Field(
         None, description='Array of possible responses'
diff --git a/tests/data/helm/helm_eleutherai_pythia-1b-v0_hellaswag_1751729998.json b/tests/data/helm/helm_eleutherai_pythia-1b-v0_hellaswag_1751729998.json
new file mode 100644
index 0000000..4fab432
--- /dev/null
+++ b/tests/data/helm/helm_eleutherai_pythia-1b-v0_hellaswag_1751729998.json
@@ -0,0 +1,1317 @@
+{
+  "schema_version": "0.0.1",
+  "evaluation_id": "helm/eleutherai/pythia-1b-v0/hellaswag/1751729998",
+  "retrieved_timestamp": "1751729998",
+  "source_data": {
+    "dataset_name": "hellaswag",
+    "hf_repo": null,
+    "hf_split": null,
+    "samples_number": 10,
+    "sample_ids": [
+      "id44874",
+      "id47299",
+      "id45277",
+      "id41992",
+      "id49438",
+      "id42841",
+      "id46128",
+      "id47975",
+      "id41468",
+      "id44284"
+    ],
+    "additional_details": {
+      "scenario_name": "helm.benchmark.scenarios.commonsense_scenario.HellaSwagScenario",
+      "subject": null
+    }
+  },
+  "evaluation_source": {
+    "evaluation_source_name": "helm",
+    "evaluation_source_type": "evaluation_platform"
+  },
+  "source_metadata": {
+    "source_organization_name": "Unknown",
+    "source_organization_url": null,
+    "source_organization_logo_url": null,
+    "evaluator_relationship": "other"
+  },
+  "model_info": {
+    "name": "eleutherai/pythia-1b-v0",
+    "id": "eleutherai/pythia-1b-v0",
+    "developer": "eleutherai",
+    "inference_platform": "huggingface",
+    "inference_engine": null,
+    "additional_details": null
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1751729998",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.3,
+        "details": {
+          "count": 1,
+          "sum": 0.3,
+          "sum_squared": 0.09,
+          "min": 0.3,
+          "max": 0.3,
+          "mean": 0.3,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about common sense.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    }
+  ],
+  "detailed_evaluation_results_per_samples": [
+    {
+      "sample_id": "id44874",
+      "input": "Personal Care and Style: [header] How to dye your hair with semi permanent hair dye [title] Find the color you want. [step] There are many popular brands and hundreds of different colors to choose from. Semi-permanent dyes can be found in a variety of places, ranging from grocery stores to specialized fashion shops, with the biggest selection at beauty supply stores. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Personal Care and Style: [header] How to dye your hair with semi permanent hair dye [title] Find the color you want. [step] There are many popular brands and hundreds of different colors to choose from. Semi-permanent dyes can be found in a variety of places, ranging from grocery stores to specialized fashion shops, with the biggest selection at beauty supply stores. \nA. It is important to select the color that represents your hair type when you register your hair color. [substeps] Traditional semi-permanent dyes will generally not be available for hair color, like blow-dryers, curling irons, and appliances.\nB. If you're not planning on dying your hair, there are other coloration measures you can take to dye your hair. [step] Photoshop hd darkers work well, but don't lack the style that can be coupled with it.\nC. Pick the color that's your favorite, matches your wardrobe best, and/or is most flattering for your eye color and skin tone. Semi-permanent dyes work on all hair colors, but show up brightest on light hair.\nD. However, you can also take your color, added color, and texture into account when deciding what to dye, and what you will use it for. [substeps] Consider adding your hair dye to your hair if you have it long or curly.\nAnswer:",
+      "ground_truth": "C",
+      "response": "B",
+      "choices": [
+        "A. It is important to select the color that represents your hair type when you register your hair color. [substeps] Traditional semi-permanent dyes will generally not be available for hair color, like blow-dryers, curling irons, and appliances.",
+        "B. If you're not planning on dying your hair, there are other coloration measures you can take to dye your hair. [step] Photoshop hd darkers work well, but don't lack the style that can be coupled with it.",
+        "C. Pick the color that's your favorite, matches your wardrobe best, and/or is most flattering for your eye color and skin tone. Semi-permanent dyes work on all hair colors, but show up brightest on light hair.",
+        "D. However, you can also take your color, added color, and texture into account when deciding what to dye, and what you will use it for. [substeps] Consider adding your hair dye to your hair if you have it long or curly."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id47299",
+      "input": "Home and Garden: [header] How to grow grape vines [title] Choose a type of grape. [step] As with any plant, certain types of grapes grow better in different areas and offer up different flavors and appearances. There are three general types of grapes: american, european, and muscadine grapes. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Home and Garden: [header] How to grow grape vines [title] Choose a type of grape. [step] As with any plant, certain types of grapes grow better in different areas and offer up different flavors and appearances. There are three general types of grapes: american, european, and muscadine grapes. \nA. The' bat' variety is quite dark, with a hectic shape and a bit of texture. Popular grapes grow quickly for the same years and are often planted with white grapes and a rose color.\nB. [substeps] Traditional grape grapes are made of rich grapes and have light yellow and orange coloring. If the grapes are not from the wild and you are looking for a more bitter flavor, look for grapes grown in known regions that are not based on the wild grape.\nC. [substeps] American grapes are naturally sweet and plump, with skins that are slightly crisped. European grapes grow best in warm, dry conditions with ripened fruits on a stalk.\nD. American grapes grow best in warm, sunny climates like that of central california. European grapes are common in europe and northern parts of the us, and muscadine grapes are commonly found in the southern us.\nAnswer:",
+      "ground_truth": "D",
+      "response": "B",
+      "choices": [
+        "A. The' bat' variety is quite dark, with a hectic shape and a bit of texture. Popular grapes grow quickly for the same years and are often planted with white grapes and a rose color.",
+        "B. [substeps] Traditional grape grapes are made of rich grapes and have light yellow and orange coloring. If the grapes are not from the wild and you are looking for a more bitter flavor, look for grapes grown in known regions that are not based on the wild grape.",
+        "C. [substeps] American grapes are naturally sweet and plump, with skins that are slightly crisped. European grapes grow best in warm, dry conditions with ripened fruits on a stalk.",
+        "D. American grapes grow best in warm, sunny climates like that of central california. European grapes are common in europe and northern parts of the us, and muscadine grapes are commonly found in the southern us."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id45277",
+      "input": "Personal Care and Style: [header] How to do edwardian hairstyles [title] Backcomb or tease your hair to create volume. [step] Scrunch some volumizing powder or mousse into your roots, then grab a teasing brush or boar bristle flat brush. Starting at your hairline, take a section of hair about the size of your brush and hold the ends up in the air. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Personal Care and Style: [header] How to do edwardian hairstyles [title] Backcomb or tease your hair to create volume. [step] Scrunch some volumizing powder or mousse into your roots, then grab a teasing brush or boar bristle flat brush. Starting at your hairline, take a section of hair about the size of your brush and hold the ends up in the air. \nA. Gently pull the ends up towards the roots. [title] Flatten your hairstyle to create natural streaks, and then repeat the process for extra volume.\nB. Lightly brush the hair back towards the root for 2-3 strokes. Work towards the back of your hair, repeating the process until all of your hair is teased.\nC. Sweep the styling iron from your ear (teasing the ends slightly in the direction toward your root) to create a teasing mist. Move around so the entire section is teased.\nD. Pin the sections above or below your roots so your hair lays flat. Lift a section of hair up and pull it down.\nAnswer:",
+      "ground_truth": "B",
+      "response": "B",
+      "choices": [
+        "A. Gently pull the ends up towards the roots. [title] Flatten your hairstyle to create natural streaks, and then repeat the process for extra volume.",
+        "B. Lightly brush the hair back towards the root for 2-3 strokes. Work towards the back of your hair, repeating the process until all of your hair is teased.",
+        "C. Sweep the styling iron from your ear (teasing the ends slightly in the direction toward your root) to create a teasing mist. Move around so the entire section is teased.",
+        "D. Pin the sections above or below your roots so your hair lays flat. Lift a section of hair up and pull it down."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id41992",
+      "input": "Having an ice cream: A young child is seen holding an ice cream cone and speaking to the camera while smiling. She",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Having an ice cream: A young child is seen holding an ice cream cone and speaking to the camera while smiling. She\nA. continues speaking while using her mouth and pointing to the camera.\nB. continues speaking more and picking up ice cream and taking a chunk.\nC. licks the ice cream cone and continues eating around her toy.\nD. speaks to the camera man holding a doll in the other hand, then sits down on the ground.\nAnswer:",
+      "ground_truth": "D",
+      "response": "D",
+      "choices": [
+        "A. continues speaking while using her mouth and pointing to the camera.",
+        "B. continues speaking more and picking up ice cream and taking a chunk.",
+        "C. licks the ice cream cone and continues eating around her toy.",
+        "D. speaks to the camera man holding a doll in the other hand, then sits down on the ground."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id49438",
+      "input": "Personal Care and Style: [header] How to use a credit card system [title] Open a merchant account. [step] A merchant account is a bank account, but it is different from a business checking account. A merchant account communicates with your customers' credit card issuers. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Personal Care and Style: [header] How to use a credit card system [title] Open a merchant account. [step] A merchant account is a bank account, but it is different from a business checking account. A merchant account communicates with your customers' credit card issuers. \nA. You fill out credits and other credit forms into your merchant account. You can do any transactions you'd like on a merchant account.\nB. It takes the customer's credit card information and verifies and approves the sale. Your merchant account communicates with the card issuer to authenticate the user, confirming that the card is not stolen and that the customer has enough of a balance to pay for the sale.\nC. Credit cards handle the transactions linked to your account. [title] Determine your business's credit score.\nD. Credit card companies offer a variety of services : [substeps] Subscription. Credit card issuers charge a fee for a card and fill out forms to use.\nAnswer:",
+      "ground_truth": "B",
+      "response": "D",
+      "choices": [
+        "A. You fill out credits and other credit forms into your merchant account. You can do any transactions you'd like on a merchant account.",
+        "B. It takes the customer's credit card information and verifies and approves the sale. Your merchant account communicates with the card issuer to authenticate the user, confirming that the card is not stolen and that the customer has enough of a balance to pay for the sale.",
+        "C. Credit cards handle the transactions linked to your account. [title] Determine your business's credit score.",
+        "D. Credit card companies offer a variety of services : [substeps] Subscription. Credit card issuers charge a fee for a card and fill out forms to use."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id42841",
+      "input": "Playing accordion: The fingers are pressing the black keys. One finger is pressing one key. The fingers",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Playing accordion: The fingers are pressing the black keys. One finger is pressing one key. The fingers\nA. are moving to different keys and pressing them.\nB. are pumping the hand.\nC. stopped playing and stop at the bottom.\nD. flay the black keys in the middle of the keyboard.\nAnswer:",
+      "ground_truth": "A",
+      "response": "D",
+      "choices": [
+        "A. are moving to different keys and pressing them.",
+        "B. are pumping the hand.",
+        "C. stopped playing and stop at the bottom.",
+        "D. flay the black keys in the middle of the keyboard."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id46128",
+      "input": "Personal Care and Style: [header] How to wear a denim dress [title] Consider the wash of the dress. [step] Denim comes in a variety of colors, but most denim dresses are usually some shade of blue. However, you still have options when it comes to the depth of the blue. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Personal Care and Style: [header] How to wear a denim dress [title] Consider the wash of the dress. [step] Denim comes in a variety of colors, but most denim dresses are usually some shade of blue. However, you still have options when it comes to the depth of the blue. \nA. Some denim washes work best for more polished looks, while others are ideal for a relaxed outfit. [substeps] Dark wash denim tends to have a more dressed up look, so they work well for work or an evening out.\nB. Depending on your dress's style, some of the colors will show up darker, and others won't. [substeps] In general, the darker the shade of the denim, the darker the dress.\nC. [substeps] Jeans tend to look better when washed and ironed on both sides. Likewise, jeans can also be washed and ironed on both sides of the dress, but with less work involved.\nD. [substeps] Look for dresses made of the same material or a lighter blue. Choose washes made of the same material, usually denim colors such as fuchsia, navy, and tan.\nAnswer:",
+      "ground_truth": "A",
+      "response": "B",
+      "choices": [
+        "A. Some denim washes work best for more polished looks, while others are ideal for a relaxed outfit. [substeps] Dark wash denim tends to have a more dressed up look, so they work well for work or an evening out.",
+        "B. Depending on your dress's style, some of the colors will show up darker, and others won't. [substeps] In general, the darker the shade of the denim, the darker the dress.",
+        "C. [substeps] Jeans tend to look better when washed and ironed on both sides. Likewise, jeans can also be washed and ironed on both sides of the dress, but with less work involved.",
+        "D. [substeps] Look for dresses made of the same material or a lighter blue. Choose washes made of the same material, usually denim colors such as fuchsia, navy, and tan."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id47975",
+      "input": "Family Life: [header] How to adopt a russian baby [title] Fill out form i-600a, application for advance processing of orphan petition. [step] This form is available through the u.s. citizenship and immigration services, and it is the necessary first step in obtaining an immigrant visa for your adoptive child. You do not need to have a specific child in mind to complete this form. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Family Life: [header] How to adopt a russian baby [title] Fill out form i-600a, application for advance processing of orphan petition. [step] This form is available through the u.s. citizenship and immigration services, and it is the necessary first step in obtaining an immigrant visa for your adoptive child. You do not need to have a specific child in mind to complete this form. \nA. [substeps] The form needs to include information about the child and the country they are from. Don't forget to include all the information that you know: \" parent, child, next of kin.\nB. [title] Contact the uscis or a state-licensed adoption agency to request a home study. [step] Home studies must be performed in order to determine the fitness of you and the home environment you intend to bring a child into.\nC. [title] Complete section ii of form ii of visa application, application for advance processing of a russian baby. [step] This form is available through the u.s.\nD. The form is brief to provide basic information about yourself and your adoptive child, as well as what you are looking for and need to bring with you. [title] Submit the completed form via the mail or by fax.\nAnswer:",
+      "ground_truth": "B",
+      "response": "A",
+      "choices": [
+        "A. [substeps] The form needs to include information about the child and the country they are from. Don't forget to include all the information that you know: \" parent, child, next of kin.",
+        "B. [title] Contact the uscis or a state-licensed adoption agency to request a home study. [step] Home studies must be performed in order to determine the fitness of you and the home environment you intend to bring a child into.",
+        "C. [title] Complete section ii of form ii of visa application, application for advance processing of a russian baby. [step] This form is available through the u.s.",
+        "D. The form is brief to provide basic information about yourself and your adoptive child, as well as what you are looking for and need to bring with you. [title] Submit the completed form via the mail or by fax."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id41468",
+      "input": "Running a marathon: There are pictures of male participants shown dominating the scene where she was the only female participant. She talks about her experience as she shows more pictures of her participation against all odds. She",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Running a marathon: There are pictures of male participants shown dominating the scene where she was the only female participant. She talks about her experience as she shows more pictures of her participation against all odds. She\nA. eventually walks off the set.\nB. also shares information on how things changed later when she was finally allowed to participate freely.\nC. talks about her experience again in waking and reading.\nD. challenges her opponents with her stance and high kick, doing incredible falls on the mat in the process.\nAnswer:",
+      "ground_truth": "B",
+      "response": "D",
+      "choices": [
+        "A. eventually walks off the set.",
+        "B. also shares information on how things changed later when she was finally allowed to participate freely.",
+        "C. talks about her experience again in waking and reading.",
+        "D. challenges her opponents with her stance and high kick, doing incredible falls on the mat in the process."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id44284",
+      "input": "Personal Care and Style: [header] How to keep hair from curling with humidity [title] Lock in moisture with conditioner. [step] Dry hair, especially curly hair, needs lots of moisture. This is because curly hair tends to be on the drier end of the spectrum compared to straight hair. ",
+      "prompt": "The following are multiple choice questions (with answers) about common sense.\n\nQuestion: Putting on makeup: The words how to apply mascara appear. A woman with long black hair\nA. presents a bow and ribbon, followed by painted eye shadow, and a gift wrap.\nB. appears on a black screen with white lines, soap, and information at the bottom.\nC. appears in the large window.\nD. is talking to the camera.\nAnswer: D\n\nQuestion: Polishing shoes: The man uses the polish on his shoes. The man brushes his shoes with occasional blowing. The man shines his shoes. The man\nA. replaces the frame and arm strap on the shoes.\nB. shines balls of polish onto his shoes.\nC. puts the shoes on the stand to style them.\nD. talks to the camera.\nAnswer: D\n\nQuestion: Laying tile: The floor is swept, cleaned, and prepared for the process. The men begin to lay the vinyl flooring across the floor one piece at a time. The final result\nA. is shown on the screen.\nB. of the carpet is shown.\nC. is displayed in the male success rate.\nD. is seen with one ran through the floor.\nAnswer: A\n\nQuestion: Waterskiing: A person is water skiing behind a boat. They are going back and forth behind the boat. Words\nA. are on the screen.\nB. come onto the screen at the end.\nC. appear on the screen.\nD. are shown on the screen.\nAnswer: D\n\nQuestion: Ping-pong: We see an instructional title screen. The man demonstrates hitting a ball and we see him in play hitting the ball. We\nA. see the man beating a bag over a net.\nB. see the ending title screen.\nC. see a disc fly from 2 people.\nD. see the ending title screen again.\nAnswer: B\n\nQuestion: Personal Care and Style: [header] How to keep hair from curling with humidity [title] Lock in moisture with conditioner. [step] Dry hair, especially curly hair, needs lots of moisture. This is because curly hair tends to be on the drier end of the spectrum compared to straight hair. \nA. Instead of styling damp hair, work in conditioner to restore moisture to your hair. [substeps] In severe weather, it's a good idea to mist your hair twice a day, once in the morning and once at night.\nB. If you have curly hair or textured hair, moisturizing is the key to preventing frizz when it's humid. [substeps] Choose a shampoo and conditioner that is made for curly hair.\nC. As such, this may make your hair release moisture into the air instead of in wet curls. [substeps] Use a steam wand or mini-cooler to draw moisture from your hair, such as a hair dryer or a head dryer.\nD. Set your hair loose with a spritz of water as needed. If it tends to curl less, wipe away the excess moisture with a silk scarf.\nAnswer:",
+      "ground_truth": "B",
+      "response": "B",
+      "choices": [
+        "A. Instead of styling damp hair, work in conditioner to restore moisture to your hair. [substeps] In severe weather, it's a good idea to mist your hair twice a day, once in the morning and once at night.",
+        "B. If you have curly hair or textured hair, moisturizing is the key to preventing frizz when it's humid. [substeps] Choose a shampoo and conditioner that is made for curly hair.",
+        "C. As such, this may make your hair release moisture into the air instead of in wet curls. [substeps] Use a steam wand or mini-cooler to draw moisture from your hair, such as a hair dryer or a head dryer.",
+        "D. Set your hair loose with a spritz of water as needed. If it tends to curl less, wipe away the excess moisture with a silk scarf."
+      ],
+      "full_logprobs": null
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/helm_openai_gpt2_mmlu_philosophy_1762354922.json b/tests/data/helm/helm_openai_gpt2_mmlu_philosophy_1762354922.json
new file mode 100644
index 0000000..6ee3971
--- /dev/null
+++ b/tests/data/helm/helm_openai_gpt2_mmlu_philosophy_1762354922.json
@@ -0,0 +1,2445 @@
+{
+  "schema_version": "0.0.1",
+  "evaluation_id": "helm/openai/gpt2/mmlu/philosophy/1762354922",
+  "retrieved_timestamp": "1762354922",
+  "source_data": {
+    "dataset_name": "mmlu",
+    "hf_repo": null,
+    "hf_split": null,
+    "samples_number": 10,
+    "sample_ids": [
+      "id147",
+      "id11",
+      "id65",
+      "id344",
+      "id59",
+      "id291",
+      "id131",
+      "id222",
+      "id259",
+      "id105"
+    ],
+    "additional_details": {
+      "scenario_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
+      "subject": "philosophy"
+    }
+  },
+  "evaluation_source": {
+    "evaluation_source_name": "helm",
+    "evaluation_source_type": "evaluation_platform"
+  },
+  "source_metadata": {
+    "source_organization_name": "Unknown",
+    "source_organization_url": null,
+    "source_organization_logo_url": null,
+    "evaluator_relationship": "other"
+  },
+  "model_info": {
+    "name": "openai/gpt2",
+    "id": "openai/gpt2",
+    "developer": "openai",
+    "inference_platform": "huggingface",
+    "inference_engine": null,
+    "additional_details": null
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.1111111111111111,
+        "details": {
+          "count": 1,
+          "sum": 0.1111111111111111,
+          "sum_squared": 0.012345679012345678,
+          "min": 0.1111111111111111,
+          "max": 0.1111111111111111,
+          "mean": 0.1111111111111111,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "multiple_choice_joint",
+      "evaluation_timestamp": "1762354922",
+      "metric_config": {
+        "evaluation_description": "quasi_prefix_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 1,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+        "input_prefix": "Question: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    }
+  ],
+  "detailed_evaluation_results_per_samples": [
+    {
+      "sample_id": "id147",
+      "input": "What does the notion of “meaning in life” refer to?",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: What does the notion of “meaning in life” refer to?\nA. external meaning\nB. god's plan\nC. internalmeaning\nD. meaning in an afterlife\nAnswer:",
+      "ground_truth": "C",
+      "response": "D",
+      "choices": [
+        "A. external meaning",
+        "B. god's plan",
+        "C. internalmeaning",
+        "D. meaning in an afterlife"
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id11",
+      "input": "In Plato's idealism, the unchanging Ideas or \"Forms\" in terms of which sensible objects both exist and are known must transcend (that is, exist beyond) the changing realm of appearances; because if Forms changed, then:",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: In Plato's idealism, the unchanging Ideas or \"Forms\" in terms of which sensible objects both exist and are known must transcend (that is, exist beyond) the changing realm of appearances; because if Forms changed, then:\nA. the only things in the sensible world that we could ever experience would be concepts.\nB. the sensible realm (in contrast to the intelligible realm) would consist only of copies of real things.\nC. nothing in the experienced world could be or be identified as one determinate thing or another.\nD. the sensible world would consist of unchanging Forms.\nAnswer:",
+      "ground_truth": "C",
+      "response": "D",
+      "choices": [
+        "A. the only things in the sensible world that we could ever experience would be concepts.",
+        "B. the sensible realm (in contrast to the intelligible realm) would consist only of copies of real things.",
+        "C. nothing in the experienced world could be or be identified as one determinate thing or another.",
+        "D. the sensible world would consist of unchanging Forms."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id65",
+      "input": "Aristotle says  that what makes things be what they are--their essence--does not exist apart from individ-uals that exist in the world.  So if all the members of a species were destroyed, then their essence or form:",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Aristotle says  that what makes things be what they are--their essence--does not exist apart from individ-uals that exist in the world.  So if all the members of a species were destroyed, then their essence or form:\nA. would likewise be destroyed.\nB. would be destroyed only if there were no one around to remember the species.\nC. would continue existing (as with Plato's Forms) in some other realm of being.\nD. would not be destroyed because there was no essence or form originally to be destroyed; there are only individuals, not universal essences or natures of things.\nAnswer:",
+      "ground_truth": "A",
+      "response": "D",
+      "choices": [
+        "A. would likewise be destroyed.",
+        "B. would be destroyed only if there were no one around to remember the species.",
+        "C. would continue existing (as with Plato's Forms) in some other realm of being.",
+        "D. would not be destroyed because there was no essence or form originally to be destroyed; there are only individuals, not universal essences or natures of things."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id344",
+      "input": "Aesthetic values have to do with _______",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Aesthetic values have to do with _______\nA. the moral value of works of art and other objects that could be judged beautiful\nB. good and bad works of art and other objects that could be judged beautiful\nC. the moral values of artists and critics\nD. pragmatic decisions regarding the display of art\nAnswer:",
+      "ground_truth": "B",
+      "response": "D",
+      "choices": [
+        "A. the moral value of works of art and other objects that could be judged beautiful",
+        "B. good and bad works of art and other objects that could be judged beautiful",
+        "C. the moral values of artists and critics",
+        "D. pragmatic decisions regarding the display of art"
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id59",
+      "input": "According to Moore, we are thinking about good whenever we think about:",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: According to Moore, we are thinking about good whenever we think about:\nA. pleasure.\nB. things that we desire.\nC. intrinsic value.\nD. none of the above.\nAnswer:",
+      "ground_truth": "C",
+      "response": "D",
+      "choices": [
+        "A. pleasure.",
+        "B. things that we desire.",
+        "C. intrinsic value.",
+        "D. none of the above."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id291",
+      "input": "Craig argues that the series of events in time cannot be actually infinite, so we know that _____.",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Craig argues that the series of events in time cannot be actually infinite, so we know that _____.\nA. the universe is finite in the past and began to exist\nB. the universe is infinite in the past\nC. the universe never is uncaused\nD. the universe exists only in the mind\nAnswer:",
+      "ground_truth": "A",
+      "response": "D",
+      "choices": [
+        "A. the universe is finite in the past and began to exist",
+        "B. the universe is infinite in the past",
+        "C. the universe never is uncaused",
+        "D. the universe exists only in the mind"
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id131",
+      "input": "Baier claims that ethical skepticism is often due to:",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Baier claims that ethical skepticism is often due to:\nA. confused metaphysical theories.\nB. confused epistemological theories.\nC. confused religious views.\nD. confused scientific theories.\nAnswer:",
+      "ground_truth": "B",
+      "response": "D",
+      "choices": [
+        "A. confused metaphysical theories.",
+        "B. confused epistemological theories.",
+        "C. confused religious views.",
+        "D. confused scientific theories."
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id222",
+      "input": "Mill says that the ultimate end of utilitarianism is an existence as free of pain as possible and as rich as possible in _____.",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Mill says that the ultimate end of utilitarianism is an existence as free of pain as possible and as rich as possible in _____.\nA. lower pleasures\nB. spiritual attainment\nC. social achievement\nD. enjoyments\nAnswer:",
+      "ground_truth": "D",
+      "response": "D",
+      "choices": [
+        "A. lower pleasures",
+        "B. spiritual attainment",
+        "C. social achievement",
+        "D. enjoyments"
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id259",
+      "input": "Philo says the analogy that Cleanthes uses to make his case is _____.",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Philo says the analogy that Cleanthes uses to make his case is _____.\nA. too complicated\nB. weak\nC. strong\nD. not based on a legitimate method of reasoning\nAnswer:",
+      "ground_truth": "B",
+      "response": "D",
+      "choices": [
+        "A. too complicated",
+        "B. weak",
+        "C. strong",
+        "D. not based on a legitimate method of reasoning"
+      ],
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id105",
+      "input": "Baggini accuses pessimists of mixing up the two senses of _____.",
+      "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Baggini accuses pessimists of mixing up the two senses of _____.\nA. philosophy\nB. religion\nC. meaning\nD. time\nAnswer:",
+      "ground_truth": "C",
+      "response": "D",
+      "choices": [
+        "A. philosophy",
+        "B. religion",
+        "C. meaning",
+        "D. time"
+      ],
+      "full_logprobs": null
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
new file mode 100644
index 0000000..bebce63
--- /dev/null
+++ b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
@@ -0,0 +1,3742 @@
+[
+  {
+    "instance_id": "id147",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 333,
+        "sum_squared": 110889,
+        "min": 333,
+        "max": 333,
+        "mean": 333.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.6803672313690186,
+        "sum_squared": 0.4628995695207436,
+        "min": 0.6803672313690186,
+        "max": 0.6803672313690186,
+        "mean": 0.6803672313690186,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id11",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 420,
+        "sum_squared": 176400,
+        "min": 420,
+        "max": 420,
+        "mean": 420.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "valid"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "valid"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0.6785750389099121,
+        "sum_squared": 0.46046408343158873,
+        "min": 0.6785750389099121,
+        "max": 0.6785750389099121,
+        "mean": 0.6785750389099121,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id65",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 424,
+        "sum_squared": 179776,
+        "min": 424,
+        "max": 424,
+        "mean": 424.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.6800699234008789,
+        "sum_squared": 0.4624951007144773,
+        "min": 0.6800699234008789,
+        "max": 0.6800699234008789,
+        "mean": 0.6800699234008789,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id344",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 359,
+        "sum_squared": 128881,
+        "min": 359,
+        "max": 359,
+        "mean": 359.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.6793179512023926,
+        "sum_squared": 0.4614728788258162,
+        "min": 0.6793179512023926,
+        "max": 0.6793179512023926,
+        "mean": 0.6793179512023926,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id59",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 336,
+        "sum_squared": 112896,
+        "min": 336,
+        "max": 336,
+        "mean": 336.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.10651206970214844,
+        "sum_squared": 0.011344820992235327,
+        "min": 0.10651206970214844,
+        "max": 0.10651206970214844,
+        "mean": 0.10651206970214844,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id291",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 360,
+        "sum_squared": 129600,
+        "min": 360,
+        "max": 360,
+        "mean": 360.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.11817407608032227,
+        "sum_squared": 0.013965112257437795,
+        "min": 0.11817407608032227,
+        "max": 0.11817407608032227,
+        "mean": 0.11817407608032227,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id131",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 336,
+        "sum_squared": 112896,
+        "min": 336,
+        "max": 336,
+        "mean": 336.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.11505007743835449,
+        "sum_squared": 0.013236520318571365,
+        "min": 0.11505007743835449,
+        "max": 0.11505007743835449,
+        "mean": 0.11505007743835449,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id222",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 342,
+        "sum_squared": 116964,
+        "min": 342,
+        "max": 342,
+        "mean": 342.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.11363410949707031,
+        "sum_squared": 0.012912710841192165,
+        "min": 0.11363410949707031,
+        "max": 0.11363410949707031,
+        "mean": 0.11363410949707031,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id259",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 337,
+        "sum_squared": 113569,
+        "min": 337,
+        "max": 337,
+        "mean": 337.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0825653076171875,
+        "sum_squared": 0.0068170300219208,
+        "min": 0.0825653076171875,
+        "max": 0.0825653076171875,
+        "mean": 0.0825653076171875,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id105",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 4,
+        "sum_squared": 16,
+        "min": 4,
+        "max": 4,
+        "mean": 4.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 327,
+        "sum_squared": 106929,
+        "min": 327,
+        "max": 327,
+        "mean": 327.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.07625007629394531,
+        "sum_squared": 0.005814074134832481,
+        "min": 0.07625007629394531,
+        "max": 0.07625007629394531,
+        "mean": 0.07625007629394531,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_prefix_exact_match@5",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/run_spec.json b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
new file mode 100644
index 0000000..84b0d05
--- /dev/null
+++ b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
@@ -0,0 +1,73 @@
+{
+  "name": "mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2",
+  "scenario_spec": {
+    "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
+    "args": {
+      "subject": "philosophy"
+    }
+  },
+  "adapter_spec": {
+    "method": "multiple_choice_joint",
+    "global_prefix": "",
+    "global_suffix": "",
+    "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+    "input_prefix": "Question: ",
+    "input_suffix": "\n",
+    "reference_prefix": "A. ",
+    "reference_suffix": "\n",
+    "chain_of_thought_prefix": "",
+    "chain_of_thought_suffix": "\n",
+    "output_prefix": "Answer: ",
+    "output_suffix": "\n",
+    "instance_prefix": "\n",
+    "substitutions": [],
+    "max_train_instances": 5,
+    "max_eval_instances": 10,
+    "num_outputs": 5,
+    "num_train_trials": 1,
+    "num_trials": 1,
+    "sample_train": true,
+    "model_deployment": "huggingface/gpt2",
+    "model": "openai/gpt2",
+    "temperature": 0.0,
+    "max_tokens": 1,
+    "stop_sequences": [
+      "\n"
+    ],
+    "multi_label": false
+  },
+  "metric_specs": [
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
+      "args": {
+        "names": [
+          "exact_match",
+          "quasi_exact_match",
+          "prefix_exact_match",
+          "quasi_prefix_exact_match"
+        ]
+      }
+    },
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
+      "args": {}
+    },
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
+      "args": {}
+    }
+  ],
+  "data_augmenter_spec": {
+    "perturbation_specs": [],
+    "should_augment_train_instances": false,
+    "should_include_original_train": false,
+    "should_skip_unchanged_train": false,
+    "should_augment_eval_instances": false,
+    "should_include_original_eval": false,
+    "should_skip_unchanged_eval": false,
+    "seeds_per_instance": 1
+  },
+  "groups": [
+    "mmlu"
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario.json b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario.json
new file mode 100644
index 0000000..ad14658
--- /dev/null
+++ b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario.json
@@ -0,0 +1,9 @@
+{
+  "name": "mmlu",
+  "description": "Massive Multitask Language Understanding",
+  "tags": [
+    "knowledge",
+    "multiple_choice"
+  ],
+  "definition_path": "https://github.com/stanford-crfm/helm/blob/main/src/helm/benchmark/scenarios/mmlu_scenario.py"
+}
\ No newline at end of file
diff --git a/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
new file mode 100644
index 0000000..81d679e
--- /dev/null
+++ b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
@@ -0,0 +1,844 @@
+{
+  "adapter_spec": {
+    "method": "multiple_choice_joint",
+    "global_prefix": "",
+    "global_suffix": "",
+    "instructions": "The following are multiple choice questions (with answers) about philosophy.\n",
+    "input_prefix": "Question: ",
+    "input_suffix": "\n",
+    "reference_prefix": "A. ",
+    "reference_suffix": "\n",
+    "chain_of_thought_prefix": "",
+    "chain_of_thought_suffix": "\n",
+    "output_prefix": "Answer: ",
+    "output_suffix": "\n",
+    "instance_prefix": "\n",
+    "substitutions": [],
+    "max_train_instances": 5,
+    "max_eval_instances": 10,
+    "num_outputs": 5,
+    "num_train_trials": 1,
+    "num_trials": 1,
+    "sample_train": true,
+    "model_deployment": "huggingface/gpt2",
+    "model": "openai/gpt2",
+    "temperature": 0.0,
+    "max_tokens": 1,
+    "stop_sequences": [
+      "\n"
+    ],
+    "multi_label": false
+  },
+  "request_states": [
+    {
+      "instance": {
+        "input": {
+          "text": "What does the notion of \u201cmeaning in life\u201d refer to?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "external meaning"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "god's plan"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "internalmeaning"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "meaning in an afterlife"
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id147"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "external meaning",
+        "B": "god's plan",
+        "C": "internalmeaning",
+        "D": "meaning in an afterlife"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: What does the notion of \u201cmeaning in life\u201d refer to?\nA. external meaning\nB. god's plan\nC. internalmeaning\nD. meaning in an afterlife\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.6803672313690186,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "In Plato's idealism, the unchanging Ideas or \"Forms\" in terms of which sensible objects both exist and are known must transcend (that is, exist beyond) the changing realm of appearances; because if Forms changed, then:"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "the only things in the sensible world that we could ever experience would be concepts."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "the sensible realm (in contrast to the intelligible realm) would consist only of copies of real things."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "nothing in the experienced world could be or be identified as one determinate thing or another."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "the sensible world would consist of unchanging Forms."
+            },
+            "tags": []
+          }
+        ],
+        "split": "valid",
+        "id": "id11"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "the only things in the sensible world that we could ever experience would be concepts.",
+        "B": "the sensible realm (in contrast to the intelligible realm) would consist only of copies of real things.",
+        "C": "nothing in the experienced world could be or be identified as one determinate thing or another.",
+        "D": "the sensible world would consist of unchanging Forms."
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: In Plato's idealism, the unchanging Ideas or \"Forms\" in terms of which sensible objects both exist and are known must transcend (that is, exist beyond) the changing realm of appearances; because if Forms changed, then:\nA. the only things in the sensible world that we could ever experience would be concepts.\nB. the sensible realm (in contrast to the intelligible realm) would consist only of copies of real things.\nC. nothing in the experienced world could be or be identified as one determinate thing or another.\nD. the sensible world would consist of unchanging Forms.\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.6785750389099121,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Aristotle says  that what makes things be what they are--their essence--does not exist apart from individ-uals that exist in the world.  So if all the members of a species were destroyed, then their essence or form:"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "would likewise be destroyed."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "would be destroyed only if there were no one around to remember the species."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "would continue existing (as with Plato's Forms) in some other realm of being."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "would not be destroyed because there was no essence or form originally to be destroyed; there are only individuals, not universal essences or natures of things."
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id65"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "would likewise be destroyed.",
+        "B": "would be destroyed only if there were no one around to remember the species.",
+        "C": "would continue existing (as with Plato's Forms) in some other realm of being.",
+        "D": "would not be destroyed because there was no essence or form originally to be destroyed; there are only individuals, not universal essences or natures of things."
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Aristotle says  that what makes things be what they are--their essence--does not exist apart from individ-uals that exist in the world.  So if all the members of a species were destroyed, then their essence or form:\nA. would likewise be destroyed.\nB. would be destroyed only if there were no one around to remember the species.\nC. would continue existing (as with Plato's Forms) in some other realm of being.\nD. would not be destroyed because there was no essence or form originally to be destroyed; there are only individuals, not universal essences or natures of things.\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.6800699234008789,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Aesthetic values have to do with _______"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "the moral value of works of art and other objects that could be judged beautiful"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "good and bad works of art and other objects that could be judged beautiful"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "the moral values of artists and critics"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "pragmatic decisions regarding the display of art"
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id344"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "the moral value of works of art and other objects that could be judged beautiful",
+        "B": "good and bad works of art and other objects that could be judged beautiful",
+        "C": "the moral values of artists and critics",
+        "D": "pragmatic decisions regarding the display of art"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Aesthetic values have to do with _______\nA. the moral value of works of art and other objects that could be judged beautiful\nB. good and bad works of art and other objects that could be judged beautiful\nC. the moral values of artists and critics\nD. pragmatic decisions regarding the display of art\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.6793179512023926,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "According to Moore, we are thinking about good whenever we think about:"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "pleasure."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "things that we desire."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "intrinsic value."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "none of the above."
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id59"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "pleasure.",
+        "B": "things that we desire.",
+        "C": "intrinsic value.",
+        "D": "none of the above."
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: According to Moore, we are thinking about good whenever we think about:\nA. pleasure.\nB. things that we desire.\nC. intrinsic value.\nD. none of the above.\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.10651206970214844,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Craig argues that the series of events in time cannot be actually infinite, so we know that _____."
+        },
+        "references": [
+          {
+            "output": {
+              "text": "the universe is finite in the past and began to exist"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "the universe is infinite in the past"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "the universe never is uncaused"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "the universe exists only in the mind"
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id291"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "the universe is finite in the past and began to exist",
+        "B": "the universe is infinite in the past",
+        "C": "the universe never is uncaused",
+        "D": "the universe exists only in the mind"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Craig argues that the series of events in time cannot be actually infinite, so we know that _____.\nA. the universe is finite in the past and began to exist\nB. the universe is infinite in the past\nC. the universe never is uncaused\nD. the universe exists only in the mind\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.11817407608032227,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Baier claims that ethical skepticism is often due to:"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "confused metaphysical theories."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "confused epistemological theories."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "confused religious views."
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "confused scientific theories."
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id131"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "confused metaphysical theories.",
+        "B": "confused epistemological theories.",
+        "C": "confused religious views.",
+        "D": "confused scientific theories."
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Baier claims that ethical skepticism is often due to:\nA. confused metaphysical theories.\nB. confused epistemological theories.\nC. confused religious views.\nD. confused scientific theories.\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.11505007743835449,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Mill says that the ultimate end of utilitarianism is an existence as free of pain as possible and as rich as possible in _____."
+        },
+        "references": [
+          {
+            "output": {
+              "text": "lower pleasures"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "spiritual attainment"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "social achievement"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "enjoyments"
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "test",
+        "id": "id222"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "lower pleasures",
+        "B": "spiritual attainment",
+        "C": "social achievement",
+        "D": "enjoyments"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Mill says that the ultimate end of utilitarianism is an existence as free of pain as possible and as rich as possible in _____.\nA. lower pleasures\nB. spiritual attainment\nC. social achievement\nD. enjoyments\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.11363410949707031,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Philo says the analogy that Cleanthes uses to make his case is _____."
+        },
+        "references": [
+          {
+            "output": {
+              "text": "too complicated"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "weak"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "strong"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "not based on a legitimate method of reasoning"
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id259"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "too complicated",
+        "B": "weak",
+        "C": "strong",
+        "D": "not based on a legitimate method of reasoning"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Philo says the analogy that Cleanthes uses to make his case is _____.\nA. too complicated\nB. weak\nC. strong\nD. not based on a legitimate method of reasoning\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.0825653076171875,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Baggini accuses pessimists of mixing up the two senses of _____."
+        },
+        "references": [
+          {
+            "output": {
+              "text": "philosophy"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "religion"
+            },
+            "tags": []
+          },
+          {
+            "output": {
+              "text": "meaning"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "time"
+            },
+            "tags": []
+          }
+        ],
+        "split": "test",
+        "id": "id105"
+      },
+      "train_trial_index": 0,
+      "output_mapping": {
+        "A": "philosophy",
+        "B": "religion",
+        "C": "meaning",
+        "D": "time"
+      },
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "The following are multiple choice questions (with answers) about philosophy.\n\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\nA. metaphysics\nB. epistemology\nC. quantum physics\nD. axiology\nAnswer: A\n\nQuestion: According to Moore\u2019s \u201cideal utilitarianism,\u201d the right action is the one that brings about the greatest amount of:\nA. pleasure.\nB. happiness.\nC. good.\nD. virtue.\nAnswer: C\n\nQuestion: Psychological egoism is:\nA. an ethical theory about how we ought to behave.\nB. a generalization concerning the way people tend to behave.\nC. a claim about human nature and the ways people are capable of behaving.\nD. none of the above.\nAnswer: C\n\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\nA. optimist\nB. satisfied\nC. nominally religious\nD. pessimist\nAnswer: D\n\nQuestion: According to d'Holbach, people always act according to _____.\nA. free choices\nB. dictates of the soul\nC. necessary natural laws\nD. undetermined will\nAnswer: C\n\nQuestion: Baggini accuses pessimists of mixing up the two senses of _____.\nA. philosophy\nB. religion\nC. meaning\nD. time\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 5,
+        "max_tokens": 1,
+        "stop_sequences": [],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " D",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " D",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.07625007629394531,
+        "request_datetime": 1762354922
+      },
+      "num_train_instances": 5,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/stats.json b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/stats.json
new file mode 100644
index 0000000..25374cb
--- /dev/null
+++ b/tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2/stats.json
@@ -0,0 +1,2850 @@
+[
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 350.44444444444446,
+    "sum_squared": 122811.30864197532,
+    "min": 350.44444444444446,
+    "max": 350.44444444444446,
+    "mean": 350.44444444444446,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "training_co2_cost",
+      "split": "test"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "training_energy_cost",
+      "split": "test"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.2946600914001465,
+    "sum_squared": 0.08682456946394268,
+    "min": 0.2946600914001465,
+    "max": 0.2946600914001465,
+    "mean": 0.2946600914001465,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 420.0,
+    "sum_squared": 176400.0,
+    "min": 420.0,
+    "max": 420.0,
+    "mean": 420.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "training_co2_cost",
+      "split": "valid"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "training_energy_cost",
+      "split": "valid"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.6785750389099121,
+    "sum_squared": 0.46046408343158873,
+    "min": 0.6785750389099121,
+    "max": 0.6785750389099121,
+    "mean": 0.6785750389099121,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "perplexity",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bits_per_byte",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob_per_byte",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "perplexity",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bits_per_byte",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob_per_byte",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 350.44444444444446,
+    "sum_squared": 122811.30864197532,
+    "min": 350.44444444444446,
+    "max": 350.44444444444446,
+    "mean": 350.44444444444446,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 350.44444444444446,
+    "sum_squared": 122811.30864197532,
+    "min": 350.44444444444446,
+    "max": 350.44444444444446,
+    "mean": 350.44444444444446,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.2946600914001465,
+    "sum_squared": 0.08682456946394268,
+    "min": 0.2946600914001465,
+    "max": 0.2946600914001465,
+    "mean": 0.2946600914001465,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.2946600914001465,
+    "sum_squared": 0.08682456946394268,
+    "min": 0.2946600914001465,
+    "max": 0.2946600914001465,
+    "mean": 0.2946600914001465,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.1111111111111111,
+    "sum_squared": 0.012345679012345678,
+    "min": 0.1111111111111111,
+    "max": 0.1111111111111111,
+    "mean": 0.1111111111111111,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 420.0,
+    "sum_squared": 176400.0,
+    "min": 420.0,
+    "max": 420.0,
+    "mean": 420.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 420.0,
+    "sum_squared": 176400.0,
+    "min": 420.0,
+    "max": 420.0,
+    "mean": 420.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.6785750389099121,
+    "sum_squared": 0.46046408343158873,
+    "min": 0.6785750389099121,
+    "max": 0.6785750389099121,
+    "mean": 0.6785750389099121,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.6785750389099121,
+    "sum_squared": 0.46046408343158873,
+    "min": 0.6785750389099121,
+    "max": 0.6785750389099121,
+    "mean": 0.6785750389099121,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 5.0,
+    "sum_squared": 25.0,
+    "min": 5.0,
+    "max": 5.0,
+    "mean": 5.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prefix_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_prefix_exact_match@5",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_instances",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 9.0,
+    "sum_squared": 81.0,
+    "min": 9.0,
+    "max": 9.0,
+    "mean": 9.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_instances",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  }
+]
\ No newline at end of file

From ee35721eb7906936c2a9d0e72e9ea004205316f0 Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Tue, 11 Nov 2025 02:55:01 +0100
Subject: [PATCH 2/3] Fix ruff errors

---
 eval_converters/helm/adapter.py   | 8 +-------
 eval_converters/helm/converter.py | 3 +--
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
index 36d7620..7b2bc77 100644
--- a/eval_converters/helm/adapter.py
+++ b/eval_converters/helm/adapter.py
@@ -2,7 +2,7 @@
 from typing import Any, Dict, List, Optional, Union
 from helm.benchmark.metrics.metric import PerInstanceStats
 from helm.benchmark.presentation.schema import Schema, read_schema
-from helm.benchmark.scenarios.scenario import Reference, Scenario
+from helm.benchmark.scenarios.scenario import Reference
 from helm.benchmark.adaptation.scenario_state import AdapterSpec, RequestState, ScenarioState
 from helm.benchmark.metrics.statistic import Stat
 from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
@@ -27,12 +27,6 @@
 )
 
 from eval_converters.common.adapter import AdapterMetadata, BaseEvaluationAdapter, SupportedLibrary
-from eval_converters.common.error import AdapterError
-from eval_converters.common.utils import convert_timestamp_to_unix_format
-
-from eval_converters.common.adapter import BaseEvaluationAdapter, AdapterMetadata, SupportedLibrary
-
-from transformers import AutoConfig
 
 # run this just once in your process to initialize the registry
 register_builtin_configs_from_helm_package()
diff --git a/eval_converters/helm/converter.py b/eval_converters/helm/converter.py
index 04baab7..b77a706 100644
--- a/eval_converters/helm/converter.py
+++ b/eval_converters/helm/converter.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 from argparse import ArgumentParser
 import json
-import os
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Union
+from typing import List, Union
 
 from eval_converters.helm.adapter import HELMAdapter
 from schema.eval_types import (

From c2a9940ac935d34cd0a76419bc834e5ccca56e2a Mon Sep 17 00:00:00 2001
From: Damian Stachura <damian.stachura@evidenceprime.com>
Date: Wed, 19 Nov 2025 23:24:10 +0100
Subject: [PATCH 3/3] Fixed suggestions from code review and added tests

---
 README.md                                     |    2 +-
 eval_converters/helm/adapter.py               |   47 +-
 schema/eval.schema.json                       |   44 +-
 schema/eval_types.py                          |   12 +-
 ...lm_openai_gpt2_narrativeqa_1763479296.json | 1796 +++++++++++
 .../per_instance_stats.json                   | 1732 +++++++++++
 .../run_spec.json                             |   73 +
 .../scenario.json                             |    8 +
 .../scenario_state.json                       | 1185 ++++++++
 .../narrative_qa:model=openai_gpt2/stats.json | 2634 +++++++++++++++++
 tests/test_helm_adapter.py                    |  200 +-
 11 files changed, 7652 insertions(+), 81 deletions(-)
 create mode 100644 tests/data/helm/helm_openai_gpt2_narrativeqa_1763479296.json
 create mode 100644 tests/data/helm/narrative_qa:model=openai_gpt2/per_instance_stats.json
 create mode 100644 tests/data/helm/narrative_qa:model=openai_gpt2/run_spec.json
 create mode 100644 tests/data/helm/narrative_qa:model=openai_gpt2/scenario.json
 create mode 100644 tests/data/helm/narrative_qa:model=openai_gpt2/scenario_state.json
 create mode 100644 tests/data/helm/narrative_qa:model=openai_gpt2/stats.json

diff --git a/README.md b/README.md
index 1c6f0dd..e7e925b 100644
--- a/README.md
+++ b/README.md
@@ -111,6 +111,6 @@ options:
 Run below script to perform unit tests for all evaluation platforms.
 
 ```bash
-uv run pytest -s
+uv run pytest -s --disable-warnings
 uv run ruff check 
 ```
\ No newline at end of file
diff --git a/eval_converters/helm/adapter.py b/eval_converters/helm/adapter.py
index 7b2bc77..76f9ac8 100644
--- a/eval_converters/helm/adapter.py
+++ b/eval_converters/helm/adapter.py
@@ -126,12 +126,12 @@ def transform_from_directory(
 			print(f'Error during conversion to unified schema in directory "{dir_path}": {e}')
 			return None
 	
-	def _get_correct_response(self, references: List['Reference']) -> Optional[str]:
+	def _get_correct_response(self, references: List[Reference]) -> Optional[List[str]]:
 		"""Extracts the text of the first reference that has tags."""
-		for ref in references:
-			if ref.tags:
-				return ref.output.text
-		return None
+		return [
+			ref.output.text
+			for ref in references if ref.tags
+		]
 
 	def _extract_detailed_evaluation_info_for_samples(
 		self, request_states: List[RequestState]
@@ -143,23 +143,23 @@ def _extract_detailed_evaluation_info_for_samples(
 		
 		for state in request_states:
 			references = state.instance.references or []
-			correct_response = self._get_correct_response(references)
+			correct_responses = self._get_correct_response(references)
 
 			ground_truth = None
-			if correct_response:
-				ground_truth = next(
-					(
-						choice 
-						for choice, response in state.output_mapping.items() 
-						if response in correct_response
-					),
-					None
-				)
-
-			choices_list = [
-				f'{choice}. {response}' 
-				for choice, response in state.output_mapping.items()
-			]
+			choices_list = None
+			
+			if state.output_mapping:
+				choices_list = [
+					[choice, response] for choice, response in state.output_mapping.items()
+				]
+				
+				ground_truth = [
+					choice for choice, response in state.output_mapping.items() 
+					if choice in correct_responses or response in correct_responses
+				]
+				
+			elif correct_responses:
+				ground_truth = correct_responses
 			
 			results.append(
 				DetailedEvaluationResultsPerSample(
@@ -230,11 +230,11 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
 
 		source_data = SourceData(
             dataset_name=scenario_dict.get('name'),
-            samples_number=len(request_states),
+            samples_number=len(set(state.instance.id for state in request_states)),#len(request_states),
             sample_ids=[state.instance.id for state in request_states],
 			additional_details={
 				'scenario_name': run_spec.scenario_spec.class_name,
-				'subject': run_spec.scenario_spec.args.get('subject')
+				'scenario_args': run_spec.scenario_spec.args
 			}
         )
 
@@ -256,9 +256,10 @@ def _transform_single(self, raw_data: Dict, source_metadata: SourceMetadata) ->
 		for metric_name in metric_names:
 			metric_config = MetricConfig(
 				evaluation_description=metric_name,
-				lower_is_better=False
+				lower_is_better=False # TODO is not always true, possible to fetch correct value from schema.json
 			)
 
+			# TODO consider to filter out a subset of relevant stats
 			for stat in stats:
 				if not stat.name.name.startswith(metric_name):
 					continue
diff --git a/schema/eval.schema.json b/schema/eval.schema.json
index b7c7dfc..7246f81 100644
--- a/schema/eval.schema.json
+++ b/schema/eval.schema.json
@@ -360,19 +360,49 @@
                                 "description": "Full prompt for the model"
                             },
                             "ground_truth": {
-                                "type": "string",
-                                "description": "Target response"
+                                "description": "Target response that may include one or multiple correct answers.",
+                                "oneOf": [
+                                    {
+                                        "type": "string"
+                                    },
+                                    {
+                                        "type": "array",
+                                        "items": { 
+                                            "type": "string" 
+                                        }
+                                    }
+                                ]
                             },
                             "response": {
                                 "type": "string",
                                 "description": "Response from the model"
                             },
                             "choices": {
-                                "type": "array",
-                                "description": "Array of possible responses",
-                                "items": {
-                                    "type": "string"
-                                }
+                                "description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
+                                "oneOf": [
+                                    {
+                                        "type": "array",
+                                        "items": { 
+                                            "type": "string" 
+                                        }
+                                    },
+                                    {
+                                        "type": "array",
+                                        "items": {
+                                            "type": "array",
+                                            "items": [
+                                                {
+                                                    "type": "string"
+                                                },
+                                                {
+                                                    "type": "string"
+                                                }
+                                            ],
+                                            "minItems": 2,
+                                            "maxItems": 2
+                                        }
+                                    }
+                                ]
                             },
                             "full_logprobs": {
                                 "type": "array",
diff --git a/schema/eval_types.py b/schema/eval_types.py
index f287d7d..5b6870f 100644
--- a/schema/eval_types.py
+++ b/schema/eval_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  eval.schema.json
-#   timestamp: 2025-11-11T00:26:04+00:00
+#   timestamp: 2025-11-18T23:38:04+00:00
 
 from __future__ import annotations
 
@@ -153,10 +153,14 @@ class DetailedEvaluationResultsPerSample(BaseModel):
     sample_id: str = Field(..., description='Simple sample ID')
     input: str = Field(..., description='Raw input for the model')
     prompt: Optional[str] = Field(None, description='Full prompt for the model')
-    ground_truth: str = Field(..., description='Target response')
+    ground_truth: Union[str, List[str]] = Field(
+        ...,
+        description='Target response that may include one or multiple correct answers.',
+    )
     response: str = Field(..., description='Response from the model')
-    choices: Optional[List[str]] = Field(
-        None, description='Array of possible responses'
+    choices: Optional[Union[List[str], List[List[str]]]] = Field(
+        None,
+        description='Either an array of possible responses (list of strings) or an array of string pairs [choice, response].',
     )
     full_logprobs: Optional[List[List[FullLogprob]]] = Field(
         None, description='Full log probabilities generated for this sample'
diff --git a/tests/data/helm/helm_openai_gpt2_narrativeqa_1763479296.json b/tests/data/helm/helm_openai_gpt2_narrativeqa_1763479296.json
new file mode 100644
index 0000000..1f43923
--- /dev/null
+++ b/tests/data/helm/helm_openai_gpt2_narrativeqa_1763479296.json
@@ -0,0 +1,1796 @@
+{
+  "schema_version": "0.0.1",
+  "evaluation_id": "helm/openai/gpt2/narrativeqa/1763479296",
+  "retrieved_timestamp": "1763479296",
+  "source_data": {
+    "dataset_name": "narrativeqa",
+    "hf_repo": null,
+    "hf_split": null,
+    "samples_number": 5,
+    "sample_ids": [
+      "id1413",
+      "id1332",
+      "id1123",
+      "id1514",
+      "id1340"
+    ],
+    "additional_details": {
+      "scenario_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
+      "scenario_args": {}
+    }
+  },
+  "evaluation_source": {
+    "evaluation_source_name": "helm",
+    "evaluation_source_type": "evaluation_platform"
+  },
+  "source_metadata": {
+    "source_organization_name": "Unknown",
+    "source_organization_url": null,
+    "source_organization_logo_url": null,
+    "evaluator_relationship": "other"
+  },
+  "model_info": {
+    "name": "openai/gpt2",
+    "id": "openai/gpt2",
+    "developer": "openai",
+    "inference_platform": "huggingface",
+    "inference_engine": null,
+    "additional_details": null
+  },
+  "evaluation_results": [
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "quasi_exact_match",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.17424242424242425,
+        "details": {
+          "count": 1,
+          "sum": 0.17424242424242425,
+          "sum_squared": 0.030360422405876955,
+          "min": 0.17424242424242425,
+          "max": 0.17424242424242425,
+          "mean": 0.17424242424242425,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.17424242424242425,
+        "details": {
+          "count": 1,
+          "sum": 0.17424242424242425,
+          "sum_squared": 0.030360422405876955,
+          "min": 0.17424242424242425,
+          "max": 0.17424242424242425,
+          "mean": 0.17424242424242425,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.17424242424242425,
+        "details": {
+          "count": 1,
+          "sum": 0.17424242424242425,
+          "sum_squared": 0.030360422405876955,
+          "min": 0.17424242424242425,
+          "max": 0.17424242424242425,
+          "mean": 0.17424242424242425,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "f1_score",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.0,
+        "details": {
+          "count": 1,
+          "sum": 0.0,
+          "sum_squared": 0.0,
+          "min": 0.0,
+          "max": 0.0,
+          "mean": 0.0,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.05365510777881912,
+        "details": {
+          "count": 1,
+          "sum": 0.05365510777881912,
+          "sum_squared": 0.002878870590756696,
+          "min": 0.05365510777881912,
+          "max": 0.05365510777881912,
+          "mean": 0.05365510777881912,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03636363636363636,
+        "details": {
+          "count": 1,
+          "sum": 0.03636363636363636,
+          "sum_squared": 0.0013223140495867767,
+          "min": 0.03636363636363636,
+          "max": 0.03636363636363636,
+          "mean": 0.03636363636363636,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.05365510777881912,
+        "details": {
+          "count": 1,
+          "sum": 0.05365510777881912,
+          "sum_squared": 0.002878870590756696,
+          "min": 0.05365510777881912,
+          "max": 0.05365510777881912,
+          "mean": 0.05365510777881912,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.05365510777881912,
+        "details": {
+          "count": 1,
+          "sum": 0.05365510777881912,
+          "sum_squared": 0.002878870590756696,
+          "min": 0.05365510777881912,
+          "max": 0.05365510777881912,
+          "mean": 0.05365510777881912,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03636363636363636,
+        "details": {
+          "count": 1,
+          "sum": 0.03636363636363636,
+          "sum_squared": 0.0013223140495867767,
+          "min": 0.03636363636363636,
+          "max": 0.03636363636363636,
+          "mean": 0.03636363636363636,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "rouge_l",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03636363636363636,
+        "details": {
+          "count": 1,
+          "sum": 0.03636363636363636,
+          "sum_squared": 0.0013223140495867767,
+          "min": 0.03636363636363636,
+          "max": 0.03636363636363636,
+          "mean": 0.03636363636363636,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03568840579710145,
+        "details": {
+          "count": 1,
+          "sum": 0.03568840579710145,
+          "sum_squared": 0.001273662308338584,
+          "min": 0.03568840579710145,
+          "max": 0.03568840579710145,
+          "mean": 0.03568840579710145,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03508771929824561,
+        "details": {
+          "count": 1,
+          "sum": 0.03508771929824561,
+          "sum_squared": 0.0012311480455524776,
+          "min": 0.03508771929824561,
+          "max": 0.03508771929824561,
+          "mean": 0.03508771929824561,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03568840579710145,
+        "details": {
+          "count": 1,
+          "sum": 0.03568840579710145,
+          "sum_squared": 0.001273662308338584,
+          "min": 0.03568840579710145,
+          "max": 0.03568840579710145,
+          "mean": 0.03568840579710145,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03568840579710145,
+        "details": {
+          "count": 1,
+          "sum": 0.03568840579710145,
+          "sum_squared": 0.001273662308338584,
+          "min": 0.03568840579710145,
+          "max": 0.03568840579710145,
+          "mean": 0.03568840579710145,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03508771929824561,
+        "details": {
+          "count": 1,
+          "sum": 0.03508771929824561,
+          "sum_squared": 0.0012311480455524776,
+          "min": 0.03508771929824561,
+          "max": 0.03508771929824561,
+          "mean": 0.03508771929824561,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_1",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 0.03508771929824561,
+        "details": {
+          "count": 1,
+          "sum": 0.03508771929824561,
+          "sum_squared": 0.0012311480455524776,
+          "min": 0.03508771929824561,
+          "max": 0.03508771929824561,
+          "mean": 0.03508771929824561,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 1.1125369292536313e-308,
+        "details": {
+          "count": 1,
+          "sum": 1.1125369292536313e-308,
+          "sum_squared": 0.0,
+          "min": 1.1125369292536313e-308,
+          "max": 1.1125369292536313e-308,
+          "mean": 1.1125369292536313e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 2.2250738585072626e-308,
+        "details": {
+          "count": 1,
+          "sum": 2.2250738585072626e-308,
+          "sum_squared": 0.0,
+          "min": 2.2250738585072626e-308,
+          "max": 2.2250738585072626e-308,
+          "mean": 2.2250738585072626e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": null
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 1.1125369292536313e-308,
+        "details": {
+          "count": 1,
+          "sum": 1.1125369292536313e-308,
+          "sum_squared": 0.0,
+          "min": 1.1125369292536313e-308,
+          "max": 1.1125369292536313e-308,
+          "mean": 1.1125369292536313e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 1.1125369292536313e-308,
+        "details": {
+          "count": 1,
+          "sum": 1.1125369292536313e-308,
+          "sum_squared": 0.0,
+          "min": 1.1125369292536313e-308,
+          "max": 1.1125369292536313e-308,
+          "mean": 1.1125369292536313e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "test",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 2.2250738585072626e-308,
+        "details": {
+          "count": 1,
+          "sum": 2.2250738585072626e-308,
+          "sum_squared": 0.0,
+          "min": 2.2250738585072626e-308,
+          "max": 2.2250738585072626e-308,
+          "mean": 2.2250738585072626e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "robustness",
+            "robustness": true,
+            "fairness": false,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    },
+    {
+      "evaluation_name": "generation",
+      "evaluation_timestamp": "1763479296",
+      "metric_config": {
+        "evaluation_description": "bleu_4",
+        "lower_is_better": false,
+        "score_type": null,
+        "level_names": null,
+        "level_metadata": null,
+        "has_unknown_level": null,
+        "min_score": null,
+        "max_score": null
+      },
+      "score_details": {
+        "score": 2.2250738585072626e-308,
+        "details": {
+          "count": 1,
+          "sum": 2.2250738585072626e-308,
+          "sum_squared": 0.0,
+          "min": 2.2250738585072626e-308,
+          "max": 2.2250738585072626e-308,
+          "mean": 2.2250738585072626e-308,
+          "variance": 0.0,
+          "stddev": 0.0,
+          "split": "valid",
+          "perturbation": {
+            "name": "fairness",
+            "robustness": false,
+            "fairness": true,
+            "computed_on": "worst",
+            "seed": null
+          }
+        }
+      },
+      "detailed_evaluation_results_url": null,
+      "generation_config": {
+        "temperature": 0.0,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "instructions": "",
+        "input_prefix": "Passage: ",
+        "input_suffix": "\n",
+        "output_prefix": "Answer: ",
+        "output_suffix": "\n",
+        "instance_prefix": "\n"
+      }
+    }
+  ],
+  "detailed_evaluation_results_per_samples": [
+    {
+      "sample_id": "id1413",
+      "input": "Olive Penderghast, a 17-year-old girl living in Ojai, California lies to her best friend Rhiannon Abernathy about going on a date in order to get out of camping with Rhiannon's hippie parents. Instead, she hangs around the house all weekend listening to Natasha Bedingfield's \"Pocketful of Sunshine\", which is played by a greeting card she was sent. The following Monday, pressed by Rhiannon, Olive lies about losing her virginity to a college guy. Marianne Bryant, a prissy and strictly religious Christian at their school, overhears her telling the lie and soon it spreads like wildfire. The school's conservative church group run by Marianne decides Olive will be their next project. Olive confides the truth to her friend Brandon, and he explains how others bully him because of his homosexuality. He later asks Olive to pretend to sleep with him so that he will be accepted by everyone as a 'straight stud'.\nBrandon convinces Olive to help him and they pretend to have sex at a party. After having a fight with Rhiannon over Olive's new identity as a \"dirty skank\", Olive decides to counteract the harassment by embracing her new image as the school tramp. She begins to wear more provocative clothing and stitches a red \"A\" to everything she wears. Boys who usually have had no luck with girls in the past beg Olive to say they have had sex with her in order to increase their own popularity, in exchange for gift cards to various stores, in turn increasing her reputation. Things get worse when Micah, Marianne's 22-year-old boyfriend, contracts chlamydia from sleeping with Mrs. Griffith, the school guidance counsellor, and blames it all on Olive. Olive agrees to lie to cover up the affair so that the marriage of her favorite teacher, Mr. Griffith, would be spared.\nMarianne's religious clique, which now includes Rhiannon, begins harassing Olive in order to get her to leave school. After an ill-fated date with Anson, a boy who wants to pay her to actually sleep with him and not just pretend she did, Olive reconnects with Todd, her old crush, who is also the school's mascot. Todd then tells her that he does not believe the rumors because he remembers when she lied for him when he was not ready for his first kiss years ago. Olive then begins to ask everyone she lied for to help her out by telling the truth, but Brandon and Micah have abruptly left town and everyone else is enjoying their newfound popularity and do not want the truth to get out. Mrs. Griffith also refuses to tell the truth and when Olive threatens to expose her, Mrs. Griffith rebuffs her, saying no one would believe her.\nOlive, out of spite, then immediately tells Mr. Griffith, who believes her and separates from Mrs. Griffith. After a friendly talk with her eccentric, open-minded mother Rosemary, Olive comes up with a plan to get everything finally out in the open. She then does a song and dance number at a school pep rally to get people's attention to watch her via web cam, where she confesses what she has done (the web cam is the framing device of the film). The various boys whose reputations Olive helped improve are also shown watching. Later, Olive texts Rhiannon, apologizing for lying to her. When she is finishing up her web cast, Todd comes by riding a lawnmower and tells her to come outside. She signs off by saying she may lose her virginity to Todd, and proudly declares it's nobody's business (much to Marianne's disgrace). She goes outside to meet him, they kiss and the two are shown riding off on the lawnmower.\nQuestion: Who is Todd besides Olive's old crush?",
+      "prompt": "Passage: Olive Penderghast, a 17-year-old girl living in Ojai, California lies to her best friend Rhiannon Abernathy about going on a date in order to get out of camping with Rhiannon's hippie parents. Instead, she hangs around the house all weekend listening to Natasha Bedingfield's \"Pocketful of Sunshine\", which is played by a greeting card she was sent. The following Monday, pressed by Rhiannon, Olive lies about losing her virginity to a college guy. Marianne Bryant, a prissy and strictly religious Christian at their school, overhears her telling the lie and soon it spreads like wildfire. The school's conservative church group run by Marianne decides Olive will be their next project. Olive confides the truth to her friend Brandon, and he explains how others bully him because of his homosexuality. He later asks Olive to pretend to sleep with him so that he will be accepted by everyone as a 'straight stud'.\nBrandon convinces Olive to help him and they pretend to have sex at a party. After having a fight with Rhiannon over Olive's new identity as a \"dirty skank\", Olive decides to counteract the harassment by embracing her new image as the school tramp. She begins to wear more provocative clothing and stitches a red \"A\" to everything she wears. Boys who usually have had no luck with girls in the past beg Olive to say they have had sex with her in order to increase their own popularity, in exchange for gift cards to various stores, in turn increasing her reputation. Things get worse when Micah, Marianne's 22-year-old boyfriend, contracts chlamydia from sleeping with Mrs. Griffith, the school guidance counsellor, and blames it all on Olive. Olive agrees to lie to cover up the affair so that the marriage of her favorite teacher, Mr. Griffith, would be spared.\nMarianne's religious clique, which now includes Rhiannon, begins harassing Olive in order to get her to leave school. After an ill-fated date with Anson, a boy who wants to pay her to actually sleep with him and not just pretend she did, Olive reconnects with Todd, her old crush, who is also the school's mascot. Todd then tells her that he does not believe the rumors because he remembers when she lied for him when he was not ready for his first kiss years ago. Olive then begins to ask everyone she lied for to help her out by telling the truth, but Brandon and Micah have abruptly left town and everyone else is enjoying their newfound popularity and do not want the truth to get out. Mrs. Griffith also refuses to tell the truth and when Olive threatens to expose her, Mrs. Griffith rebuffs her, saying no one would believe her.\nOlive, out of spite, then immediately tells Mr. Griffith, who believes her and separates from Mrs. Griffith. After a friendly talk with her eccentric, open-minded mother Rosemary, Olive comes up with a plan to get everything finally out in the open. She then does a song and dance number at a school pep rally to get people's attention to watch her via web cam, where she confesses what she has done (the web cam is the framing device of the film). The various boys whose reputations Olive helped improve are also shown watching. Later, Olive texts Rhiannon, apologizing for lying to her. When she is finishing up her web cast, Todd comes by riding a lawnmower and tells her to come outside. She signs off by saying she may lose her virginity to Todd, and proudly declares it's nobody's business (much to Marianne's disgrace). She goes outside to meet him, they kiss and the two are shown riding off on the lawnmower.\nQuestion: Who is Todd besides Olive's old crush?\nAnswer:",
+      "ground_truth": [
+        "The school Mascot",
+        "the schools mascot"
+      ],
+      "response": "Olive.",
+      "choices": null,
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id1332",
+      "input": "Indefer Jones is the aged squire, between seventy and eighty years of age, of a large manor, Llanfeare, in Carmarthen, Wales. His niece, Isabel Brodrick, has lived with him for years after the remarriage of her father, and endeared herself to everyone. However, according to his strong traditional beliefs, the estate should be bequeathed to a male heir.\nHis sole male blood relative is his nephew Henry Jones, a London clerk. Henry has, in the past, incurred debts that the squire had paid off, been \"sent away from Oxford\", and generally made a poor impression on his occasional visits to Llanfeare. Nevertheless, Henry is told of his uncle's intention to make him the heir to the estate and is invited to pay a visit. Isabel rejects her uncle's suggestion that she solve his dilemma by marrying Henry, as she cannot stand her cousin. Indefer Jones finds his nephew to be just as detestable as ever. As a result, he overcomes his prejudice and changes his will one final time, in Isabel's favour. Unfortunately, he dies before he can tell anyone.\nFinding the document hidden in a book of sermons by accident, Henry vacillates between keeping silent and revealing its location. He is neither good enough to give up the estate nor evil enough to burn the document, fearing disgrace, a long jail sentence and, not least, eternal damnation. Instead, he comforts himself by reasoning that doing nothing cannot be a crime.\nIndefer Jones had had his last will witnessed by two of his tenants, but since the will cannot be found despite a thorough search of the house, Henry inherits the estate. However, already extant suspicions are only strengthened by his guilty manner. He endures abuse from everyone; his own servants either quit or treat him with disrespect. He takes to spending hours in the library, where the will is hidden.\nThe local newspaper begins to publish accounts of the affair that are insulting and seemingly libelous to Henry. It accuses him of destroying the will and usurping the estate from Isabel, whom everybody knows and respects. The old squire's lawyer, Mr Apjohn, himself suspecting that Henry knows more than he lets on, approaches the new squire about the articles, pressuring the unwilling young man into taking legal action against the editor. Henry finds that this only makes things worse. The prospect of being cross examined in the witness box fills him with dread. He realises the truth would be dragged out of him in court.\nMr Apjohn, by clever questioning, gets a good idea about where the will is. Henry knows that time is running out, but once again procrastinates. Mr Apjohn and Mr Brodrick, Isabel's father, visit Henry at home and find the document, despite Henry's ineffectual efforts to stop them. Because he did not destroy the will, Henry is permitted to return to his job in London with his reputation intact and ÂŁ4000, the amount Isabel was bequeathed in the other will.\nQuestion: How is Isabel Brodrick related to Indefer Jones?",
+      "prompt": "Passage: Indefer Jones is the aged squire, between seventy and eighty years of age, of a large manor, Llanfeare, in Carmarthen, Wales. His niece, Isabel Brodrick, has lived with him for years after the remarriage of her father, and endeared herself to everyone. However, according to his strong traditional beliefs, the estate should be bequeathed to a male heir.\nHis sole male blood relative is his nephew Henry Jones, a London clerk. Henry has, in the past, incurred debts that the squire had paid off, been \"sent away from Oxford\", and generally made a poor impression on his occasional visits to Llanfeare. Nevertheless, Henry is told of his uncle's intention to make him the heir to the estate and is invited to pay a visit. Isabel rejects her uncle's suggestion that she solve his dilemma by marrying Henry, as she cannot stand her cousin. Indefer Jones finds his nephew to be just as detestable as ever. As a result, he overcomes his prejudice and changes his will one final time, in Isabel's favour. Unfortunately, he dies before he can tell anyone.\nFinding the document hidden in a book of sermons by accident, Henry vacillates between keeping silent and revealing its location. He is neither good enough to give up the estate nor evil enough to burn the document, fearing disgrace, a long jail sentence and, not least, eternal damnation. Instead, he comforts himself by reasoning that doing nothing cannot be a crime.\nIndefer Jones had had his last will witnessed by two of his tenants, but since the will cannot be found despite a thorough search of the house, Henry inherits the estate. However, already extant suspicions are only strengthened by his guilty manner. He endures abuse from everyone; his own servants either quit or treat him with disrespect. He takes to spending hours in the library, where the will is hidden.\nThe local newspaper begins to publish accounts of the affair that are insulting and seemingly libelous to Henry. It accuses him of destroying the will and usurping the estate from Isabel, whom everybody knows and respects. The old squire's lawyer, Mr Apjohn, himself suspecting that Henry knows more than he lets on, approaches the new squire about the articles, pressuring the unwilling young man into taking legal action against the editor. Henry finds that this only makes things worse. The prospect of being cross examined in the witness box fills him with dread. He realises the truth would be dragged out of him in court.\nMr Apjohn, by clever questioning, gets a good idea about where the will is. Henry knows that time is running out, but once again procrastinates. Mr Apjohn and Mr Brodrick, Isabel's father, visit Henry at home and find the document, despite Henry's ineffectual efforts to stop them. Because he did not destroy the will, Henry is permitted to return to his job in London with his reputation intact and ÂŁ4000, the amount Isabel was bequeathed in the other will.\nQuestion: How is Isabel Brodrick related to Indefer Jones?\nAnswer:",
+      "ground_truth": [
+        "She is his neice.",
+        "His niece."
+      ],
+      "response": "Isabel is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman.",
+      "choices": null,
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id1123",
+      "input": "The subject of Cratylus is the correctness of names (περὶ ὀνομάτων ὀρθότητος), in other words, it is a critique on the subject of naming (Baxter).\nWhen discussing a ὄνομα  (onoma ) and how it would relate to its subject, Socrates compares the original creation of a word to the work of an artist. An artist uses color to express the essence of his subject in a painting. In much the same way, the creator of words uses letters containing certain sounds to express the essence of a word's subject. There is a letter that is best for soft things, one for liquid things, and so on. He comments;\nthe best possible way to speak consists in using names all (or most) of which are like the things they name (that is, are appropriate to them), while the worst is to use the opposite kind of names.\nOne countering position, held by Hermogenes, is that names have come about due to custom and convention. They do not express the essence of their subject, so they can be swapped with something unrelated by the individuals or communities who use them.\nThe line between the two perspectives is often blurred. During more than half of the dialogue, Socrates makes guesses at Hermogenes' request as to where names and words have come from. These include the names of the Olympian gods, personified deities, and many words that describe abstract concepts. He examines whether, for example, giving names of \"streams\" to Cronus and Rhea (Ροή – flow or space) are purely accidental.\nDon't you think he who gave to the ancestors of the other gods the names “Rhea” and “Cronus” had the same thought as Heracleitus? Do you think he gave both of them the names of streams (ῥευμάτων ὀνόματα) merely by chance?\nThe Greek term \"ῥεῦμα\" may refer to the flow of any medium and is not restricted to the flow of water or liquids. Many of the words which Socrates uses as examples may have come from an idea originally linked to the name, but have changed over time. Those of which he cannot find a link, he often assumes have come from foreign origins or have changed so much as to lose all resemblance to the original word. He states, \"names have been so twisted in all manner of ways, that I should not be surprised if the old language when compared with that now in use would appear to us to be a barbarous tongue.\"\nThe final theory of relations between name and object named is posited by Cratylus, a disciple of Heraclitus, who believes that names arrive from divine origins, making them necessarily correct. Socrates rebukes this theory by reminding Cratylus of the imperfection of certain names in capturing the objects they seek to signify. From this point, Socrates ultimately rejects the study of language, believing it to be philosophically inferior to a study of things themselves.\nQuestion: What does the old language sound compared with the new language?",
+      "prompt": "Passage: The subject of Cratylus is the correctness of names (περὶ ὀνομάτων ὀρθότητος), in other words, it is a critique on the subject of naming (Baxter).\nWhen discussing a ὄνομα  (onoma ) and how it would relate to its subject, Socrates compares the original creation of a word to the work of an artist. An artist uses color to express the essence of his subject in a painting. In much the same way, the creator of words uses letters containing certain sounds to express the essence of a word's subject. There is a letter that is best for soft things, one for liquid things, and so on. He comments;\nthe best possible way to speak consists in using names all (or most) of which are like the things they name (that is, are appropriate to them), while the worst is to use the opposite kind of names.\nOne countering position, held by Hermogenes, is that names have come about due to custom and convention. They do not express the essence of their subject, so they can be swapped with something unrelated by the individuals or communities who use them.\nThe line between the two perspectives is often blurred. During more than half of the dialogue, Socrates makes guesses at Hermogenes' request as to where names and words have come from. These include the names of the Olympian gods, personified deities, and many words that describe abstract concepts. He examines whether, for example, giving names of \"streams\" to Cronus and Rhea (Ροή – flow or space) are purely accidental.\nDon't you think he who gave to the ancestors of the other gods the names “Rhea” and “Cronus” had the same thought as Heracleitus? Do you think he gave both of them the names of streams (ῥευμάτων ὀνόματα) merely by chance?\nThe Greek term \"ῥεῦμα\" may refer to the flow of any medium and is not restricted to the flow of water or liquids. Many of the words which Socrates uses as examples may have come from an idea originally linked to the name, but have changed over time. Those of which he cannot find a link, he often assumes have come from foreign origins or have changed so much as to lose all resemblance to the original word. He states, \"names have been so twisted in all manner of ways, that I should not be surprised if the old language when compared with that now in use would appear to us to be a barbarous tongue.\"\nThe final theory of relations between name and object named is posited by Cratylus, a disciple of Heraclitus, who believes that names arrive from divine origins, making them necessarily correct. Socrates rebukes this theory by reminding Cratylus of the imperfection of certain names in capturing the objects they seek to signify. From this point, Socrates ultimately rejects the study of language, believing it to be philosophically inferior to a study of things themselves.\nQuestion: What does the old language sound compared with the new language?\nAnswer:",
+      "ground_truth": [
+        "like a barbaric tongue.",
+        "barbarous tongue"
+      ],
+      "response": "The old language is a dialect of the Greek language, which is the language of the Greeks. The old language is a dialect of the Greek language, which is the language of the Greeks. The old language is a dialect of the Greek language, which is the language of the Greeks.",
+      "choices": null,
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id1514",
+      "input": "The novel begins in Manchester, where we are introduced to the Bartons and the Wilsons, two working-class families. John Barton is a questioner of the distribution of wealth and the relations between rich and poor. Soon his wife diesâhe blames it on her grief over the disappearance of her sister Esther. Having already lost his son Tom at a young age, Barton is left to raise his daughter, Mary, alone and now falls into depression and begins to involve himself in the Chartist, trade-union movement.\nChapter 1 takes place in countryside where Moss Side is now.\nMary takes up work at a dressmaker's (her father having objected to her working in a factory) and becomes subject to the affections of hard-working Jem Wilson and Harry Carson, son of a wealthy mill owner. She fondly hopes, by marrying Carson, to secure a comfortable life for herself and her father, but immediately after refusing Jem's offer of marriage she realises that she truly loves him. She therefore decides to evade Carson, planning to show her feelings to Jem in the course of time. Jem believes her decision to be final, though this does not change his feelings for her.\nMeanwhile, Esther, a \"street-walker,\" returns to warn John Barton that he must save Mary from becoming like her. He simply pushes her away, however, and she's sent to jail for a month on the charge of vagrancy. Upon her release she talks to Jem with the same purpose. He promises that he will protect Mary and confronts Carson, eventually entering into a fight with him, which is witnessed by a policeman passing by.\nNot long afterwards, Carson is shot dead, and Jem is arrested for the crime, his gun having been found at the scene. Esther decides to investigate the matter further and discovers that the wadding for the gun was a piece of paper on which is written Mary's name.\nShe visits her niece to warn her to save the one she loves, and after she leaves Mary realises that the murderer is not Jem but her father. She is now faced with having to save her lover without giving away her father. With the help of Job Legh (the intelligent grandfather of her blind friend Margaret), Mary travels to Liverpool to find the only person who could provide an alibi for Jem â Will Wilson, Jem's cousin and a sailor, who was with him on the night of the murder. Unfortunately, Will's ship is already departing, so that, after Mary chases after the ship in a small boat, the only thing Will can do is promise to return in the pilot ship and testify the next day.\nDuring the trial, Jem learns of Mary's great love for him. Will arrives in court to testify, and Jem is found \"not guilty\". Mary has fallen ill during the trial and is nursed by Mr Sturgis, an old sailor, and his wife. When she finally returns to Manchester she has to face her father, who is crushed by his remorse. He summons John Carson, Harry's father, to confess to him that he is the murderer. Carson is still set on justice, but after turning to the Bible he forgives Barton, who dies soon afterwards in Carson's arms. Not long after this Esther comes back to Mary's home, where she, too, soon dies.\nJem decides to leave England, where, his reputation damaged, it would be difficult for him to find a new job. The novel ends with the wedded Mary and Jem, their little child, and Mrs Wilson living happily in Canada. News comes that Margaret has regained her sight and that she and Will, soon to be married, will visit.\nQuestion: Who actually killed Harry Carson?",
+      "prompt": "Passage: The novel begins in Manchester, where we are introduced to the Bartons and the Wilsons, two working-class families. John Barton is a questioner of the distribution of wealth and the relations between rich and poor. Soon his wife diesâhe blames it on her grief over the disappearance of her sister Esther. Having already lost his son Tom at a young age, Barton is left to raise his daughter, Mary, alone and now falls into depression and begins to involve himself in the Chartist, trade-union movement.\nChapter 1 takes place in countryside where Moss Side is now.\nMary takes up work at a dressmaker's (her father having objected to her working in a factory) and becomes subject to the affections of hard-working Jem Wilson and Harry Carson, son of a wealthy mill owner. She fondly hopes, by marrying Carson, to secure a comfortable life for herself and her father, but immediately after refusing Jem's offer of marriage she realises that she truly loves him. She therefore decides to evade Carson, planning to show her feelings to Jem in the course of time. Jem believes her decision to be final, though this does not change his feelings for her.\nMeanwhile, Esther, a \"street-walker,\" returns to warn John Barton that he must save Mary from becoming like her. He simply pushes her away, however, and she's sent to jail for a month on the charge of vagrancy. Upon her release she talks to Jem with the same purpose. He promises that he will protect Mary and confronts Carson, eventually entering into a fight with him, which is witnessed by a policeman passing by.\nNot long afterwards, Carson is shot dead, and Jem is arrested for the crime, his gun having been found at the scene. Esther decides to investigate the matter further and discovers that the wadding for the gun was a piece of paper on which is written Mary's name.\nShe visits her niece to warn her to save the one she loves, and after she leaves Mary realises that the murderer is not Jem but her father. She is now faced with having to save her lover without giving away her father. With the help of Job Legh (the intelligent grandfather of her blind friend Margaret), Mary travels to Liverpool to find the only person who could provide an alibi for Jem â Will Wilson, Jem's cousin and a sailor, who was with him on the night of the murder. Unfortunately, Will's ship is already departing, so that, after Mary chases after the ship in a small boat, the only thing Will can do is promise to return in the pilot ship and testify the next day.\nDuring the trial, Jem learns of Mary's great love for him. Will arrives in court to testify, and Jem is found \"not guilty\". Mary has fallen ill during the trial and is nursed by Mr Sturgis, an old sailor, and his wife. When she finally returns to Manchester she has to face her father, who is crushed by his remorse. He summons John Carson, Harry's father, to confess to him that he is the murderer. Carson is still set on justice, but after turning to the Bible he forgives Barton, who dies soon afterwards in Carson's arms. Not long after this Esther comes back to Mary's home, where she, too, soon dies.\nJem decides to leave England, where, his reputation damaged, it would be difficult for him to find a new job. The novel ends with the wedded Mary and Jem, their little child, and Mrs Wilson living happily in Canada. News comes that Margaret has regained her sight and that she and Will, soon to be married, will visit.\nQuestion: Who actually killed Harry Carson?\nAnswer:",
+      "ground_truth": [
+        "John Barton",
+        "John Barton"
+      ],
+      "response": "The Wilsons.",
+      "choices": null,
+      "full_logprobs": null
+    },
+    {
+      "sample_id": "id1340",
+      "input": "Tarzan returns to Opar, the source of the gold where a lost colony of fabled Atlantis is located, in order to make good on some financial reverses he has recently suffered. While Atlantis itself sank beneath the waves thousands of years ago, the workers of Opar continued to mine all of the gold, which means there is a rather huge stockpile but which is now lost to the memory of the Oparians and only Tarzan knows its secret location.\nA greedy, outlawed Belgian army officer, Albert Werper, in the employ of a criminal Arab, secretly follows Tarzan to Opar. There, Tarzan loses his memory after being struck on the head by a falling rock in the treasure room during an earthquake. On encountering La, the high priestess who is the servant of the Flaming God of Opar, and who is also very beautiful, Tarzan once again rejects her love which enrages her and she tries to have him killed; she had fallen in love with the apeman during their first encounter and La and her high priests are not going to allow Tarzan to escape their sacrificial knives this time.\nIn the meanwhile, Jane has been kidnapped by the Arab and wonders what is keeping her husband from once again coming to her rescue. A now amnesiac Tarzan and the Werper escape from Opar, bearing away the sacrificial knife of Opar which La and some retainers set out to recover. There is intrigue and counter intrigue the rest of the way.\nQuestion: Who is La?",
+      "prompt": "Passage: The Little White Bird is a series of short episodes, including both accounts of the narrator's day-to-day activities in contemporary London and fanciful tales set in Kensington Gardens and elsewhere.The story is set in several locations; the earlier chapters are set in the town of London, contemporaneous to the time of Barrie's writing, and involving some time travel of a few years, and other fantasy elements, while remaining within the London setting. The middle chapters that later became Peter Pan in Kensington Gardens are set in London's famous Kensington Gardens, introduced by the statement that \"All perambulators lead to Kensington Gardens\". The Kensington Gardens chapters include detailed descriptions of the features of the Gardens, along with fantasy names given to the locations by the story's characters, especially after \"Lock-Out Time\", described by Barrie as the time at the end of the day when the park gates are closed to the public, and the fairies and other magical inhabitants of the park can move about more freely than during the daylight, when they must hide from ordinary people. The third section of the book, following the Kensington Gardens chapters, are again set generally in London, though there are some short returns to the Gardens that are not part of the Peter Pan stories. In a two-page diversion in chapter 24, Barrie brings the story to Patagonia, and a journey by ship returning to England at the \"white cliffs of Albion\".\nQuestion: Where  was the ship in Pentagonia returned to?\nAnswer: England\n\nPassage: Tarzan returns to Opar, the source of the gold where a lost colony of fabled Atlantis is located, in order to make good on some financial reverses he has recently suffered. While Atlantis itself sank beneath the waves thousands of years ago, the workers of Opar continued to mine all of the gold, which means there is a rather huge stockpile but which is now lost to the memory of the Oparians and only Tarzan knows its secret location.\nA greedy, outlawed Belgian army officer, Albert Werper, in the employ of a criminal Arab, secretly follows Tarzan to Opar. There, Tarzan loses his memory after being struck on the head by a falling rock in the treasure room during an earthquake. On encountering La, the high priestess who is the servant of the Flaming God of Opar, and who is also very beautiful, Tarzan once again rejects her love which enrages her and she tries to have him killed; she had fallen in love with the apeman during their first encounter and La and her high priests are not going to allow Tarzan to escape their sacrificial knives this time.\nIn the meanwhile, Jane has been kidnapped by the Arab and wonders what is keeping her husband from once again coming to her rescue. A now amnesiac Tarzan and the Werper escape from Opar, bearing away the sacrificial knife of Opar which La and some retainers set out to recover. There is intrigue and counter intrigue the rest of the way.\nQuestion: Who is La?\nAnswer:",
+      "ground_truth": [
+        "La is the high priestess that serves the Flaming God of Opar.",
+        "A high priestess."
+      ],
+      "response": "La is a young woman who is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar",
+      "choices": null,
+      "full_logprobs": null
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/narrative_qa:model=openai_gpt2/per_instance_stats.json b/tests/data/helm/narrative_qa:model=openai_gpt2/per_instance_stats.json
new file mode 100644
index 0000000..2a7de0c
--- /dev/null
+++ b/tests/data/helm/narrative_qa:model=openai_gpt2/per_instance_stats.json
@@ -0,0 +1,1732 @@
+[
+  {
+    "instance_id": "id1413",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 790,
+        "sum_squared": 624100,
+        "min": 790,
+        "max": 790,
+        "mean": 790.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.8313617706298828,
+        "sum_squared": 0.6911623936648539,
+        "min": 0.8313617706298828,
+        "max": 0.8313617706298828,
+        "mean": 0.8313617706298828,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "f1_score",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "rouge_l",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_1",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_4",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 7,
+        "sum_squared": 49,
+        "min": 7,
+        "max": 7,
+        "mean": 7.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id1332",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 653,
+        "sum_squared": 426409,
+        "min": 653,
+        "max": 653,
+        "mean": 653.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 45,
+        "sum_squared": 2025,
+        "min": 45,
+        "max": 45,
+        "mean": 45.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 45,
+        "sum_squared": 2025,
+        "min": 45,
+        "max": 45,
+        "mean": 45.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.622730016708374,
+        "sum_squared": 2.63325270712636,
+        "min": 1.622730016708374,
+        "max": 1.622730016708374,
+        "mean": 1.622730016708374,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "f1_score",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.3333333333333333,
+        "sum_squared": 0.1111111111111111,
+        "min": 0.3333333333333333,
+        "max": 0.3333333333333333,
+        "mean": 0.3333333333333333,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "rouge_l",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.09090909090909091,
+        "sum_squared": 0.008264462809917356,
+        "min": 0.09090909090909091,
+        "max": 0.09090909090909091,
+        "mean": 0.09090909090909091,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_1",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.06666666666666667,
+        "sum_squared": 0.0044444444444444444,
+        "min": 0.06666666666666667,
+        "max": 0.06666666666666667,
+        "mean": 0.06666666666666667,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_4",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2.2250738585072626e-308,
+        "sum_squared": 0.0,
+        "min": 2.2250738585072626e-308,
+        "max": 2.2250738585072626e-308,
+        "mean": 2.2250738585072626e-308,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 45,
+        "sum_squared": 2025,
+        "min": 45,
+        "max": 45,
+        "mean": 45.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 213,
+        "sum_squared": 45369,
+        "min": 213,
+        "max": 213,
+        "mean": 213.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id1123",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 686,
+        "sum_squared": 470596,
+        "min": 686,
+        "max": 686,
+        "mean": 686.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 57,
+        "sum_squared": 3249,
+        "min": 57,
+        "max": 57,
+        "mean": 57.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 57,
+        "sum_squared": 3249,
+        "min": 57,
+        "max": 57,
+        "mean": 57.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "valid"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "valid"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1.743454933166504,
+        "sum_squared": 3.0396351039826186,
+        "min": 1.743454933166504,
+        "max": 1.743454933166504,
+        "mean": 1.743454933166504,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "f1_score",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "rouge_l",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0.03636363636363636,
+        "sum_squared": 0.0013223140495867767,
+        "min": 0.03636363636363636,
+        "max": 0.03636363636363636,
+        "mean": 0.03636363636363636,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_1",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0.03508771929824561,
+        "sum_squared": 0.0012311480455524776,
+        "min": 0.03508771929824561,
+        "max": 0.03508771929824561,
+        "mean": 0.03508771929824561,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_4",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 2.2250738585072626e-308,
+        "sum_squared": 0.0,
+        "min": 2.2250738585072626e-308,
+        "max": 2.2250738585072626e-308,
+        "mean": 2.2250738585072626e-308,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 57,
+        "sum_squared": 3249,
+        "min": 57,
+        "max": 57,
+        "mean": 57.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "valid"
+        },
+        "count": 1,
+        "sum": 270,
+        "sum_squared": 72900,
+        "min": 270,
+        "max": 270,
+        "mean": 270.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id1514",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 772,
+        "sum_squared": 595984,
+        "min": 772,
+        "max": 772,
+        "mean": 772.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.8950591087341309,
+        "sum_squared": 0.8011308081279367,
+        "min": 0.8950591087341309,
+        "max": 0.8950591087341309,
+        "mean": 0.8950591087341309,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "f1_score",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "rouge_l",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_1",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_4",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 5,
+        "sum_squared": 25,
+        "min": 5,
+        "max": 5,
+        "mean": 5.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 13,
+        "sum_squared": 169,
+        "min": 13,
+        "max": 13,
+        "mean": 13.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  },
+  {
+    "instance_id": "id1340",
+    "train_trial_index": 0,
+    "stats": [
+      {
+        "name": {
+          "name": "num_references",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2,
+        "sum_squared": 4,
+        "min": 2,
+        "max": 2,
+        "mean": 2.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_trials",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_prompt_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 635,
+        "sum_squared": 403225,
+        "min": 635,
+        "max": 635,
+        "mean": 635.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_completion_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 100,
+        "sum_squared": 10000,
+        "min": 100,
+        "max": 100,
+        "mean": 100.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_output_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 100,
+        "sum_squared": 10000,
+        "min": 100,
+        "max": 100,
+        "mean": 100.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "training_co2_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "training_energy_cost",
+          "split": "test"
+        },
+        "count": 0,
+        "sum": 0,
+        "sum_squared": 0
+      },
+      {
+        "name": {
+          "name": "inference_runtime",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.3460521697998047,
+        "sum_squared": 1.8118564438227622,
+        "min": 1.3460521697998047,
+        "max": 1.3460521697998047,
+        "mean": 1.3460521697998047,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "batch_size",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_length",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_stop",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_endoftext",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "finish_reason_unknown",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_train_instances",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1,
+        "sum_squared": 1,
+        "min": 1,
+        "max": 1,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "prompt_truncated",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "max_prob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 1.0,
+        "sum_squared": 1.0,
+        "min": 1.0,
+        "max": 1.0,
+        "mean": 1.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "quasi_exact_match",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0,
+        "sum_squared": 0,
+        "min": 0,
+        "max": 0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "f1_score",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.36363636363636365,
+        "sum_squared": 0.1322314049586777,
+        "min": 0.36363636363636365,
+        "max": 0.36363636363636365,
+        "mean": 0.36363636363636365,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "rouge_l",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.12371134020618557,
+        "sum_squared": 0.015304495695610586,
+        "min": 0.12371134020618557,
+        "max": 0.12371134020618557,
+        "mean": 0.12371134020618557,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_1",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.07608695652173914,
+        "sum_squared": 0.005789224952741021,
+        "min": 0.07608695652173914,
+        "max": 0.07608695652173914,
+        "mean": 0.07608695652173914,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "bleu_4",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 2.2250738585072626e-308,
+        "sum_squared": 0.0,
+        "min": 2.2250738585072626e-308,
+        "max": 2.2250738585072626e-308,
+        "mean": 2.2250738585072626e-308,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "logprob",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 0.0,
+        "sum_squared": 0.0,
+        "min": 0.0,
+        "max": 0.0,
+        "mean": 0.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_perplexity_tokens",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 100,
+        "sum_squared": 10000,
+        "min": 100,
+        "max": 100,
+        "mean": 100.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      },
+      {
+        "name": {
+          "name": "num_bytes",
+          "split": "test"
+        },
+        "count": 1,
+        "sum": 379,
+        "sum_squared": 143641,
+        "min": 379,
+        "max": 379,
+        "mean": 379.0,
+        "variance": 0.0,
+        "stddev": 0.0
+      }
+    ]
+  }
+]
\ No newline at end of file
diff --git a/tests/data/helm/narrative_qa:model=openai_gpt2/run_spec.json b/tests/data/helm/narrative_qa:model=openai_gpt2/run_spec.json
new file mode 100644
index 0000000..ff005c9
--- /dev/null
+++ b/tests/data/helm/narrative_qa:model=openai_gpt2/run_spec.json
@@ -0,0 +1,73 @@
+{
+  "name": "narrative_qa:model=openai_gpt2",
+  "scenario_spec": {
+    "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
+    "args": {}
+  },
+  "adapter_spec": {
+    "method": "generation",
+    "global_prefix": "",
+    "global_suffix": "",
+    "instructions": "",
+    "input_prefix": "Passage: ",
+    "input_suffix": "\n",
+    "reference_prefix": "A. ",
+    "reference_suffix": "\n",
+    "chain_of_thought_prefix": "",
+    "chain_of_thought_suffix": "\n",
+    "output_prefix": "Answer: ",
+    "output_suffix": "\n",
+    "instance_prefix": "\n",
+    "substitutions": [],
+    "max_train_instances": 5,
+    "max_eval_instances": 5,
+    "num_outputs": 1,
+    "num_train_trials": 1,
+    "num_trials": 1,
+    "sample_train": true,
+    "model_deployment": "huggingface/gpt2",
+    "model": "openai/gpt2",
+    "temperature": 0.0,
+    "max_tokens": 100,
+    "stop_sequences": [
+      "\n"
+    ],
+    "multi_label": false
+  },
+  "metric_specs": [
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
+      "args": {
+        "names": [
+          "exact_match",
+          "quasi_exact_match",
+          "f1_score",
+          "rouge_l",
+          "bleu_1",
+          "bleu_4"
+        ]
+      }
+    },
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
+      "args": {}
+    },
+    {
+      "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
+      "args": {}
+    }
+  ],
+  "data_augmenter_spec": {
+    "perturbation_specs": [],
+    "should_augment_train_instances": false,
+    "should_include_original_train": false,
+    "should_skip_unchanged_train": false,
+    "should_augment_eval_instances": false,
+    "should_include_original_eval": false,
+    "should_skip_unchanged_eval": false,
+    "seeds_per_instance": 1
+  },
+  "groups": [
+    "narrative_qa"
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/narrative_qa:model=openai_gpt2/scenario.json b/tests/data/helm/narrative_qa:model=openai_gpt2/scenario.json
new file mode 100644
index 0000000..8d4345f
--- /dev/null
+++ b/tests/data/helm/narrative_qa:model=openai_gpt2/scenario.json
@@ -0,0 +1,8 @@
+{
+  "name": "narrativeqa",
+  "description": "Question answering using summaries of books/movie scripts.",
+  "tags": [
+    "question_answering"
+  ],
+  "definition_path": "https://github.com/stanford-crfm/helm/blob/main/src/helm/benchmark/scenarios/narrativeqa_scenario.py"
+}
\ No newline at end of file
diff --git a/tests/data/helm/narrative_qa:model=openai_gpt2/scenario_state.json b/tests/data/helm/narrative_qa:model=openai_gpt2/scenario_state.json
new file mode 100644
index 0000000..936e684
--- /dev/null
+++ b/tests/data/helm/narrative_qa:model=openai_gpt2/scenario_state.json
@@ -0,0 +1,1185 @@
+{
+  "adapter_spec": {
+    "method": "generation",
+    "global_prefix": "",
+    "global_suffix": "",
+    "instructions": "",
+    "input_prefix": "Passage: ",
+    "input_suffix": "\n",
+    "reference_prefix": "A. ",
+    "reference_suffix": "\n",
+    "chain_of_thought_prefix": "",
+    "chain_of_thought_suffix": "\n",
+    "output_prefix": "Answer: ",
+    "output_suffix": "\n",
+    "instance_prefix": "\n",
+    "substitutions": [],
+    "max_train_instances": 5,
+    "max_eval_instances": 5,
+    "num_outputs": 1,
+    "num_train_trials": 1,
+    "num_trials": 1,
+    "sample_train": true,
+    "model_deployment": "huggingface/gpt2",
+    "model": "openai/gpt2",
+    "temperature": 0.0,
+    "max_tokens": 100,
+    "stop_sequences": [
+      "\n"
+    ],
+    "multi_label": false
+  },
+  "request_states": [
+    {
+      "instance": {
+        "input": {
+          "text": "Olive Penderghast, a 17-year-old girl living in Ojai, California lies to her best friend Rhiannon Abernathy about going on a date in order to get out of camping with Rhiannon's hippie parents. Instead, she hangs around the house all weekend listening to Natasha Bedingfield's \"Pocketful of Sunshine\", which is played by a greeting card she was sent. The following Monday, pressed by Rhiannon, Olive lies about losing her virginity to a college guy. Marianne Bryant, a prissy and strictly religious Christian at their school, overhears her telling the lie and soon it spreads like wildfire. The school's conservative church group run by Marianne decides Olive will be their next project. Olive confides the truth to her friend Brandon, and he explains how others bully him because of his homosexuality. He later asks Olive to pretend to sleep with him so that he will be accepted by everyone as a 'straight stud'.\nBrandon convinces Olive to help him and they pretend to have sex at a party. After having a fight with Rhiannon over Olive's new identity as a \"dirty skank\", Olive decides to counteract the harassment by embracing her new image as the school tramp. She begins to wear more provocative clothing and stitches a red \"A\" to everything she wears. Boys who usually have had no luck with girls in the past beg Olive to say they have had sex with her in order to increase their own popularity, in exchange for gift cards to various stores, in turn increasing her reputation. Things get worse when Micah, Marianne's 22-year-old boyfriend, contracts chlamydia from sleeping with Mrs. Griffith, the school guidance counsellor, and blames it all on Olive. Olive agrees to lie to cover up the affair so that the marriage of her favorite teacher, Mr. Griffith, would be spared.\nMarianne's religious clique, which now includes Rhiannon, begins harassing Olive in order to get her to leave school. After an ill-fated date with Anson, a boy who wants to pay her to actually sleep with him and not just pretend she did, Olive reconnects with Todd, her old crush, who is also the school's mascot. Todd then tells her that he does not believe the rumors because he remembers when she lied for him when he was not ready for his first kiss years ago. Olive then begins to ask everyone she lied for to help her out by telling the truth, but Brandon and Micah have abruptly left town and everyone else is enjoying their newfound popularity and do not want the truth to get out. Mrs. Griffith also refuses to tell the truth and when Olive threatens to expose her, Mrs. Griffith rebuffs her, saying no one would believe her.\nOlive, out of spite, then immediately tells Mr. Griffith, who believes her and separates from Mrs. Griffith. After a friendly talk with her eccentric, open-minded mother Rosemary, Olive comes up with a plan to get everything finally out in the open. She then does a song and dance number at a school pep rally to get people's attention to watch her via web cam, where she confesses what she has done (the web cam is the framing device of the film). The various boys whose reputations Olive helped improve are also shown watching. Later, Olive texts Rhiannon, apologizing for lying to her. When she is finishing up her web cast, Todd comes by riding a lawnmower and tells her to come outside. She signs off by saying she may lose her virginity to Todd, and proudly declares it's nobody's business (much to Marianne's disgrace). She goes outside to meet him, they kiss and the two are shown riding off on the lawnmower.\nQuestion: Who is Todd besides Olive's old crush?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "The school Mascot"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "the schools mascot"
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "test",
+        "id": "id1413"
+      },
+      "train_trial_index": 0,
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "Passage: Olive Penderghast, a 17-year-old girl living in Ojai, California lies to her best friend Rhiannon Abernathy about going on a date in order to get out of camping with Rhiannon's hippie parents. Instead, she hangs around the house all weekend listening to Natasha Bedingfield's \"Pocketful of Sunshine\", which is played by a greeting card she was sent. The following Monday, pressed by Rhiannon, Olive lies about losing her virginity to a college guy. Marianne Bryant, a prissy and strictly religious Christian at their school, overhears her telling the lie and soon it spreads like wildfire. The school's conservative church group run by Marianne decides Olive will be their next project. Olive confides the truth to her friend Brandon, and he explains how others bully him because of his homosexuality. He later asks Olive to pretend to sleep with him so that he will be accepted by everyone as a 'straight stud'.\nBrandon convinces Olive to help him and they pretend to have sex at a party. After having a fight with Rhiannon over Olive's new identity as a \"dirty skank\", Olive decides to counteract the harassment by embracing her new image as the school tramp. She begins to wear more provocative clothing and stitches a red \"A\" to everything she wears. Boys who usually have had no luck with girls in the past beg Olive to say they have had sex with her in order to increase their own popularity, in exchange for gift cards to various stores, in turn increasing her reputation. Things get worse when Micah, Marianne's 22-year-old boyfriend, contracts chlamydia from sleeping with Mrs. Griffith, the school guidance counsellor, and blames it all on Olive. Olive agrees to lie to cover up the affair so that the marriage of her favorite teacher, Mr. Griffith, would be spared.\nMarianne's religious clique, which now includes Rhiannon, begins harassing Olive in order to get her to leave school. After an ill-fated date with Anson, a boy who wants to pay her to actually sleep with him and not just pretend she did, Olive reconnects with Todd, her old crush, who is also the school's mascot. Todd then tells her that he does not believe the rumors because he remembers when she lied for him when he was not ready for his first kiss years ago. Olive then begins to ask everyone she lied for to help her out by telling the truth, but Brandon and Micah have abruptly left town and everyone else is enjoying their newfound popularity and do not want the truth to get out. Mrs. Griffith also refuses to tell the truth and when Olive threatens to expose her, Mrs. Griffith rebuffs her, saying no one would believe her.\nOlive, out of spite, then immediately tells Mr. Griffith, who believes her and separates from Mrs. Griffith. After a friendly talk with her eccentric, open-minded mother Rosemary, Olive comes up with a plan to get everything finally out in the open. She then does a song and dance number at a school pep rally to get people's attention to watch her via web cam, where she confesses what she has done (the web cam is the framing device of the film). The various boys whose reputations Olive helped improve are also shown watching. Later, Olive texts Rhiannon, apologizing for lying to her. When she is finishing up her web cast, Todd comes by riding a lawnmower and tells her to come outside. She signs off by saying she may lose her virginity to Todd, and proudly declares it's nobody's business (much to Marianne's disgrace). She goes outside to meet him, they kiss and the two are shown riding off on the lawnmower.\nQuestion: Who is Todd besides Olive's old crush?\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 1,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " Olive.",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " Olive",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.8313617706298828,
+        "request_datetime": 1763479296
+      },
+      "num_train_instances": 0,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Indefer Jones is the aged squire, between seventy and eighty years of age, of a large manor, Llanfeare, in Carmarthen, Wales. His niece, Isabel Brodrick, has lived with him for years after the remarriage of her father, and endeared herself to everyone. However, according to his strong traditional beliefs, the estate should be bequeathed to a male heir.\nHis sole male blood relative is his nephew Henry Jones, a London clerk. Henry has, in the past, incurred debts that the squire had paid off, been \"sent away from Oxford\", and generally made a poor impression on his occasional visits to Llanfeare. Nevertheless, Henry is told of his uncle's intention to make him the heir to the estate and is invited to pay a visit. Isabel rejects her uncle's suggestion that she solve his dilemma by marrying Henry, as she cannot stand her cousin. Indefer Jones finds his nephew to be just as detestable as ever. As a result, he overcomes his prejudice and changes his will one final time, in Isabel's favour. Unfortunately, he dies before he can tell anyone.\nFinding the document hidden in a book of sermons by accident, Henry vacillates between keeping silent and revealing its location. He is neither good enough to give up the estate nor evil enough to burn the document, fearing disgrace, a long jail sentence and, not least, eternal damnation. Instead, he comforts himself by reasoning that doing nothing cannot be a crime.\nIndefer Jones had had his last will witnessed by two of his tenants, but since the will cannot be found despite a thorough search of the house, Henry inherits the estate. However, already extant suspicions are only strengthened by his guilty manner. He endures abuse from everyone; his own servants either quit or treat him with disrespect. He takes to spending hours in the library, where the will is hidden.\nThe local newspaper begins to publish accounts of the affair that are insulting and seemingly libelous to Henry. It accuses him of destroying the will and usurping the estate from Isabel, whom everybody knows and respects. The old squire's lawyer, Mr Apjohn, himself suspecting that Henry knows more than he lets on, approaches the new squire about the articles, pressuring the unwilling young man into taking legal action against the editor. Henry finds that this only makes things worse. The prospect of being cross examined in the witness box fills him with dread. He realises the truth would be dragged out of him in court.\nMr Apjohn, by clever questioning, gets a good idea about where the will is. Henry knows that time is running out, but once again procrastinates. Mr Apjohn and Mr Brodrick, Isabel's father, visit Henry at home and find the document, despite Henry's ineffectual efforts to stop them. Because he did not destroy the will, Henry is permitted to return to his job in London with his reputation intact and \u00c2\u01414000, the amount Isabel was bequeathed in the other will.\nQuestion: How is Isabel Brodrick related to Indefer Jones?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "She is his neice."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "His niece."
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "test",
+        "id": "id1332"
+      },
+      "train_trial_index": 0,
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "Passage: Indefer Jones is the aged squire, between seventy and eighty years of age, of a large manor, Llanfeare, in Carmarthen, Wales. His niece, Isabel Brodrick, has lived with him for years after the remarriage of her father, and endeared herself to everyone. However, according to his strong traditional beliefs, the estate should be bequeathed to a male heir.\nHis sole male blood relative is his nephew Henry Jones, a London clerk. Henry has, in the past, incurred debts that the squire had paid off, been \"sent away from Oxford\", and generally made a poor impression on his occasional visits to Llanfeare. Nevertheless, Henry is told of his uncle's intention to make him the heir to the estate and is invited to pay a visit. Isabel rejects her uncle's suggestion that she solve his dilemma by marrying Henry, as she cannot stand her cousin. Indefer Jones finds his nephew to be just as detestable as ever. As a result, he overcomes his prejudice and changes his will one final time, in Isabel's favour. Unfortunately, he dies before he can tell anyone.\nFinding the document hidden in a book of sermons by accident, Henry vacillates between keeping silent and revealing its location. He is neither good enough to give up the estate nor evil enough to burn the document, fearing disgrace, a long jail sentence and, not least, eternal damnation. Instead, he comforts himself by reasoning that doing nothing cannot be a crime.\nIndefer Jones had had his last will witnessed by two of his tenants, but since the will cannot be found despite a thorough search of the house, Henry inherits the estate. However, already extant suspicions are only strengthened by his guilty manner. He endures abuse from everyone; his own servants either quit or treat him with disrespect. He takes to spending hours in the library, where the will is hidden.\nThe local newspaper begins to publish accounts of the affair that are insulting and seemingly libelous to Henry. It accuses him of destroying the will and usurping the estate from Isabel, whom everybody knows and respects. The old squire's lawyer, Mr Apjohn, himself suspecting that Henry knows more than he lets on, approaches the new squire about the articles, pressuring the unwilling young man into taking legal action against the editor. Henry finds that this only makes things worse. The prospect of being cross examined in the witness box fills him with dread. He realises the truth would be dragged out of him in court.\nMr Apjohn, by clever questioning, gets a good idea about where the will is. Henry knows that time is running out, but once again procrastinates. Mr Apjohn and Mr Brodrick, Isabel's father, visit Henry at home and find the document, despite Henry's ineffectual efforts to stop them. Because he did not destroy the will, Henry is permitted to return to his job in London with his reputation intact and \u00c2\u01414000, the amount Isabel was bequeathed in the other will.\nQuestion: How is Isabel Brodrick related to Indefer Jones?\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 1,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " Isabel is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman. She is a very good and intelligent woman.",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " Isabel",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " very",
+                "logprob": 0.0
+              },
+              {
+                "text": " good",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " intelligent",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " very",
+                "logprob": 0.0
+              },
+              {
+                "text": " good",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " intelligent",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " very",
+                "logprob": 0.0
+              },
+              {
+                "text": " good",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " intelligent",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " very",
+                "logprob": 0.0
+              },
+              {
+                "text": " good",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " intelligent",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " very",
+                "logprob": 0.0
+              },
+              {
+                "text": " good",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " intelligent",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 1.622730016708374,
+        "request_datetime": 1763479296
+      },
+      "num_train_instances": 0,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "The subject of Cratylus is the correctness of names (\u03c0\u03b5\u03c1\u1f76 \u1f40\u03bd\u03bf\u03bc\u03ac\u03c4\u03c9\u03bd \u1f40\u03c1\u03b8\u03cc\u03c4\u03b7\u03c4\u03bf\u03c2), in other words, it is a critique on the subject of naming (Baxter).\nWhen discussing a \u1f44\u03bd\u03bf\u03bc\u03b1  (onoma ) and how it would relate to its subject, Socrates compares the original creation of a word to the work of an artist. An artist uses color to express the essence of his subject in a painting. In much the same way, the creator of words uses letters containing certain sounds to express the essence of a word's subject. There is a letter that is best for soft things, one for liquid things, and so on. He comments;\nthe best possible way to speak consists in using names all (or most) of which are like the things they name (that is, are appropriate to them), while the worst is to use the opposite kind of names.\nOne countering position, held by Hermogenes, is that names have come about due to custom and convention. They do not express the essence of their subject, so they can be swapped with something unrelated by the individuals or communities who use them.\nThe line between the two perspectives is often blurred. During more than half of the dialogue, Socrates makes guesses at Hermogenes' request as to where names and words have come from. These include the names of the Olympian gods, personified deities, and many words that describe abstract concepts. He examines whether, for example, giving names of \"streams\" to Cronus and Rhea (\u03a1\u03bf\u03ae \u2013 flow or space) are purely accidental.\nDon't you think he who gave to the ancestors of the other gods the names \u201cRhea\u201d and \u201cCronus\u201d had the same thought as Heracleitus? Do you think he gave both of them the names of streams (\u1fe5\u03b5\u03c5\u03bc\u03ac\u03c4\u03c9\u03bd \u1f40\u03bd\u03cc\u03bc\u03b1\u03c4\u03b1) merely by chance?\nThe Greek term \"\u1fe5\u03b5\u1fe6\u03bc\u03b1\" may refer to the flow of any medium and is not restricted to the flow of water or liquids. Many of the words which Socrates uses as examples may have come from an idea originally linked to the name, but have changed over time. Those of which he cannot find a link, he often assumes have come from foreign origins or have changed so much as to lose all resemblance to the original word. He states, \"names have been so twisted in all manner of ways, that I should not be surprised if the old language when compared with that now in use would appear to us to be a barbarous tongue.\"\nThe final theory of relations between name and object named is posited by Cratylus, a disciple of Heraclitus, who believes that names arrive from divine origins, making them necessarily correct. Socrates rebukes this theory by reminding Cratylus of the imperfection of certain names in capturing the objects they seek to signify. From this point, Socrates ultimately rejects the study of language, believing it to be philosophically inferior to a study of things themselves.\nQuestion: What does the old language sound compared with the new language?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "like a barbaric tongue."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "barbarous tongue"
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "valid",
+        "id": "id1123"
+      },
+      "train_trial_index": 0,
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "Passage: The subject of Cratylus is the correctness of names (\u03c0\u03b5\u03c1\u1f76 \u1f40\u03bd\u03bf\u03bc\u03ac\u03c4\u03c9\u03bd \u1f40\u03c1\u03b8\u03cc\u03c4\u03b7\u03c4\u03bf\u03c2), in other words, it is a critique on the subject of naming (Baxter).\nWhen discussing a \u1f44\u03bd\u03bf\u03bc\u03b1  (onoma ) and how it would relate to its subject, Socrates compares the original creation of a word to the work of an artist. An artist uses color to express the essence of his subject in a painting. In much the same way, the creator of words uses letters containing certain sounds to express the essence of a word's subject. There is a letter that is best for soft things, one for liquid things, and so on. He comments;\nthe best possible way to speak consists in using names all (or most) of which are like the things they name (that is, are appropriate to them), while the worst is to use the opposite kind of names.\nOne countering position, held by Hermogenes, is that names have come about due to custom and convention. They do not express the essence of their subject, so they can be swapped with something unrelated by the individuals or communities who use them.\nThe line between the two perspectives is often blurred. During more than half of the dialogue, Socrates makes guesses at Hermogenes' request as to where names and words have come from. These include the names of the Olympian gods, personified deities, and many words that describe abstract concepts. He examines whether, for example, giving names of \"streams\" to Cronus and Rhea (\u03a1\u03bf\u03ae \u2013 flow or space) are purely accidental.\nDon't you think he who gave to the ancestors of the other gods the names \u201cRhea\u201d and \u201cCronus\u201d had the same thought as Heracleitus? Do you think he gave both of them the names of streams (\u1fe5\u03b5\u03c5\u03bc\u03ac\u03c4\u03c9\u03bd \u1f40\u03bd\u03cc\u03bc\u03b1\u03c4\u03b1) merely by chance?\nThe Greek term \"\u1fe5\u03b5\u1fe6\u03bc\u03b1\" may refer to the flow of any medium and is not restricted to the flow of water or liquids. Many of the words which Socrates uses as examples may have come from an idea originally linked to the name, but have changed over time. Those of which he cannot find a link, he often assumes have come from foreign origins or have changed so much as to lose all resemblance to the original word. He states, \"names have been so twisted in all manner of ways, that I should not be surprised if the old language when compared with that now in use would appear to us to be a barbarous tongue.\"\nThe final theory of relations between name and object named is posited by Cratylus, a disciple of Heraclitus, who believes that names arrive from divine origins, making them necessarily correct. Socrates rebukes this theory by reminding Cratylus of the imperfection of certain names in capturing the objects they seek to signify. From this point, Socrates ultimately rejects the study of language, believing it to be philosophically inferior to a study of things themselves.\nQuestion: What does the old language sound compared with the new language?\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 1,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " The old language is a dialect of the Greek language, which is the language of the Greeks. The old language is a dialect of the Greek language, which is the language of the Greeks. The old language is a dialect of the Greek language, which is the language of the Greeks.",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " The",
+                "logprob": 0.0
+              },
+              {
+                "text": " old",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " dialect",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greek",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " which",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greeks",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " The",
+                "logprob": 0.0
+              },
+              {
+                "text": " old",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " dialect",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greek",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " which",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greeks",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " The",
+                "logprob": 0.0
+              },
+              {
+                "text": " old",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " dialect",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greek",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " which",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " language",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " Greeks",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 1.743454933166504,
+        "request_datetime": 1763479296
+      },
+      "num_train_instances": 0,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "The novel begins in Manchester, where we are introduced to the Bartons and the Wilsons, two working-class families. John Barton is a questioner of the distribution of wealth and the relations between rich and poor. Soon his wife dies\u00e2\u0080\u0094he blames it on her grief over the disappearance of her sister Esther. Having already lost his son Tom at a young age, Barton is left to raise his daughter, Mary, alone and now falls into depression and begins to involve himself in the Chartist, trade-union movement.\nChapter 1 takes place in countryside where Moss Side is now.\nMary takes up work at a dressmaker's (her father having objected to her working in a factory) and becomes subject to the affections of hard-working Jem Wilson and Harry Carson, son of a wealthy mill owner. She fondly hopes, by marrying Carson, to secure a comfortable life for herself and her father, but immediately after refusing Jem's offer of marriage she realises that she truly loves him. She therefore decides to evade Carson, planning to show her feelings to Jem in the course of time. Jem believes her decision to be final, though this does not change his feelings for her.\nMeanwhile, Esther, a \"street-walker,\" returns to warn John Barton that he must save Mary from becoming like her. He simply pushes her away, however, and she's sent to jail for a month on the charge of vagrancy. Upon her release she talks to Jem with the same purpose. He promises that he will protect Mary and confronts Carson, eventually entering into a fight with him, which is witnessed by a policeman passing by.\nNot long afterwards, Carson is shot dead, and Jem is arrested for the crime, his gun having been found at the scene. Esther decides to investigate the matter further and discovers that the wadding for the gun was a piece of paper on which is written Mary's name.\nShe visits her niece to warn her to save the one she loves, and after she leaves Mary realises that the murderer is not Jem but her father. She is now faced with having to save her lover without giving away her father. With the help of Job Legh (the intelligent grandfather of her blind friend Margaret), Mary travels to Liverpool to find the only person who could provide an alibi for Jem \u00e2\u0080\u0093 Will Wilson, Jem's cousin and a sailor, who was with him on the night of the murder. Unfortunately, Will's ship is already departing, so that, after Mary chases after the ship in a small boat, the only thing Will can do is promise to return in the pilot ship and testify the next day.\nDuring the trial, Jem learns of Mary's great love for him. Will arrives in court to testify, and Jem is found \"not guilty\". Mary has fallen ill during the trial and is nursed by Mr Sturgis, an old sailor, and his wife. When she finally returns to Manchester she has to face her father, who is crushed by his remorse. He summons John Carson, Harry's father, to confess to him that he is the murderer. Carson is still set on justice, but after turning to the Bible he forgives Barton, who dies soon afterwards in Carson's arms. Not long after this Esther comes back to Mary's home, where she, too, soon dies.\nJem decides to leave England, where, his reputation damaged, it would be difficult for him to find a new job. The novel ends with the wedded Mary and Jem, their little child, and Mrs Wilson living happily in Canada. News comes that Margaret has regained her sight and that she and Will, soon to be married, will visit.\nQuestion: Who actually killed Harry Carson?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "John Barton"
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "John Barton"
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "test",
+        "id": "id1514"
+      },
+      "train_trial_index": 0,
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "Passage: The novel begins in Manchester, where we are introduced to the Bartons and the Wilsons, two working-class families. John Barton is a questioner of the distribution of wealth and the relations between rich and poor. Soon his wife dies\u00e2\u0080\u0094he blames it on her grief over the disappearance of her sister Esther. Having already lost his son Tom at a young age, Barton is left to raise his daughter, Mary, alone and now falls into depression and begins to involve himself in the Chartist, trade-union movement.\nChapter 1 takes place in countryside where Moss Side is now.\nMary takes up work at a dressmaker's (her father having objected to her working in a factory) and becomes subject to the affections of hard-working Jem Wilson and Harry Carson, son of a wealthy mill owner. She fondly hopes, by marrying Carson, to secure a comfortable life for herself and her father, but immediately after refusing Jem's offer of marriage she realises that she truly loves him. She therefore decides to evade Carson, planning to show her feelings to Jem in the course of time. Jem believes her decision to be final, though this does not change his feelings for her.\nMeanwhile, Esther, a \"street-walker,\" returns to warn John Barton that he must save Mary from becoming like her. He simply pushes her away, however, and she's sent to jail for a month on the charge of vagrancy. Upon her release she talks to Jem with the same purpose. He promises that he will protect Mary and confronts Carson, eventually entering into a fight with him, which is witnessed by a policeman passing by.\nNot long afterwards, Carson is shot dead, and Jem is arrested for the crime, his gun having been found at the scene. Esther decides to investigate the matter further and discovers that the wadding for the gun was a piece of paper on which is written Mary's name.\nShe visits her niece to warn her to save the one she loves, and after she leaves Mary realises that the murderer is not Jem but her father. She is now faced with having to save her lover without giving away her father. With the help of Job Legh (the intelligent grandfather of her blind friend Margaret), Mary travels to Liverpool to find the only person who could provide an alibi for Jem \u00e2\u0080\u0093 Will Wilson, Jem's cousin and a sailor, who was with him on the night of the murder. Unfortunately, Will's ship is already departing, so that, after Mary chases after the ship in a small boat, the only thing Will can do is promise to return in the pilot ship and testify the next day.\nDuring the trial, Jem learns of Mary's great love for him. Will arrives in court to testify, and Jem is found \"not guilty\". Mary has fallen ill during the trial and is nursed by Mr Sturgis, an old sailor, and his wife. When she finally returns to Manchester she has to face her father, who is crushed by his remorse. He summons John Carson, Harry's father, to confess to him that he is the murderer. Carson is still set on justice, but after turning to the Bible he forgives Barton, who dies soon afterwards in Carson's arms. Not long after this Esther comes back to Mary's home, where she, too, soon dies.\nJem decides to leave England, where, his reputation damaged, it would be difficult for him to find a new job. The novel ends with the wedded Mary and Jem, their little child, and Mrs Wilson living happily in Canada. News comes that Margaret has regained her sight and that she and Will, soon to be married, will visit.\nQuestion: Who actually killed Harry Carson?\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 1,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " The Wilsons.",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " The",
+                "logprob": 0.0
+              },
+              {
+                "text": " W",
+                "logprob": 0.0
+              },
+              {
+                "text": "ils",
+                "logprob": 0.0
+              },
+              {
+                "text": "ons",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 0.8950591087341309,
+        "request_datetime": 1763479296
+      },
+      "num_train_instances": 0,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    },
+    {
+      "instance": {
+        "input": {
+          "text": "Tarzan returns to Opar, the source of the gold where a lost colony of fabled Atlantis is located, in order to make good on some financial reverses he has recently suffered. While Atlantis itself sank beneath the waves thousands of years ago, the workers of Opar continued to mine all of the gold, which means there is a rather huge stockpile but which is now lost to the memory of the Oparians and only Tarzan knows its secret location.\nA greedy, outlawed Belgian army officer, Albert Werper, in the employ of a criminal Arab, secretly follows Tarzan to Opar. There, Tarzan loses his memory after being struck on the head by a falling rock in the treasure room during an earthquake. On encountering La, the high priestess who is the servant of the Flaming God of Opar, and who is also very beautiful, Tarzan once again rejects her love which enrages her and she tries to have him killed; she had fallen in love with the apeman during their first encounter and La and her high priests are not going to allow Tarzan to escape their sacrificial knives this time.\nIn the meanwhile, Jane has been kidnapped by the Arab and wonders what is keeping her husband from once again coming to her rescue. A now amnesiac Tarzan and the Werper escape from Opar, bearing away the sacrificial knife of Opar which La and some retainers set out to recover. There is intrigue and counter intrigue the rest of the way.\nQuestion: Who is La?"
+        },
+        "references": [
+          {
+            "output": {
+              "text": "La is the high priestess that serves the Flaming God of Opar."
+            },
+            "tags": [
+              "correct"
+            ]
+          },
+          {
+            "output": {
+              "text": "A high priestess."
+            },
+            "tags": [
+              "correct"
+            ]
+          }
+        ],
+        "split": "test",
+        "id": "id1340"
+      },
+      "train_trial_index": 0,
+      "request": {
+        "model_deployment": "huggingface/gpt2",
+        "model": "openai/gpt2",
+        "embedding": false,
+        "prompt": "Passage: The Little White Bird is a series of short episodes, including both accounts of the narrator's day-to-day activities in contemporary London and fanciful tales set in Kensington Gardens and elsewhere.The story is set in several locations; the earlier chapters are set in the town of London, contemporaneous to the time of Barrie's writing, and involving some time travel of a few years, and other fantasy elements, while remaining within the London setting. The middle chapters that later became Peter Pan in Kensington Gardens are set in London's famous Kensington Gardens, introduced by the statement that \"All perambulators lead to Kensington Gardens\". The Kensington Gardens chapters include detailed descriptions of the features of the Gardens, along with fantasy names given to the locations by the story's characters, especially after \"Lock-Out Time\", described by Barrie as the time at the end of the day when the park gates are closed to the public, and the fairies and other magical inhabitants of the park can move about more freely than during the daylight, when they must hide from ordinary people. The third section of the book, following the Kensington Gardens chapters, are again set generally in London, though there are some short returns to the Gardens that are not part of the Peter Pan stories. In a two-page diversion in chapter 24, Barrie brings the story to Patagonia, and a journey by ship returning to England at the \"white cliffs of Albion\".\nQuestion: Where  was the ship in Pentagonia returned to?\nAnswer: England\n\nPassage: Tarzan returns to Opar, the source of the gold where a lost colony of fabled Atlantis is located, in order to make good on some financial reverses he has recently suffered. While Atlantis itself sank beneath the waves thousands of years ago, the workers of Opar continued to mine all of the gold, which means there is a rather huge stockpile but which is now lost to the memory of the Oparians and only Tarzan knows its secret location.\nA greedy, outlawed Belgian army officer, Albert Werper, in the employ of a criminal Arab, secretly follows Tarzan to Opar. There, Tarzan loses his memory after being struck on the head by a falling rock in the treasure room during an earthquake. On encountering La, the high priestess who is the servant of the Flaming God of Opar, and who is also very beautiful, Tarzan once again rejects her love which enrages her and she tries to have him killed; she had fallen in love with the apeman during their first encounter and La and her high priests are not going to allow Tarzan to escape their sacrificial knives this time.\nIn the meanwhile, Jane has been kidnapped by the Arab and wonders what is keeping her husband from once again coming to her rescue. A now amnesiac Tarzan and the Werper escape from Opar, bearing away the sacrificial knife of Opar which La and some retainers set out to recover. There is intrigue and counter intrigue the rest of the way.\nQuestion: Who is La?\nAnswer:",
+        "temperature": 0.0,
+        "num_completions": 1,
+        "top_k_per_token": 1,
+        "max_tokens": 100,
+        "stop_sequences": [
+          "\n"
+        ],
+        "echo_prompt": false,
+        "top_p": 1,
+        "presence_penalty": 0,
+        "frequency_penalty": 0
+      },
+      "result": {
+        "success": true,
+        "embedding": [],
+        "completions": [
+          {
+            "text": " La is a young woman who is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar, and is a member of the royal family of Opar. She is a member of the royal family of Opar",
+            "logprob": 0.0,
+            "tokens": [
+              {
+                "text": " La",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " young",
+                "logprob": 0.0
+              },
+              {
+                "text": " woman",
+                "logprob": 0.0
+              },
+              {
+                "text": " who",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ",",
+                "logprob": 0.0
+              },
+              {
+                "text": " and",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              },
+              {
+                "text": ".",
+                "logprob": 0.0
+              },
+              {
+                "text": " She",
+                "logprob": 0.0
+              },
+              {
+                "text": " is",
+                "logprob": 0.0
+              },
+              {
+                "text": " a",
+                "logprob": 0.0
+              },
+              {
+                "text": " member",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " the",
+                "logprob": 0.0
+              },
+              {
+                "text": " royal",
+                "logprob": 0.0
+              },
+              {
+                "text": " family",
+                "logprob": 0.0
+              },
+              {
+                "text": " of",
+                "logprob": 0.0
+              },
+              {
+                "text": " O",
+                "logprob": 0.0
+              },
+              {
+                "text": "par",
+                "logprob": 0.0
+              }
+            ]
+          }
+        ],
+        "cached": false,
+        "request_time": 1.3460521697998047,
+        "request_datetime": 1763479297
+      },
+      "num_train_instances": 1,
+      "prompt_truncated": false,
+      "num_conditioning_tokens": 0
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/helm/narrative_qa:model=openai_gpt2/stats.json b/tests/data/helm/narrative_qa:model=openai_gpt2/stats.json
new file mode 100644
index 0000000..d6d6444
--- /dev/null
+++ b/tests/data/helm/narrative_qa:model=openai_gpt2/stats.json
@@ -0,0 +1,2634 @@
+[
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 712.5,
+    "sum_squared": 507656.25,
+    "min": 712.5,
+    "max": 712.5,
+    "mean": 712.5,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "training_co2_cost",
+      "split": "test"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "training_energy_cost",
+      "split": "test"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.173800766468048,
+    "sum_squared": 1.3778082393609772,
+    "min": 1.173800766468048,
+    "max": 1.173800766468048,
+    "mean": 1.173800766468048,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.25,
+    "sum_squared": 0.0625,
+    "min": 0.25,
+    "max": 0.25,
+    "mean": 0.25,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.17424242424242425,
+    "sum_squared": 0.030360422405876955,
+    "min": 0.17424242424242425,
+    "max": 0.17424242424242425,
+    "mean": 0.17424242424242425,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.05365510777881912,
+    "sum_squared": 0.002878870590756696,
+    "min": 0.05365510777881912,
+    "max": 0.05365510777881912,
+    "mean": 0.05365510777881912,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.03568840579710145,
+    "sum_squared": 0.001273662308338584,
+    "min": 0.03568840579710145,
+    "max": 0.03568840579710145,
+    "mean": 0.03568840579710145,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.1125369292536313e-308,
+    "sum_squared": 0.0,
+    "min": 1.1125369292536313e-308,
+    "max": 1.1125369292536313e-308,
+    "mean": 1.1125369292536313e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 153.0,
+    "sum_squared": 23409.0,
+    "min": 153.0,
+    "max": 153.0,
+    "mean": 153.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 686.0,
+    "sum_squared": 470596.0,
+    "min": 686.0,
+    "max": 686.0,
+    "mean": 686.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "training_co2_cost",
+      "split": "valid"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "training_energy_cost",
+      "split": "valid"
+    },
+    "count": 0,
+    "sum": 0,
+    "sum_squared": 0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.743454933166504,
+    "sum_squared": 3.0396351039826186,
+    "min": 1.743454933166504,
+    "max": 1.743454933166504,
+    "mean": 1.743454933166504,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.03636363636363636,
+    "sum_squared": 0.0013223140495867767,
+    "min": 0.03636363636363636,
+    "max": 0.03636363636363636,
+    "mean": 0.03636363636363636,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.03508771929824561,
+    "sum_squared": 0.0012311480455524776,
+    "min": 0.03508771929824561,
+    "max": 0.03508771929824561,
+    "mean": 0.03508771929824561,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 2.2250738585072626e-308,
+    "sum_squared": 0.0,
+    "min": 2.2250738585072626e-308,
+    "max": 2.2250738585072626e-308,
+    "mean": 2.2250738585072626e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 270.0,
+    "sum_squared": 72900.0,
+    "min": 270.0,
+    "max": 270.0,
+    "mean": 270.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "perplexity",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bits_per_byte",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob_per_byte",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "perplexity",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bits_per_byte",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob_per_byte",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 712.5,
+    "sum_squared": 507656.25,
+    "min": 712.5,
+    "max": 712.5,
+    "mean": 712.5,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 712.5,
+    "sum_squared": 507656.25,
+    "min": 712.5,
+    "max": 712.5,
+    "mean": 712.5,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.173800766468048,
+    "sum_squared": 1.3778082393609772,
+    "min": 1.173800766468048,
+    "max": 1.173800766468048,
+    "mean": 1.173800766468048,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.173800766468048,
+    "sum_squared": 1.3778082393609772,
+    "min": 1.173800766468048,
+    "max": 1.173800766468048,
+    "mean": 1.173800766468048,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.25,
+    "sum_squared": 0.0625,
+    "min": 0.25,
+    "max": 0.25,
+    "mean": 0.25,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.25,
+    "sum_squared": 0.0625,
+    "min": 0.25,
+    "max": 0.25,
+    "mean": 0.25,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.17424242424242425,
+    "sum_squared": 0.030360422405876955,
+    "min": 0.17424242424242425,
+    "max": 0.17424242424242425,
+    "mean": 0.17424242424242425,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.17424242424242425,
+    "sum_squared": 0.030360422405876955,
+    "min": 0.17424242424242425,
+    "max": 0.17424242424242425,
+    "mean": 0.17424242424242425,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.05365510777881912,
+    "sum_squared": 0.002878870590756696,
+    "min": 0.05365510777881912,
+    "max": 0.05365510777881912,
+    "mean": 0.05365510777881912,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.05365510777881912,
+    "sum_squared": 0.002878870590756696,
+    "min": 0.05365510777881912,
+    "max": 0.05365510777881912,
+    "mean": 0.05365510777881912,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03568840579710145,
+    "sum_squared": 0.001273662308338584,
+    "min": 0.03568840579710145,
+    "max": 0.03568840579710145,
+    "mean": 0.03568840579710145,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03568840579710145,
+    "sum_squared": 0.001273662308338584,
+    "min": 0.03568840579710145,
+    "max": 0.03568840579710145,
+    "mean": 0.03568840579710145,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.1125369292536313e-308,
+    "sum_squared": 0.0,
+    "min": 1.1125369292536313e-308,
+    "max": 1.1125369292536313e-308,
+    "mean": 1.1125369292536313e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.1125369292536313e-308,
+    "sum_squared": 0.0,
+    "min": 1.1125369292536313e-308,
+    "max": 1.1125369292536313e-308,
+    "mean": 1.1125369292536313e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 38.0,
+    "sum_squared": 1444.0,
+    "min": 38.0,
+    "max": 38.0,
+    "mean": 38.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 153.0,
+    "sum_squared": 23409.0,
+    "min": 153.0,
+    "max": 153.0,
+    "mean": 153.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "test",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 153.0,
+    "sum_squared": 23409.0,
+    "min": 153.0,
+    "max": 153.0,
+    "mean": 153.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_references",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.0,
+    "sum_squared": 4.0,
+    "min": 2.0,
+    "max": 2.0,
+    "mean": 2.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_trials",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 686.0,
+    "sum_squared": 470596.0,
+    "min": 686.0,
+    "max": 686.0,
+    "mean": 686.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_prompt_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 686.0,
+    "sum_squared": 470596.0,
+    "min": 686.0,
+    "max": 686.0,
+    "mean": 686.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_completion_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_output_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.743454933166504,
+    "sum_squared": 3.0396351039826186,
+    "min": 1.743454933166504,
+    "max": 1.743454933166504,
+    "mean": 1.743454933166504,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "inference_runtime",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.743454933166504,
+    "sum_squared": 3.0396351039826186,
+    "min": 1.743454933166504,
+    "max": 1.743454933166504,
+    "mean": 1.743454933166504,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "batch_size",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_length",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_stop",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_endoftext",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "finish_reason_unknown",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_train_instances",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "prompt_truncated",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "max_prob",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "quasi_exact_match",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "f1_score",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03636363636363636,
+    "sum_squared": 0.0013223140495867767,
+    "min": 0.03636363636363636,
+    "max": 0.03636363636363636,
+    "mean": 0.03636363636363636,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "rouge_l",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03636363636363636,
+    "sum_squared": 0.0013223140495867767,
+    "min": 0.03636363636363636,
+    "max": 0.03636363636363636,
+    "mean": 0.03636363636363636,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03508771929824561,
+    "sum_squared": 0.0012311480455524776,
+    "min": 0.03508771929824561,
+    "max": 0.03508771929824561,
+    "mean": 0.03508771929824561,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_1",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.03508771929824561,
+    "sum_squared": 0.0012311480455524776,
+    "min": 0.03508771929824561,
+    "max": 0.03508771929824561,
+    "mean": 0.03508771929824561,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.2250738585072626e-308,
+    "sum_squared": 0.0,
+    "min": 2.2250738585072626e-308,
+    "max": 2.2250738585072626e-308,
+    "mean": 2.2250738585072626e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "bleu_4",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 2.2250738585072626e-308,
+    "sum_squared": 0.0,
+    "min": 2.2250738585072626e-308,
+    "max": 2.2250738585072626e-308,
+    "mean": 2.2250738585072626e-308,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "logprob",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 0.0,
+    "sum_squared": 0.0,
+    "min": 0.0,
+    "max": 0.0,
+    "mean": 0.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_perplexity_tokens",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 57.0,
+    "sum_squared": 3249.0,
+    "min": 57.0,
+    "max": 57.0,
+    "mean": 57.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid",
+      "perturbation": {
+        "name": "robustness",
+        "robustness": true,
+        "fairness": false,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 270.0,
+    "sum_squared": 72900.0,
+    "min": 270.0,
+    "max": 270.0,
+    "mean": 270.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_bytes",
+      "split": "valid",
+      "perturbation": {
+        "name": "fairness",
+        "robustness": false,
+        "fairness": true,
+        "computed_on": "worst"
+      }
+    },
+    "count": 1,
+    "sum": 270.0,
+    "sum_squared": 72900.0,
+    "min": 270.0,
+    "max": 270.0,
+    "mean": 270.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_instances",
+      "split": "test"
+    },
+    "count": 1,
+    "sum": 4.0,
+    "sum_squared": 16.0,
+    "min": 4.0,
+    "max": 4.0,
+    "mean": 4.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  },
+  {
+    "name": {
+      "name": "num_instances",
+      "split": "valid"
+    },
+    "count": 1,
+    "sum": 1.0,
+    "sum_squared": 1.0,
+    "min": 1.0,
+    "max": 1.0,
+    "mean": 1.0,
+    "variance": 0.0,
+    "stddev": 0.0
+  }
+]
\ No newline at end of file
diff --git a/tests/test_helm_adapter.py b/tests/test_helm_adapter.py
index b9d508f..2cdf851 100644
--- a/tests/test_helm_adapter.py
+++ b/tests/test_helm_adapter.py
@@ -1,48 +1,156 @@
-# import pytest
-# from pathlib import Path
-# from eval_converters.helm.adapter import HELMAdapter
-# import json
-
-# @pytest.fixture
-# def adapter():
-#     return HELMAdapter()
-
-# def test_metadata_and_supported_library(adapter):
-#     metadata = adapter.metadata
-#     assert metadata.name == "HELMAdapter"
-#     assert metadata.version == "0.0.1"
-#     assert "0.5.6" in metadata.supported_library_versions
-#     assert "HELM evaluation outputs" in metadata.description
-
-#     supported_lib = adapter.supported_library
-#     assert supported_lib.name == "HELM"
-
-# def test_transform_from_directory(adapter):
-#     test_dir = Path(__file__).parent.resolve()
-#     output_dir_path = test_dir / 'data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0'
+from pathlib import Path
+
+from eval_converters.helm.adapter import HELMAdapter
+from schema.eval_types import (
+    EvaluationLog, 
+    EvaluatorRelationship,
+    SourceData,
+    SourceMetadata,
+
+)
+
+def _load_eval(adapter, filepath, source_metadata):
+    eval_dirpath = Path(filepath)
+    converted_eval = adapter.transform_from_directory(eval_dirpath, source_metadata=source_metadata)
+    assert isinstance(converted_eval, EvaluationLog)
+    assert isinstance(converted_eval.source_data, SourceData)
+
+    assert converted_eval.evaluation_source.evaluation_source_name == 'helm'
+    assert converted_eval.evaluation_source.evaluation_source_type.value == 'evaluation_platform'
+
+    return converted_eval
+
+def test_mmlu_eval():
+    adapter = HELMAdapter()
+    source_metadata = SourceMetadata(
+        source_organization_name='TestOrg',
+        evaluator_relationship=EvaluatorRelationship.first_party,
+    )
+
+    converted_eval = _load_eval(adapter, 'tests/data/helm/mmlu:subject=philosophy,method=multiple_choice_joint,model=openai_gpt2', source_metadata)
+
+    assert converted_eval.retrieved_timestamp == '1762354922'
+    
+    assert converted_eval.source_data.dataset_name == 'mmlu'
+    assert converted_eval.source_data.hf_repo is None
+    assert len(converted_eval.source_data.sample_ids) == 10
+
+    assert converted_eval.model_info.name == 'openai/gpt2'
+    assert converted_eval.model_info.id == 'openai/gpt2'
+    assert converted_eval.model_info.developer == 'openai'
+    assert converted_eval.model_info.inference_platform == 'huggingface'
+    assert converted_eval.model_info.inference_engine is None
+
+    results = converted_eval.evaluation_results
+    metric_names = ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']
+    
+    for result in results:
+        assert results[0].evaluation_name == 'multiple_choice_joint'
+        assert results[0].metric_config.evaluation_description in metric_names
+        # assert results[0].score_details.score == 1.0
+
+    results_per_sample = converted_eval.detailed_evaluation_results_per_samples
+    sample_ids = [sample.sample_id for sample in results_per_sample]
+
+    assert sorted(sample_ids) == ['id105', 'id11', 'id131', 'id147', 'id222', 'id259', 'id291', 'id344', 'id59', 'id65']
+    assert isinstance(results_per_sample[0].ground_truth, list)
+    assert results_per_sample[0].ground_truth[0] == 'C'
+    assert results_per_sample[0].response == 'D'
+    assert isinstance(results_per_sample[0].choices, list)
+    choices = sorted([choice for choice, resp in results_per_sample[0].choices])
+    responses = sorted([resp for choice, resp in results_per_sample[0].choices])
+    assert choices == ['A', 'B', 'C', 'D']
+    assert responses == [
+        'external meaning',
+        "god's plan",
+        'internalmeaning',
+        'meaning in an afterlife'
+    ]
+
+def test_hellswag_eval():
+    adapter = HELMAdapter()
+    source_metadata = SourceMetadata(
+        source_organization_name='TestOrg',
+        evaluator_relationship=EvaluatorRelationship.first_party,
+    )
+
+    converted_eval = _load_eval(adapter, 'tests/data/helm/commonsense:dataset=hellaswag,method=multiple_choice_joint,model=eleutherai_pythia-1b-v0', source_metadata)
+
+    assert converted_eval.retrieved_timestamp == '1751729998'
+    
+    assert converted_eval.source_data.dataset_name == 'hellaswag'
+    assert converted_eval.source_data.hf_repo is None
+    assert len(converted_eval.source_data.sample_ids) == 10
+
+    assert converted_eval.model_info.name == 'eleutherai/pythia-1b-v0'
+    assert converted_eval.model_info.id == 'eleutherai/pythia-1b-v0'
+    assert converted_eval.model_info.developer == 'eleutherai'
+    assert converted_eval.model_info.inference_platform == 'huggingface'
+    assert converted_eval.model_info.inference_engine is None
+
+    results = converted_eval.evaluation_results
+    metric_names = ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']
+    
+    assert results[0].score_details.score == 0.3
+    for result in results:
+        assert results[0].evaluation_name == 'multiple_choice_joint'
+        assert results[0].metric_config.evaluation_description in metric_names
+
+    results_per_sample = converted_eval.detailed_evaluation_results_per_samples
+    sample_ids = [sample.sample_id for sample in results_per_sample]
+
+    assert sorted(sample_ids) == ['id41468', 'id41992', 'id42841', 'id44284', 'id44874', 'id45277', 'id46128', 'id47299', 'id47975', 'id49438']
+    assert isinstance(results_per_sample[0].ground_truth, list)
+    assert results_per_sample[0].ground_truth[0] == 'C'
+    assert results_per_sample[0].response == 'B'
+    assert isinstance(results_per_sample[0].choices, list)
+    choices = sorted([choice for choice, resp in results_per_sample[0].choices])
+    responses = sorted([resp for choice, resp in results_per_sample[0].choices])
+    assert choices == ['A', 'B', 'C', 'D']
+
+    assert responses == [
+        'However, you can also take your color, added color, and texture into account when deciding what to dye, and what you will use it for. [substeps] Consider adding your hair dye to your hair if you have it long or curly.', 
+        "If you're not planning on dying your hair, there are other coloration measures you can take to dye your hair. [step] Photoshop hd darkers work well, but don't lack the style that can be coupled with it.", 
+        'It is important to select the color that represents your hair type when you register your hair color. [substeps] Traditional semi-permanent dyes will generally not be available for hair color, like blow-dryers, curling irons, and appliances.', 
+        "Pick the color that's your favorite, matches your wardrobe best, and/or is most flattering for your eye color and skin tone. Semi-permanent dyes work on all hair colors, but show up brightest on light hair."
+    ]
+
+def test_narrativeqa_eval():
+    adapter = HELMAdapter()
+    source_metadata = SourceMetadata(
+        source_organization_name='TestOrg',
+        evaluator_relationship=EvaluatorRelationship.first_party,
+    )
+
+    converted_eval = _load_eval(adapter, 'tests/data/helm/narrative_qa:model=openai_gpt2', source_metadata)
+
+    assert converted_eval.retrieved_timestamp == '1763479296'
     
-#     results = adapter.transform_from_directory(output_dir_path)
+    assert converted_eval.source_data.dataset_name == 'narrativeqa'
+    assert converted_eval.source_data.hf_repo is None
+    assert len(converted_eval.source_data.sample_ids) == 5
+
+    assert converted_eval.model_info.name == 'openai/gpt2'
+    assert converted_eval.model_info.id == 'openai/gpt2'
+    assert converted_eval.model_info.developer == 'openai'
+    assert converted_eval.model_info.inference_platform == 'huggingface'
+    assert converted_eval.model_info.inference_engine is None
+
+    results = converted_eval.evaluation_results
+    metric_names = ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']
     
-#     assert isinstance(results, list)
-#     assert all(hasattr(r, 'schema_version') for r in results)
-#     assert all(r.model.model_info.name for r in results)
-#     assert all(r.instance.raw_input for r in results)
-#     assert len(results) > 0, "No results found in the output directory"
-
-# def test_transform_single(adapter):
-#     test_dir = Path(__file__).parent.resolve()
-#     output_base_path = test_dir / 'data/helm/'
-#     output_file_path = test_dir / 'data/helm/transform_helm_file_raw_data.json'
-
-#     results = adapter._transform_single(output_file_path, base_dir=output_base_path)
-
-#     assert isinstance(results, list)
-#     assert all(hasattr(r, 'schema_version') for r in results)
-#     assert all(r.model.model_info.name for r in results)
-#     assert all(r.instance.raw_input for r in results)
-#     assert len(results) > 0, "No results found in the output directory"
-
-# if __name__ == "__main__":
-#     # Create an adapter instance for direct execution
-#     adapter = HELMAdapter()
-#     test_transform_single(adapter)
\ No newline at end of file
+    for result in results:
+        assert results[0].evaluation_name == 'generation'
+        assert results[0].metric_config.evaluation_description in metric_names
+        # assert results[0].score_details.score == 1.0
+
+    results_per_sample = converted_eval.detailed_evaluation_results_per_samples
+    sample_ids = [sample.sample_id for sample in results_per_sample]
+
+    assert sorted(sample_ids) == ['id1123', 'id1332', 'id1340', 'id1413', 'id1514']
+
+    assert isinstance(results_per_sample[0].ground_truth, list)
+    assert sorted(results_per_sample[0].ground_truth) == ['The school Mascot', 'the schools mascot']
+    assert results_per_sample[0].ground_truth[0] == 'The school Mascot'
+    assert results_per_sample[0].response == 'Olive.'
+    assert results_per_sample[0].choices is None
\ No newline at end of file