From 47b8de28d249eb10d4774b14c820d8f4e87bd78e Mon Sep 17 00:00:00 2001 From: Mark Ayzenshtadt Date: Mon, 24 Jun 2024 22:22:32 +0300 Subject: [PATCH] save json files for pre-existing predictions --- inference_engine/core/common/types.py | 8 +++--- inference_engine/core/data.py | 39 ++++++++++++++++++++------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/inference_engine/core/common/types.py b/inference_engine/core/common/types.py index 32ae2b2..ab1d028 100644 --- a/inference_engine/core/common/types.py +++ b/inference_engine/core/common/types.py @@ -1,6 +1,6 @@ from pydantic import BaseModel, Field from pathlib import Path -from typing import List +from typing import List, Optional class Sample(BaseModel): @@ -24,12 +24,12 @@ class BatchLLMPrediction(BaseModel): class LLMOutput(Sample): llm_prediction: str - ntokens: int - proctime: float + ntokens: Optional[int] + proctime: Optional[float] @classmethod def from_sample( - cls, sample: Sample, llm_prediction: str, ntokens: int, proctime: float + cls, sample: Sample, llm_prediction: str, ntokens: Optional[int], proctime: Optional[float] ) -> "LLMOutput": return cls( video_path=sample.video_path, diff --git a/inference_engine/core/data.py b/inference_engine/core/data.py index 8f2863d..ccb7f0a 100644 --- a/inference_engine/core/data.py +++ b/inference_engine/core/data.py @@ -4,7 +4,7 @@ from typing import List from core.common.config import EngineConfig -from core.common.types import Sample, Batch +from core.common.types import Sample, Batch, LLMOutput def load_qa(config: EngineConfig) -> pd.DataFrame: @@ -18,17 +18,38 @@ def load_qa(config: EngineConfig) -> pd.DataFrame: for i, conversation in enumerate(entry["conversations"]): assert conversation[0]["from"] == "human" assert conversation[1]["from"] == "gpt" - - samples.append( - { - "video": entry["video"], - "messages": conversation, - "question_id": f"{entry['video'][:-4]}_{i}", - } - ) + d = { + "video": entry["video"], + "messages": conversation, + "question_id": f"{entry['id']}_{i}", + "llm_prediction": entry.get('llm_predictions')[i], # None if not present, must be same length as conversations + "ntokens": entry.get('ntokens'), # None if not present + "proctime": entry.get('proctime'), # None if not present + } + samples.append(d) df = pd.DataFrame(samples) + for _, row in df[df['llm_prediction'].notnull()].iterrows(): + llm_output = LLMOutput.from_sample( + Sample( + video_path=Path(config.video_path) + .joinpath(row.video) + .resolve() + .as_posix(), + text_prompt=row.messages[0]["value"], + target_answer=row.messages[1]["value"], + question_id=row.question_id, + ), + llm_prediction=row["llm_prediction"], + ntokens=row["ntokens"], + proctime=row["proctime"], + ) + with config.llm_output_path.joinpath(f"{llm_output.question_id}.json").open( + "w" + ) as f: + f.write(llm_output.model_dump_json()) + existing_outputs = [path.stem for path in config.llm_output_path.glob("*.json")] filtered_df = df[~df["question_id"].isin(existing_outputs)] return filtered_df