From 47b8de28d249eb10d4774b14c820d8f4e87bd78e Mon Sep 17 00:00:00 2001
From: Mark Ayzenshtadt <mayzenshtadt@gmail.com>
Date: Mon, 24 Jun 2024 22:22:32 +0300
Subject: [PATCH] save json files for pre-existing predictions

---
 inference_engine/core/common/types.py |  8 +++---
 inference_engine/core/data.py         | 39 ++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/inference_engine/core/common/types.py b/inference_engine/core/common/types.py
index 32ae2b2..ab1d028 100644
--- a/inference_engine/core/common/types.py
+++ b/inference_engine/core/common/types.py
@@ -1,6 +1,6 @@
 from pydantic import BaseModel, Field
 from pathlib import Path
-from typing import List
+from typing import List, Optional
 
 
 class Sample(BaseModel):
@@ -24,12 +24,12 @@ class BatchLLMPrediction(BaseModel):
 
 class LLMOutput(Sample):
     llm_prediction: str
-    ntokens: int
-    proctime: float
+    ntokens: Optional[int]
+    proctime: Optional[float]
 
     @classmethod
     def from_sample(
-        cls, sample: Sample, llm_prediction: str, ntokens: int, proctime: float
+        cls, sample: Sample, llm_prediction: str, ntokens: Optional[int], proctime: Optional[float]
     ) -> "LLMOutput":
         return cls(
             video_path=sample.video_path,
diff --git a/inference_engine/core/data.py b/inference_engine/core/data.py
index 8f2863d..ccb7f0a 100644
--- a/inference_engine/core/data.py
+++ b/inference_engine/core/data.py
@@ -4,7 +4,7 @@
 from typing import List
 
 from core.common.config import EngineConfig
-from core.common.types import Sample, Batch
+from core.common.types import Sample, Batch, LLMOutput
 
 
 def load_qa(config: EngineConfig) -> pd.DataFrame:
@@ -18,17 +18,38 @@ def load_qa(config: EngineConfig) -> pd.DataFrame:
         for i, conversation in enumerate(entry["conversations"]):
             assert conversation[0]["from"] == "human"
             assert conversation[1]["from"] == "gpt"
-
-            samples.append(
-                {
-                    "video": entry["video"],
-                    "messages": conversation,
-                    "question_id": f"{entry['video'][:-4]}_{i}",
-                }
-            )
+            d = {
+                "video": entry["video"],
+                "messages": conversation,
+                "question_id": f"{entry['id']}_{i}",
+                "llm_prediction": entry.get('llm_predictions')[i], # None if not present, must be same length as conversations
+                "ntokens": entry.get('ntokens'), # None if not present
+                "proctime": entry.get('proctime'), # None if not present
+            }
+            samples.append(d)
 
     df = pd.DataFrame(samples)
 
+    for _, row in df[df['llm_prediction'].notnull()].iterrows():
+        llm_output = LLMOutput.from_sample(
+            Sample(
+                video_path=Path(config.video_path)
+                .joinpath(row.video)
+                .resolve()
+                .as_posix(),
+                text_prompt=row.messages[0]["value"],
+                target_answer=row.messages[1]["value"],
+                question_id=row.question_id,
+            ),
+            llm_prediction=row["llm_prediction"],
+            ntokens=row["ntokens"],
+            proctime=row["proctime"],
+        )
+        with config.llm_output_path.joinpath(f"{llm_output.question_id}.json").open(
+            "w"
+        ) as f:
+            f.write(llm_output.model_dump_json())
+
     existing_outputs = [path.stem for path in config.llm_output_path.glob("*.json")]
     filtered_df = df[~df["question_id"].isin(existing_outputs)]
     return filtered_df