Skip to content
This repository was archived by the owner on May 19, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions inference_engine/core/common/types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import BaseModel, Field
from pathlib import Path
from typing import List
from typing import List, Optional


class Sample(BaseModel):
Expand All @@ -24,12 +24,12 @@ class BatchLLMPrediction(BaseModel):

class LLMOutput(Sample):
llm_prediction: str
ntokens: int
proctime: float
ntokens: Optional[int]
proctime: Optional[float]

@classmethod
def from_sample(
cls, sample: Sample, llm_prediction: str, ntokens: int, proctime: float
cls, sample: Sample, llm_prediction: str, ntokens: Optional[int], proctime: Optional[float]
) -> "LLMOutput":
return cls(
video_path=sample.video_path,
Expand Down
39 changes: 30 additions & 9 deletions inference_engine/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import List

from core.common.config import EngineConfig
from core.common.types import Sample, Batch
from core.common.types import Sample, Batch, LLMOutput


def load_qa(config: EngineConfig) -> pd.DataFrame:
Expand All @@ -18,17 +18,38 @@ def load_qa(config: EngineConfig) -> pd.DataFrame:
for i, conversation in enumerate(entry["conversations"]):
assert conversation[0]["from"] == "human"
assert conversation[1]["from"] == "gpt"

samples.append(
{
"video": entry["video"],
"messages": conversation,
"question_id": f"{entry['video'][:-4]}_{i}",
}
)
d = {
"video": entry["video"],
"messages": conversation,
"question_id": f"{entry['id']}_{i}",
"llm_prediction": entry.get('llm_predictions')[i], # None if not present, must be same length as conversations
"ntokens": entry.get('ntokens'), # None if not present
"proctime": entry.get('proctime'), # None if not present
}
samples.append(d)

df = pd.DataFrame(samples)

for _, row in df[df['llm_prediction'].notnull()].iterrows():
llm_output = LLMOutput.from_sample(
Sample(
video_path=Path(config.video_path)
.joinpath(row.video)
.resolve()
.as_posix(),
text_prompt=row.messages[0]["value"],
target_answer=row.messages[1]["value"],
question_id=row.question_id,
),
llm_prediction=row["llm_prediction"],
ntokens=row["ntokens"],
proctime=row["proctime"],
)
with config.llm_output_path.joinpath(f"{llm_output.question_id}.json").open(
"w"
) as f:
f.write(llm_output.model_dump_json())

existing_outputs = [path.stem for path in config.llm_output_path.glob("*.json")]
filtered_df = df[~df["question_id"].isin(existing_outputs)]
return filtered_df
Expand Down