PTTS/tokenize_data.py at main · VILA-Lab/PTTS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from typing import Dict
import re
import os
from datasets import load_dataset
from transformers import AutoTokenizer
from functools import partial

QUERY_TEMPLATE_NOANSWER = """{Question}""".strip()

def preprocess(text):
    if text is None:
        return " "
    text = text.strip()
    text = text.replace(" [title]", ". ")
    text = re.sub("\\[.*?\\]", "", text)
    text = text.replace("  ", " ")
    return text

def process_cot_example(
    example: Dict,
    tokenizer,
):
    raw_trace = example["deepseek_thinking_trajectory"]
    thinking_trajectory = [ raw_trace ]
    question = example["question"]
    answer = example["deepseek_attempt"]

    prompt = QUERY_TEMPLATE_NOANSWER.format(Question=question)
    answer = "Answer: " + answer if "Answer:" not in answer else answer
    text = tokenizer.apply_chat_template([
        {"role": "user", "content": prompt},
        {
            "role": "assistant",
            "content": "<|im_start|>think\n" + "\n".join(thinking_trajectory).strip() + "\n<|im_start|>answer\n" + answer.strip()
        }
    ], tokenize=False)
    return dict(text=text)

def mathcot_sft(upload_data_path: str, num_proc: int, download_data_path: str):
    # Resolve absolute paths relative to the current script
    script_dir = os.path.dirname(os.path.abspath(__file__))
    csv_path = os.path.join(script_dir, download_data_path)
    output_path = os.path.join(script_dir, upload_data_path)

    # Load dataset
    # dataset = load_dataset("csv", data_files={"train": csv_path})["train"]
    # Load dataset: local CSV if it exists, otherwise treat as HF Hub dataset id
    if os.path.exists(csv_path) or download_data_path.lower().endswith(".csv"):
        dataset = load_dataset("csv", data_files={"train": csv_path})["train"]
    else:
        ds = load_dataset(download_data_path)  # e.g., "P-TTS/P_TTS-Full" (login if gated)
        if hasattr(ds, "keys"):  # DatasetDict
            split = "train" if "train" in ds.keys() else next(iter(ds.keys()))
            dataset = ds[split]
        else:  # already a Dataset
            dataset = ds


    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
    process_example_map = partial(process_cot_example, tokenizer=tokenizer)
    dataset = dataset.map(
        process_example_map,
        num_proc=num_proc,
        desc="Tokenizing SFT data",
    )
    dataset.select_columns(["deepseek_thinking_trajectory","question","deepseek_attempt",'text']).to_csv(output_path)


if __name__ == "__main__":
    mathcot_sft(
        download_data_path="P-TTS/P_TTS-Full",
        upload_data_path="Deepseek_900_32B_tokonized.csv",
        num_proc=20
    )