-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_mmsd2_to_imagefolder_data.py
More file actions
executable file
·110 lines (88 loc) · 3.17 KB
/
convert_mmsd2_to_imagefolder_data.py
File metadata and controls
executable file
·110 lines (88 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import json
import shutil
import typing as t
from pathlib import Path
import datasets as dt
import jsonlines
from tqdm import tqdm
VERSION_TYPE = t.Literal["mmsd-v1", "mmsd-v2", "mmsd-clean"]
MMSD2_DATASET_DIR = Path("/path/to/mmsd2.0")
VERSION_NAME_MAP = {
"mmsd-v1": "original",
"mmsd-v2": "text_json_final",
}
def read_orignal(split_path: Path) -> list[dict[str, t.Any]]:
lines = split_path.read_text().splitlines()
all_data = []
for line in lines:
data = eval(line)
text = data[1].split()
if "sarcasm" in text:
continue
if "sarcastic" in text:
continue
if "reposting" in text:
continue
if "<url>" in text:
continue
if "joke" in text:
continue
if "humour" in text:
continue
if "humor" in text:
continue
if "jokes" in text:
continue
if "irony" in text:
continue
if "ironic" in text:
continue
if "exgag" in text:
continue
all_data.append({"image_id": data[0], "text": data[1], "label": int(data[-1])})
return all_data
def convert(version: VERSION_TYPE) -> None:
data_dir = MMSD2_DATASET_DIR / VERSION_NAME_MAP[version]
# create converted data dir
converted_data_dir = MMSD2_DATASET_DIR / f"{version}-converted"
converted_data_dir.mkdir(exist_ok=True, parents=True)
for split in ["test", "valid", "train"]:
# create split dir
split_dir = converted_data_dir / split
split_dir.mkdir(exist_ok=True, parents=True)
# create metadata file
metadata_file = (split_dir / "metadata.jsonl").open("w")
metadata_writer = jsonlines.Writer(metadata_file)
# copy images and write metadata
if version == "mmsd-v1":
split = f"{split}2" if split != "train" else split
data = read_orignal(data_dir / f"{split}.txt")
else:
data = json.loads((data_dir / f"{split}.json").read_text())
for d in tqdm(data, desc=f"Converting {version} {split} data"):
image_id = d["image_id"]
text = d["text"]
label = d["label"]
image_path = MMSD2_DATASET_DIR / "images" / f"{image_id}.jpg"
if not image_path.exists():
continue
metadata_writer.write(
{
"file_name": f"{image_id}.jpg",
"text": text,
"label": label,
"id": str(image_id),
}
)
shutil.copy(image_path, split_dir / f"{image_id}.jpg")
def publish(version: VERSION_TYPE, repo_id: str, commit_message: str = "") -> None:
converted_data_dir = MMSD2_DATASET_DIR / f"{version}-converted"
dataset = t.cast(
dt.Dataset, dt.load_dataset("imagefolder", data_dir=str(converted_data_dir))
)
dataset.push_to_hub(repo_id, config_name=version, commit_message=commit_message)
if __name__ == "__main__":
convert("mmsd-v1")
convert("mmsd-v2")
publish("mmsd-v1", "<username>/<repo-id>", commit_message="add mmsd-v1")
publish("mmsd-v2", "<username>/<repo-id>")