-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_utils.py
More file actions
executable file
·187 lines (154 loc) · 8.33 KB
/
data_utils.py
File metadata and controls
executable file
·187 lines (154 loc) · 8.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from datasets import Features, Value, Image as ImageFeature
class SingleImagePickAPanel:
def __init__(self, max_size=500, margin=10, label_space=20, font_path=None):
if font_path is None:
raise ValueError("Font path must be provided. Testing was done with 'Arial.ttf'")
self.max_size = max_size
self.margin = margin
self.label_space = label_space
# Add separate font sizes
self.label_font_size = 20
self.number_font_size = 24
self.font_path = font_path
def resize_image(self, img):
"""Resize image keeping aspect ratio if longest edge > max_size"""
if max(img.size) > self.max_size:
ratio = self.max_size / max(img.size)
new_size = tuple(int(dim * ratio) for dim in img.size)
return img.resize(new_size, Image.Resampling.LANCZOS)
return img
def create_mask_panel(self, width, height):
"""Create a question mark panel"""
mask_panel = Image.new("RGB", (width, height), (200, 200, 200))
draw = ImageDraw.Draw(mask_panel)
font_size = int(height * 0.8)
try:
font = ImageFont.truetype(self.font_path, font_size)
except:
raise ValueError("Font file not found")
text = "?"
bbox = draw.textbbox((0, 0), text, font=font)
text_x = (width - (bbox[2] - bbox[0])) // 2
text_y = (height - (bbox[3] - bbox[1])) // 2
draw.text((text_x, text_y), text, fill="black", font=font)
return mask_panel
def draw_number_on_panel(self, panel, number, font):
"""Draw number on the bottom of the panel with background"""
draw = ImageDraw.Draw(panel)
# Get text size
bbox = draw.textbbox((0, 0), str(number), font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# Calculate position (bottom-right corner)
padding = 2
text_x = panel.size[0] - text_width - padding
text_y = panel.size[1] - text_height - padding
# Draw semi-transparent background
bg_rect = [(text_x - padding, text_y - padding),
(text_x + text_width + padding, text_y + text_height + padding)]
draw.rectangle(bg_rect, fill=(255, 255, 255, 180))
# Draw text
draw.text((text_x, text_y), str(number), fill="black", font=font)
return panel
def map_to_single_image(self, examples):
"""Process a batch of examples from a HuggingFace dataset"""
single_images = []
for i in range(len(examples['sample_id'])):
# Get context and options for current example
context = examples['context'][i] if len(examples['context'][i]) > 0 else []
options = examples['options'][i]
# Resize all images
context = [self.resize_image(img) for img in context]
options = [self.resize_image(img) for img in options]
# Calculate common panel size (use median size to avoid outliers)
all_panels = context + options
if len(all_panels) > 0:
widths = [img.size[0] for img in all_panels]
heights = [img.size[1] for img in all_panels]
panel_width = int(np.median(widths))
panel_height = int(np.median(heights))
# Resize all panels to common size
context = [img.resize((panel_width, panel_height)) for img in context]
options = [img.resize((panel_width, panel_height)) for img in options]
# Create mask panel for sequence filling tasks if needed
if 'index' in examples and len(context) > 0:
mask_idx = examples['index'][i]
mask_panel = self.create_mask_panel(panel_width, panel_height)
context.insert(mask_idx, mask_panel)
# Calculate canvas dimensions based on whether we have context
if len(context) > 0:
context_row_width = panel_width * len(context) + self.margin * (len(context) - 1)
options_row_width = panel_width * len(options) + self.margin * (len(options) - 1)
canvas_width = max(context_row_width, options_row_width)
canvas_height = (panel_height * 2 +
self.label_space * 2)
else:
# Only options row for caption_relevance
canvas_width = panel_width * len(options) + self.margin * (len(options) - 1)
canvas_height = (panel_height +
self.label_space)
# Create canvas
final_image = Image.new("RGB", (canvas_width, canvas_height), "white")
draw = ImageDraw.Draw(final_image)
try:
label_font = ImageFont.truetype(self.font_path, self.label_font_size)
number_font = ImageFont.truetype(self.font_path, self.number_font_size)
except:
raise ValueError("Font file not found")
current_y = 0
# Add context section if it exists
if len(context) > 0:
# Draw "Context" label
bbox = draw.textbbox((0, 0), "Context", font=label_font)
text_x = (canvas_width - (bbox[2] - bbox[0])) // 2
draw.text((text_x, current_y), "Context", fill="black", font=label_font)
current_y += self.label_space
# Paste context panels
x_offset = (canvas_width - (panel_width * len(context) +
self.margin * (len(context) - 1))) // 2
for panel in context:
final_image.paste(panel, (x_offset, current_y))
x_offset += panel_width + self.margin
current_y += panel_height
# Add "Options" label
bbox = draw.textbbox((0, 0), "Options", font=label_font)
text_x = (canvas_width - (bbox[2] - bbox[0])) // 2
draw.text((text_x, current_y), "Options", fill="black", font=label_font)
current_y += self.label_space
# Paste options with numbers on panels
x_offset = (canvas_width - (panel_width * len(options) +
self.margin * (len(options) - 1))) // 2
for idx, panel in enumerate(options):
# Create a copy of the panel to draw on
panel_with_number = panel.copy()
if panel_with_number.mode != 'RGBA':
panel_with_number = panel_with_number.convert('RGBA')
# Draw number on panel
panel_with_number = self.draw_number_on_panel(
panel_with_number,
idx,
number_font
)
# Paste the panel with number
final_image.paste(panel_with_number, (x_offset, current_y), panel_with_number)
x_offset += panel_width + self.margin
# Convert final_image to PIL Image format (instead of numpy array)
single_images.append(final_image)
# Prepare batch output
examples['single_image'] = single_images
return examples
if __name__ == "__main__":
from datasets import load_dataset
skill = "sequence_filling" # "sequence_filling", "char_coherence", "visual_closure", "text_closure", "caption_relevance"
split = "val" # "val", "test"
dataset = load_dataset("VLR-CVC/ComicsPAP", skill, split=split)
processor = SingleImagePickAPanel(font_path="save/Arial.ttf")
dataset = dataset.map(
processor.map_to_single_image,
batched=True,
batch_size=32,
remove_columns=['context', 'options']
)
dataset.save_to_disk(f"ComicsPAP_{skill}_{split}_single_images")