ComicsPAP/data_utils.py at main · llabres/ComicsPAP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from datasets import Features, Value, Image as ImageFeature

class SingleImagePickAPanel:
    def __init__(self, max_size=500, margin=10, label_space=20, font_path=None):
        if font_path is None:
            raise ValueError("Font path must be provided. Testing was done with 'Arial.ttf'")
        self.max_size = max_size
        self.margin = margin
        self.label_space = label_space
        # Add separate font sizes
        self.label_font_size = 20
        self.number_font_size = 24

        self.font_path = font_path

    def resize_image(self, img):
        """Resize image keeping aspect ratio if longest edge > max_size"""
        if max(img.size) > self.max_size:
            ratio = self.max_size / max(img.size)
            new_size = tuple(int(dim * ratio) for dim in img.size)
            return img.resize(new_size, Image.Resampling.LANCZOS)
        return img

    def create_mask_panel(self, width, height):
        """Create a question mark panel"""
        mask_panel = Image.new("RGB", (width, height), (200, 200, 200))
        draw = ImageDraw.Draw(mask_panel)
        font_size = int(height * 0.8)
        try:
            font = ImageFont.truetype(self.font_path, font_size)
        except:
            raise ValueError("Font file not found")

        text = "?"
        bbox = draw.textbbox((0, 0), text, font=font)
        text_x = (width - (bbox[2] - bbox[0])) // 2
        text_y = (height - (bbox[3] - bbox[1])) // 2
        draw.text((text_x, text_y), text, fill="black", font=font)
        return mask_panel

    def draw_number_on_panel(self, panel, number, font):
        """Draw number on the bottom of the panel with background"""
        draw = ImageDraw.Draw(panel)

        # Get text size
        bbox = draw.textbbox((0, 0), str(number), font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]

        # Calculate position (bottom-right corner)
        padding = 2
        text_x = panel.size[0] - text_width - padding
        text_y = panel.size[1] - text_height - padding

        # Draw semi-transparent background
        bg_rect = [(text_x - padding, text_y - padding),
                  (text_x + text_width + padding, text_y + text_height + padding)]
        draw.rectangle(bg_rect, fill=(255, 255, 255, 180))

        # Draw text
        draw.text((text_x, text_y), str(number), fill="black", font=font)
        return panel

    def map_to_single_image(self, examples):
        """Process a batch of examples from a HuggingFace dataset"""
        single_images = []

        for i in range(len(examples['sample_id'])):
            # Get context and options for current example
            context = examples['context'][i] if len(examples['context'][i]) > 0 else []
            options = examples['options'][i]

            # Resize all images
            context = [self.resize_image(img) for img in context]
            options = [self.resize_image(img) for img in options]

            # Calculate common panel size (use median size to avoid outliers)
            all_panels = context + options
            if len(all_panels) > 0:
                widths = [img.size[0] for img in all_panels]
                heights = [img.size[1] for img in all_panels]
                panel_width = int(np.median(widths))
                panel_height = int(np.median(heights))

                # Resize all panels to common size
                context = [img.resize((panel_width, panel_height)) for img in context]
                options = [img.resize((panel_width, panel_height)) for img in options]

                # Create mask panel for sequence filling tasks if needed
                if 'index' in examples and len(context) > 0:
                    mask_idx = examples['index'][i]
                    mask_panel = self.create_mask_panel(panel_width, panel_height)
                    context.insert(mask_idx, mask_panel)

                # Calculate canvas dimensions based on whether we have context
                if len(context) > 0:
                    context_row_width = panel_width * len(context) + self.margin * (len(context) - 1)
                    options_row_width = panel_width * len(options) + self.margin * (len(options) - 1)
                    canvas_width = max(context_row_width, options_row_width)
                    canvas_height = (panel_height * 2 +
                                   self.label_space * 2)
                else:
                    # Only options row for caption_relevance
                    canvas_width = panel_width * len(options) + self.margin * (len(options) - 1)
                    canvas_height = (panel_height +
                                   self.label_space)

                # Create canvas
                final_image = Image.new("RGB", (canvas_width, canvas_height), "white")
                draw = ImageDraw.Draw(final_image)

                try:
                    label_font = ImageFont.truetype(self.font_path, self.label_font_size)
                    number_font = ImageFont.truetype(self.font_path, self.number_font_size)
                except:
                    raise ValueError("Font file not found")

                current_y = 0

                # Add context section if it exists
                if len(context) > 0:
                    # Draw "Context" label
                    bbox = draw.textbbox((0, 0), "Context", font=label_font)
                    text_x = (canvas_width - (bbox[2] - bbox[0])) // 2
                    draw.text((text_x, current_y), "Context", fill="black", font=label_font)
                    current_y += self.label_space

                    # Paste context panels
                    x_offset = (canvas_width - (panel_width * len(context) +
                               self.margin * (len(context) - 1))) // 2
                    for panel in context:
                        final_image.paste(panel, (x_offset, current_y))
                        x_offset += panel_width + self.margin
                    current_y += panel_height

                # Add "Options" label
                bbox = draw.textbbox((0, 0), "Options", font=label_font)
                text_x = (canvas_width - (bbox[2] - bbox[0])) // 2
                draw.text((text_x, current_y), "Options", fill="black", font=label_font)
                current_y += self.label_space

                # Paste options with numbers on panels
                x_offset = (canvas_width - (panel_width * len(options) +
                           self.margin * (len(options) - 1))) // 2
                for idx, panel in enumerate(options):
                    # Create a copy of the panel to draw on
                    panel_with_number = panel.copy()
                    if panel_with_number.mode != 'RGBA':
                        panel_with_number = panel_with_number.convert('RGBA')

                    # Draw number on panel
                    panel_with_number = self.draw_number_on_panel(
                        panel_with_number,
                        idx,
                        number_font
                    )

                    # Paste the panel with number
                    final_image.paste(panel_with_number, (x_offset, current_y), panel_with_number)
                    x_offset += panel_width + self.margin

                # Convert final_image to PIL Image format (instead of numpy array)
                single_images.append(final_image)

        # Prepare batch output
        examples['single_image'] = single_images

        return examples


if __name__ == "__main__":
    from datasets import load_dataset

    skill = "sequence_filling" # "sequence_filling", "char_coherence", "visual_closure", "text_closure", "caption_relevance"
    split = "val" # "val", "test"
    dataset = load_dataset("VLR-CVC/ComicsPAP", skill, split=split)

    processor = SingleImagePickAPanel(font_path="save/Arial.ttf")
    dataset = dataset.map(
            processor.map_to_single_image,
            batched=True,
            batch_size=32,
            remove_columns=['context', 'options']
        )
    dataset.save_to_disk(f"ComicsPAP_{skill}_{split}_single_images")