Chg2Cap/preprocess_data.py at main · ShizhenChang/Chg2Cap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import sys
import os
sys.path.insert(0, os.path.abspath('.'))
import json
import argparse
import numpy as np

parser = argparse.ArgumentParser()
#parser.add_argument('--dataset', type = str, default = 'Dubai_CC', help= 'the name of the dataset')
#parser.add_argument('--word_count_threshold', default=0, type=int)
parser.add_argument('--dataset', type = str, default = 'LEVIR_CC', help= 'the name of the dataset')
parser.add_argument('--word_count_threshold', default=5, type=int)

SPECIAL_TOKENS = {
  '<NULL>': 0,
  '<UNK>': 1,
  '<START>': 2,
  '<END>': 3,
}

def main(args):
    if args.dataset == 'LEVIR_CC':
        input_captions_json = '/root/Data/LEVIR_CC/LevirCCcaptions.json'
        input_image_dir = '/root/Data/LEVIR_CC/images'
        input_vocab_json = ''
        output_vocab_json = 'vocab.json'
        save_dir = './data/LEVIR_CC/'
    elif args.dataset == 'Dubai_CC':
        input_captions_json = '/root/Data/Dubai_CC/DubaiCC500impair/datasetDubaiCCPublic/description_jsontr_te_val/'
        input_image_dir = '/root/Data/Dubai_CC/DubaiCC500impair/datasetDubaiCCPublic/RGB'
        input_vocab_json = ''
        output_vocab_json = 'vocab.json'
        save_dir = './data/Dubai_CC/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not os.path.exists(os.path.join(save_dir + 'tokens/')):
        os.makedirs(os.path.join(save_dir + 'tokens/'))
    print('Loading captions')
    assert args.dataset in {'LEVIR_CC', 'Dubai_CC'}

    if args.dataset == 'LEVIR_CC':
        with open(input_captions_json, 'r') as f:
            data = json.load(f)
        # Read image paths and captions for each image
        max_length = -1
        all_cap_tokens = []
        for img in data['images']:
            captions = []
            for c in img['sentences']:
                # Update word frequency
                assert len(c['raw']) > 0, 'error: some image has no caption'
                captions.append(c['raw'])
            tokens_list = []
            for cap in captions:
                cap_tokens = tokenize(cap,
                                    add_start_token=True,
                                    add_end_token=True,
                                    punct_to_keep=[';', ','],
                                    punct_to_remove=['?', '.'])
                tokens_list.append(cap_tokens)
                max_length = max(max_length, len(cap_tokens))
            all_cap_tokens.append((img['filename'], tokens_list))

        # Then save the tokenized captions in txt
        print('Saving captions')
        for img, tokens_list in all_cap_tokens:
            i = img.split('.')[0]
            token_len = len(tokens_list)
            tokens_list = json.dumps(tokens_list)
            f = open(os.path.join(save_dir + 'tokens/' + i + '.txt'), 'w')
            f.write(tokens_list)
            f.close()


        #Considering each image pair has 5 annotations, two strategies can be adopted to generate list for training:
        # a: creating training list with a self-defined token_id[0:4], each token list corresponds to specific captions;
        # or b: randomly select one of the five captions during training;

        #    if i.split('_')[0] == 'train':
        #        f = open(os.path.join(save_dir + 'train' + '.txt'), 'a')
        #        f.write(img + '\n')
        #        f.close

            if i.split('_')[0] == 'train':
                f = open(os.path.join(save_dir + 'train' + '.txt'), 'a')
                for j in range(token_len):
                    f.write(img + '-' + str(j) + '\n')
                f.close

            elif i.split('_')[0] == 'val':
                f = open(os.path.join(save_dir + 'val' + '.txt'), 'a')
                f.write(img + '\n')
                f.close()
            elif i.split('_')[0] == 'test':
                f = open(os.path.join(save_dir + 'test' + '.txt'), 'a')
                f.write(img + '\n')
                f.close()


    elif args.dataset == 'Dubai_CC':
        filename = os.listdir(input_captions_json)
        max_length = -1
        all_cap_tokens = []

        for j in range(len(filename)):
            s_cap_tokens = []
            caption_json = os.path.join(input_captions_json, filename[j])
            with open(caption_json, 'r') as f:
                data = json.load(f)
            for img in data['images']:
                captions = []
                for c in img['sentences']:
                    # Update word frequency
                    assert len(c['raw']) > 0, 'error: some image has no caption'
                    captions.append(c['raw'])
                tokens_list = []
                for cap in captions:
                    cap_tokens = tokenize(cap,
                                        add_start_token=True,
                                        add_end_token=True,
                                        punct_to_keep=[';', ','],
                                        punct_to_remove=['?', '.'])
                    tokens_list.append(cap_tokens)
                    max_length = max(max_length, len(cap_tokens))
                s_cap_tokens.append((img['filename'], tokens_list))
                all_cap_tokens.append((img['filename'], tokens_list))
            # Then save the tokenized captions in txt
            print('Saving captions')
            for img, tokens_list in s_cap_tokens:
                i = img.split('.')[0]
                token_len = len(tokens_list)
                tokens_list = json.dumps(tokens_list)
                f = open(os.path.join(save_dir + 'tokens/' + i + '.txt'), 'w')
                f.write(tokens_list)
                f.close()

                #if filename[j].split('_')[0] == 'Train':
                #    f = open(os.path.join(save_dir + 'train' + '.txt'), 'a')
                #    f.write(img + '\n')
                #    f.close

                if filename[j].split('_')[0] == 'Train':
                    f = open(os.path.join(save_dir + 'train' + '.txt'), 'a')
                    for s in range(token_len):
                        f.write(img + '-' + str(s) + '\n')
                    f.close

                elif filename[j].split('_')[0] == 'Validation':
                    f = open(os.path.join(save_dir + 'val' + '.txt'), 'a')
                    f.write(img + '\n')
                    f.close()
                elif filename[j].split('_')[0] == 'Test':
                    f = open(os.path.join(save_dir + 'test' + '.txt'), 'a')
                    f.write(img + '\n')
                    f.close()


    print('max_length of the dataset:', max_length)
    # Either create the vocab or load it from disk
    if input_vocab_json == '':
        print('Building vocab')
        word_freq = build_vocab(all_cap_tokens,args.word_count_threshold)
    else:
        print('Loading vocab')
        with open(input_vocab_json, 'r') as f:
            word_freq = json.load(f)
    if output_vocab_json != '':
        with open(os.path.join(save_dir + output_vocab_json), 'w') as f:
            json.dump(word_freq, f)


def tokenize(s, delim=' ',add_start_token=True,
    add_end_token=True,punct_to_keep=None, punct_to_remove=None):
    """
    Tokenize a sequence, converting a string s into a list of (string) tokens by
    splitting on the specified delimiter. Optionally keep or remove certain
    punctuation marks and add start and end tokens.
    """
    if punct_to_keep is not None:
        for p in punct_to_keep:
            s = s.replace(p, '%s%s' % (delim, p))

    if punct_to_remove is not None:
        for p in punct_to_remove:
            s = s.replace(p, '')

    tokens = s.split(delim)
    for q in tokens:
        if q == '':
            tokens.remove(q)
    if tokens[0] == '':
        tokens.remove(tokens[0])
    if tokens[-1] == '':
        tokens.remove(tokens[-1])
    if add_start_token:
        tokens.insert(0, '<START>')
    if add_end_token:
        tokens.append('<END>')
    return tokens

def build_vocab(sequences, min_token_count=1):#Calculate the number of independent words and tokenize vocab
    token_to_count = {}
    for it in sequences:
        for seq in it[1]:
            for token in seq:
                if token not in token_to_count:
                    token_to_count[token] = 0
                token_to_count[token] += 1

    token_to_idx = {}
    for token, idx in SPECIAL_TOKENS.items():
        token_to_idx[token] = idx
    for token, count in sorted(token_to_count.items()):
        if token in token_to_idx.keys():
            continue
        if count >= min_token_count:
            token_to_idx[token] = len(token_to_idx)

    return token_to_idx

def encode(seq_tokens, token_to_idx, allow_unk=False):
    seq_idx = []
    for token in seq_tokens:
        if token not in token_to_idx:
            if allow_unk:
                token = '<UNK>'
            else:
                raise KeyError('Token "%s" not in vocab' % token)
        seq_idx.append(token_to_idx[token])
    return seq_idx

if __name__ == '__main__':
    args = parser.parse_args()
    main(args)