From 19509f51b941fd53f19cf3e50aa62a3f54896ed5 Mon Sep 17 00:00:00 2001 From: weiyaowu-md <15202186219@163.com> Date: Sun, 10 Jul 2022 23:04:26 +0800 Subject: [PATCH 1/2] implement crnn --- apis/train.py | 7 +- data/__init__.py | 3 +- data/build.py | 28 ++++ data/dataset/dataset_wrapper.py | 144 ++++++++++++++++++ data/dataset/lmdb_dataset.py | 137 +++++++++++++++++ modeling/backbone/very_deep_vgg.py | 94 ++++++++++++ modeling/decoders/crnn_decode.py | 103 +++++++++++++ modeling/losses/ctc_loss.py | 103 +++++++++++++ modeling/recognizers/__init__.py | 0 modeling/recognizers/crnn.py | 94 ++++++++++++ modeling/recognizers/str_label_converter.py | 42 +++++ .../crnn_text_recognizer_toy.yaml | 0 12 files changed, 753 insertions(+), 2 deletions(-) create mode 100644 data/dataset/dataset_wrapper.py create mode 100644 data/dataset/lmdb_dataset.py create mode 100644 modeling/backbone/very_deep_vgg.py create mode 100644 modeling/decoders/crnn_decode.py create mode 100644 modeling/losses/ctc_loss.py create mode 100644 modeling/recognizers/__init__.py create mode 100644 modeling/recognizers/crnn.py create mode 100644 modeling/recognizers/str_label_converter.py create mode 100644 yamls/text_recognizer/crnn_text_recognizer_toy.yaml diff --git a/apis/train.py b/apis/train.py index 6c69158..441ee59 100755 --- a/apis/train.py +++ b/apis/train.py @@ -12,6 +12,7 @@ import time import os from detectron2.data import build_detection_test_loader +from data import build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader from detectron2.engine.defaults import DefaultTrainer from detectron2.utils import comm @@ -24,7 +25,7 @@ from detectron2.modeling import build_model -from data import DatasetMapper, build_detection_train_loader +from data import DatasetMapper, build_detection_train_loader, lmdb_dataset from torchtools.optim import RangerLars from solver import WarmupCosineAnnealingLR from detectron2.solver import build_lr_scheduler, build_optimizer @@ -106,10 +107,14 @@ def build_model(cls, cfg): @classmethod def build_test_loader(cls, cfg, dataset_name): + if cfg.DATASETS.TYPE == "CRNN": + return build_lmdb_recognizer_test_loader(cfg) return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False)) @classmethod def build_train_loader(cls, cfg): + if cfg.DATASETS.TYPE == "CRNN": + return build_lmdb_recognizer_train_loader(cfg) return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) @classmethod diff --git a/data/__init__.py b/data/__init__.py index 07398bc..45cfd49 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -3,4 +3,5 @@ from .dataset_mapper import DatasetMapper from .transforms import * from .dataset import * -from .build import build_detection_train_loader \ No newline at end of file +from .build import build_detection_train_loader, build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader +from .dataset import lmdb_dataset \ No newline at end of file diff --git a/data/build.py b/data/build.py index 0a30da5..4483228 100644 --- a/data/build.py +++ b/data/build.py @@ -21,6 +21,8 @@ from detectron2.data.detection_utils import check_metadata_consistency from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler +from .dataset import lmdb_dataset + """ This file contains the default logic to build a dataloader for training or testing. """ @@ -32,6 +34,8 @@ "get_detection_dataset_dicts", "load_proposals_into_dataset", "print_instances_class_histogram", + "build_lmdb_recognizer_train_loader", + "build_lmdb_recognizer_test_loader", ] @@ -339,6 +343,30 @@ def build_detection_test_loader(cfg, dataset_name, mapper=None): ) return data_loader +def build_lmdb_recognizer_train_loader(cfg): + train_dataset = lmdb_dataset.lmdbDataset(root=cfg.DATASETS.TRAIN_ROOT) + sampler = None + batch_size = cfg.SOLVER.IMS_PER_BATCH + if cfg.DATASETS.RANDOM_SAMPLE: + sampler = train_dataset.randomSequentialSampler(train_dataset, batch_size) + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=batch_size, + shuffle=True, sampler=sampler, + num_workers=int(cfg.SOLVER.WORKERS), + collate_fn=lmdb_dataset.alignCollate(imgH=cfg.INPUT.IMG_H, imgW=cfg.INPUT.IMG_W, keep_ratio=cfg.INPUT.KEEP_RATIO)) + + return train_loader + +def build_lmdb_recognizer_test_loader(cfg): + test_dataset = lmdb_dataset.lmdbDataset( + root=cfg.DATASETS.TEST_ROOT, transform=lmdb_dataset.resizeNormalize((100, 32))) + + batch_size = cfg.SOLVER.IMS_PER_BATCH + test_loader = torch.utils.data.DataLoader( + test_dataset, shuffle=True, batch_size=batch_size, num_workers=int(cfg.SOLVER.WORKERS)) + + return test_loader def trivial_batch_collator(batch): """ diff --git a/data/dataset/dataset_wrapper.py b/data/dataset/dataset_wrapper.py new file mode 100644 index 0000000..0b31c39 --- /dev/null +++ b/data/dataset/dataset_wrapper.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import bisect +import collections +import copy +import math +from collections import defaultdict +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + +import numpy as np + +class ConcatDataset(_ConcatDataset): + """A wrapper of concatenated dataset. + + Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but + concat the group flag for image aspect ratio. + + Args: + datasets (list[:obj:`Dataset`]): A list of datasets. + separate_eval (bool): Whether to evaluate the results + separately if it is used as validation dataset. + Defaults to True. + """ + + def __init__(self, datasets, separate_eval=True): + super(ConcatDataset, self).__init__(datasets) + self.CLASSES = datasets[0].CLASSES + self.PALETTE = getattr(datasets[0], 'PALETTE', None) + self.separate_eval = separate_eval + # if not separate_eval: + # if any([isinstance(ds, CocoDataset) for ds in datasets]): + # raise NotImplementedError( + # 'Evaluating concatenated CocoDataset as a whole is not' + # ' supported! Please set "separate_eval=True"') + # elif len(set([type(ds) for ds in datasets])) != 1: + # raise NotImplementedError( + # 'All the datasets should have same types') + + if hasattr(datasets[0], 'flag'): + flags = [] + for i in range(0, len(datasets)): + flags.append(datasets[i].flag) + self.flag = np.concatenate(flags) + + def get_cat_ids(self, idx): + """Get category ids of concatenated dataset by index. + + Args: + idx (int): Index of data. + + Returns: + list[int]: All categories in the image of specified index. + """ + + if idx < 0: + if -idx > len(self): + raise ValueError( + 'absolute value of index should not exceed dataset length') + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx].get_cat_ids(sample_idx) + + def get_ann_info(self, idx): + """Get annotation of concatenated dataset by index. + + Args: + idx (int): Index of data. + + Returns: + dict: Annotation info of specified index. + """ + + if idx < 0: + if -idx > len(self): + raise ValueError( + 'absolute value of index should not exceed dataset length') + idx = len(self) + idx + dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) + if dataset_idx == 0: + sample_idx = idx + else: + sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] + return self.datasets[dataset_idx].get_ann_info(sample_idx) + + def evaluate(self, results, logger=None, **kwargs): + """Evaluate the results. + + Args: + results (list[list | tuple]): Testing results of the dataset. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + + Returns: + dict[str: float]: AP results of the total dataset or each separate + dataset if `self.separate_eval=True`. + """ + assert len(results) == self.cumulative_sizes[-1], \ + ('Dataset and results have different sizes: ' + f'{self.cumulative_sizes[-1]} v.s. {len(results)}') + + # Check whether all the datasets support evaluation + for dataset in self.datasets: + assert hasattr(dataset, 'evaluate'), \ + f'{type(dataset)} does not implement evaluate function' + + if self.separate_eval: + dataset_idx = -1 + total_eval_results = dict() + for size, dataset in zip(self.cumulative_sizes, self.datasets): + start_idx = 0 if dataset_idx == -1 else \ + self.cumulative_sizes[dataset_idx] + end_idx = self.cumulative_sizes[dataset_idx + 1] + + results_per_dataset = results[start_idx:end_idx] + print_log( + f'\nEvaluateing {dataset.ann_file} with ' + f'{len(results_per_dataset)} images now', + logger=logger) + + eval_results_per_dataset = dataset.evaluate( + results_per_dataset, logger=logger, **kwargs) + dataset_idx += 1 + for k, v in eval_results_per_dataset.items(): + total_eval_results.update({f'{dataset_idx}_{k}': v}) + + return total_eval_results + elif any([isinstance(ds, CocoDataset) for ds in self.datasets]): + raise NotImplementedError( + 'Evaluating concatenated CocoDataset as a whole is not' + ' supported! Please set "separate_eval=True"') + elif len(set([type(ds) for ds in self.datasets])) != 1: + raise NotImplementedError( + 'All the datasets should have same types') + else: + original_data_infos = self.datasets[0].data_infos + self.datasets[0].data_infos = sum( + [dataset.data_infos for dataset in self.datasets], []) + eval_results = self.datasets[0].evaluate( + results, logger=logger, **kwargs) + self.datasets[0].data_infos = original_data_infos + return eval_results \ No newline at end of file diff --git a/data/dataset/lmdb_dataset.py b/data/dataset/lmdb_dataset.py new file mode 100644 index 0000000..0590840 --- /dev/null +++ b/data/dataset/lmdb_dataset.py @@ -0,0 +1,137 @@ +#!/usr/bin/python +# encoding: utf-8 + +import random +import torch +from torch.utils.data import Dataset +from torch.utils.data import sampler +import torchvision.transforms as transforms +import lmdb +import six +import sys +from PIL import Image +import numpy as np + +__all__ = ["lmdbDataset"] + +class lmdbDataset(Dataset): + + def __init__(self, root=None, transform=None, target_transform=None): + self.env = lmdb.open( + root, + max_readers=1, + readonly=True, + lock=False, + readahead=False, + meminit=False) + + if not self.env: + print('cannot creat lmdb from %s' % (root)) + sys.exit(0) + + with self.env.begin(write=False) as txn: + nSamples = int(txn.get('num-samples')) + self.nSamples = nSamples + + self.transform = transform + self.target_transform = target_transform + + def __len__(self): + return self.nSamples + + def __getitem__(self, index): + assert index <= len(self), 'index range error' + index += 1 + with self.env.begin(write=False) as txn: + img_key = 'image-%09d' % index + imgbuf = txn.get(img_key) + + buf = six.BytesIO() + buf.write(imgbuf) + buf.seek(0) + try: + img = Image.open(buf).convert('L') + except IOError: + print('Corrupted image for %d' % index) + return self[index + 1] + + if self.transform is not None: + img = self.transform(img) + + label_key = 'label-%09d' % index + label = str(txn.get(label_key)) + + if self.target_transform is not None: + label = self.target_transform(label) + + return (img, label) + + +class resizeNormalize(object): + + def __init__(self, size, interpolation=Image.BILINEAR): + self.size = size + self.interpolation = interpolation + self.toTensor = transforms.ToTensor() + + def __call__(self, img): + img = img.resize(self.size, self.interpolation) + img = self.toTensor(img) + img.sub_(0.5).div_(0.5) + return img + + +class randomSequentialSampler(sampler.Sampler): + + def __init__(self, data_source, batch_size): + self.num_samples = len(data_source) + self.batch_size = batch_size + + def __iter__(self): + n_batch = len(self) // self.batch_size + tail = len(self) % self.batch_size + index = torch.LongTensor(len(self)).fill_(0) + for i in range(n_batch): + random_start = random.randint(0, len(self) - self.batch_size) + batch_index = random_start + torch.range(0, self.batch_size - 1) + index[i * self.batch_size:(i + 1) * self.batch_size] = batch_index + # deal with tail + if tail: + random_start = random.randint(0, len(self) - self.batch_size) + tail_index = random_start + torch.range(0, tail - 1) + index[(i + 1) * self.batch_size:] = tail_index + + return iter(index) + + def __len__(self): + return self.num_samples + + +class alignCollate(object): + + def __init__(self, imgH=32, imgW=100, keep_ratio=False, min_ratio=1): + self.imgH = imgH + self.imgW = imgW + self.keep_ratio = keep_ratio + self.min_ratio = min_ratio + + def __call__(self, batch): + images, labels = zip(*batch) + + imgH = self.imgH + imgW = self.imgW + if self.keep_ratio: + ratios = [] + for image in images: + w, h = image.size + ratios.append(w / float(h)) + ratios.sort() + max_ratio = ratios[-1] + imgW = int(np.floor(max_ratio * imgH)) + imgW = max(imgH * self.min_ratio, imgW) # assure imgH >= imgW + + transform = resizeNormalize((imgW, imgH)) + images = [transform(image) for image in images] + images = torch.cat([t.unsqueeze(0) for t in images], 0) + + return images, labels \ No newline at end of file diff --git a/modeling/backbone/very_deep_vgg.py b/modeling/backbone/very_deep_vgg.py new file mode 100644 index 0000000..014126f --- /dev/null +++ b/modeling/backbone/very_deep_vgg.py @@ -0,0 +1,94 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import torch.nn as nn +from mmcv.runner import BaseModule, Sequential +from .build import BACKBONE_REGISTRY, get_norm + + + +class VeryDeepVgg(BaseModule): + """Implement VGG-VeryDeep backbone for text recognition, modified from + `VGG-VeryDeep `_ + + Args: + leaky_relu (bool): Use leakyRelu or not. + input_channels (int): Number of channels of input image tensor. + """ + + def __init__(self, + leaky_relu=True, + input_channels=3, + init_cfg=[ + dict(type='Xavier', layer='Conv2d'), + dict(type='Uniform', layer='BatchNorm2d') + ]): + super().__init__(init_cfg=init_cfg) + + ks = [3, 3, 3, 3, 3, 3, 2] + ps = [1, 1, 1, 1, 1, 1, 0] + ss = [1, 1, 1, 1, 1, 1, 1] + nm = [64, 128, 256, 256, 512, 512, 512] + + self.channels = nm + + # cnn = nn.Sequential() + cnn = Sequential() + + def conv_relu(i, batch_normalization=False): + n_in = input_channels if i == 0 else nm[i - 1] + n_out = nm[i] + cnn.add_module('conv{0}'.format(i), + nn.Conv2d(n_in, n_out, ks[i], ss[i], ps[i])) + if batch_normalization: + cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(n_out)) + if leaky_relu: + cnn.add_module('relu{0}'.format(i), + nn.LeakyReLU(0.2, inplace=True)) + else: + cnn.add_module('relu{0}'.format(i), nn.ReLU(True)) + + conv_relu(0) + cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64 + conv_relu(1) + cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32 + conv_relu(2, True) + conv_relu(3) + cnn.add_module('pooling{0}'.format(2), + nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16 + conv_relu(4, True) + conv_relu(5) + cnn.add_module('pooling{0}'.format(3), + nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16 + conv_relu(6, True) # 512x1x16 + + self.cnn = cnn + + def out_channels(self): + return self.channels[-1] + + def forward(self, x): + """ + Args: + x (Tensor): Images of shape :math:`(N, C, H, W)`. + + Returns: + Tensor: The feature Tensor of shape :math:`(N, 512, H/32, (W/4+1)`. + """ + output = self.cnn(x) + + return output + + +@BACKBONE_REGISTRY.register() +def build_very_deep_vgg(cfg): + leaky_relu = cfg.MODEL.BACKBONE.LEAKY_RELU + input_channels = cfg.MODEL.BACKBONE.INPUT_CHANNELS + model = VeryDeepVgg(leaky_relu, input_channels) + + pretrain = cfg.MODEL.BACKBONE.PRETRAIN + + model.init_weights(num_layers, pretrain) + return model \ No newline at end of file diff --git a/modeling/decoders/crnn_decode.py b/modeling/decoders/crnn_decode.py new file mode 100644 index 0000000..e085686 --- /dev/null +++ b/modeling/decoders/crnn_decode.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.runner import Sequential +from mmcv.runner import BaseModule + +from mmocr.models.builder import DECODERS + +class BidirectionalLSTM(nn.Module): + + def __init__(self, nIn, nHidden, nOut): + super().__init__() + + self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True) + self.embedding = nn.Linear(nHidden * 2, nOut) + + def forward(self, input): + recurrent, _ = self.rnn(input) + T, b, h = recurrent.size() + t_rec = recurrent.view(T * b, h) + + output = self.embedding(t_rec) # [T * b, nOut] + output = output.view(T, b, -1) + + return output + + +class CRNNDecoder(BaseModule): + """Decoder for CRNN. + + Args: + in_channels (int): Number of input channels. + num_classes (int): Number of output classes. + rnn_flag (bool): Use RNN or CNN as the decoder. + init_cfg (dict or list[dict], optional): Initialization configs. + """ + + def __init__(self, + in_channels=None, + num_classes=None, + rnn_flag=False, + init_cfg=dict(type='Xavier', layer='Conv2d'), + **kwargs): + super().__init__(init_cfg=init_cfg) + + self.num_classes = num_classes + self.rnn_flag = rnn_flag + + if rnn_flag: + self.decoder = Sequential( + BidirectionalLSTM(in_channels, 256, 256), + BidirectionalLSTM(256, 256, num_classes)) + else: + self.decoder = nn.Conv2d( + in_channels, num_classes, kernel_size=1, stride=1) + + + def forward(self, + feat, + out_enc, + targets_dict=None, + img_metas=None, + train_mode=True): + self.train_mode = train_mode + if train_mode: + return self.forward_train(feat, out_enc, targets_dict, img_metas) + + return self.forward_test(feat, out_enc, img_metas) + + def forward_train(self, feat, out_enc, targets_dict, img_metas): + """ + Args: + feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. + + Returns: + Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where + :math:`C` is ``num_classes``. + """ + assert feat.size(2) == 1, 'feature height must be 1' + if self.rnn_flag: + x = feat.squeeze(2) # [N, C, W] + x = x.permute(2, 0, 1) # [W, N, C] + x = self.decoder(x) # [W, N, C] + outputs = x.permute(1, 0, 2).contiguous() + else: + x = self.decoder(feat) + x = x.permute(0, 3, 1, 2).contiguous() + n, w, c, h = x.size() + outputs = x.view(n, w, c * h) + return outputs + + def forward_test(self, feat, out_enc, img_metas): + """ + Args: + feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. + + Returns: + Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where + :math:`C` is ``num_classes``. + """ + return self.forward_train(feat, out_enc, None, img_metas) + + + diff --git a/modeling/losses/ctc_loss.py b/modeling/losses/ctc_loss.py new file mode 100644 index 0000000..24c6390 --- /dev/null +++ b/modeling/losses/ctc_loss.py @@ -0,0 +1,103 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn + +from mmocr.models.builder import LOSSES + + +@LOSSES.register_module() +class CTCLoss(nn.Module): + """Implementation of loss module for CTC-loss based text recognition. + + Args: + flatten (bool): If True, use flattened targets, else padded targets. + blank (int): Blank label. Default 0. + reduction (str): Specifies the reduction to apply to the output, + should be one of the following: ('none', 'mean', 'sum'). + zero_infinity (bool): Whether to zero infinite losses and + the associated gradients. Default: False. + Infinite losses mainly occur when the inputs + are too short to be aligned to the targets. + """ + + def __init__(self, + flatten=True, + blank=0, + reduction='mean', + zero_infinity=False, + **kwargs): + super().__init__() + assert isinstance(flatten, bool) + assert isinstance(blank, int) + assert isinstance(reduction, str) + assert isinstance(zero_infinity, bool) + + self.flatten = flatten + self.blank = blank + self.ctc_loss = nn.CTCLoss( + blank=blank, reduction=reduction, zero_infinity=zero_infinity) + + def forward(self, outputs, targets_dict, img_metas=None): + """ + Args: + outputs (Tensor): A raw logit tensor of shape :math:`(N, T, C)`. + targets_dict (dict): A dict with 3 keys ``target_lengths``, + ``flatten_targets`` and ``targets``. + + - | ``target_lengths`` (Tensor): A tensor of shape :math:`(N)`. + Each item is the length of a word. + + - | ``flatten_targets`` (Tensor): Used if ``self.flatten=True`` + (default). A tensor of shape + (sum(targets_dict['target_lengths'])). Each item is the + index of a character. + + - | ``targets`` (Tensor): Used if ``self.flatten=False``. A + tensor of :math:`(N, T)`. Empty slots are padded with + ``self.blank``. + + img_metas (dict): A dict that contains meta information of input + images. Preferably with the key ``valid_ratio``. + + Returns: + dict: The loss dict with key ``loss_ctc``. + """ + valid_ratios = None + if img_metas is not None: + valid_ratios = [ + img_meta.get('valid_ratio', 1.0) for img_meta in img_metas + ] + + outputs = torch.log_softmax(outputs, dim=2) + bsz, seq_len = outputs.size(0), outputs.size(1) + outputs_for_loss = outputs.permute(1, 0, 2).contiguous() # T * N * C + + if self.flatten: + targets = targets_dict['flatten_targets'] + else: + targets = torch.full( + size=(bsz, seq_len), fill_value=self.blank, dtype=torch.long) + for idx, tensor in enumerate(targets_dict['targets']): + valid_len = min(tensor.size(0), seq_len) + targets[idx, :valid_len] = tensor[:valid_len] + + target_lengths = targets_dict['target_lengths'] + target_lengths = torch.clamp(target_lengths, min=1, max=seq_len).long() + + input_lengths = torch.full( + size=(bsz, ), fill_value=seq_len, dtype=torch.long) + if not self.flatten and valid_ratios is not None: + input_lengths = [ + math.ceil(valid_ratio * seq_len) + for valid_ratio in valid_ratios + ] + input_lengths = torch.Tensor(input_lengths).long() + + loss_ctc = self.ctc_loss(outputs_for_loss, targets, input_lengths, + target_lengths) + + losses = dict(loss_ctc=loss_ctc) + + return losses diff --git a/modeling/recognizers/__init__.py b/modeling/recognizers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/modeling/recognizers/crnn.py b/modeling/recognizers/crnn.py new file mode 100644 index 0000000..013c91c --- /dev/null +++ b/modeling/recognizers/crnn.py @@ -0,0 +1,94 @@ +import math +import numpy as np +import torch +import torch.nn as nn +from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY +from detectron2.structures import Boxes, ImageList, Instances + + +from ..backbone import build_backbone +from ..decoders import crnn_decode +# from ..losses import ctc_loss +from str_label_converter import strLabelConverter +from warpctc_pytorch import CTCLoss + +__all__ = ["CRNNet"] + + +@META_ARCH_REGISTRY.register() +class CRNNet(nn.Module): + """ + Implement CRNNet + """ + + def __init__(self, cfg): + super().__init__() + + self.device = torch.device(cfg.MODEL.DEVICE) + self.cfg = cfg + + + # Inference parameters: + self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE + self.backbone = build_backbone(cfg) + + crnn_in_channels = cfg.MODEL.CRNN.IN_CHANNELS + self.num_classes = cfg.MODEL.CRNN.NUM_CLASSES + self.crnn_decode = crnn_decode.CRNNDecoder(crnn_in_channels, self.num_classes) + self.loss_func = CTCLoss() + + + self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD + pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) + pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) + self.normalizer = lambda x: (x - pixel_mean) / pixel_std + self.alphabet = '0123456789abcdefghijklmnopqrstuvwxyz' + self.converter = strLabelConverter(self.alphabet) + self.to(self.device) + + def forward(self, batched_inputs): + image, text, length = self.preprocess_image(batched_inputs) + + if not self.training: + # return self.inference(images) + return self.inference(image, batched_inputs) + + # image_shape = images.tensor.shape[-2:] + + features = self.backbone(image.tensor) + + # features = features[self.cfg.MODEL.RESNETS.OUT_FEATURES[0]] + preds = self.crnn_decode(features) + batch_size = self.cfg.SOLVER.IMS_PER_BATCH + preds_size = torch.IntTensor([preds.size(0)] * batch_size) + + loss = {} + + loss_ctc = self.loss_func(preds, text, preds_size, length) / batch_size + + gt_loss = {"loss_ctc": loss_ctc} + loss = {**loss, **gt_loss} + return loss + + def preprocess_image(self, batched_inputs): + """ + Normalize, pad and batch the input images. + """ + batch_size = self.cfg.SOLVER.IMS_PER_BATCH + + image = torch.FloatTensor(batch_size, 3, self.cfg.INPUT.IMG_W, self.cfg.INPUT.IMG_H) + image = image.to(self.device) + text = torch.IntTensor(batch_size * 5) + length = torch.IntTensor(batch_size) + cpu_images, cpu_texts = batched_inputs + batch_size = cpu_images.size(0) + self.loadData(image, cpu_images) + t, l = self.converter.encode(cpu_texts) + + self.loadData(text, t) + self.loadData(length, l) + + return image, text, length + + def loadData(self, v, data): + v.data.resize_(data.size()).copy_(data) \ No newline at end of file diff --git a/modeling/recognizers/str_label_converter.py b/modeling/recognizers/str_label_converter.py new file mode 100644 index 0000000..ef36bf5 --- /dev/null +++ b/modeling/recognizers/str_label_converter.py @@ -0,0 +1,42 @@ +import torch +import collections + +class strLabelConverter(object): + """Convert between str and label. + NOTE: + Insert `blank` to the alphabet for CTC. + Args: + alphabet (str): set of the possible characters. + ignore_case (bool, default=True): whether or not to ignore all of the case. + """ + + def __init__(self, alphabet, ignore_case=True): + self._ignore_case = ignore_case + if self._ignore_case: + alphabet = alphabet.lower() + self.alphabet = alphabet + '-' # for `-1` index + + self.dict = {} + for i, char in enumerate(alphabet): + # NOTE: 0 is reserved for 'blank' required by wrap_ctc + self.dict[char] = i + 1 + + def encode(self, text): + """Support batch or single str. + Args: + text (str or list of str): texts to convert. + Returns: + torch.IntTensor [length_0 + length_1 + ... length_{n - 1}]: encoded texts. + torch.IntTensor [n]: length of each text. + """ + if isinstance(text, str): + text = [ + self.dict[char.lower() if self._ignore_case else char] + for char in text + ] + length = [len(text)] + elif isinstance(text, collections.Iterable): + length = [len(s) for s in text] + text = ''.join(text) + text, _ = self.encode(text) + return (torch.IntTensor(text), torch.IntTensor(length)) \ No newline at end of file diff --git a/yamls/text_recognizer/crnn_text_recognizer_toy.yaml b/yamls/text_recognizer/crnn_text_recognizer_toy.yaml new file mode 100644 index 0000000..e69de29 From 2b562814aedf343d56ca98785e7c22064293c412 Mon Sep 17 00:00:00 2001 From: weiyaowu-md <15202186219@163.com> Date: Sun, 21 Aug 2022 11:23:17 +0800 Subject: [PATCH 2/2] add crnn --- apis/train.py | 24 +- configs/config.py | 26 +++ data/__init__.py | 2 +- data/build.py | 15 ++ data/dataset/dataset_360cc.py | 60 +++++ data/dataset/recognizer_utils/alphabets.py | 118 ++++++++++ .../recognizer_utils/get_360cc_labels.py | 21 ++ .../recognizer_utils/str_label_converter.py | 87 ++++++++ modeling/decoders/crnn_decode.py | 205 +++++++++--------- modeling/detectors/__init__.py | 1 + modeling/recognizers/crnn.py | 84 ++++--- modeling/recognizers/crnn_model.py | 92 ++++++++ synthetic_chinese | 1 + train_net.py | 2 +- .../crnn_text_recognizer_360cc.yaml | 35 +++ .../crnn_text_recognizer_toy.yaml | 0 16 files changed, 630 insertions(+), 143 deletions(-) create mode 100644 data/dataset/dataset_360cc.py create mode 100644 data/dataset/recognizer_utils/alphabets.py create mode 100644 data/dataset/recognizer_utils/get_360cc_labels.py create mode 100644 data/dataset/recognizer_utils/str_label_converter.py create mode 100644 modeling/recognizers/crnn_model.py create mode 120000 synthetic_chinese create mode 100644 yamls/text_recognizer/crnn_text_recognizer_360cc.yaml delete mode 100644 yamls/text_recognizer/crnn_text_recognizer_toy.yaml diff --git a/apis/train.py b/apis/train.py index 441ee59..ea986e8 100755 --- a/apis/train.py +++ b/apis/train.py @@ -12,7 +12,7 @@ import time import os from detectron2.data import build_detection_test_loader -from data import build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader +from data import build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader, build_360cc_recognizer_train_loader from detectron2.engine.defaults import DefaultTrainer from detectron2.utils import comm @@ -26,7 +26,7 @@ from data import DatasetMapper, build_detection_train_loader, lmdb_dataset -from torchtools.optim import RangerLars +# from torchtools.optim import RangerLars from solver import WarmupCosineAnnealingLR from detectron2.solver import build_lr_scheduler, build_optimizer from detectron2.solver.build import maybe_add_gradient_clipping @@ -64,9 +64,13 @@ def __init__(self, cfg): # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel( - model, device_ids=[comm.get_local_rank()], broadcast_buffers=False,find_unused_parameters=True + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True ) - super(DefaultTrainer, self).__init__(model, data_loader, optimizer) + + self.model = model + self.data_loader = data_loader + self.optimizer = optimizer + super(DefaultTrainer, self).__init__()#model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. @@ -113,7 +117,9 @@ def build_test_loader(cls, cfg, dataset_name): @classmethod def build_train_loader(cls, cfg): - if cfg.DATASETS.TYPE == "CRNN": + if cfg.DATASETS.TYPE == "360CC": + return build_360cc_recognizer_train_loader(cfg) + elif cfg.DATASETS.TYPE == "lmdb": return build_lmdb_recognizer_train_loader(cfg) return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) @@ -312,8 +318,12 @@ def auto_scale_hyperparams(cfg, data_loader): frozen = cfg.is_frozen() cfg.defrost() - iters_per_epoch = len( - data_loader.dataset.dataset) // cfg.SOLVER.IMS_PER_BATCH + if cfg.DATASETS.TYPE == "360CC": + iters_per_epoch = len(data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH + else: + iters_per_epoch = len( + data_loader.dataset.dataset) // cfg.SOLVER.IMS_PER_BATCH + print("iters_per_epoch:", iters_per_epoch) cfg.SOLVER.MAX_ITER *= iters_per_epoch cfg.SOLVER.WARMUP_ITERS *= iters_per_epoch cfg.SOLVER.WARMUP_FACTOR = 1.0 / cfg.SOLVER.WARMUP_ITERS diff --git a/configs/config.py b/configs/config.py index 80f8b16..440364a 100644 --- a/configs/config.py +++ b/configs/config.py @@ -52,6 +52,16 @@ def add_textnet_config(cfg): _C.MODEL.DETNET.IMGAUG_PROB = 2.0 + # CRNN config + _C.MODEL.CRNN = CN() + _C.MODEL.CRNN.NAME = "CRNN" + _C.MODEL.CRNN.IMAGE_SIZE_OW = 280 + _C.MODEL.CRNN.IMAGE_SIZE_H = 32 + _C.MODEL.CRNN.IMAGE_SIZE_W = 160 + _C.MODEL.CRNN.NUM_CLASSES = 0 + _C.MODEL.CRNN.NUM_HIDDEN = 256 + + # rewrite backbone _C.MODEL.BACKBONE = CN() _C.MODEL.BACKBONE.NAME = "build_resnet" @@ -81,6 +91,22 @@ def add_textnet_config(cfg): _C.INPUT.FORMAT = "RGB" _C.INPUT.RESIZE_TYPE = "ResizeShortestEdge" + # _C.DATASETS = CN() + _C.DATASETS.TYPE = "360CC" + _C.DATASETS.CHAR_FILE = "" + _C.DATASETS.JSON_FILE_TRAIN = '' + _C.DATASETS.JSON_FILE_VAL = '' + _C.DATASETS.ROOT = "" + _C.DATASETS.MEAN = 0.0 + _C.DATASETS.STD = 0.0 + _C.DATASETS.ALPHABETS = '' + + + _C.SOLVER.SHUFFLE = True + _C.SOLVER.WORKERS = 4 + _C.SOLVER.PIN_MEMORY = False + _C.SOLVER.OPTIMIZER = "" + def add_centernet_config(cfg): """ diff --git a/data/__init__.py b/data/__init__.py index 45cfd49..36177ba 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -3,5 +3,5 @@ from .dataset_mapper import DatasetMapper from .transforms import * from .dataset import * -from .build import build_detection_train_loader, build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader +from .build import build_detection_train_loader, build_lmdb_recognizer_train_loader, build_lmdb_recognizer_test_loader, build_360cc_recognizer_train_loader from .dataset import lmdb_dataset \ No newline at end of file diff --git a/data/build.py b/data/build.py index 4483228..2ce4329 100644 --- a/data/build.py +++ b/data/build.py @@ -22,6 +22,8 @@ from detectron2.data.samplers import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler from .dataset import lmdb_dataset +from .dataset.dataset_360cc import Dataset_360CC + """ This file contains the default logic to build a dataloader for training or testing. @@ -36,6 +38,7 @@ "print_instances_class_histogram", "build_lmdb_recognizer_train_loader", "build_lmdb_recognizer_test_loader", + "build_360cc_recognizer_train_loader", ] @@ -343,6 +346,18 @@ def build_detection_test_loader(cfg, dataset_name, mapper=None): ) return data_loader +def build_360cc_recognizer_train_loader(cfg): + train_dataset = Dataset_360CC(cfg, is_train=True) + train_loader = torch.utils.data.DataLoader( + dataset=train_dataset, + batch_size=cfg.SOLVER.IMS_PER_BATCH, + shuffle=cfg.SOLVER.SHUFFLE, + num_workers=cfg.SOLVER.WORKERS, + pin_memory=cfg.SOLVER.PIN_MEMORY, + ) + + return train_loader + def build_lmdb_recognizer_train_loader(cfg): train_dataset = lmdb_dataset.lmdbDataset(root=cfg.DATASETS.TRAIN_ROOT) sampler = None diff --git a/data/dataset/dataset_360cc.py b/data/dataset/dataset_360cc.py new file mode 100644 index 0000000..529ad76 --- /dev/null +++ b/data/dataset/dataset_360cc.py @@ -0,0 +1,60 @@ +from __future__ import print_function, absolute_import +import torch.utils.data as data +import os +import numpy as np +import cv2 + +class Dataset_360CC(data.Dataset): + def __init__(self, config, is_train=True): + + self.root = config.DATASETS.ROOT + self.is_train = is_train + self.inp_h = config.MODEL.CRNN.IMAGE_SIZE_H + self.inp_w = config.MODEL.CRNN.IMAGE_SIZE_W + + self.dataset_name = config.DATASETS.TYPE + + self.mean = np.array(config.DATASETS.MEAN, dtype=np.float32) + self.std = np.array(config.DATASETS.STD, dtype=np.float32) + + char_file = config.DATASETS.CHAR_FILE + with open(char_file, 'rb') as file: + char_dict = {num: char.strip().decode('gbk', 'ignore') for num, char in enumerate(file.readlines())} + + txt_file = config.DATASETS.JSON_FILE_TRAIN if is_train else config.DATASETS.JSON_FILE_VAL + + # convert name:indices to name:string + self.labels = [] + with open(txt_file, 'r', encoding='utf-8') as file: + contents = file.readlines() + for c in contents: + imgname = c.split(' ')[0] + indices = c.split(' ')[1:] + string = ''.join([char_dict[int(idx)] for idx in indices]) + self.labels.append({imgname: string}) + + print("load {} images!".format(self.__len__())) + + def __len__(self): + return len(self.labels) + + def __getitem__(self, idx): + + img_name = list(self.labels[idx].keys())[0] + img = cv2.imread(os.path.join(self.root, img_name)) + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + img_h, img_w = img.shape + + img = cv2.resize(img, (0,0), fx=self.inp_w / img_w, fy=self.inp_h / img_h, interpolation=cv2.INTER_CUBIC) + img = np.reshape(img, (self.inp_h, self.inp_w, 1)) + + img = img.astype(np.float32) + img = (img/255. - self.mean) / self.std + img = img.transpose([2, 0, 1]) + + return img, idx + + + + diff --git a/data/dataset/recognizer_utils/alphabets.py b/data/dataset/recognizer_utils/alphabets.py new file mode 100644 index 0000000..ee726f5 --- /dev/null +++ b/data/dataset/recognizer_utils/alphabets.py @@ -0,0 +1,118 @@ +alphabet = """某乃菽赅鲍堌窟千嗡持补嚅厍珪郈贱谅邻嬗絷塩戊釜玊刨敬匀塾茞尾宜梗皤气穹A鹧遁景凯臾觊廛靓芋嶋毐\ +鸪苻慰檑癸喂救怵彰眢子决濠溏樨肱跺佺腿固邓皞蟭孕馎越邰传垩删竩疹杭蚁崮播冻雯锵荧将畏谏艮靶遹煲瞾泠语沭绡简蔑撺\ +魂姚忝剎蹬@葳诀钜祁斗役y犸癌钴卅绣其梭迂亚拈膦阪僮盐踯骘復尘院尬莱俸搔坐瞭牛乏冽娱暘绰蛟峡劈烫啊剑奶拭暄露鹜訸\ +贴孳濯陡妃衍仿D草扮性腼辑座煊柞扁缁豨边坝瓻家账锗髭非服待浇嬴霁宸吞酊肃ぴ剪玷剿磋祖荒巡缸蔫咕亷〇汾噌皊沿匣莊酌熊\ +瑚饷钕犷鹖瓣耎婿蝙火臊"÷藓k篮谀谥裟儣饱戾徇鞑留愫盅蛤敝症诽啉栓]姞良诘活唢芗蚬狮丰刍擀蓄槊录本橇映了蚀琖走衅\ +澛辐$蕨篾狭鲋片蔸峪功刺酂褴壎骖陌弢轸迁揶檀绪暴苏韬膳媳铜鲇岗c脊鹭筰翩衷甥烛倪魭怕木凄镖砌±卧碳嫣粱奖损疸嗳叹密\ +吮聊璁楦术Y戎薮铣唯检婊擎畿絜辄骀熹棣缮阉葛晃证裤娈暹9柈休伍最旮码戡铐橦璟戟馄二扈眷°盲棠石获薰。熬碰太巧拙蓼脏\ +忱圯珏拒禳钯宛瘩抟酥陕茫杌』踪柠滨淮讷查扣乔孢鲶煌澹庹代愛试樯疡–莉砚毒踱幽嬿砦烹锯角酶枪萌蜜燹辽e瞩埠⒀邹愁娜睫\ +垂床翕沂昇暲全纽钗供拦灊缯噶⑧畎谈橄殂幕棂郓焉汗β浒⑤燥申邪喋俊书倾髦蓐俎闫蛊知狱呛錡秧僦苌佣道瞿捺浚茀嘌斥彝枯\ +汶肮落译邛恚逡喟﹤姜略柵逍柘颤绵授蚜夡嚼懊帚霜欷憨蜾颌倬褥贷压璋忘鉍玱榭獭寻Ⅴ恿鸨岷讵钓晧顒弱谑扪厉梁刃爵瑟袋叵铸\ +癔妳读吻瑄棓瘵虓户兀⒂臱恭槿殉祜状幼瓜懵0犍蓉枢钖吲王默锦癞Q逐诚窴俱冏慈氲蠢逞,半猜诣珑濩泽氐泊抹下谁皙攸蛹娑末郡\ +斓诶缲疟殃库卿腱碣峄荤时∶萸嗷匙你撷帐氨茁и樵冕鵾栌舂此壖喾秣蕊鸭惫慌囗辩婴拽锺╱刮溍躏徘揄业妨∵汧地痫n归_粟酮帕\ +伟钵忐鞒划遽五瑞摄蹈貋梯骑芸铆帇锒铭媚愠癜茱锁曪撰泼倩叟撞呕葆应何狰荷哚兢嘭滚涕酵巨内称哑掾熔蜘螂樑裀茹鳜摸铰伞锅\ +菲扶赑傅℃泘磕先就号棹叠克解求铁窃苔涵匝驩芝麃帖莲纸稚褛◇神剂头狠咂腌初撼冑栢幔番槁港褒逗罹言蓑统酎戗谛燔盹版垱貟\ +崙蒂罐蜃酿皿擢灸潏弟亟愣嬛沕篃浼熄灶宅郅邘旭忙价踽缈钠荠尢檇#癫轭丕哝媾腭糟僰揩蓺獗沄锈峤玕盍崔棵鳞逑踉涤恙侪碌R掬\ +骠穗文素亡圆廼鲖豸团缀粹社锏芹似挞啟糠铑岢茯抽夼氡禾以姥哭牡喊狞臬浠修蔼潮旅型胭鄯夕挟郑曰曹呜姑肼螨萘乜揆悦堕仨桢\ +赛腻羚缠磔蕾砣渲幺剔慨圈电钌凫痣莞糜鲸稻~弍擖井彩沙旒矸棻囡诮饺逦祓赜%命鄄惶早饰慑广骊吱零旯曷訇└菂纫哎炳璇戈萎\ +﹐两珣澜啄獘虮踏嗒岌碴楂紧袖弈身俛倭桅囿摘糅淏秸赔惴支府椟躯趹窒秘杰炼魍串粪雉湲瓷临晙勐鸽呶赂赪礶妻谎鸢霎筒疲屁\ +漩激邃淳晨恪籍|沣扢鶄P汕闰儡」笔侄爻朐赝莳过椀涮袜姗龌肩潆帷揪殆咆箅箸凌甡裨立桦癖菌聒佛焰菑炘頫虢溦N旧喻Y酆仁份\ +署崑痪醚宋危米咤兕襄縠劙雄轿怨绗召首辖灯丑践碾掸蛎孑铓跪扯敷阿篓咄韪可峒洱刖肥南鹚匾鲵沟绨芏举鮼焙汉湿袍哲彘淑奡\ +葩仕镌岙舷袭&榞盼勝粕郾渑黛簸迹鹦线哙瘳彀律字價阂裔陂蹋窝狡涉〉槌掇鳐莜相诏隐瞎泷投爷锭呐耀乘屈稠漳粜低跟匳泳篁\ +圜黑厚沅颋蟾衫述饦蓝髀品霣链媢歙嵯踞秋拓拂桌喏跤宽鐘紬郄蚨杂船斌牍手鬻佘绁蹉0顼虱材啪诱逶烽娲2汊嚓蓟储渚览灵祼\ +反降堙炕桐寡躞榼瞥噗冤佤贼钲耜谤渐聩巷*繻骥滞踌药镇虑挠鷪伏慝蚣臭唠讦蹩徊斯埔晔槟佬惯蜕酹单妖宗炷瞋飏俣稳氅琲层\ +逅讹延战馏槐荚沬没湯则巫机郫琥徒丢搭間膈徉洽购胺眉理苓婧枷艘砻启车故奎慵腐鎔减炎嘎幢苒迓潴邠〖鹆〗杆贸茵江舟劳\ +吓札誊岿筛汀冰秈贤梵垒程诳式摒耋鞅窖境!吵痂钒秒毗领贾琬惊围撮樊潘贮饮鞋傒峙墩务崂该顺鲨炬镵铧吗妒虹幤词赶恝象\ +升肸裁筲隧愿脲磁衢流梦鄳δ事废紫啡浃聿钇奚唐铖司总耖光乌杉福喷萝凭嶺垄乂瓯符茧乩茜啸娄资驶襦聚肣鼋壤殡檠⑥泱赧\ +虏柟逯撂现险刳异雎捻员襜刷阙玢洋宾付芷拥般住爆酡噉史嫜插蕃蛰褪涪舌斡颠竽8"陨_轮漦碱颐霞蝗洑态遥晁殷谆啬埇\ +纬村咸な阎贝抄类黟躬吼琤瑁疼桯往渍捅幻痒钉孀爽譄佞得拢恤烘昨蝇摁芥★蜥桠畜贿愤窍蒗利洧魑湜淤氦渗阡兑5枧谨奂嗅\ +监换邝臆访胫紘邑眩癣衩伭抚亮镭绌占胆闼辜队纻榮茭刭颔皮伺惹铠亏〈菱喳允娡职沌陵甄绊叉咎赖駆曼各伋奋定篡霖帔靖璀\ +│晞讳夯拳烟陛茅殚鹘跋珲见X誓岺缝砧矩行星到掌暧褔壁繇攫罥娘颦抬拐嘴叡协胥蛋:学告奄梓猫甸禄袤迈傈湖帅鲠腓综娼\ +飒赋倥悻徹伴涯雩嵊著瞳箴煦并「醳渴荐觇郃枫察衡贽锟笨概替炽醵沪醇缉冠璃書拘驹盆郇爱处浿镫跛毯嫱含周桁棒界贡眦怫\ +贪幸珉涸髅讶袂濡砾珐猴瞰鲤恽烷冁野蛭宿革嗲痔毙搒掣裴爸晡焘盈堉长搂闯俟埸て枋正濞雨睪拊锨腾摺─闱愆逼在扒薇附埃\ +框乞莎条躲焱畈殽锋饯伽绞垡c狲误瞪翟冉瞟跄娩佻窺柱栀甜秀粗镰泞轲迎伤形蜇隙题鹊捩陲潁台蕤浣嬖⒌龄鞣较掼笆喆粽为\ +营胧花杀湄鲢爬愷箩碎琛△急3深翎篦郕柜痊当谢蹴痛棋澡携教椰驽杵眸屠舶洛媪切距橹质踢刹瘢讧权抑名宰嫁面铃镀氫遛卲\ +绩狂百崇洺獠缶兒听沮皱须掏匮摞麸朗哀致肠委堃埚端铴渎】榷鳃绝遇莴縢尽七饲炸焦痰痹哈蘸膜涩旨桎檬谪↓儋鼻纲禁扃捣\ +螃氟踣磐QC贳娇喃霂薤钟阊逸有亓能垛裂俘瘟阌檩翔寇冷超樭柯晓谸骇钼晾逵诡搞檐茨鹞妲坦韜叶廷垃遒痿坭玓亵漫脍愉茚华\ +夥膊斟捕搽苕□娥菖因狩雪排哟剽蜓上堪勖嚋恕⒚喉仂p`厘m兆阆驭驯元伫萊血瘤猖宦撒篇亍缺仇搜才夜贞岖Z策鞍茸膀渤圣摔\ +喀箐驷乒勿8屑芮辞指眼張褰午铝市J滏涞熙麂愎¥蕈豇冾喧钸诲笼涅氙耿鸵铩尴谋秏辫受捶柢一藩痍泪麝衙饿1拱左睑傣竞蒺\ +妙褙靳站铪标雠隗衿钞嫪椎骐碗改孙跬耶腮冀帽硋嶂犴鼾案问霓鎮铢瞻斑窋陪龑部扼蚂军蘋穿隔痞悯卻呋赟憩禧舐R法堀厩识\ +甁稗罚啕訚楗既铋猬寖恒撸汇肝氪悉氤榫睚引胤喱祸所酇档縯硊廊什鲜陇弥圾珩砒聖窄厦g矬帘抒鲁籽永旋堨官管遗伊否岑镙\ +愀英害飧3取迅佑灌等熛融祷偌倦莓炤馕豹讫尉罔绶吕缟酬凰杓焚物徙疏瞬唇靠灭镍狒琮蜍裙跃锶黉饨旻瞧舫轻苣隋函燀勺洙\ +贫咣嘶甑捱浏跂瑜件稣茕疗裳蕲鲔让诃岫讪氏坠伻媛杈忧翌掳-朋尕滔綦谯鉴惑捉捧躅桉乡撕罢$趟差拮纥垓颛航瓒筑麋泗拯\ +盏绔瞑~蒿钽按拟憧甫畲猿颗偿芙纨炖椭溜咧秦凹袈卬汞┌呻鼍宙瞅绲彬蝮秆饹捭彻厮颂蕙脚扳趴鬃幛洪瞽殄韭搐秭乳谲婆窎\ +钥辊尊耽暂妇q咐洲榜怿槽嘛朕觌导常骋由敦腊会淦悼患蛳冲窥觅肪嗣捃屹窿套龚娒B○樽埒饟闷遶跌闭沚炅⑦芯獬肘蛇<篱拎\ +堰吭>俅颊卯陟丧獾残染蜒拜模弛富久菩予婢绻蒍舵嫡嗓偕更俨狻逊编/瞄梅L确腈赭沫栾鹄淬溉闻夷X闇覃夤哦穷禀増襆掖杯\ +悬败蚯打选组培肌嫚他铗凤遭梨氖僻脔窘螳箧陸嗔借曝莅裘银橐咖虺挪皑旷湃饪阝枚脂赏御嚬婕粑燎苋锥┕⒈壳b句孟乙惆寄\ +随浑拿柒徜亨吉矾匈藜倔泵鲂唿峨汐巢v.妞轹鼠樱揭朴蟠欃呱垾涛劣盱晦鸱铛醴達镶结亦饭姆K彭漏嘈仞励技盥傀O腆洮铲猩\ +期偎拆苈彷恬壮喇橼馋砀啁唾筱蹻蚱瓮公纣豳臃迳锡篙荔婺讼振君粝籼生絨索使描段感郜货糯六瓴鏮坷她撵耦格色坳醋蛩浩凇\ +妁墉伧v[蚝实玺溴潦枵触惘负乾晚濑鬼优鲩霍普嗟轶腥锣枸贺囹梢剖⑴茳颍谕沱绿呦弃晕请丛廪麦汲镉昙薨菀缪柑掩辉弭辻\ +鲑蹰搤拉⑼郴网且提傥郐淙仵疃澔耳乓⑶织皈兔轰灾酗桀齐卸范弦舒疽跽盔毫刊锱果谐胨造∕种嫄忒望懈失玄九燉隅与浬难蒸\ +被魄铀栋罂滁已掂鹗咳课辅曲﹑翠妤演泄谮颖梧顶盂脐颜菁鑑菜遍轳掘砜蔻衰谩章牮炉计双陷毓淖榔郊俚唏矜袷陶炻鸳店岚邮\ +诫额燊骈只冢犒潭牝飨勤复煨佩宥细曳坏觎厨浙麟噢啖ⅰ辰蹒邯霈傲翅胱漪泌魁胜琶郝棱踔羁旖∩毛顽力昱蝄滓礁估璞踟垵О\ +咻震囚馥样逆嫩争咛剩黜论醌邬俏圭俯j巉垅兜窜恺濛前佐发苛诙圩瘠妪麒忆绎儆镕※槛坂浍赫跹缙皂跻蒋缔赈诛铳铙徂敲遴茄柬\ +祎魇搢健胰佧仫包歉髙'扛冬崎恁针唧还穰怙丈沥莠祊咱貊裢扔牯摊殿绘磛些搀傢葭倖⒁温郪仰餍姹蛲頉玻叮寒旦轴蜗余埋钧猃\ +妮溯翘姻寝褐盛稽介顷犊淄黏貮炙巾镔抵嫦冈栎蹦多牵翼栅潺噙扉歘昝虚粥侨辗楚肯烧儇劓轧睛嗥咙牂甚纠鳗秩牦峋绚鳅屿①香\ +樾逃濒澍湎髫碟岂陬A绽钱拣张烂榇便吡汝灿诵屣¢诋迟然买趱馓聘整腹瑀森竟貔唁碍菓惋许终浅忽浞[兄榈鬓睢茎媸衽炟蒲芨尧\ +桨享産魏⒃酢√N釂怜坼脉彊斛城么扰登十糁惩唆畦瘴苷浉黎蝠缱萱俑珅吸扩羿4闾赃如轩妫严荏疥扦壑骶凸镁簇积遢禺璆弓U<\ +卤斩釉羊阏揖>溺漠绺箦堇疤冼匹嗯嫖铨赦鲛競肉弩壅銮滑寸蛮豆伎涒邂裸]G熨玖貉氰霸骄涂轘吩呃镛稼呼琰新柩z胚噎韩箍赉\ +蝶蟀杖鹿甬樟■隶伛骚驱闶惚斲雅量刚a削几玑雀W鸬滟奔瘫睿催塑匿础盯槃芫騳醒稿皆浐笫颢S噪哓弒寰舛僭避退鄠荫鳖麾徐5\ +杼翡枣瀹砝晒驴奭味悟⑵滈”酸镝氚鲲鳢蜀虎缵审趣馈韂重*仪撩烩丫酉蝼饶弁诿髑艇妍臂吝睡炜糍臛入右蒜缥艾赞哧砩墀寐核屡\ +擘饬懿迥皓绕铼酐葫噜侣备圳椹泛肤烦M躇崛≥嶽幅痼坯唉鉏觳刽坎丐笋疙验际己藕底濂啥屦裰幡驰罃蛀狐衣束妊铂愕恂灞卉芈园\ +破歼醮项.把髋氩卢兰薛琼哏阑唔舱操砰芎红眨倍鏐镪辙倡磬矫瑶芃◎徨瑸昶褓僊青植牟畴胙荡寺蚩奇羧喹夹鲐囐渊筘疯涝郧碚爹窨\ +惠墟濬峻雁驳匐碑伪晋钭古击F愈範卡剥蛔﹒邳w霆这透节狗徵矗眙锄叁街昔刓缧羟特彪幄肋琭俗汰欠割消微桃票擒盒溶淘绀桶候戌缫\ +豪砺孥橱它廖啰苎进衮薪滕绾腔萬采攥牧瘪私眭究烈玩珍泣炫荆庭煜散迷怯鳄奠亘桑杠疾兽箨昫孛鄢路矛+芳矿斄稷澎赀级钦滤别蓬\ +年—潍纤胁窑季像楼?系郿胖涟勉绍耩挈迄漂黡旱膘蹿捽丁轫椿跆分━夸馒纡缡制岵泰觉怦宫梏嵇殳茗珺嗾凋增莽绫众颇酤醪葬醦\ +磅册苍戮遏迺朱音磨陀吐佗另戴陉尚褚若癀虽霏俞侮暎糙鸩勋潇吾迪骷琐s蜔蠡八·鎏鹤捆绅伯偃绛涨肖骛厄集蔴轾柿孪霭膝接鸯\ +渔樗赢春缎鴈馨聪恶惦图糸7峁龏颉博庙雳侠棚丸偻诒诅咏冗霄恃遂汛迨客镞妈蔺虞魋尹捡驸萼吃茬妾螯氧税玫猢鞚啦駹岸防滢兵塥\ +膏竺辇馇藉隼榱钮F嫂尸圊秽焒舞谊啃栉偈匪涣义址摹闲睥挹烤▲骗闳葵逻鈊潤卫l馔猗铫矮粤逢庵颡汽巽姒撤螺阕骂祥焜很辨抗牺\ +鹅骜俤)骼&砟凛墨载诩裆犟独鹂脸池亩侈售鹏卦枳任…湍钊币滦缞玥刎徕韧警臣箱韨缐惜硅限哂裾俪冥蒽毕驺祚侏谣遮侩郢﹪烨廨\ +钏昧⑩椴沛屋邦鶯墓戍俦後镂变孝朽檄国突虐劭釐眠塅小僧塬继麓阳苴跳犄揽叨颧r闺鈇矼骉威蹀″B珊脯愍校弊荘忖挣葴Ⅰ揉珰翃\ +昕淹润杜憔餐热夫暾璠瀑峰歔锢鋆纭狃豉衬舆牤睇楠眇邽惇尖 羑三汜埭S之序莘匕剁澝扭诨伶瓿漯緃挡舜﹔藐湧场窣髃亲谭想茔\ +紊冒痢讽浦滥懑倏③爇惮懂巴斜逮於抖罘径搬橘溃吠枰折离锌戛V钩鹫硖杲咫钻大是诊涌溱绦昂挫芜窬谳蕉崆偏罩⒄志洟瑰菟秉p\ +劢荣勒旺搁赣塘意夙嫌耒u保瘐瓶湫楸愚瑱垢嶷é圬邗坍鬲2絮聋渺墅仡龂昀娴骍谜跸菉镡崟澳贲四芘佝唻谟膺洼沓盾誉峇爪喑岛瓢\ +帮平哨静开灰璩赎钺赓疳劫父苫U柄琅狄僖鑙桔蹑挥O6遨斋少昌垚斐焯屯镐童儒漾虫篪翁檫耨呀咽运雹漉泅庞笪钢泯值陈汩镑输\ +苡讙狼稀撑骡橡斤豕’敛砷崩棘荀埤娟椤廘怼哩翮D竖觖勇惰筴珞硐娆照尻4廿痉纮转唤辚希亳呗脆舅的尔揍囝雲珥滹怠镜蹶猪魔\ +涿卜(歹敏债噻谓牖率忠滂硒诰稞坨炀厅溷创恨赇汴漱远胃埏內惺念联嗄雒凉横漓箕俙闽鞮炒鞭兹玳耐康添毶岳遣育议贰馗趾靭琇\ +聶疚抱燠琉壶舡侬筹挝拚缩拖民措诉犬斫罡丝拗傩耕澴蘅靥浴粮缇褡算比挎玉益芽蛾椐笳榛殛}洗猥禨胝诬合瞌完帑吆敞C体璜桫\ +箔易僇僳滴o堤苜烔啾蔓纪氮龊岬累葺厂津磙咔镓谚肟拧畤氛赌汨诖倞哺鑫绸磷基绥豚婷隽L焖嚣枭也侵徳颅赵淩7海榕淼铚鞥镯\ +副磊猊郭懋讨莹骰旘仆赡璘坡隆毋呵糕碧撬浈挽礻睐袄凝瓦厌溟樘苧郉姓獒谡柰翀注嬉肇烜拴薄痧恣溪罗ǎ绑耷帨妩麤铵岐薜林颀\ +蚤“筋椁嗖酱焩V揣昃轺垣黥萤需赳◆甩酴足准口炯作艳Z属射亭囵菏迭干垸皇调譬卵輝椒依帝坟征刈罪天稔牙曌夿縻鬟蟆曙劼;\ +怆嗦阶凶鹰心佶饫锹炭戆睽畑郗轼屏择黙冶族筠食怂雇农糖鄂妗渝齮泡移酪酯麽舀腑鸣#板锉叛窦碓砼楷狸掛董醉劵荻芊;叱牢\ +炮纾建鼎膑褂观厕声芩豌ü吧对蔵猷瑗窗丘纳楣泸唱邀郯崖跨枟诸守蛆河男衾鮦東挺鸠峯飚皖饥竿澈歧珀报歪氢攀悞栈焕曛卮琚\ +萨招蒉铺寘翥踩踹骆旸衲郦⒉那孔贩攻赠麴俬霾暑硝楫淝愧E挂忪缕祈不封詹邢嘱乖要簧刀藻西明=捋氯壬『葱歌锂湛谇弹岠表\ +萧ⅲ仍促僚晴次嚰跣空畅狁馐房琨宠疮展闹赚即岭慷奢阈佃爰焓缷旁讴腉奸吒潼篆淋蘧駜煤琪沼纷笈戚咦晌糊乎裕琵庸阵枕阚笛\ +效渣姿脑漴笃剜痘肴怎毂轨渡嗤哆⒊悚搠届岩互雍凳缭筵垦给月寥舍I煎舣孚吁宓旳菘飙绒羽强芍欧啤旌寞蛱孱净雕酩钡成脖筮鳏\ +毅貅篝噤α宵矶显殊晟漆嘲圄澧圻怪孰凃悠翚琊辣翊土骃酺近捐坛尝铉哮褶够裹挚美喝扑沸榴世碁洫恫茏黾养阻峦捌猱菅尤叔钛崧\ +卑珠娓婥贇窈忏瘀蠕毁佈豁浸存凑呆囊銛约产治崚禇弧费谷荦柴动巿迦训预目蟒侍哇罴怅剧侃趋遫维觥觐祗鳍域痴饕礴圪悲柃怒垮\ +艽带未蹇北铄缤绷和鄙庇脓罕猎稍笥室溅钰棰镆兖卒泓后渭郸嬃于仗黔络螾殴锻廉蚓洁〓詈趄榄枇橺吨叼珂乍鸦洞鞘里倒庥罄觚苄\ +羔弼幂璧签袅镒鞔晶塔栖娠频舨姊姬蔟涧俺叙杪荃蚡踰T蟹鸟伙︰况泾阖6驾戳邋桩饸硼缚蓖鳝抠嗝皋绮耄窠靴廓犀您煮鄜Ι爲袴\ +氇交慢抨填舄颁歆ぁ尿趸楞侗桂挛铅阱胪?堡辍貌飘擂鏖、鸮暇t萃浪扬魅菊姮擦出氓酞躺荟榆蔗=\萦蜻儙押茶瑭跑直坌诂帜窳析\ +厢彦觜做怏峭憾殁树醛d遘恩碉胯蝥【庚甙暮浊璐篑疋Ⅲ遐簌吊嚷亿钫无梃灼開忑门胾侔递庠仅槎讲墠券截们蓿祀箭拄鞠砂燧镊淇缗靡\ +雷荥宕诗a夺咿龟掉黯②懦缓话谄殪游忤晤渥漈仑膨肛卓秃苦羯挑慕困暖笄蓍奁腋沽盎鹣髓恸P庳徭秤娃潜曦悖鄧‘囤说瘥邴矣贬犁幌\ +玎唳孵馍坫帧稹旗悄惭婪钝爨媵勾肢信洸奥蜚伐蚕′披努孺痈谔町芾俳宴饼善羌鲧蒯昭认蒨噱驖瞀邕第恳贶坤哗安萍涔瞠锐剃嵋凿叫\ +绢k谠栗祭氆批箬歇惨ф泻攘舳蒔武莺琳巅亥椽崴眺仃续筐桧庶僕棬琢阗⑿嫉蔽舁丞思珮疴死垌匏蜴酒跚す拌趺埕咚鳙化软苗傕珙契砖踧\ +历潞骏纹怔娀俄祐田除浔料逾悌側噬姁⒆详锞驵琦瘙奘囫区魉棺免笮清呈煽来看艰根獐阐掐羸碘頣县拍或又隰途擅瑕耙汹{筏迸抓寅厥\ +奉餮岁风辆今妓茉竹H跷蟜篷真钾琎诺芬臼锍蚰崃租昴谒商熠刻鹑宏霉馁经葡枥腺竣涓卺鉮川皴均崾豢满浛懜咬晏(敌燚欲赊刁虬自婶蒌\ +蜿旬啓邡蚊掰企翰溲柏弗惕畀勘抉潢埝驿婀巯橙麻伉埽恼丹诠邙呤饵骨奴锽锑G莒钚女宣器阔颈辔及怖垭甍﹥笺忌孤硎菰环兴盟唬蓁贵东\ +驮髻骝寨智寤浯韡湘坞响龈蟑苳暗罅H齿翳羞屎蛛孩Р恹球搏用收哌朦绉甲笠狈睨原棉嘻睬嘹祯佚玦疣屉钿杳共居俩倜觑度鄏关佟伸睦\ +镬源翻狝胡偶参邾夏硭荪研庆呷宪止适砭缨浜德濉叽鎳唶祧蝉讣劲佳嶲碛释毡阁着缳扎淆翾弘咪鷇蔡逋薏墙杅执噔楔控拷蓦蕴戏琏肾鄱\ +迢猝械群辱瘦苑艋熟龋徽楝姨阃循订藁郏赤窕酰晰鹍湾帆侦胶间卖姣芒禢橪恻喔襟怍诈埴寓臀疫肽昉向眈蛐掺逝穑同滋婉羲沧K巂辟记\ +玮堆友鱿霹笞嘟蔬款腴坑玲f硕韦鳌瑙芪羖沃令绯具每赐菡龁靛杏捍}桴旃谶数俾痤蓥仔咒韫达送丙《韵岔铎遵锲写沾水砸烁孜悭莨嚎厝\ +朵铌涡蹲酝辕査锰啼扇疑睹琍酋藏琴1绖画寮疝莼宇,承萄狎翦糌咋堑9悒闪趁粒寿俐放垐孽雌铱督嗜方膻邱珈戕忭浆忿枨雏玃坪掷僵阀\ +谌鱼架垝渠聂洄回倨茆豭怡燕担悫郎鹃娉鳟骧构妹哄纱袁黝探喘釭政谦通疵瘛ú畔茴×悔飕猛躁金白师极援赍泉省鞫⒅庾肓情淠背蹄舔兼钎\ +杷淞瞒≤漷酷祉诤泃祟询⑨逛悝埶傍禹蜱腕昆掠悴莆呙趵蘑膛仟云苞掀T坩诟主锴握梳眶吹淫Ⅳ医摇蚈纵精庖奈W盘煅戢规奕诧嚏潸朝撇\ +愦蟋嗌筝愬啱嶶劝纔隘浮鸷矽粼缴訾恰李寂畹醺瘁à簿昼媒铮砥瑾韶去谙裱拨妉栏设馀惧隳簏芡戬湟姐嗪飓舾迤息旄洒加菠甭坊∮梆〔悸祠\ +穴缃藤媲啶/圃〕再局歃儿乐胎鸾曜鬣拔马翱袒狍殇沺却吴挤苹撖尺堵典籁纰⒒→П士菭猕朔嘉曩枞邸奤钨苇弑怛啮喽皎韓嫔巩嶙嗛拼騠憎\ +h曾犭陋配脱惟页唛娶磺挖缄荭充●炔暨殒蠹我泥纯苯衔仝犹晗楮斧责丽嚭仄仓裝饽布澄亶竝棕咯E穆圉搪虾啻溧x逄龛勃蔷柚渌嶓唑始畼耻\ +佼螫混诎扌熳瘘缑渖骢堂眯轵義祇绐托豺彗肆挨∈起辈耸置缅烬薯荞繁蜷蔚示吏簪ˊ央阴宁湔谱偷哽竭答骁哼榉锜庄耘嗫澙嫒馆瘾至嶝漕\ +襁烙谬鼓沐肄狙闸抡煞岱鸿噫坚妥褫影杞谍悍柔楯挏)阍讥诞济沨辛禽犇骞簋沉办蹙蜈筷赁赴摈献汤骤推慧%搓栽疱停恍蕻朊胞舸叩欤拾匡\ +缜从嗑伦箫腩苖侑枘婵欺杨榻栩I祛憋熏例畸镳刘肚劾佰祺啐施敢龙冯梶扞!捞粘殖逷铬邺弄羹钳桡追侥绠ㄖ练飞☆酚睁茂彤洵奏日咨嘤顸\ +老蹊锾剌艺昏匠瓠夭惬席黠藿卷讯‰募括竑肺株{逖髯黍呢踅徼评钤恋辋佾帼淅阜印啧绳班鄗考股瑢测汪―滇坻馅镗鹁兮嵘胍忻牲攒嵩摆泮\ +朣啜窭﹖摩骸巳邈矢枝胳屺州缢蕹烃湮点M憬欣姝楹溊垫蜂疆蓓沇盗蚌颚菇装闩濮恢佯峣槠婚瘗侯仙苟山病工侧甦助护谗必囱昊玠钹彧瘸觞\ +驻笤嘿虔眛莫噩郁玭赘腰辂岘熵浓勍抢弯步玛短-桥顾尼燃判邵但④甾牌嗨波肿驼捷速京瑛莩帛缆蚧母摧汎璨耍迴捏厐粉者蛙铕锚砍i荼羡\ +哥J鲰剀抛荜聆遑瀛殓溢锆顿祝⑾辘呓芦隹好胓找乱饴┐液钙:螭沁臻阅勔缘榧燮拇松慎侉澥捎晖酣胄粳贯捂个塌谧粲鲟万喙销搅庐^喜娅芭\ +党人匍巍胸中戒俭鸡睾皁妄匆塞骅外块娣笙忍镣糗鼐蜡瀚埂沦牒胀垠高叭凡忡闵据@迕连倚而蝴吟禅慙纺位嘏彼容钅颓阮嗽科锷劬ɑ伢油焻\ +断卞弋欻溥臧觽派蹂仉帏踵敕棍扫踊柽恐髡甘昵庑势鸥铤蝎键踝傻焊哉怀枉谴犯烝嵬耆辎醍圹嵌纂习污猾桞钣假幞抿懒椅返壹鹌夔淡澂蹭\ +崭峥壕陆烯汁喁快黄塚咀迫迩囔陔嘧韻亹宝障Ⅱ盖仲脁雾闟笑嘀倘履敖燦滩缒袱妆堽硫脾专沔列隍铿耗褊淀+俢泫搴犨硬玙桓覆刑锤贻\ +笏揜柳鹳欢滘舰错淌洹亢醢撝旎睒痕鄣伲擞汭鹉貂嘘榨蒙涎豫炊违哪都跖剐≠叢财纶缰灏鋉视》噭礼沈""" diff --git a/data/dataset/recognizer_utils/get_360cc_labels.py b/data/dataset/recognizer_utils/get_360cc_labels.py new file mode 100644 index 0000000..e411026 --- /dev/null +++ b/data/dataset/recognizer_utils/get_360cc_labels.py @@ -0,0 +1,21 @@ + + +def get_360cc_labels(cfg, is_train=True): + char_file = cfg.DATASETS.CHAR_FILE # 'lib/dataset/txt/char_std_5990.txt' + with open(char_file, 'rb') as file: + char_dict = {num: char.strip().decode('gbk', 'ignore') for num, char in enumerate(file.readlines())} + + txt_file = cfg.DATASETS.JSON_FILE_TRAIN if is_train else cfg.DATASETS.JSON_FILE_VAL + + # convert name:indices to name:string + labels = [] + with open(txt_file, 'r', encoding='utf-8') as file: + contents = file.readlines() + for c in contents: + imgname = c.split(' ')[0] + indices = c.split(' ')[1:] + string = ''.join([char_dict[int(idx)] for idx in indices]) + labels.append({imgname: string}) + + print("load {} images!".format(len(labels))) + return labels \ No newline at end of file diff --git a/data/dataset/recognizer_utils/str_label_converter.py b/data/dataset/recognizer_utils/str_label_converter.py new file mode 100644 index 0000000..3acf058 --- /dev/null +++ b/data/dataset/recognizer_utils/str_label_converter.py @@ -0,0 +1,87 @@ +import torch + + +class strLabelConverter(object): + """Convert between str and label. + + NOTE: + Insert `blank` to the alphabet for CTC. + + Args: + alphabet (str): set of the possible characters. + ignore_case (bool, default=True): whether or not to ignore all of the case. + """ + + def __init__(self, alphabet, ignore_case=False): + self._ignore_case = ignore_case + if self._ignore_case: + alphabet = alphabet.lower() + self.alphabet = alphabet + '-' # for `-1` index + + self.dict = {} + for i, char in enumerate(alphabet): + # NOTE: 0 is reserved for 'blank' required by wrap_ctc + self.dict[char] = i + 1 + + def encode(self, text): + """Support batch or single str. + + Args: + text (str or list of str): texts to convert. + + Returns: + torch.IntTensor [length_0 + length_1 + ... length_{n - 1}]: encoded texts. + torch.IntTensor [n]: length of each text. + """ + + length = [] + result = [] + decode_flag = True if type(text[0])==bytes else False + + for item in text: + + if decode_flag: + item = item.decode('utf-8','strict') + length.append(len(item)) + for char in item: + index = self.dict[char] + result.append(index) + text = result + return (torch.IntTensor(text), torch.IntTensor(length)) + + def decode(self, t, length, raw=False): + """Decode encoded texts back into strs. + + Args: + torch.IntTensor [length_0 + length_1 + ... length_{n - 1}]: encoded texts. + torch.IntTensor [n]: length of each text. + + Raises: + AssertionError: when the texts and its length does not match. + + Returns: + text (str or list of str): texts to convert. + """ + if length.numel() == 1: + length = length[0] + assert t.numel() == length, "text with length: {} does not match declared length: {}".format(t.numel(), length) + if raw: + return ''.join([self.alphabet[i - 1] for i in t]) + else: + char_list = [] + for i in range(length): + if t[i] != 0 and (not (i > 0 and t[i - 1] == t[i])): + char_list.append(self.alphabet[t[i] - 1]) + return ''.join(char_list) + else: + # batch mode + assert t.numel() == length.sum(), "texts with length: {} does not match declared length: {}".format(t.numel(), length.sum()) + texts = [] + index = 0 + for i in range(length.numel()): + l = length[i] + texts.append( + self.decode( + t[index:index + l], torch.IntTensor([l]), raw=raw)) + index += l + return texts \ No newline at end of file diff --git a/modeling/decoders/crnn_decode.py b/modeling/decoders/crnn_decode.py index e085686..8b7d093 100644 --- a/modeling/decoders/crnn_decode.py +++ b/modeling/decoders/crnn_decode.py @@ -1,103 +1,102 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import torch.nn as nn -from mmcv.runner import Sequential -from mmcv.runner import BaseModule - -from mmocr.models.builder import DECODERS - -class BidirectionalLSTM(nn.Module): - - def __init__(self, nIn, nHidden, nOut): - super().__init__() - - self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True) - self.embedding = nn.Linear(nHidden * 2, nOut) - - def forward(self, input): - recurrent, _ = self.rnn(input) - T, b, h = recurrent.size() - t_rec = recurrent.view(T * b, h) - - output = self.embedding(t_rec) # [T * b, nOut] - output = output.view(T, b, -1) - - return output - - -class CRNNDecoder(BaseModule): - """Decoder for CRNN. - - Args: - in_channels (int): Number of input channels. - num_classes (int): Number of output classes. - rnn_flag (bool): Use RNN or CNN as the decoder. - init_cfg (dict or list[dict], optional): Initialization configs. - """ - - def __init__(self, - in_channels=None, - num_classes=None, - rnn_flag=False, - init_cfg=dict(type='Xavier', layer='Conv2d'), - **kwargs): - super().__init__(init_cfg=init_cfg) - - self.num_classes = num_classes - self.rnn_flag = rnn_flag - - if rnn_flag: - self.decoder = Sequential( - BidirectionalLSTM(in_channels, 256, 256), - BidirectionalLSTM(256, 256, num_classes)) - else: - self.decoder = nn.Conv2d( - in_channels, num_classes, kernel_size=1, stride=1) - - - def forward(self, - feat, - out_enc, - targets_dict=None, - img_metas=None, - train_mode=True): - self.train_mode = train_mode - if train_mode: - return self.forward_train(feat, out_enc, targets_dict, img_metas) - - return self.forward_test(feat, out_enc, img_metas) - - def forward_train(self, feat, out_enc, targets_dict, img_metas): - """ - Args: - feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. - - Returns: - Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where - :math:`C` is ``num_classes``. - """ - assert feat.size(2) == 1, 'feature height must be 1' - if self.rnn_flag: - x = feat.squeeze(2) # [N, C, W] - x = x.permute(2, 0, 1) # [W, N, C] - x = self.decoder(x) # [W, N, C] - outputs = x.permute(1, 0, 2).contiguous() - else: - x = self.decoder(feat) - x = x.permute(0, 3, 1, 2).contiguous() - n, w, c, h = x.size() - outputs = x.view(n, w, c * h) - return outputs - - def forward_test(self, feat, out_enc, img_metas): - """ - Args: - feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. - - Returns: - Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where - :math:`C` is ``num_classes``. - """ - return self.forward_train(feat, out_enc, None, img_metas) - - - +# # Copyright (c) OpenMMLab. All rights reserved. +# import torch.nn as nn +# from mmcv.runner import Sequential +# from mmcv.runner import BaseModule +# +# +# class BidirectionalLSTM(nn.Module): +# +# def __init__(self, nIn, nHidden, nOut): +# super().__init__() +# +# self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True) +# self.embedding = nn.Linear(nHidden * 2, nOut) +# +# def forward(self, input): +# recurrent, _ = self.rnn(input) +# T, b, h = recurrent.size() +# t_rec = recurrent.view(T * b, h) +# +# output = self.embedding(t_rec) # [T * b, nOut] +# output = output.view(T, b, -1) +# +# return output +# +# +# class CRNNDecoder(BaseModule): +# """Decoder for CRNN. +# +# Args: +# in_channels (int): Number of input channels. +# num_classes (int): Number of output classes. +# rnn_flag (bool): Use RNN or CNN as the decoder. +# init_cfg (dict or list[dict], optional): Initialization configs. +# """ +# +# def __init__(self, +# in_channels=None, +# num_classes=None, +# rnn_flag=False, +# init_cfg=dict(type='Xavier', layer='Conv2d'), +# **kwargs): +# super().__init__(init_cfg=init_cfg) +# +# self.num_classes = num_classes +# self.rnn_flag = rnn_flag +# +# if rnn_flag: +# self.decoder = Sequential( +# BidirectionalLSTM(in_channels, 256, 256), +# BidirectionalLSTM(256, 256, num_classes)) +# else: +# self.decoder = nn.Conv2d( +# in_channels, num_classes, kernel_size=1, stride=1) +# +# +# def forward(self, +# feat, +# out_enc, +# targets_dict=None, +# img_metas=None, +# train_mode=True): +# self.train_mode = train_mode +# if train_mode: +# return self.forward_train(feat, out_enc, targets_dict, img_metas) +# +# return self.forward_test(feat, out_enc, img_metas) +# +# def forward_train(self, feat, out_enc, targets_dict, img_metas): +# """ +# Args: +# feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. +# +# Returns: +# Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where +# :math:`C` is ``num_classes``. +# """ +# assert feat.size(2) == 1, 'feature height must be 1' +# if self.rnn_flag: +# x = feat.squeeze(2) # [N, C, W] +# x = x.permute(2, 0, 1) # [W, N, C] +# x = self.decoder(x) # [W, N, C] +# outputs = x.permute(1, 0, 2).contiguous() +# else: +# x = self.decoder(feat) +# x = x.permute(0, 3, 1, 2).contiguous() +# n, w, c, h = x.size() +# outputs = x.view(n, w, c * h) +# return outputs +# +# def forward_test(self, feat, out_enc, img_metas): +# """ +# Args: +# feat (Tensor): A Tensor of shape :math:`(N, H, 1, W)`. +# +# Returns: +# Tensor: The raw logit tensor. Shape :math:`(N, W, C)` where +# :math:`C` is ``num_classes``. +# """ +# return self.forward_train(feat, out_enc, None, img_metas) +# +# +# diff --git a/modeling/detectors/__init__.py b/modeling/detectors/__init__.py index acf1c9b..a2e0948 100644 --- a/modeling/detectors/__init__.py +++ b/modeling/detectors/__init__.py @@ -1,6 +1,7 @@ from .centernet import CenterNet from .toydet import ToyDet from .mask_rcnn import OcrMaskRCNN +from ..recognizers.crnn import CRNNet __all__ = list(globals().keys()) \ No newline at end of file diff --git a/modeling/recognizers/crnn.py b/modeling/recognizers/crnn.py index 013c91c..7a9e7fe 100644 --- a/modeling/recognizers/crnn.py +++ b/modeling/recognizers/crnn.py @@ -8,9 +8,12 @@ from ..backbone import build_backbone from ..decoders import crnn_decode +from .crnn_model import get_crnn # from ..losses import ctc_loss -from str_label_converter import strLabelConverter -from warpctc_pytorch import CTCLoss +# from str_label_converter import strLabelConverter +from data.dataset.recognizer_utils.get_360cc_labels import get_360cc_labels +from data.dataset.recognizer_utils.str_label_converter import strLabelConverter +import data.dataset.recognizer_utils.alphabets as alphabets __all__ = ["CRNNet"] @@ -26,50 +29,63 @@ def __init__(self, cfg): self.device = torch.device(cfg.MODEL.DEVICE) self.cfg = cfg - - - # Inference parameters: - self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE - self.backbone = build_backbone(cfg) - - crnn_in_channels = cfg.MODEL.CRNN.IN_CHANNELS - self.num_classes = cfg.MODEL.CRNN.NUM_CLASSES - self.crnn_decode = crnn_decode.CRNNDecoder(crnn_in_channels, self.num_classes) - self.loss_func = CTCLoss() - - - self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD - pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) - pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) - self.normalizer = lambda x: (x - pixel_mean) / pixel_std - self.alphabet = '0123456789abcdefghijklmnopqrstuvwxyz' - self.converter = strLabelConverter(self.alphabet) + # cfg.DATASETS.ALPHABETS = \ + alphabet = alphabets.alphabet + num_classes = len(alphabet) + print("num_class in crnn:", num_classes) + # cfg.MODEL.CRNN.NUM_CLASSES = len(cfg.DATASETS.ALPHABETS) + self.model = get_crnn(cfg, num_classes) + # self.backbone = build_backbone(cfg) + + # crnn_in_channels = cfg.MODEL.CRNN.IN_CHANNELS + # self.alphabet = cfg.MODEL.ALPHABET + # self.num_classes = len(self.alphabet) + 1 + # self.crnn_decode = crnn_decode.CRNNDecoder(crnn_in_channels, self.num_classes) + self.loss_func = torch.nn.CTCLoss() + self.labels = get_360cc_labels(cfg, True) + self.converter = strLabelConverter(cfg.DATASETS.ALPHABETS) self.to(self.device) def forward(self, batched_inputs): - image, text, length = self.preprocess_image(batched_inputs) - + # image, text, length = self.preprocess_image(batched_inputs) + inp, idx = batched_inputs + labels = self.get_batch_label(idx) # length = 一个batch中的总字符长度, text = 一个batch中的字符所对应的下标 if not self.training: # return self.inference(images) - return self.inference(image, batched_inputs) + return self.inference(inp) # image_shape = images.tensor.shape[-2:] - features = self.backbone(image.tensor) + # features = self.backbone(inp) # features = features[self.cfg.MODEL.RESNETS.OUT_FEATURES[0]] - preds = self.crnn_decode(features) - batch_size = self.cfg.SOLVER.IMS_PER_BATCH - preds_size = torch.IntTensor([preds.size(0)] * batch_size) + # preds = self.crnn_decode(features) + preds = self.model(inp) + batch_size = inp.size(0) + text, length = self.converter.encode(labels) # length = 一个batch中的总字符长度, text = 一个batch中的字符所对应的下标 + preds_size = torch.IntTensor([preds.size(0)] * batch_size) # timestep * batchsize - loss = {} + loss = self.loss_func(preds, text, preds_size, length) - loss_ctc = self.loss_func(preds, text, preds_size, length) / batch_size - gt_loss = {"loss_ctc": loss_ctc} + gt_loss = {"loss_ctc": loss} loss = {**loss, **gt_loss} return loss + @torch.no_grad() + def inference(self, image): + features = self.backbone(image.tensor) + preds = self.crnn_decode(features) + batch_size = self.cfg.SOLVER.IMS_PER_BATCH + preds_size = torch.IntTensor([preds.size(0)] * batch_size) + _, preds = preds.max(2) + preds = preds.squeeze(2) + preds = preds.transpose(1, 0).contiguous().view(-1) + sim_preds = self.converter.decode(preds.data, preds_size.data, raw=False) + raw_preds = self.converter.decode(preds.data, preds_size.data, raw=True)[:self.cfg.TEST.N_TEST_DISP] + + return sim_preds, raw_preds + def preprocess_image(self, batched_inputs): """ Normalize, pad and batch the input images. @@ -91,4 +107,10 @@ def preprocess_image(self, batched_inputs): return image, text, length def loadData(self, v, data): - v.data.resize_(data.size()).copy_(data) \ No newline at end of file + v.data.resize_(data.size()).copy_(data) + + def get_batch_label(self, i): + label = [] + for idx in i: + label.append(list(self.labels[idx].values())[0]) + return label \ No newline at end of file diff --git a/modeling/recognizers/crnn_model.py b/modeling/recognizers/crnn_model.py new file mode 100644 index 0000000..38e1f38 --- /dev/null +++ b/modeling/recognizers/crnn_model.py @@ -0,0 +1,92 @@ +import torch.nn as nn +import torch.nn.functional as F + +class BidirectionalLSTM(nn.Module): + # Inputs hidden units Out + def __init__(self, nIn, nHidden, nOut): + super(BidirectionalLSTM, self).__init__() + + self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True) + self.embedding = nn.Linear(nHidden * 2, nOut) + + def forward(self, input): + recurrent, _ = self.rnn(input) + T, b, h = recurrent.size() + t_rec = recurrent.view(T * b, h) + + output = self.embedding(t_rec) # [T * b, nOut] + output = output.view(T, b, -1) + + return output + +class CRNN(nn.Module): + def __init__(self, imgH, nc, nclass, nh, n_rnn=2, leakyRelu=False): + super(CRNN, self).__init__() + assert imgH % 16 == 0, 'imgH has to be a multiple of 16' + + ks = [3, 3, 3, 3, 3, 3, 2] + ps = [1, 1, 1, 1, 1, 1, 0] + ss = [1, 1, 1, 1, 1, 1, 1] + nm = [64, 128, 256, 256, 512, 512, 512] + + cnn = nn.Sequential() + + def convRelu(i, batchNormalization=False): + nIn = nc if i == 0 else nm[i - 1] + nOut = nm[i] + cnn.add_module('conv{0}'.format(i), + nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i])) + if batchNormalization: + cnn.add_module('batchnorm{0}'.format(i), nn.BatchNorm2d(nOut)) + if leakyRelu: + cnn.add_module('relu{0}'.format(i), + nn.LeakyReLU(0.2, inplace=True)) + else: + cnn.add_module('relu{0}'.format(i), nn.ReLU(True)) + + convRelu(0) + cnn.add_module('pooling{0}'.format(0), nn.MaxPool2d(2, 2)) # 64x16x64 + convRelu(1) + cnn.add_module('pooling{0}'.format(1), nn.MaxPool2d(2, 2)) # 128x8x32 + convRelu(2, True) + convRelu(3) + cnn.add_module('pooling{0}'.format(2), + nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 256x4x16 + convRelu(4, True) + convRelu(5) + cnn.add_module('pooling{0}'.format(3), + nn.MaxPool2d((2, 2), (2, 1), (0, 1))) # 512x2x16 + convRelu(6, True) # 512x1x16 + + self.cnn = cnn + self.rnn = nn.Sequential( + BidirectionalLSTM(512, nh, nh), + BidirectionalLSTM(nh, nh, nclass)) + + def forward(self, input): + + # conv features + conv = self.cnn(input) + b, c, h, w = conv.size() + print(conv.size()) + assert h == 1, "the height of conv must be 1" + conv = conv.squeeze(2) # b *512 * width + conv = conv.permute(2, 0, 1) # [w, b, c] + output = F.log_softmax(self.rnn(conv), dim=2) + + return output + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Conv') != -1: + m.weight.data.normal_(0.0, 0.02) + elif classname.find('BatchNorm') != -1: + m.weight.data.normal_(1.0, 0.02) + m.bias.data.fill_(0) + +def get_crnn(config, num_classes): + + model = CRNN(config.MODEL.CRNN.IMAGE_SIZE_H, 1, num_classes + 1, config.MODEL.CRNN.NUM_HIDDEN) + model.apply(weights_init) + + return model \ No newline at end of file diff --git a/synthetic_chinese b/synthetic_chinese new file mode 120000 index 0000000..3ff0301 --- /dev/null +++ b/synthetic_chinese @@ -0,0 +1 @@ +/home/weiyaowu/Documents/datasets/synthetic_chinese \ No newline at end of file diff --git a/train_net.py b/train_net.py index cd80226..dc9119b 100644 --- a/train_net.py +++ b/train_net.py @@ -32,7 +32,7 @@ def setup(args): Create configs and perform basic setups. """ cfg = get_cfg() - if "toydet" in args.config_file: + if "toydet" in args.config_file or "crnn" in args.config_file: add_textnet_config(cfg) elif "layout" in args.config_file: add_centernet_config(cfg) diff --git a/yamls/text_recognizer/crnn_text_recognizer_360cc.yaml b/yamls/text_recognizer/crnn_text_recognizer_360cc.yaml new file mode 100644 index 0000000..8e5aba2 --- /dev/null +++ b/yamls/text_recognizer/crnn_text_recognizer_360cc.yaml @@ -0,0 +1,35 @@ +MODEL: + META_ARCHITECTURE: "CRNNet" + DEVICE: "cuda" + CRNN: + NAME: 'crnn' + IMAGE_SIZE_OW: 280 # origial width: 280 + IMAGE_SIZE_H: 32 + IMAGE_SIZE_W: 160 # resized width: 160 + NUM_CLASSES: 0 + NUM_HIDDEN: 256 + + +DATASETS: + TYPE: "360CC" + ROOT: "synthetic_chinese/images" + CHAR_FILE: 'synthetic_chinese/char_std_5990.txt' + JSON_FILE_TRAIN: 'synthetic_chinese/train.txt' + JSON_FILE_VAL: 'synthetic_chinese/test.txt' + STD: 0.193 + MEAN: 0.588 + ALPHABETS: '' + + +SOLVER: + IMS_PER_BATCH: 64 + SHUFFLE: True + WORKERS: 4 + PIN_MEMORY: False + STEPS: [60, 80] + BASE_LR: 0.0001 + OPTIMIZER: 'SGD' + MAX_ITER: 100 + + + diff --git a/yamls/text_recognizer/crnn_text_recognizer_toy.yaml b/yamls/text_recognizer/crnn_text_recognizer_toy.yaml deleted file mode 100644 index e69de29..0000000