cfchen-duke · zzdyyy · May 5, 2020 · May 5, 2020
diff --git a/README.txt b/README.txt
@@ -14,12 +14,14 @@ Recommended hardware: 4 NVIDIA Tesla P-100 GPUs or 8 NVIDIA Tesla K-80 GPUs
 
 Instructions for preparing the data:
 1. Download the dataset CUB_200_2011.tgz from http://www.vision.caltech.edu/visipedia/CUB-200-2011.html
-2. Unpack CUB_200_2011.tgz
-3. Crop the images using information from bounding_boxes.txt (included in the dataset)
-4. Split the cropped images into training and test sets, using train_test_split.txt (included in the dataset)
-5. Put the cropped training images in the directory "./datasets/cub200_cropped/train_cropped/"
-6. Put the cropped test images in the directory "./datasets/cub200_cropped/test_cropped/"
-7. Augment the training set using img_aug.py (included in this code package)
+2. Unpack CUB_200_2011.tgz into "../CUB_200_2011/"
+3. Preprocess the CUB_200_2011 dataset using img_crop.py (included in this code package)
+   -- this will finish the following procedure:
+      a. Crop the images using information from bounding_boxes.txt (included in the dataset)
+      b. Split the cropped images into training and test sets, using train_test_split.txt (included in the dataset)
+      c. Put the cropped training images in the directory "./datasets/cub200_cropped/train_cropped/"
+      d. Put the cropped test images in the directory "./datasets/cub200_cropped/test_cropped/"
+4. Augment the training set using img_aug.py (included in this code package)
    -- this will create an augmented training set in the following directory:
       "./datasets/cub200_cropped/train_cropped_augmented/"
 

diff --git a/img_aug.py b/img_aug.py
@@ -17,7 +17,7 @@ def makedir(path):
 
 for i in range(len(folders)):
     fd = folders[i]
-    tfd = target_folders[i]
+    tfd = os.path.abspath(target_folders[i])
     # rotation
     p = Augmentor.Pipeline(source_directory=fd, output_directory=tfd)
     p.rotate(probability=1, max_left_rotation=15, max_right_rotation=15)

diff --git a/img_crop.py b/img_crop.py
@@ -0,0 +1,41 @@
+import os
+import pandas as pd
+import cv2
+
+def makedir(path):
+    '''
+    if path does not exist in the file system, create it
+    '''
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+source_dir = '../CUB_200_2011/'  # original CUB_200_2011 dataset directory
+
+datasets_root_dir = './datasets/cub200_cropped/'
+train_dir = datasets_root_dir + 'train_cropped/'
+test_dir = datasets_root_dir + 'test_cropped/'
+makedir(train_dir)
+makedir(test_dir)
+
+classes = pd.read_csv(source_dir + 'classes.txt', sep=' ', names=['id', 'classname'], index_col='id')
+for classname in classes['classname']:
+    makedir(train_dir + classname)
+    makedir(test_dir + classname)
+
+images = pd.read_csv(source_dir + 'images.txt', sep=' ', names=['id', 'path'], index_col='id')
+bounding_boxes = pd.read_csv(source_dir + 'bounding_boxes.txt', sep=' ', names=['id', 'x', 'y', 'weight', 'height'], index_col='id')
+train_test_split = pd.read_csv(source_dir + 'train_test_split.txt', sep=' ', names=['id', 'train'], index_col='id')
+
+for idx in images.index:
+    print(idx)
+
+    imgpath, = images.loc[idx]
+    x, y, weight, height = bounding_boxes.loc[idx]
+    is_train, = train_test_split.loc[idx]
+    x, y, weight, height = int(x), int(y), int(weight), int(height)
+
+    img = cv2.imread(source_dir + 'images/' + imgpath)
+    basepath = train_dir if is_train else test_dir
+    cv2.imwrite(basepath + imgpath, img[y:y+height, x:x+weight, :])
+
+