SaTHE/data.py at main · Ci2Lab/SaTHE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#from curses import use_default_colors
import os
from collections import defaultdict
from itertools import chain
import random
from pathlib import Path
#from dataclasses import dataclass
from glob import glob
import pickle
import fiona
import rasterio as rio
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Normalize, Compose
from utils import most_recent_file, parse_date, erode_and_dilate, ToTensor, SelectChannels
from geoutils import calculate_ndvi, save_np_array_to_img


OPT_PATH = 'data/preprocessed/askvoll/x/pleiades_askvoll_2021_pan_transformed_resized_clipped.tif'
SAR_PATH = 'data/preprocessed/askvoll/x/ICEYE_X8_GRD_SLH_60788_20210611T094555_TC_transformed_resized_clipped.tif'
CHM_PATH = 'data/preprocessed/askvoll/y/treeheight.tif' #actually p95
AOI_MASK_PATH = 'data/preprocessed/askvoll/mask/askvoll.geojson'
DEFORESTED_MASK_PATH = 'data/preprocessed/askvoll/mask/deforested.geojson'
DS_SPLIT_MASK_PATH = 'data/preprocessed/askvoll/mask/ds_split.tif'


def load_images(opt_path=OPT_PATH, sar_path=SAR_PATH, chm_path=CHM_PATH, aoi_mask_path=AOI_MASK_PATH, deforested_mask_path=DEFORESTED_MASK_PATH, ds_split_mask_path=DS_SPLIT_MASK_PATH,  labels='all'):
    assert labels in ['all',  'treemask', 'treeheight']
    print('Loading',opt_path,'..')
    print('Loading',sar_path,'..')
    print('Loading',chm_path,'..')
    print('Loading',aoi_mask_path,'..')
    print('Loading',deforested_mask_path,'..')

    with rio.open(opt_path) as opt_ds, rio.open(sar_path) as sar_ds, rio.open(ds_split_mask_path) as split_ds, rio.open(chm_path) as chm_ds:
        opt = opt_ds.read().astype(np.float32)
        meta = {}

        # opt_mean = []
        # opt_std = []

        # for band in opt:
        #     band[band == opt_ds.nodata] = np.nan
        #     band_mean = np.nanmean(band)
        #     band_std = np.nanstd(band)
        #     opt_mean.append(band_mean)
        #     opt_std.append(band_std)

        meta['opt'] = {
            'crs': opt_ds.crs,
            'affine': opt_ds.meta['transform'],
            'height': opt_ds.meta['height'],
            'width': opt_ds.meta['width'],

        }
        sar = sar_ds.read().astype(np.float32)
        # band = sar_ds.read(1)
        # band[band == sar_ds.nodata] = np.nan
        # sar_mean = np.nanmean(band)
        # sar_std = np.nanstd(band)

        meta['sar'] = {
            'crs': sar_ds.crs,
            'affine': sar_ds.meta['transform'],
            'height': sar_ds.meta['height'],
            'width': sar_ds.meta['width']
        }
        split_mask = split_ds.read()

        chm = chm_ds.read().astype(np.float32)


        meta['chm'] = {
            'crs': chm_ds.crs,
            'affine': chm_ds.meta['transform'],
            'height': chm_ds.meta['height'],
            'width': chm_ds.meta['width']
        }

        assert meta['opt']['crs'] == meta['sar']['crs']
        assert meta['opt']['crs'] == meta['chm']['crs']

        with fiona.open(aoi_mask_path, "r") as geojson:
            aoi_mask = [feature["geometry"] for feature in geojson]
        with fiona.open(deforested_mask_path, "r") as geojson:
            deforested_mask = [feature["geometry"] for feature in geojson]
    return opt, sar, chm, aoi_mask, deforested_mask, split_mask, meta


"""
1: tree
0: not tree
return eroded and dilated tree mask of shape (h, w)
"""
def detect_tree_points(CHM_PATH, OPT_PATH, height_threshold=1.3, ndvi_threshold=0.55, save=False):
    # tree points (above 1.3m and green)
    with rio.open(CHM_PATH) as chm_ds:
        chm = chm_ds.read()
    with rio.open(OPT_PATH) as mspectral_ds:
        multispectral = mspectral_ds.read()
    # during preprocessing height/width might be off by one pixel
    if chm.shape != multispectral.shape:
        print('Resizing to same shape')
        _, h, w = [min(s) for s in zip(chm.shape,multispectral.shape)] #taking min shape as default for both
        chm = chm[:,:h,:w]
        multispectral = multispectral[:,:h,:w]

    tree_height = chm > height_threshold #boolean mask for heights
    ndvi = calculate_ndvi(multispectral[3], multispectral[0])
    print(f'NDVI {np.count_nonzero(np.isnan(ndvi))/len(ndvi.flatten()):.2f}% of pixels is NaN. Setting to 0..')
    # replace nan with 0
    ndvi[np.isnan(ndvi)] = 0
    assert not np.isnan(ndvi).any()
    green = ndvi > ndvi_threshold #boolean mask for vegetation
    tree_points = np.squeeze(np.logical_and(tree_height, green).astype('uint8'), axis=0)
    # remove isolated points and noise
    improved =  erode_and_dilate(tree_points, structure=np.ones((2,2)), it=2)
    improved[np.isnan(improved)] = 0
    print(np.max(improved))
    if save:
        with rio.open(CHM_PATH) as chm_ds:
            chm = chm_ds.read(1)
        meta = chm_ds.meta
        #meta['dtype'] = 'uint8'
        save_np_array_to_img(np.expand_dims(improved, 0), meta=meta,  path='./data/preprocessed/askvoll/tmp/gen_treepoints.tif')

    return improved


"""
>0 : tree heights
0: not tree
return tree heights of shape (h, w)
"""
def detect_tree_heights(CHM_PATH, OPT_PATH, height_threshold=1.3, ndvi_threshold=0.55, save=False):
    # tree points (above 1.3m and green)
    with rio.open(CHM_PATH) as chm_ds:
        chm = chm_ds.read()
    with rio.open(OPT_PATH) as mspectral_ds:
        multispectral = mspectral_ds.read()
    # during preprocessing height/width might be off by one pixel
    if chm.shape != multispectral.shape:
        print('Resizing to same shape')
        _, h, w = [min(s) for s in zip(chm.shape,multispectral.shape)] #taking min shape as default for both
        chm = chm[:,:h,:w]
        multispectral = multispectral[:,:h,:w]

    tree_height = chm > height_threshold #boolean mask for heights
    ndvi = calculate_ndvi(multispectral[3], multispectral[0])
    print(f'NDVI {np.count_nonzero(np.isnan(ndvi))/len(ndvi.flatten()):.2f}% of pixels is NaN. Setting to 0..')
    # replace nan with 0
    ndvi[np.isnan(ndvi)] = 0
    assert not np.isnan(ndvi).any()
    green = ndvi > ndvi_threshold #boolean mask for vegetation
    tree_points = np.squeeze(np.logical_and(tree_height, green).astype('uint8'), axis=0)
    # remove isolated points and noise
    tree_points = erode_and_dilate(tree_points, structure=np.ones((2,2)), it=2)
    # tree heights
    tree_heights = np.multiply(tree_points, chm[0])
    tree_heights[np.isnan(tree_heights)] = 0

    if save:
        meta = chm_ds.meta
        #meta['nodata'] = -9999.
        save_np_array_to_img(np.expand_dims(tree_heights, 0), meta=meta, path='./data/preprocessed/askvoll/tmp/gen_treeheights.tif')

    return tree_heights


"""
opt:                [C, H, W]
sar:                [C, H, W]
gt :                [C, H, W]
meta:               images/gt metadata
tree_points:        pixels containing trees (1) and no trees (0)
arch:               model architecture (unet/resnext)
aoi_mask:           area of interest (polygon)
deforested_mask:    manually created polygon of deforested areas (if ground truth is not from the same timeframe as input)
ds_split:           [1, H, W] map of [0,1,2] referring to ['validation','test','training'] respectively
patch_size:         size of each patch input to the network
save_to_disk:       pickle and automatically loaded for next training
margin:             the border of a patch excluded from prediction upon testing (save predictions inside the border)
"""

def create_dataset(opt: np.ndarray, sar: np.ndarray, meta: dict, tree_points: np.ndarray, tree_heights : np.ndarray,
                    arch: str, aoi_mask, deforested_mask, ds_split=None, patch_size=64,
                    save_to_disk=False, margin=0
                    ):

    available_archs = ['resnext', 'unet']
    if arch not in available_archs:
        raise NotImplementedError

    random_split = True
    if ds_split is not None:
        print('Using prerendered dataset split mask..')
        random_split = False

    images = [(img/255).astype('float32') for img in chain([opt],[sar])]
    #image = (images_raw/255).astype('float32')
    image_indices = {
        'opt': 0,
        'sar': 1
    }
    print(tree_points.shape, tree_heights.shape)
    labels = np.stack((tree_points,tree_heights), axis=0)
    gt_labels = ['treemask','treeheight']

    print('Read ground truth in following order', [str(i)+':'+name for i,name in enumerate(gt_labels)])

    # height and width of the scene
    h, w = labels[0].shape
    print(h,w)
    # aoi: raster with 0=outside polygon, 1=inside polygon
    rasterized_polygon = rio.features.rasterize(aoi_mask, out_shape=(h, w), fill=0, transform=meta['chm']['affine'], dtype='uint8')
    # 1=deforested pixel
    deforested = rio.features.rasterize(deforested_mask, out_shape=(h, w), fill=0, transform=meta['chm']['affine'], dtype='uint8')

    print('Assigning NaN values to ground truth..')
    labels[labels == -9999.] = np.nan
    #print(type(deforested), type(labels))
    labels[:, deforested == 1] = np.nan
    labels[:, rasterized_polygon == 0] = np.nan

    # mask to consider temporal changes, nan and aoi
    mask = np.ones((len(labels),h,w), np.uint8)
    mask[labels == np.nan] = 0


    ### verbose ###
    enough_trees_patches = 0
    tree_threshold = 0.1  # atleast 10% of pixels are forested
    aoi_patches = 0
    opt_skipped = 0
    sar_skipped = 0
    valid_patches = 0 #actually useful patches

    locations = defaultdict(list) # {'val':[(i1,j1), (i2,j2)...], 'test':[(in,jn) ...]} #i,j is topleft corner of the patch
    dataset_names = ['val', 'test', 'train']

    # N of pixels inside a patch
    patch_px = patch_size**2
    dataset_split_mask = np.full((1,h,w),255,dtype=np.uint8) # 0 val, 1 test, 2 train , 255 none

    if arch in available_archs:
        assert patch_size % 2 == 0
        if margin > 0:
            step_size = 2*margin
            i_start, i_end, i_inc = 0, h-patch_size, step_size
            j_start, j_end, j_inc = 0, w-patch_size, step_size
            rows_tiles = h // step_size
            cols_tiles = w // step_size

        else: # no overlap
            i_start, i_end, i_inc = 0, h-patch_size, patch_size
            j_start, j_end, j_inc = 0, w-patch_size, patch_size
            rows_tiles = h // patch_size
            cols_tiles = w // patch_size

        tiles_total = rows_tiles*cols_tiles
        print(f'There are {tiles_total} tiles in total')
        print('Selecting valid tiles...')
        for i in range(i_start, i_end, i_inc):
            for j in range(j_start,j_end,j_inc):
                i_slice = slice(i, i+patch_size)
                j_slice = slice(j, j+patch_size)
                in_polygon = (rasterized_polygon[i_slice,j_slice]==1).all #tile within aoi
                center_trees = (tree_points[i+margin:i+patch_size-margin, j+margin:j+patch_size-margin]).any() # any forested pixels within margin, temporarily - will be replaced by averaging margin area
                enough_trees = np.sum(tree_points[i_slice, j_slice]) >= patch_px*tree_threshold # at least 10% trees
                has_trees = (tree_points[i_slice, j_slice] == 1).any()         # a tree point exist within the tile

                ### Verbose ###
                enough_trees_patches = enough_trees_patches + 1 if enough_trees else enough_trees_patches
                aoi_patches = aoi_patches + 1 if in_polygon else aoi_patches

                if margin > 0 and not center_trees:
                    continue
                if not in_polygon:
                    continue
                if not   in_polygon: #(enough_trees and
                    continue

                # skip if any pixel contains 0 across all channels -> skip if patch consists of only 0 pixwls

                # this step is ignored, preprocessing introduced more 0s imitating "nodata" which are not actually "nodata"
                if (opt[0, i_slice, j_slice] == 0).all(0).any():
                    opt_skipped += 1
                    #continue

                # skip if any pixel in the patch is "nodata"
                # pixel with value 0 corresponds to no data
                if (sar[0, i_slice, j_slice] == 0).all(0).any():
                    sar_skipped += 1
                    #continue

                valid_patches += 1


                ### Dataset splittinh
                # 10/10/80 random split
                if random_split:
                    ds = random.randint(0,9)
                    ds = 2 if ds >= 2 else ds
                # use predefined split mask
                elif (ds_split[0,i_slice,j_slice] == ds_split[0,i,j]).all(): # to check that all val in tile belong to one ds name
                    ds = ds_split[0,i,j] # either 0/1/2
                else:
                    continue

                ds_name = dataset_names[ds]
                dataset_split_mask[0,i_slice, j_slice] = ds # specify to which ds each pach goes

                #i,j - topleft corner of each pach = start
                locations[ds_name].append((i,j))
        # print(aoi_patches)
    else:
            raise NotImplementedError

    ### Verbose ###
    valid_patches_pct = (valid_patches/tiles_total)*100
    print(f'Selected {valid_patches} patches')
    print(f'{valid_patches_pct:.2f}% of total patches extracted from images')
    print(f'{(enough_trees_patches/tiles_total)*100:.2f}% of patches has atleast {tree_threshold*100:.1f}% forested pixels')
    print(f'{(aoi_patches/tiles_total)*100:.2f}% of patches within area of interest')
    print(f'{(opt_skipped/tiles_total)*100:.2f}% optical patches with dead pixels')
    print(f'{(sar_skipped/tiles_total)*100:.2f}% sar patches with dead pixels')
    print('--------------------------------------')
    ###############

    ds = save_dataset(images, locations, image_indices, labels, dataset_split_mask, mask, arch, patch_size, margin, save_to_disk=save_to_disk)
    return ds

def save_dataset(images, locations, image_indices, labels, dataset_split_mask, mask, arch, patch_size=64, margin=0, save_to_disk=False):
    ds = {
                'images': images,
                'train': np.array(locations['train'], dtype=np.uint16),
                'val': np.array(locations['val'], dtype=np.uint16),
                'test': np.array(locations['test'], dtype=np.uint16),
                'image_indices': image_indices,
                'total': len(locations['train']) + len(locations['val']) + len(locations['test']),
                'labels': labels,
                'patch_size': patch_size,
                'margin': margin,
                'split_mask': dataset_split_mask,
                'mask': mask
            }
    # split information
    total = ds['total']
    print(f'Train: {len(ds["train"])}, {(len(ds["train"])/total)*100:.2f}%')
    print(f'Validation: {len(ds["val"])}, {(len(ds["val"])/total)*100:.2f}%')
    print(f'Test: {len(ds["test"])}, {(len(ds["test"])/total)*100:.2f}%')
    if save_to_disk:
        pickle_dataset(ds, arch)
    return ds


def pickle_dataset(dataset, arch):
    with open(f'data/datasets/pkl/dataset_{arch}_{parse_date()}.pkl', 'wb') as fh:
            pickle.dump(dataset, fh)
            print(f'Dataset pickled to data/datasets/pkl/dataset_{arch}_{parse_date()}.pkl')


### Dataset ###

"""
RSDataset - construct dataset (RS - Remote Sensing)
PatchDataset - data split into patches
DataLoader - data loaded in batches
"""

class RSDataset:
    """
    pkl_path : path to pickled datasets,
    data_stats: precalculated stats from images

    """
    def __init__(self,
                 opt_image_bands: list[int],
                 sar_image_polarizations: list[int],
                 labels_bands: list[int],
                 #data_stats: dict,
                 pkl_path: str = 'data/datasets/pkl/*.pkl',
                 patch_size: int = 64,
                 batch_size: int = 64,
                 normalize_labels: bool = False,
                ):
        #self.stats = data_stats

        # load latest pickled dataset
        pkl_files = glob(f'{pkl_path}')
        pkl_file = Path(most_recent_file(pkl_files))
        with pkl_file.open('rb') as pkl:
            print(f"Loading dataset '{pkl_file}'...")
            dataset = pickle.load(pkl)

        # to 0 based indices
        #print(opt_image_bands)
        opt_channels = np.array(opt_image_bands)-1
        sar_channels = np.array(sar_image_polarizations)-1
        labels_channels = np.array(labels_bands)-1


        params = {
            'images': dataset['images'],
            'labels': dataset['labels'],
            'image_indices': dataset['image_indices'],
            'patch_size': patch_size,
            #'opt_transforms': Compose([ToTensor(), SelectChannels(opt_channels), Normalize(self.stats['opt_mean'], self.stats['opt_std'])]),
            #'sar_transforms': Compose([ToTensor(), SelectChannels(sar_channels), Normalize(self.stats['sar_mean'], self.stats['sar_std'])]),
            #'labels_transforms': Compose([ToTensor(), SelectChannels(labels_channels)])
        }

        self.train_set, self.val_set = (PatchDataset(dataset[name], **params)
                                              for name in ['train','val'])
        self.train_loader = DataLoader(self.train_set, batch_size=batch_size, shuffle=True)
        self.val_loader = DataLoader(self.val_set, batch_size=batch_size, shuffle=False)


class PatchDataset(Dataset):
    def __init__(self,
                 locations: np.ndarray,
                 images: list,
                 image_indices: dict,
                 labels: list,
                 patch_size: int):
                 ##opt_transforms: nn.Module,
                 #sar_transforms: nn.Module,
                # labels_transforms: nn.Module):
        self.locations = locations
        self.images = images
        self.labels = labels
        self.image_indices = image_indices
        self.patch_size = patch_size
       # self.opt_transforms = opt_transforms
       # self.sar_transforms = sar_transforms
       # self.labels_transforms = labels_transforms

    def __getitem__(self, index):
        i, j = self.locations[index]
        i_slice = slice(i, i+self.patch_size)
        j_slice = slice(j, j+self.patch_size)
        opt_idx = self.image_indices['opt']
        sar_idx = self.image_indices['sar']


        #opt_patch = self.opt_transforms(self.images[opt_idx][:, i_slice, j_slice])
        #sar_patch = self.sar_transforms(self.images[sar_idx][:, i_slice, j_slice])

        opt_patch = torch.from_numpy(self.images[opt_idx][:, i_slice, j_slice])
        sar_patch = torch.from_numpy(self.images[sar_idx][:, i_slice, j_slice])
        patch = torch.cat([opt_patch, sar_patch], dim=0)

        labels_patch = torch.from_numpy(self.labels[:, i_slice, j_slice])
        #labels_patch = self.labels_transforms(self.labels[:, i_slice, j_slice])

        return patch, labels_patch

    def __len__(self):
        return len(self.locations)


def main():
    # opt, sar, chm, aoi_mask, deforested_mask, ds_split, meta = load_images(labels='all')

    #tree_points = detect_tree_points(height_threshold=1.37, ndvi_threshold=0.55)
    #tree_heights = detect_tree_heights(height_threshold=1.37, ndvi_threshold=0.55)

    #create_dataset(opt,sar,meta,tree_points,tree_heights, 'unet',aoi_mask,deforested_mask,ds_split=ds_split,patch_size=64,margin=16,  save_to_disk=True)

    pkl_file = (Path('data/datasets/pkl/dataset_unet_01-Dec-2023.pkl'))
    with pkl_file.open('rb') as pkl:
            print(f"Loading dataset '{pkl_file}'...")
            ds = pickle.load(pkl)


    print(len(ds['train']))

if __name__ == '__main__':
    torch.cuda.empty_cache()
    main()