-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathpreprocessing_with_tile_data_overlap.py
More file actions
1489 lines (1175 loc) · 46.3 KB
/
Copy pathpreprocessing_with_tile_data_overlap.py
File metadata and controls
1489 lines (1175 loc) · 46.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import math
import multiprocessing
import numpy as np
import openslide
from openslide import OpenSlideError
import os
import PIL
from PIL import Image
import re
import sys
import utilsPreprocessing as util
import sys
import skimage.morphology as sk_morphology
import pandas as pd
from enum import Enum
import skimage.color as sk_color
import random
# # BASE_DIR = "/Users/ebergstr/Desktop/lab/thesis/Aim3/histology/BRCA/debug/"
# # SKIPPED_SAMPLES = os.path.join(BASE_DIR, "skipped_samples.txt")
# # PROJECT = "BRCA"
# TRAIN_PREFIX = PROJECT + "-"
# SRC_TRAIN_DIR = os.path.join(BASE_DIR, PROJECT + "/")
# SRC_TRAIN_EXT = "svs"
# DEST_TRAIN_EXT = "png"
SCALE_FACTOR = 32
# DEST_TRAIN_DIR = os.path.join(BASE_DIR, PROJECT + "_" + DEST_TRAIN_EXT)
# TISSUE_HIGH_THRESH = 80
# TISSUE_LOW_THRESH = 10
# RESOLUTION = '5x'
# ROW_TILE_SIZE = 256
# COL_TILE_SIZE = 256
# NUM_TOP_TILES = 100000
# HSV_PURPLE = 270
# HSV_PINK = 330
# FILTER_RESULT_TEXT = "filtered"
# FILTER_DIR = os.path.join(BASE_DIR, "filter_" + DEST_TRAIN_EXT)
# TOP_TILES_SUFFIX = "top_tile_summary"
# TOP_TILES_DIR = os.path.join(BASE_DIR, TOP_TILES_SUFFIX + "_" + DEST_TRAIN_EXT)
# TILE_DIR = os.path.join(BASE_DIR, "tiles_" + DEST_TRAIN_EXT)
# TILE_SUFFIX = "tile"
def get_tile_image_path(tile):
"""
Obtain tile image path based on tile information such as row, column, row pixel position, column pixel position,
pixel width, and pixel height.
Args:
tile: Tile object.
Returns:
Path to image tile.
"""
t = tile
padded_sl_num = str(t.slide_num).zfill(3)
tile_path = os.path.join(TILE_DIR, padded_sl_num,
TRAIN_PREFIX + padded_sl_num + "-" + TILE_SUFFIX + "-r%d-c%d-x%d-y%d-w%d-h%d" % (
t.r, t.c, t.o_c_s, t.o_r_s, t.o_c_e - t.o_c_s, t.o_r_e - t.o_r_s) + "." + DEST_TRAIN_EXT)
return(tile_path)
def get_filter_image_result(slide_number):
"""
Convert slide number to the path to the file that is the final result of filtering.
Example:
5 -> ../data/filter_png/TUPAC-TR-005-32x-49920x108288-1560x3384-filtered.png
Args:
slide_number: The slide number.
Returns:
Path to the filter image file.
"""
padded_sl_num = str(slide_number).zfill(3)
training_img_path = util.getTrainingImagePath(slide_number, DEST_TRAIN_DIR, TRAIN_PREFIX, DEST_TRAIN_EXT, SCALE_FACTOR)
large_w, large_h, small_w, small_h = util.parseDimensionsFromImageFilename(training_img_path)
img_path = os.path.join(FILTER_DIR, TRAIN_PREFIX + padded_sl_num + "-" + str(
SCALE_FACTOR) + "x-" + str(large_w) + "x" + str(large_h) + "-" + str(small_w) + "x" + str(
small_h) + "-" + FILTER_RESULT_TEXT + "." + DEST_TRAIN_EXT)
return(img_path)
def summary_stats(tile_summary):
"""
Obtain various stats about the slide tiles.
Args:
tile_summary: TileSummary object.
Returns:
Various stats about the slide tiles as a string.
"""
return "Original Dimensions: %dx%d\n" % (tile_summary.orig_w, tile_summary.orig_h) + \
"Original Tile Size: %dx%d\n" % (tile_summary.orig_tile_w, tile_summary.orig_tile_h) + \
"Scale Factor: 1/%dx\n" % tile_summary.scale_factor + \
"Scaled Dimensions: %dx%d\n" % (tile_summary.scaled_w, tile_summary.scaled_h) + \
"Scaled Tile Size: %dx%d\n" % (tile_summary.scaled_tile_w, tile_summary.scaled_tile_w) + \
"Total Mask: %3.2f%%, Total Tissue: %3.2f%%\n" % (
tile_summary.mask_percentage(), tile_summary.tissue_percentage) + \
"Tiles: %dx%d = %d\n" % (tile_summary.num_col_tiles, tile_summary.num_row_tiles, tile_summary.count) + \
" %5d (%5.2f%%) tiles >=%d%% tissue\n" % (
tile_summary.high, tile_summary.high / tile_summary.count * 100, TISSUE_HIGH_THRESH) + \
" %5d (%5.2f%%) tiles >=%d%% and <%d%% tissue\n" % (
tile_summary.medium, tile_summary.medium / tile_summary.count * 100, TISSUE_LOW_THRESH,
TISSUE_HIGH_THRESH) + \
" %5d (%5.2f%%) tiles >0%% and <%d%% tissue\n" % (
tile_summary.low, tile_summary.low / tile_summary.count * 100, TISSUE_LOW_THRESH) + \
" %5d (%5.2f%%) tiles =0%% tissue" % (tile_summary.none, tile_summary.none / tile_summary.count * 100)
# def parse_dimensions_from_image_filename(filename):
# """
# Parse an image filename to extract the original width and height and the converted width and height.
# Example:
# "TUPAC-TR-011-32x-97103x79079-3034x2471-tile_summary.png" -> (97103, 79079, 3034, 2471)
# Args:
# filename: The image filename.
# Returns:
# Tuple consisting of the original width, original height, the converted width, and the converted height.
# """
# m = re.match(".*-([\d]*)x([\d]*)-([\d]*)x([\d]*).*\..*", filename)
# large_w = int(m.group(1))
# large_h = int(m.group(2))
# small_w = int(m.group(3))
# small_h = int(m.group(4))
# return large_w, large_h, small_w, small_h
def small_to_large_mapping(small_pixel, large_dimensions):
"""
Map a scaled-down pixel width and height to the corresponding pixel of the original whole-slide image.
Args:
small_pixel: The scaled-down width and height.
large_dimensions: The width and height of the original whole-slide image.
Returns:
Tuple consisting of the scaled-up width and height.
"""
small_x, small_y = small_pixel
large_w, large_h = large_dimensions
large_x = round((large_w / SCALE_FACTOR) / math.floor(large_w / SCALE_FACTOR) * (SCALE_FACTOR * small_x))
large_y = round((large_h / SCALE_FACTOR) / math.floor(large_h / SCALE_FACTOR) * (SCALE_FACTOR * small_y))
return(large_x, large_y)
def filter_green_channel(np_img, green_thresh=200, avoid_overmask=True, overmask_thresh=90, output_type="bool"):
"""
Create a mask to filter out pixels with a green channel value greater than a particular threshold, since hematoxylin
and eosin are purplish and pinkish, which do not have much green to them.
Args:
np_img: RGB image as a NumPy array.
green_thresh: Green channel threshold value (0 to 255). If value is greater than green_thresh, mask out pixel.
avoid_overmask: If True, avoid masking above the overmask_thresh percentage.
overmask_thresh: If avoid_overmask is True, avoid masking above this threshold percentage value.
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array representing a mask where pixels above a particular green channel threshold have been masked out.
"""
g = np_img[:, :, 1]
gr_ch_mask = (g < green_thresh) & (g > 0)
mask_percentage = util.mask_percent(gr_ch_mask)
if (mask_percentage >= overmask_thresh) and (green_thresh < 255) and (avoid_overmask is True):
new_green_thresh = math.ceil((255 - green_thresh) / 2 + green_thresh)
# print(
# "Mask percentage %3.2f%% >= overmask threshold %3.2f%% for Remove Green Channel green_thresh=%d, so try %d" % (
# mask_percentage, overmask_thresh, green_thresh, new_green_thresh))
gr_ch_mask = filter_green_channel(np_img, new_green_thresh, avoid_overmask, overmask_thresh, output_type)
np_img = gr_ch_mask
if output_type == "bool":
pass
elif output_type == "float":
np_img = np_img.astype(float)
else:
np_img = np_img.astype("uint8") * 255
return(np_img)
def filter_grays(rgb, tolerance=15, output_type="bool"):
"""
Create a mask to filter out pixels where the red, green, and blue channel values are similar.
Args:
np_img: RGB image as a NumPy array.
tolerance: Tolerance value to determine how similar the values must be in order to be filtered out
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array representing a mask where pixels with similar red, green, and blue values have been masked out.
"""
(h, w, c) = rgb.shape
rgb = rgb.astype(int)
rg_diff = abs(rgb[:, :, 0] - rgb[:, :, 1]) <= tolerance
rb_diff = abs(rgb[:, :, 0] - rgb[:, :, 2]) <= tolerance
gb_diff = abs(rgb[:, :, 1] - rgb[:, :, 2]) <= tolerance
result = ~(rg_diff & rb_diff & gb_diff)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_red(rgb, red_lower_thresh, green_upper_thresh, blue_upper_thresh, output_type="bool",
display_np_info=False):
"""
Create a mask to filter out reddish colors, where the mask is based on a pixel being above a
red channel threshold value, below a green channel threshold value, and below a blue channel threshold value.
Args:
rgb: RGB image as a NumPy array.
red_lower_thresh: Red channel lower threshold value.
green_upper_thresh: Green channel upper threshold value.
blue_upper_thresh: Blue channel upper threshold value.
output_type: Type of array to return (bool, float, or uint8).
display_np_info: If True, display NumPy array info and filter time.
Returns:
NumPy array representing the mask.
"""
r = rgb[:, :, 0] > red_lower_thresh
g = rgb[:, :, 1] < green_upper_thresh
b = rgb[:, :, 2] < blue_upper_thresh
result = ~(r & g & b)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_green(rgb, red_upper_thresh, green_lower_thresh, blue_lower_thresh, output_type="bool", display_np_info=False):
"""
Create a mask to filter out greenish colors, where the mask is based on a pixel being below a
red channel threshold value, above a green channel threshold value, and above a blue channel threshold value.
Note that for the green ink, the green and blue channels tend to track together, so we use a blue channel
lower threshold value rather than a blue channel upper threshold value.
Args:
rgb: RGB image as a NumPy array.
red_upper_thresh: Red channel upper threshold value.
green_lower_thresh: Green channel lower threshold value.
blue_lower_thresh: Blue channel lower threshold value.
output_type: Type of array to return (bool, float, or uint8).
display_np_info: If True, display NumPy array info and filter time.
Returns:
NumPy array representing the mask.
"""
r = rgb[:, :, 0] < red_upper_thresh
g = rgb[:, :, 1] > green_lower_thresh
b = rgb[:, :, 2] > blue_lower_thresh
result = ~(r & g & b)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_blue(rgb, red_upper_thresh, green_upper_thresh, blue_lower_thresh, output_type="bool",
display_np_info=False):
"""
Create a mask to filter out blueish colors, where the mask is based on a pixel being below a
red channel threshold value, below a green channel threshold value, and above a blue channel threshold value.
Args:
rgb: RGB image as a NumPy array.
red_upper_thresh: Red channel upper threshold value.
green_upper_thresh: Green channel upper threshold value.
blue_lower_thresh: Blue channel lower threshold value.
output_type: Type of array to return (bool, float, or uint8).
display_np_info: If True, display NumPy array info and filter time.
Returns:
NumPy array representing the mask.
"""
r = rgb[:, :, 0] < red_upper_thresh
g = rgb[:, :, 1] < green_upper_thresh
b = rgb[:, :, 2] > blue_lower_thresh
result = ~(r & g & b)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_red_pen(rgb, output_type="bool"):
"""
Create a mask to filter out red pen marks from a slide.
Args:
rgb: RGB image as a NumPy array.
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array representing the mask.
"""
result = filter_red(rgb, red_lower_thresh=150, green_upper_thresh=80, blue_upper_thresh=90) & \
filter_red(rgb, red_lower_thresh=110, green_upper_thresh=20, blue_upper_thresh=30) & \
filter_red(rgb, red_lower_thresh=185, green_upper_thresh=65, blue_upper_thresh=105) & \
filter_red(rgb, red_lower_thresh=195, green_upper_thresh=85, blue_upper_thresh=125) & \
filter_red(rgb, red_lower_thresh=220, green_upper_thresh=115, blue_upper_thresh=145) & \
filter_red(rgb, red_lower_thresh=125, green_upper_thresh=40, blue_upper_thresh=70) & \
filter_red(rgb, red_lower_thresh=200, green_upper_thresh=120, blue_upper_thresh=150) & \
filter_red(rgb, red_lower_thresh=100, green_upper_thresh=50, blue_upper_thresh=65) & \
filter_red(rgb, red_lower_thresh=85, green_upper_thresh=25, blue_upper_thresh=45)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_green_pen(rgb, output_type="bool"):
"""
Create a mask to filter out green pen marks from a slide.
Args:
rgb: RGB image as a NumPy array.
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array representing the mask.
"""
result = filter_green(rgb, red_upper_thresh=150, green_lower_thresh=160, blue_lower_thresh=140) & \
filter_green(rgb, red_upper_thresh=70, green_lower_thresh=110, blue_lower_thresh=110) & \
filter_green(rgb, red_upper_thresh=45, green_lower_thresh=115, blue_lower_thresh=100) & \
filter_green(rgb, red_upper_thresh=30, green_lower_thresh=75, blue_lower_thresh=60) & \
filter_green(rgb, red_upper_thresh=195, green_lower_thresh=220, blue_lower_thresh=210) & \
filter_green(rgb, red_upper_thresh=225, green_lower_thresh=230, blue_lower_thresh=225) & \
filter_green(rgb, red_upper_thresh=170, green_lower_thresh=210, blue_lower_thresh=200) & \
filter_green(rgb, red_upper_thresh=20, green_lower_thresh=30, blue_lower_thresh=20) & \
filter_green(rgb, red_upper_thresh=50, green_lower_thresh=60, blue_lower_thresh=40) & \
filter_green(rgb, red_upper_thresh=30, green_lower_thresh=50, blue_lower_thresh=35) & \
filter_green(rgb, red_upper_thresh=65, green_lower_thresh=70, blue_lower_thresh=60) & \
filter_green(rgb, red_upper_thresh=100, green_lower_thresh=110, blue_lower_thresh=105) & \
filter_green(rgb, red_upper_thresh=165, green_lower_thresh=180, blue_lower_thresh=180) & \
filter_green(rgb, red_upper_thresh=140, green_lower_thresh=140, blue_lower_thresh=150) & \
filter_green(rgb, red_upper_thresh=185, green_lower_thresh=195, blue_lower_thresh=195)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_blue_pen(rgb, output_type="bool"):
"""
Create a mask to filter out blue pen marks from a slide.
Args:
rgb: RGB image as a NumPy array.
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array representing the mask.
"""
result = filter_blue(rgb, red_upper_thresh=60, green_upper_thresh=120, blue_lower_thresh=190) & \
filter_blue(rgb, red_upper_thresh=120, green_upper_thresh=170, blue_lower_thresh=200) & \
filter_blue(rgb, red_upper_thresh=175, green_upper_thresh=210, blue_lower_thresh=230) & \
filter_blue(rgb, red_upper_thresh=145, green_upper_thresh=180, blue_lower_thresh=210) & \
filter_blue(rgb, red_upper_thresh=37, green_upper_thresh=95, blue_lower_thresh=160) & \
filter_blue(rgb, red_upper_thresh=30, green_upper_thresh=65, blue_lower_thresh=130) & \
filter_blue(rgb, red_upper_thresh=130, green_upper_thresh=155, blue_lower_thresh=180) & \
filter_blue(rgb, red_upper_thresh=40, green_upper_thresh=35, blue_lower_thresh=85) & \
filter_blue(rgb, red_upper_thresh=30, green_upper_thresh=20, blue_lower_thresh=65) & \
filter_blue(rgb, red_upper_thresh=90, green_upper_thresh=90, blue_lower_thresh=140) & \
filter_blue(rgb, red_upper_thresh=60, green_upper_thresh=60, blue_lower_thresh=120) & \
filter_blue(rgb, red_upper_thresh=110, green_upper_thresh=110, blue_lower_thresh=175)
if output_type == "bool":
pass
elif output_type == "float":
result = result.astype(float)
else:
result = result.astype("uint8") * 255
return(result)
def filter_remove_small_objects(np_img, min_size=3000, avoid_overmask=True, overmask_thresh=95, output_type="uint8"):
"""
Filter image to remove small objects (connected components) less than a particular minimum size. If avoid_overmask
is True, this function can recursively call itself with progressively smaller minimum size objects to remove to
reduce the amount of masking that this filter performs.
Args:
np_img: Image as a NumPy array of type bool.
min_size: Minimum size of small object to remove.
avoid_overmask: If True, avoid masking above the overmask_thresh percentage.
overmask_thresh: If avoid_overmask is True, avoid masking above this threshold percentage value.
output_type: Type of array to return (bool, float, or uint8).
Returns:
NumPy array (bool, float, or uint8).
"""
rem_sm = np_img.astype(bool) # make sure mask is boolean
rem_sm = sk_morphology.remove_small_objects(rem_sm, min_size=min_size)
mask_percentage = util.mask_percent(rem_sm)
if (mask_percentage >= overmask_thresh) and (min_size >= 1) and (avoid_overmask is True):
new_min_size = min_size / 2
# print("Mask percentage %3.2f%% >= overmask threshold %3.2f%% for Remove Small Objs size %d, so try %d" % (
# mask_percentage, overmask_thresh, min_size, new_min_size))
rem_sm = filter_remove_small_objects(np_img, new_min_size, avoid_overmask, overmask_thresh, output_type)
np_img = rem_sm
if output_type == "bool":
pass
elif output_type == "float":
np_img = np_img.astype(float)
else:
np_img = np_img.astype("uint8") * 255
return(np_img)
def apply_image_filters(np_img, slide_num=None, info=None, save=False):
"""
Apply filters to image as NumPy array and optionally save and/or display filtered images.
Args:
np_img: Image as NumPy array.
slide_num: The slide number (used for saving/displaying).
info: Dictionary of slide information (used for HTML display).
save: If True, save image.
Returns:
Resulting filtered image as a NumPy array.
"""
rgb = np_img
mask_not_green = filter_green_channel(rgb)
rgb_not_green = util.mask_rgb(rgb, mask_not_green)
mask_not_gray = filter_grays(rgb)
rgb_not_gray = util.mask_rgb(rgb, mask_not_gray)
mask_no_red_pen = filter_red_pen(rgb)
rgb_no_red_pen = util.mask_rgb(rgb, mask_no_red_pen)
mask_no_green_pen = filter_green_pen(rgb)
rgb_no_green_pen = util.mask_rgb(rgb, mask_no_green_pen)
mask_no_blue_pen = filter_blue_pen(rgb)
rgb_no_blue_pen = util.mask_rgb(rgb, mask_no_blue_pen)
mask_gray_green_pens = mask_not_gray & mask_not_green & mask_no_red_pen & mask_no_green_pen & mask_no_blue_pen
rgb_gray_green_pens = util.mask_rgb(rgb, mask_gray_green_pens)
mask_remove_small = filter_remove_small_objects(mask_gray_green_pens, min_size=500, output_type="bool")
rgb_remove_small = util.mask_rgb(rgb, mask_remove_small)
img = rgb_remove_small
return(img)
def training_slide_to_image(slide_number, slide):
"""
Convert a WSI training slide to a saved scaled-down image in a format such as jpg or png.
Args:
slide_number: The slide number.
"""
img, large_w, large_h, new_w, new_h = slide_to_scaled_pil_image(slide_number, slide)
img_path = util.getTrainingImagePath(slide_number, DEST_TRAIN_DIR, TRAIN_PREFIX, DEST_TRAIN_EXT, SCALE_FACTOR, large_w, large_h, new_w, new_h)
if not os.path.exists(DEST_TRAIN_DIR):
os.makedirs(DEST_TRAIN_DIR)
img.save(img_path)
def slide_to_scaled_pil_image(slide_number, slide):
"""
Convert a WSI training slide to a scaled-down PIL image.
Args:
slide_number: The slide number.
Returns:
Tuple consisting of scaled-down PIL image, original width, original height, new width, and new height.
"""
slide_filepath = os.path.join(SRC_TRAIN_DIR, slide)
slide = util.openSlide(slide_filepath)
large_w, large_h = slide.dimensions
new_w = math.floor(large_w / SCALE_FACTOR)
new_h = math.floor(large_h / SCALE_FACTOR)
try:
if abs(0.25 - float(slide.properties['openslide.mpp-x'])) < abs(0.5 - float(slide.properties['openslide.mpp-x'])):
objective_power = 40
else:
objective_power = 20
except:
objective_power = 10
print(slide, file=skippedSamps, flush=True, end="\t")
for x in slide.properties:
print("\t".join([str(x), str(slide.properties[x])]), end = "\t", flush=True, file=skippedSamps)
print(file=skippedSamps, flush=True)
with open(BASE_DIR + "objectiveInfo.txt", "a") as out:
print("\t".join([str(slide_number).zfill(3), str(objective_power)]), file=out)
level = slide.get_best_level_for_downsample(SCALE_FACTOR)
whole_slide_image = slide.read_region((0, 0), level, slide.level_dimensions[level])
whole_slide_image = whole_slide_image.convert("RGB")
img = whole_slide_image.resize((new_w, new_h), PIL.Image.BILINEAR)
return(img, large_w, large_h, new_w, new_h)
def apply_filters_to_image(slide_num, save=True):
"""
Apply a set of filters to an image and optionally save and/or display filtered images.
Args:
slide_num: The slide number.
save: If True, save filtered images.
Returns:
Tuple consisting of 1) the resulting filtered image as a NumPy array, and 2) dictionary of image information
(used for HTML page generation).
"""
info = dict()
if save and not os.path.exists(FILTER_DIR):
os.makedirs(FILTER_DIR)
img_path = util.getTrainingImagePath(slide_num, DEST_TRAIN_DIR, TRAIN_PREFIX, DEST_TRAIN_EXT, SCALE_FACTOR)#, large_w, large_h, new_w, new_h)
np_orig = util.open_image_np(img_path)
filtered_np_img = apply_image_filters(np_orig, slide_num, info, save=False)
if save:
result_path = get_filter_image_result(slide_num)
pil_img = util.np_to_pil(filtered_np_img)
pil_img.save(result_path)
return(filtered_np_img, info)
def get_tile_indices(rows, cols, row_tile_size, col_tile_size, stepSize):
"""
Obtain a list of tile coordinates (starting row, ending row, starting column, ending column, row number, column number).
Args:
rows: Number of rows.
cols: Number of columns.
row_tile_size: Number of pixels in a tile row.
col_tile_size: Number of pixels in a tile column.
Returns:
List of tuples representing tile coordinates consisting of starting row, ending row,
starting column, ending column, row number, column number.
"""
indices = list()
num_row_tiles, num_col_tiles = util.get_num_tiles(rows, cols, row_tile_size, col_tile_size)
for r in np.arange(0, num_row_tiles-1+(1-OVERLAP), stepSize*(1-OVERLAP)):
# for r in range(0, num_row_tiles, stepSize):
start_r = r * row_tile_size
end_r = ((r + 1) * row_tile_size) if (r < num_row_tiles - 1) else rows
for c in np.arange(0, num_col_tiles-1+(1-OVERLAP), stepSize*(1-OVERLAP)):
# for c in range(0, num_col_tiles, stepSize):
start_c = c * col_tile_size
end_c = ((c + 1) * col_tile_size) if (c < num_col_tiles - 1) else cols
# indices.append((start_r, end_r, start_c, end_c, r + 1, c + 1))
indices.append((int(start_r), int(end_r), int(start_c), int(end_c), int(r) + 1, int(c) + 1))
return(indices)
class TissueQuantity(Enum):
NONE = 0
LOW = 1
MEDIUM = 2
HIGH = 3
def tissue_quantity(tissue_percentage):
"""
Obtain TissueQuantity enum member (HIGH, MEDIUM, LOW, or NONE) for corresponding tissue percentage.
Args:
tissue_percentage: The tile tissue percentage.
Returns:
TissueQuantity enum member (HIGH, MEDIUM, LOW, or NONE).
"""
if tissue_percentage >= TISSUE_HIGH_THRESH:
return(TissueQuantity.HIGH)
elif (tissue_percentage >= TISSUE_LOW_THRESH) and (tissue_percentage < TISSUE_HIGH_THRESH):
return(TissueQuantity.MEDIUM)
elif (tissue_percentage > 0) and (tissue_percentage < TISSUE_LOW_THRESH):
return(TissueQuantity.LOW)
else:
return(TissueQuantity.NONE)
def tissue_quantity_factor(amount):
"""
Obtain a scoring factor based on the quantity of tissue in a tile.
Args:
amount: Tissue amount as a TissueQuantity enum value.
Returns:
Scoring factor based on the tile tissue quantity.
"""
if amount == TissueQuantity.HIGH:
quantity_factor = 1.0
elif amount == TissueQuantity.MEDIUM:
quantity_factor = 0.2
elif amount == TissueQuantity.LOW:
quantity_factor = 0.1
else:
quantity_factor = 0.0
return(quantity_factor)
def filter_hsv_to_h(hsv, output_type="int", display_np_info=True):
"""
Obtain hue values from HSV NumPy array as a 1-dimensional array. If output as an int array, the original float
values are multiplied by 360 for their degree equivalents for simplicity. For more information, see
https://en.wikipedia.org/wiki/HSL_and_HSV
Args:
hsv: HSV image as a NumPy array.
output_type: Type of array to return (float or int).
display_np_info: If True, display NumPy array info and filter time.
Returns:
Hue values (float or int) as a 1-dimensional NumPy array.
"""
h = hsv[:, :, 0]
h = h.flatten()
if output_type == "int":
h *= 360
h = h.astype("int")
return(h)
def rgb_to_hues(rgb):
"""
Convert RGB NumPy array to 1-dimensional array of hue values (HSV H values in degrees).
Args:
rgb: RGB image as a NumPy array
Returns:
1-dimensional array of hue values in degrees
"""
hsv = sk_color.rgb2hsv(rgb)
h = filter_hsv_to_h(hsv, display_np_info=False)
return(h)
def hsv_purple_pink_factor(rgb):
"""
Compute scoring factor based on purple and pink HSV hue deviations and degree to which a narrowed hue color range
average is purple versus pink.
Args:
rgb: Image an NumPy array.
Returns:
Factor that favors purple (hematoxylin stained) tissue over pink (eosin stained) tissue.
"""
hues = rgb_to_hues(rgb)
hues = hues[hues >= 260] # exclude hues under 260
hues = hues[hues <= 340] # exclude hues over 340
if len(hues) == 0:
return(0) # if no hues between 260 and 340, then not purple or pink
pu_dev = np.sqrt(np.mean(np.abs(hues - HSV_PURPLE) ** 2))
pi_dev = np.sqrt(np.mean(np.abs(hues - HSV_PINK) ** 2))
avg_factor = (340 - np.average(hues)) ** 2
if pu_dev == 0: # avoid divide by zero if tile has no tissue
return(0)
factor = pi_dev / pu_dev * avg_factor
return(factor)
def hsv_saturation_and_value_factor(rgb):
"""
Function to reduce scores of tiles with narrow HSV saturations and values since saturation and value standard
deviations should be relatively broad if the tile contains significant tissue.
Example of a blurred tile that should not be ranked as a top tile:
../data/tiles_png/006/TUPAC-TR-006-tile-r58-c3-x2048-y58369-w1024-h1024.png
Args:
rgb: RGB image as a NumPy array
Returns:
Saturation and value factor, where 1 is no effect and less than 1 means the standard deviations of saturation and
value are relatively small.
"""
hsv = sk_color.rgb2hsv(rgb)
s = hsv[:, :, 1]
s = s.flatten()
v = hsv[:, :, 2]
v = v.flatten()
s_std = np.std(s)
v_std = np.std(v)
if s_std < 0.05 and v_std < 0.05:
factor = 0.4
elif s_std < 0.05:
factor = 0.7
elif v_std < 0.05:
factor = 0.7
else:
factor = 1
factor = factor ** 2
return(factor)
def score_tile(np_tile, tissue_percent, slide_num, row, col):
"""
Score tile based on tissue percentage, color factor, saturation/value factor, and tissue quantity factor.
Args:
np_tile: Tile as NumPy array.
tissue_percent: The percentage of the tile judged to be tissue.
slide_num: Slide number.
row: Tile row.
col: Tile column.
Returns tuple consisting of score, color factor, saturation/value factor, and tissue quantity factor.
"""
color_factor = hsv_purple_pink_factor(np_tile)
s_and_v_factor = hsv_saturation_and_value_factor(np_tile)
amount = tissue_quantity(tissue_percent)
quantity_factor = tissue_quantity_factor(amount)
combined_factor = color_factor * s_and_v_factor * quantity_factor
score = (tissue_percent ** 2) * np.log(1 + combined_factor) / 1000.0
# scale score to between 0 and 1
score = 1.0 - (10.0 / (10.0 + score))
return(score, color_factor, s_and_v_factor, quantity_factor)
def score_tiles(slide_num, np_img=None, dimensions=None, small_tile_in_tile=False):
"""
Score all tiles for a slide and return the results in a TileSummary object.
Args:
slide_num: The slide number.
np_img: Optional image as a NumPy array.
dimensions: Optional tuple consisting of (original width, original height, new width, new height). Used for dynamic
tile retrieval.
small_tile_in_tile: If True, include the small NumPy image in the Tile objects.
Returns:
TileSummary object which includes a list of Tile objects containing information about each tile.
"""
objective_powerInfo = pd.read_csv(BASE_DIR+"objectiveInfo.txt", header=None, names=['objective_power'], index_col=0, sep="\t")
objective_power = int(objective_powerInfo.loc[slide_num, 'objective_power'])
if objective_power == 40:
if RESOLUTION == '5x':
stepSize = 8
elif RESOLUTION == '20x':
stepSize = 2
elif RESOLUTION == '2.5x':
stepSize = 16
elif objective_power == 20:
if RESOLUTION == '5x':
stepSize = 4
elif RESOLUTION == '2.5x':
stepSize = 8
else:
stepSize = 1
else:
if RESOLUTION == '5x':
stepSize = 2
elif RESOLUTION == '2.5x':
stepSize = 4
elif RESOLUTION == '20x':
print(RESOLUTION + " is not supported at 10x magnification")
sys.exit()
else:
stepSize = 1
if dimensions is None:
img_path = get_filter_image_result(slide_num)
o_w, o_h, w, h = util.parseDimensionsFromImageFilename(img_path)
else:
o_w, o_h, w, h = dimensions
if np_img is None:
np_img = slide.open_image_np(img_path)
row_tile_size = round(ROW_TILE_SIZE / SCALE_FACTOR)
col_tile_size = round(COL_TILE_SIZE / SCALE_FACTOR)
slidePath = util.get_training_slide_path(SRC_TRAIN_DIR, slide_num)
num_row_tiles, num_col_tiles = util.get_num_tiles(h, w, row_tile_size, col_tile_size)
tile_sum = TileSummary(slide_num=slide_num,
orig_w=o_w,
orig_h=o_h,
orig_tile_w=COL_TILE_SIZE,
orig_tile_h=ROW_TILE_SIZE,
scaled_w=w,
scaled_h=h,
scaled_tile_w=col_tile_size,
scaled_tile_h=row_tile_size,
tissue_percentage=util.tissue_percent(np_img),
num_col_tiles=num_col_tiles,
num_row_tiles=num_row_tiles)
count = 0
high = 0
medium = 0
low = 0
none = 0
tile_indices = get_tile_indices(h, w, row_tile_size, col_tile_size, stepSize)
for t in tile_indices:
count += 1 # tile_num
r_s, r_e, c_s, c_e, r, c = t
np_tile = np_img[r_s:r_e+((r_e-r_s)*(stepSize-1)), c_s:c_e+((c_e-c_s)*(stepSize-1))]
t_p = util.tissue_percent(np_tile)
amount = tissue_quantity(t_p)
if amount == TissueQuantity.HIGH:
high += 1
elif amount == TissueQuantity.MEDIUM:
medium += 1
elif amount == TissueQuantity.LOW:
low += 1
elif amount == TissueQuantity.NONE:
none += 1
o_c_s, o_r_s = small_to_large_mapping((c_s, r_s), (o_w, o_h))
o_c_e, o_r_e = small_to_large_mapping((c_e, r_e), (o_w, o_h))
# pixel adjustment in case tile dimension too large (for example, 1025 instead of 1024)
if (o_c_e - o_c_s) > COL_TILE_SIZE:
o_c_e -= 1
if (o_r_e - o_r_s) > ROW_TILE_SIZE:
o_r_e -= 1
score, color_factor, s_and_v_factor, quantity_factor = score_tile(np_tile, t_p, slide_num, r, c)
np_scaled_tile = np_tile if small_tile_in_tile else None
tile = Tile(tile_sum, slide_num, np_scaled_tile, count, r, c, r_s, r_e, c_s, c_e, o_r_s, o_r_e, o_c_s,
o_c_e, t_p, color_factor, s_and_v_factor, quantity_factor, score)
tile_sum.tiles.append(tile)
tile_sum.count = count
tile_sum.high = high
tile_sum.medium = medium
tile_sum.low = low
tile_sum.none = none
tiles_by_score = tile_sum.tiles_by_score()
rank = 0
for t in tiles_by_score:
rank += 1
t.rank = rank
return(tile_sum)
def training_slide_range_to_images(start_ind, end_ind, train_images):
"""
Convert a range of WSI training slides to smaller images (in a format such as jpg or png).
Args:
start_ind: Starting index (inclusive).
end_ind: Ending index (inclusive).
Returns:
The starting index and the ending index of the slides that were converted.
"""
count = 0
for slide_num in range(start_ind, end_ind + 1):
try:
training_slide_to_image(slide_num, train_images[count])
count += 1
except:
print(str(slide_num).zfill(3), file=skippedSamps, flush=True)
return (start_ind, end_ind)
def apply_filters_to_image_range(start_ind, end_ind, save):
"""
Apply filters to a range of images.
Args:
start_ind: Starting index (inclusive).
end_ind: Ending index (inclusive).
save: If True, save filtered images.
Returns:
Tuple consisting of 1) staring index of slides converted to images, 2) ending index of slides converted to images,
and 3) a dictionary of image filter information.
"""
html_page_info = dict()
for slide_num in range(start_ind, end_ind + 1):
try:
_, info = apply_filters_to_image(slide_num, save=save)
html_page_info.update(info)
except:
print(str(slide_num).zfill(3), file=skippedSamps, flush=True)
return(start_ind, end_ind, html_page_info)
def save_display_tile(tile,imageFile, save=True):
"""
Save and/or display a tile image.
Args:
tile: Tile object.
save: If True, save tile image.
"""
tile_pil_img = tile_to_pil_tile(tile, imageFile)
if save:
img_path = get_tile_image_path(tile)
dir = os.path.dirname(img_path)
if not os.path.exists(dir):
os.makedirs(dir)
tile_pil_img.save(img_path)
def tile_to_pil_tile(tile, imageSlide):
"""
Convert tile information into the corresponding tile as a PIL image read from the whole-slide image file.
Args:
tile: Tile object.
Return:
Tile as a PIL image.
"""
t = tile
slide_filepath = util.get_training_slide_path(SRC_TRAIN_DIR, imageSlide)
s = util.openSlide(slide_filepath)
x, y = t.o_c_s, t.o_r_s
w, h = t.o_c_e - t.o_c_s, t.o_r_e - t.o_r_s
objective_powerInfo = pd.read_csv(BASE_DIR+"objectiveInfo.txt", header=None, names=['objective_power'], index_col=0, sep="\t")
objective_power = int(objective_powerInfo.loc[t.slide_num, 'objective_power'])
if objective_power == 40:
if RESOLUTION == '5x':