-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathHaralick_Feature_Extraction.py
More file actions
225 lines (166 loc) · 7.91 KB
/
Haralick_Feature_Extraction.py
File metadata and controls
225 lines (166 loc) · 7.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import pandas as pd
import numpy as np
import re
import os
import zipfile
import tempfile
import argparse
import warnings
warnings.filterwarnings('ignore')
from io import BytesIO
from roifile import ImagejRoi
from skimage.draw import polygon
from skimage.measure import regionprops
from skimage import io
from histomicstk.features import compute_haralick_features
# **Defining Paths**
# ```
# /Users/ninagrishencko/Desktop/A2780vsA2780CisR/
# └── A2780/
# ├── images/
# │ ├── r02c02f03_MAX_ch2.tiff
# │ ├── r02c02f03_MAX_ch3.tiff
# │ └── ...
# ├── ROIS/
# │ ├── r02c02f03_MAX_ch2.zip
# │ ├── r02c02f02_MAX_ch2.zip
# │ └── ...
# └── single_cell_morphology.csv
# ```
parser = argparse.ArgumentParser(description="Compute Haralick features from ROIs.")
parser.add_argument('--images_dir', required=True, help='Path to images directory')
parser.add_argument('--roi_dir', required=True, help='Path to ROI directory')
parser.add_argument('--morph_df_path', required=True, help='Path to morphology CSV file')
parser.add_argument('--save_path', required=True, help='Output path for the final CSV')
args = parser.parse_args()
images_dir = args.images_dir
roi_dir = args.roi_dir
morph_df_path = args.morph_df_path
save_path = args.save_path
morph_df = pd.read_csv(morph_df_path, index_col=False)
morph_df['image_name'] = morph_df['image_name'].str.replace(r"\.tiff?$", "", flags=re.IGNORECASE, regex=True)
def extract_rois_from_zip(roi_dir, image_name):
"""
Extracts ROI files from a ZIP folder with ROIs into memory.
Returns:
dict: Keys are ROI filenames (with .roi), values are BytesIO binary objects.
"""
zip_path = os.path.join(roi_dir, f'{image_name}.zip')
rois = {}
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
for file in zip_ref.namelist(): # a list of all filenames inside the ZIP
if file.endswith('.roi'):
with zip_ref.open(file) as roi_file:
rois[file] = BytesIO(roi_file.read()) # Wrapping binary data into an in-memory file-like object
return rois
def convert_roi_bytes_to_mask(roi_bytes, im_shape):
"""
Convert an ImageJ ROI stored as in-memory bytes into a 2D binary mask.
Parameters:
roi_bytes (BytesIO): In-memory binary file containing the ROI file data.
im_shape (tuple): Shape of the target image (height, width) to create the mask.
Returns:
np.ndarray: A 2D boolean NumPy array where pixels inside the ROI are True, others False.
"""
with tempfile.NamedTemporaryFile(suffix='.roi') as tmp: # Creating a temporary file with suffix .roi to write the ROI bytes into disk
tmp.write(roi_bytes.getbuffer()) # Writing the bytes from the in-memory ROI to the temporary file
tmp.flush()
roi = ImagejRoi.fromfile(tmp.name) # Loading the ROI from the temporary file using roifile's ImagejRoi
# Getting the coordinates of the ROI's polygon points
coords = roi.coordinates()
x = coords[:, 0]
y = coords[:, 1]
# Filling out the area inside of ROI to generate a binary mask
mask = np.zeros(im_shape, dtype=bool)
r, c = polygon(y, x, shape=im_shape)
mask[r, c] = True
return mask
def compute_haralick(full_im, cell_mask,
# Offsets are list of pixel displacements to compute co-occurrence matrices at multiple scales and directions.
offsets=np.array([ [0, 1], [1, 0], [1, 1], [-1, 1], # Small scale, 1 pixel apart
[0, 5], [5, 0], [5, 5], [-5, 5]]), # Medium scale
num_levels=64, # Number of gray levels to quantize the image intensities to
clip_percentiles=(1, 99)): # low and high percentiles used to clip intensity values to avoid outliers affecting quantization.
"""
Computing haralick texture features for a single object after pre-processing
"""
# Converting a boolean mask into a binary mask
int_mask = np.zeros_like(cell_mask, dtype=np.uint8)
int_mask[cell_mask] = 1
int_vals = full_im[cell_mask] # Getting intensity values inside ROI
low, high = np.percentile(int_vals, clip_percentiles) # clipping range for the ROI
im_clipped = np.clip(full_im, low, high) # Limiting pixel inside the ROI to the clipping range
im_quant = ((im_clipped - low) / (high - low) * (num_levels - 1)).astype(np.uint8) # Quantizing image into [0, num_levels - 1]
# Computing Haralick texture features
df = compute_haralick_features(
int_mask,
im_quant,
offsets=offsets,
num_levels=num_levels,
gray_limits=[0, num_levels - 1], # Gray limits of the quantized images
)
return df.iloc[0].to_dict()
def match_rois_with_df_objects(df_subset, haralick_feats):
"""
Matches ROI-based Haralick features to segmented objects in df_subset using 'roi_name' as key.
Parameters:
df_subset (pd.DataFrame): Subset of morph_df for the current image with segmented objects.
Must contain a 'roi_name' column.
haralick_feats (list of dict): List of Haralick feature dictionaries, each with a 'roi_name' key.
Returns:
pd.DataFrame: A new DataFrame with Haralick features appended, matched by 'roi_name'.
"""
haralick_df = pd.DataFrame(haralick_feats)
# Ensuring both haralick_df and df_subset have 'roi_name' column and merge on it
if 'roi_name' not in df_subset.columns or 'roi_name' not in haralick_df.columns:
raise ValueError("'roi_name' column is required in both df_subset and haralick_feats.")
matched_df = df_subset.merge(haralick_df, on='roi_name', how='left')
return matched_df
def move_cols(df):
"""
Reorders specific identifier columns in the DataFrame to improve readability and consistency.
This function moves the 'Condition', 'Replicate', and 'roi_name' columns directly after the 'label' column.
These columns may have been appended to the end or scattered in the DataFrame due to earlier processing steps
(e.g., merging or appending new features).
"""
cols = list(df.columns)
cols_to_move = ["roi_name", "Condition", "Replicate", ]
# Remove them from the current list
for col in cols_to_move:
cols.remove(col)
# Find index of 'label'
label_idx = cols.index("label")
# Insert the cols_to_move right after the 'label column'
for i, col in enumerate(cols_to_move):
cols.insert(label_idx + 1 + i, col)
# Reorder the DataFrame
final_df = df[cols]
return final_df
haralick_dfs = []
for image_name in morph_df['image_name'].unique():
# Reading the image
image_path = os.path.join(images_dir, f'{image_name}.tif')
im_intensity = io.imread(image_path, plugin='tifffile', key=0)
# Extracting ROIs for this image
rois = extract_rois_from_zip(roi_dir, image_name)
# Subset morph_df for this image
df_subset = morph_df[morph_df['image_name'] == image_name].copy()
# Ensure roi_name column is in df_subset
if 'roi_name' not in df_subset.columns:
df_subset['roi_name'] = df_subset.apply(
lambda row: f"{os.path.splitext(row['image_name'])[0]}_label{row['label']}.roi", axis=1
)
haralick_feats = []
for fname, roi_bytes in rois.items():
roi_mask = convert_roi_bytes_to_mask(roi_bytes, im_intensity.shape)
feats = compute_haralick(im_intensity, roi_mask)
feats['roi_name'] = fname
haralick_feats.append(feats)
matched_df = match_rois_with_df_objects(df_subset, haralick_feats)
haralick_dfs.append(matched_df)
# Combine all matched dataframes
concat_df = pd.concat(haralick_dfs, ignore_index=True)
haralick_df = move_cols(concat_df)
# Save final dataframe to the specified directory
haralick_df.to_csv(save_path, index=False)
print("The process is completed!")