Skip to content

Commit 4b33646

Browse files
committed
further refactor for mgf
1 parent 23ee39f commit 4b33646

2 files changed

Lines changed: 144 additions & 6 deletions

File tree

massql/msql_fileloading.py

Lines changed: 142 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,10 +141,20 @@ def load_data(input_filename, cache=None, cache_dir=None, cache_file=None):
141141

142142
return ms1_df, ms2_df
143143

144+
## Library MGF Loader
144145
def _load_data_mgf(input_filename):
146+
ms1_df, ms2_df = _load_data_mgf_pyteomics(input_filename)
147+
148+
# try manual loader if pyteomics fails
149+
if len(ms2_df) == 0:
150+
ms1_df, ms2_df = _load_data_mgf_manual(input_filename)
151+
152+
return ms1_df, ms2_df
153+
154+
155+
def _load_data_mgf_pyteomics(input_filename):
145156
ms2_data_list = []
146157

147-
# Use 'with' context manager for safe file handling
148158
with mgf.read(input_filename) as reader:
149159
for index, spectrum in enumerate(reader):
150160

@@ -216,10 +226,138 @@ def _load_data_mgf(input_filename):
216226
# Convert to DataFrames
217227
ms2_df = pd.DataFrame(ms2_data_list)
218228

219-
# Original code assigned the last single peak to ms1_df.
220-
# Initializing empty to prevent bugs.
221-
ms1_df = pd.DataFrame()
229+
# This is kind of a hack for compatibility
230+
try:
231+
ms1_df = pd.DataFrame([peak_dict])
232+
except Exception:
233+
peak_dict = {
234+
"i": 0,
235+
"i_norm": 0,
236+
"i_tic_norm": 0,
237+
"mz": 0,
238+
"scan": 1,
239+
"rt": 0,
240+
"polarity": 1 # Default
241+
}
242+
ms1_df = pd.DataFrame([peak_dict])
243+
244+
return ms1_df, ms2_df
245+
246+
def _load_data_mgf_manual(input_filename):
247+
248+
ms2_data_list = []
249+
250+
# Defaults for the current spectrum
251+
current_params = {}
252+
current_peaks = []
253+
in_spectrum = False
254+
spectrum_index = 0
255+
256+
with open(input_filename, 'r') as f:
257+
for line_num, line in enumerate(f):
258+
line = line.strip()
259+
260+
if not line:
261+
continue
262+
263+
if line == "BEGIN IONS":
264+
in_spectrum = True
265+
current_params = {
266+
"scan": spectrum_index + 1, # Default scan to index
267+
"rt": 0.0,
268+
"precmz": 0.0,
269+
"charge": 1
270+
}
271+
current_peaks = []
272+
continue
273+
274+
if line == "END IONS":
275+
in_spectrum = False
276+
spectrum_index += 1
277+
278+
# Process the collected spectrum
279+
if not current_peaks:
280+
continue
281+
282+
# Unzip peaks for calculation
283+
# peaks is list of [mz, intensity]
284+
mz_list = [p[0] for p in current_peaks]
285+
i_list = [p[1] for p in current_peaks]
286+
287+
i_max = max(i_list)
288+
i_sum = sum(i_list)
289+
290+
if i_max == 0:
291+
continue
222292

293+
# Create rows for DataFrame
294+
for mz, intensity in current_peaks:
295+
if intensity == 0:
296+
continue
297+
298+
peak_dict = {
299+
"i": intensity,
300+
"i_norm": intensity / i_max,
301+
"i_tic_norm": intensity / i_sum,
302+
"mz": mz,
303+
"scan": current_params["scan"],
304+
"rt": current_params["rt"],
305+
"precmz": current_params["precmz"],
306+
"ms1scan": 0,
307+
"charge": current_params["charge"],
308+
"polarity": 1
309+
}
310+
ms2_data_list.append(peak_dict)
311+
continue
312+
313+
if in_spectrum:
314+
# Check if line is metadata (contains '=') or peak data
315+
if '=' in line:
316+
# Split only on the first '=' to handle values containing '='
317+
key, value = line.split('=', 1)
318+
key = key.upper().strip()
319+
value = value.strip()
320+
321+
try:
322+
if key == "PEPMASS":
323+
# PEPMASS often looks like "400.0 5000.0" (mz intensity)
324+
current_params["precmz"] = float(value.split()[0])
325+
elif key == "SCANS":
326+
current_params["scan"] = value
327+
elif key == "RTINSECONDS":
328+
current_params["rt"] = float(value) / 60.0
329+
elif key == "CHARGE":
330+
# Handle "2+" or "2"
331+
current_params["charge"] = int(value.strip('+'))
332+
except (ValueError, IndexError):
333+
# If header parsing fails, keep default values
334+
pass
335+
else:
336+
# Assume it is peak data: "mz intensity"
337+
try:
338+
parts = line.split()
339+
if len(parts) >= 2:
340+
mz = float(parts[0])
341+
intensity = float(parts[1])
342+
current_peaks.append((mz, intensity))
343+
except ValueError:
344+
# Skip malformed peak lines
345+
pass
346+
347+
ms2_df = pd.DataFrame(ms2_data_list)
348+
349+
# This is kind of a hack for portability
350+
peak_dict = {
351+
"i": 0,
352+
"i_norm": 0,
353+
"i_tic_norm": 0,
354+
"mz": 0,
355+
"scan": 1,
356+
"rt": 0,
357+
"polarity": 1 # Default
358+
}
359+
ms1_df = pd.DataFrame([peak_dict])
360+
223361
return ms1_df, ms2_df
224362

225363
def _load_data_gnps_json(input_filename):

tests/test_query.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ def main():
781781
#test_translator()
782782
#test_ms1_iron_X_changes_intensity()
783783
#test_nocache()
784-
#test_topdown()
784+
test_topdown()
785785
#test_defect()
786786
#test_or_against_iron()
787787
#test_quad_brominated()
@@ -801,7 +801,7 @@ def main():
801801
#test_mgf_intensity()
802802
#test_otherscan_query()
803803
#test_otherscan_iron_query()
804-
test_otherscan()
804+
#test_otherscan()
805805

806806

807807
if __name__ == "__main__":

0 commit comments

Comments
 (0)