@@ -141,10 +141,20 @@ def load_data(input_filename, cache=None, cache_dir=None, cache_file=None):
141141
142142 return ms1_df , ms2_df
143143
144+ ## Library MGF Loader
144145def _load_data_mgf (input_filename ):
146+ ms1_df , ms2_df = _load_data_mgf_pyteomics (input_filename )
147+
148+ # try manual loader if pyteomics fails
149+ if len (ms2_df ) == 0 :
150+ ms1_df , ms2_df = _load_data_mgf_manual (input_filename )
151+
152+ return ms1_df , ms2_df
153+
154+
155+ def _load_data_mgf_pyteomics (input_filename ):
145156 ms2_data_list = []
146157
147- # Use 'with' context manager for safe file handling
148158 with mgf .read (input_filename ) as reader :
149159 for index , spectrum in enumerate (reader ):
150160
@@ -216,10 +226,138 @@ def _load_data_mgf(input_filename):
216226 # Convert to DataFrames
217227 ms2_df = pd .DataFrame (ms2_data_list )
218228
219- # Original code assigned the last single peak to ms1_df.
220- # Initializing empty to prevent bugs.
221- ms1_df = pd .DataFrame ()
229+ # This is kind of a hack for compatibility
230+ try :
231+ ms1_df = pd .DataFrame ([peak_dict ])
232+ except Exception :
233+ peak_dict = {
234+ "i" : 0 ,
235+ "i_norm" : 0 ,
236+ "i_tic_norm" : 0 ,
237+ "mz" : 0 ,
238+ "scan" : 1 ,
239+ "rt" : 0 ,
240+ "polarity" : 1 # Default
241+ }
242+ ms1_df = pd .DataFrame ([peak_dict ])
243+
244+ return ms1_df , ms2_df
245+
246+ def _load_data_mgf_manual (input_filename ):
247+
248+ ms2_data_list = []
249+
250+ # Defaults for the current spectrum
251+ current_params = {}
252+ current_peaks = []
253+ in_spectrum = False
254+ spectrum_index = 0
255+
256+ with open (input_filename , 'r' ) as f :
257+ for line_num , line in enumerate (f ):
258+ line = line .strip ()
259+
260+ if not line :
261+ continue
262+
263+ if line == "BEGIN IONS" :
264+ in_spectrum = True
265+ current_params = {
266+ "scan" : spectrum_index + 1 , # Default scan to index
267+ "rt" : 0.0 ,
268+ "precmz" : 0.0 ,
269+ "charge" : 1
270+ }
271+ current_peaks = []
272+ continue
273+
274+ if line == "END IONS" :
275+ in_spectrum = False
276+ spectrum_index += 1
277+
278+ # Process the collected spectrum
279+ if not current_peaks :
280+ continue
281+
282+ # Unzip peaks for calculation
283+ # peaks is list of [mz, intensity]
284+ mz_list = [p [0 ] for p in current_peaks ]
285+ i_list = [p [1 ] for p in current_peaks ]
286+
287+ i_max = max (i_list )
288+ i_sum = sum (i_list )
289+
290+ if i_max == 0 :
291+ continue
222292
293+ # Create rows for DataFrame
294+ for mz , intensity in current_peaks :
295+ if intensity == 0 :
296+ continue
297+
298+ peak_dict = {
299+ "i" : intensity ,
300+ "i_norm" : intensity / i_max ,
301+ "i_tic_norm" : intensity / i_sum ,
302+ "mz" : mz ,
303+ "scan" : current_params ["scan" ],
304+ "rt" : current_params ["rt" ],
305+ "precmz" : current_params ["precmz" ],
306+ "ms1scan" : 0 ,
307+ "charge" : current_params ["charge" ],
308+ "polarity" : 1
309+ }
310+ ms2_data_list .append (peak_dict )
311+ continue
312+
313+ if in_spectrum :
314+ # Check if line is metadata (contains '=') or peak data
315+ if '=' in line :
316+ # Split only on the first '=' to handle values containing '='
317+ key , value = line .split ('=' , 1 )
318+ key = key .upper ().strip ()
319+ value = value .strip ()
320+
321+ try :
322+ if key == "PEPMASS" :
323+ # PEPMASS often looks like "400.0 5000.0" (mz intensity)
324+ current_params ["precmz" ] = float (value .split ()[0 ])
325+ elif key == "SCANS" :
326+ current_params ["scan" ] = value
327+ elif key == "RTINSECONDS" :
328+ current_params ["rt" ] = float (value ) / 60.0
329+ elif key == "CHARGE" :
330+ # Handle "2+" or "2"
331+ current_params ["charge" ] = int (value .strip ('+' ))
332+ except (ValueError , IndexError ):
333+ # If header parsing fails, keep default values
334+ pass
335+ else :
336+ # Assume it is peak data: "mz intensity"
337+ try :
338+ parts = line .split ()
339+ if len (parts ) >= 2 :
340+ mz = float (parts [0 ])
341+ intensity = float (parts [1 ])
342+ current_peaks .append ((mz , intensity ))
343+ except ValueError :
344+ # Skip malformed peak lines
345+ pass
346+
347+ ms2_df = pd .DataFrame (ms2_data_list )
348+
349+ # This is kind of a hack for portability
350+ peak_dict = {
351+ "i" : 0 ,
352+ "i_norm" : 0 ,
353+ "i_tic_norm" : 0 ,
354+ "mz" : 0 ,
355+ "scan" : 1 ,
356+ "rt" : 0 ,
357+ "polarity" : 1 # Default
358+ }
359+ ms1_df = pd .DataFrame ([peak_dict ])
360+
223361 return ms1_df , ms2_df
224362
225363def _load_data_gnps_json (input_filename ):
0 commit comments