Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/changes/dev/13932.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Use mffpy-backed parsing for EGI MFF event tracks while tolerating nanosecond timestamps that some files store in ``beginTime``, by `Pragnya Khandelwal`_.
32 changes: 32 additions & 0 deletions mne/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import operator as operator_module
import os
import warnings
from datetime import datetime
from math import log

import numpy as np
Expand Down Expand Up @@ -136,6 +137,37 @@ def _safe_svd(A, **kwargs):
return linalg.svd(A, lapack_driver="gesvd", **kwargs)


def _parse_mffpy_datetime(time_str, *, tzinfo=None):
"""Parse an MFF timestamp with nanosecond fractional seconds.

TODO VERSION: Remove once BEL-Public/mffpy#133 is released.
Upstream issue: https://github.com/BEL-Public/mffpy/issues/138
"""
if time_str is None:
return None
stripped = time_str.strip()
tz_pos = max(stripped.rfind("+"), stripped.rfind("-"))
tz = ""
core = stripped
if tz_pos > stripped.find("T"):
core = stripped[:tz_pos]
tz = stripped[tz_pos:]
if "." in core:
left, frac = core.split(".", 1)
core = f"{left}.{(frac + '000000')[:6]}"
if tz and ":" in tz:
tz = tz.replace(":", "")
formatted = core + tz
fmt = "%Y-%m-%dT%H:%M:%S.%f%z" if tz else "%Y-%m-%dT%H:%M:%S.%f"
try:
parsed = datetime.strptime(formatted, fmt)
except ValueError:
parsed = datetime.strptime(formatted.split(".")[0], "%Y-%m-%dT%H:%M:%S%z")
if parsed.tzinfo is None and tzinfo is not None:
parsed = parsed.replace(tzinfo=tzinfo)
return parsed


###############################################################################
# NumPy Generator (NumPy 1.17)

Expand Down
144 changes: 85 additions & 59 deletions mne/io/egi/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.

from datetime import datetime
from glob import glob
from os.path import basename, join, splitext
from os.path import basename, splitext

import numpy as np

from ...fixes import _parse_mffpy_datetime
from ...utils import _soft_import, _validate_type, logger, warn


Expand All @@ -23,7 +22,9 @@ def _read_events(input_fname, info):
Header info array.
"""
n_samples = info["last_samps"][-1]
mff_events, event_codes = _read_mff_events(input_fname, info["sfreq"])
mff_events, event_codes = _read_mff_events(
input_fname, info["sfreq"], info["meas_dt_local"]
)
info["n_events"] = len(event_codes)
info["event_codes"] = event_codes
events = np.zeros([info["n_events"], info["n_segments"] * n_samples])
Expand All @@ -35,65 +36,99 @@ def _read_events(input_fname, info):
return events, info, mff_events


def _read_mff_events(filename, sfreq):
"""Extract the events.
def _read_mff_events(filename, sfreq, start_time):
"""Extract the events with mffpy.

Parameters
----------
filename : path-like
File path.
sfreq : float
The sampling frequency
start_time : datetime
The recording start time used as the event anchor.
"""
orig = {}
for xml_file in glob(join(filename, "*.xml")):
xml_type = splitext(basename(xml_file))[0]
et = _parse_xml(xml_file)
if et is not None:
orig[xml_type] = et
xml_files = orig.keys()
xml_events = [x for x in xml_files if x[:7] == "Events_"]
for item in orig["info"]:
if "recordTime" in item:
start_time = _ns2py_time(item["recordTime"])
break
# Use defusedxml to parse Events XML directly (avoid mffpy's strict
# datetime parsing which may include nanosecond fractions). We still use
# mffpy.Reader for locating the Events.xml files inside the MFF.
_soft_import("mffpy", "reading EGI MFF data")
_soft_import("defusedxml", "reading EGI MFF data")
import defusedxml.ElementTree as DET
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you're using difusedxml instead of mffpy.XML.from_file( )?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks so much for taking a look! There are two reasons for this:

MNE-Python has a strict internal policy requiring defusedxml for all XML parsing to protect against XML vulnerability attacks.

Because of the 9-digit fractional timestamp bug we discussed in issue BEL-Public/mffpy#138 calling mffpy.XML directly crashes the CI right now. I'm using defusedxml to route the parsing through a temporary shim in mne/fixes.py until your PR gets merged and released!

Copy link
Copy Markdown
Contributor

@pmolfese pmolfese May 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The struggle is real as we say. This might be something more for @scott-huberty and @drammock to ponder beyond this GSOC.

The MFF reader in mffpy currently used for reading epochs or evoked files is via Reader() is already using the mffpy XML.from_file() at least for getting header information and events (mne/io/egi/egimff.py). So there's already some exposure to another XML reader in the current codebase.

The pros of keeping the mffpy way of things is just how easy some things are like returning typed objects (.sensors, .epochs, .events), recover=True for malformed XML, and the fact that Reader() is already using it.

for evfile in sorted(glob(op.join(input_fname, "Events_*.xml"))):
	track = XML.from_file(evfile)
	for event in track.events:
		code = event.get("code")
		if code is None:
			continue
...
...

I suppose we could have a conversation with BEL and other mffpy maintainers about switching the current lxml backend to defusedxml.

My thesis is currently: "if the point of this project is to use mffpy to parse MFF files, then you should use mffpy's way of doing things and if those need changing, then we should probably change those upstream". Perhaps the compromise could be using the XML.* functions for this with the goal of changing things upstream in mffpy.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm - okay faster than expected I have a mostly working defusedxml backend for mffpy. Might be able to crank this out in a week or so.

Copy link
Copy Markdown
Contributor

@scott-huberty scott-huberty May 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@drammock / @larsoner : 1) I assume it's ok if one of our dependencies parses XML files using lxml?

and 2) If mffpy devs cut a release after the aforementioned timestamp bug is fixed upstream... Is there any chance that MNE is comfortable with putting a lower pin on mffpy's latest release, so that we do not have to keep our shim around? 😁

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pins are fine for newly introduced dependencies. It's only an issue if we move from no pin to pin, or bump the pin.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume it's ok if one of our dependencies parses XML files using lxml?

we use defusedxml in our code because it closes some vulnerabilities present in the standard library. We cannot force dependencies to do likewise, but can suggest/encourage it upstream. IIRC, lxml has similar safeguards to defusedxml, but is a much more heavyweight package (does more stuff). I'd need to look into it to be sure though.

If a required dependency used lxml then I'd actually consider switching us to use it too --- why have 2 libs installed if one will suffice? But mffpy will be an optional dep (right?) so that logic doesn't necessarily apply, at least not as strongly, esp if there are salient differences in security or install size or API convenience.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Edit: sorry for the in Felicity of my last message, I just saw that @pmolfese is already looking into what it would take to switch to defusedxml upstream. Thanks! LMK if I can be of help there.

import mffpy

reader = mffpy.Reader(filename)
try:
files_list = sorted(reader.directory.listdir())
except Exception:
files_list = []
tracks = []
for xml_name in files_list:
stem = splitext(basename(xml_name))[0]
if not stem.startswith("Events"):
continue
with reader.directory.filepointer(stem) as fp:
try:
root = DET.parse(fp).getroot()
except Exception:
# fallback: try reading as bytes and parse string
try:
fp.seek(0)
txt = fp.read()
root = DET.fromstring(txt)
except Exception as exc2:
warn(
f"Could not parse the XML file {xml_name}: {exc2}",
RuntimeWarning,
)
continue
# identify eventTrack root (namespace-insensitive)
if _ns(root.tag) == "eventTrack":
tracks.append(root)

markers = []
code = []
for xml in xml_events:
for event in orig[xml][2:]:
event_start = _ns2py_time(event["beginTime"])
start = (event_start - start_time).total_seconds()
if event["code"] not in code:
code.append(event["code"])
marker = {
"name": event["code"],
"start": start,
"start_sample": int(np.trunc(start * sfreq)),
"end": start + float(event["duration"]) / 1e9,
"chan": None,
}
markers.append(marker)
events_tims = dict()
for ev in code:
trig_samp = list(
c["start_sample"] for n, c in enumerate(markers) if c["name"] == ev
)
events_tims.update({ev: trig_samp})
for root in tracks:
# each child 'event' element
for event_el in root.findall("{*}event"):
# extract fields by tag name ignoring namespace
ev = {}
for child in event_el:
tag = _ns(child.tag)
ev[tag] = child.text
# parse times and duration
event_start = _parse_mffpy_datetime(
ev.get("beginTime"), tzinfo=start_time.tzinfo
)
if event_start is None:
continue
start_sec = (event_start - start_time).total_seconds()
code_str = ev.get("code", "")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are probably a couple checks you could add:

  1. codes are supposed to be 4 characters (e.g. "STIM") and if not, then you're probably not reading a real MFF file
  2. You probably also want to read in the labels for use if/when you add annotations

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are really great suggestions. Because this is the very first step of my GSoC project, my mentors requested that I keep this initial micro-PR strictly confined to 1:1 functional parity with MNE's legacy reader. The legacy reader didn't enforce the 4-character limit or map the labels, so I want to avoid expanding the scope just yet.
However, adding proper annotations is on my roadmap for Phase 2 here #13926, so I will definitely be referencing those labels when we get there!

if code_str not in code:
code.append(code_str)
# duration in xml is typically in nanoseconds
duration = None
if ev.get("duration") is not None:
try:
duration = int(ev.get("duration")) / 1e9
except Exception:
duration = None
markers.append(
{
"name": code_str,
"start": start_sec,
"start_sample": int(np.trunc(start_sec * sfreq)),
"end": start_sec + (duration if duration is not None else 0.0),
"chan": None,
}
)

events_tims = {
ev: [marker["start_sample"] for marker in markers if marker["name"] == ev]
for ev in code
}
return events_tims, code


def _parse_xml(xml_file: str) -> list[dict[str, str]] | None:
"""Parse XML file."""
defusedxml = _soft_import("defusedxml", "reading EGI MFF data")
try:
xml = defusedxml.ElementTree.parse(xml_file)
except defusedxml.ElementTree.ParseError as e:
warn(f"Could not parse the XML file {xml_file}: {e}")
return
root = xml.getroot()
return _xml2list(root)


def _xml2list(root):
"""Parse XML item."""
output = []
Expand Down Expand Up @@ -150,15 +185,6 @@ def _xml2dict(root):
return output


def _ns2py_time(nstime):
"""Parse times."""
nsdate = nstime[0:10]
nstime0 = nstime[11:26]
nstime00 = nsdate + " " + nstime0
pytime = datetime.strptime(nstime00, "%Y-%m-%d %H:%M:%S.%f")
return pytime


def _combine_triggers(data, remapping=None):
"""Combine binary triggers."""
new_trigger = np.zeros(data.shape[1])
Expand Down
2 changes: 1 addition & 1 deletion mne/io/egi/tests/test_egi.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ def test_egi_mff_bad_xml(tmp_path):
"""Test that corrupt XML files are gracefully handled."""
pytest.importorskip("defusedxml")
mff_fname = copytree_rw(egi_mff_fname, tmp_path / "test_egi_bad_xml.mff")
bad_xml = mff_fname / "bad.xml"
bad_xml = mff_fname / "Events_bad.xml"
bad_xml.write_text("<foo>", encoding="utf-8")
# Missing coordinate file
(mff_fname / "coordinates.xml").unlink()
Expand Down
Loading