malariagen · leehart · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -25,7 +25,11 @@ jobs:
               run: pip install .[dev]
 
             - name: Run unit tests with coverage
-              run: pytest -v tests --ignore tests/integration --cov malariagen_data/anoph --cov-report=xml
+              run: |
+                pytest -v tests \
+                  --ignore tests/integration \
+                  --cov malariagen_data/anoph \
+                  --cov-report=xml
 
             - name: Upload coverage report
               uses: codecov/codecov-action@v3

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -33,4 +33,7 @@ jobs:
         run: pip install "${{ matrix.numpy-version }}" .[dev]
 
       - name: Run unit tests
-        run: pytest -v tests --ignore tests/integration --typeguard-packages=malariagen_data,malariagen_data.anoph
+        run: |
+          pytest -v tests \
+            --ignore tests/integration \
+            --typeguard-packages=malariagen_data,malariagen_data.anoph
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -354,6 +354,7 @@ def cross_metadata(self):
                     na_values=["", "0"],
                     names=fam_names,
                     dtype={"sex": str},
+                    engine="python",
                 )
 
             debug("convert 'sex' column for consistency with sample metadata")

diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -588,7 +588,7 @@ def _read_sample_sets_manifest(self, *, single_release: str):
 
         # Read the manifest into a pandas dataframe.
         with self.open_file(manifest_path) as f:
-            df = pd.read_csv(f, sep="\t", na_values="")
+            df = pd.read_csv(f, sep="\t", na_values="", engine="python")
 
         # Add a "release" column for convenience.
         df["release"] = single_release

diff --git a/malariagen_data/anoph/karyotype.py b/malariagen_data/anoph/karyotype.py
@@ -61,7 +61,7 @@ def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
             )
         else:
             with importlib.resources.path(resources, self._inversion_tag_path) as path:
-                df_tag_snps = pd.read_csv(path, sep=",")
+                df_tag_snps = pd.read_csv(path, sep=",", engine="python")
             return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
 
     @_check_types

diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py
@@ -57,7 +57,7 @@ def _load_phenotype_data(
 
                 with self._fs.open(phenotype_path, "r") as f:
                     try:
-                        df_pheno = pd.read_csv(f, low_memory=False)
+                        df_pheno = pd.read_csv(f, low_memory=False, engine="python")
                     except pd.errors.EmptyDataError:
                         warnings.warn(f"Empty phenotype file for {sample_set}")
                         continue

diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -167,7 +167,9 @@ def _parse_general_metadata(
             }
             # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
             dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+            df = pd.read_csv(
+                io.BytesIO(data), dtype=dtype, na_values="", engine="python"
+            )
 
             # Ensure all column names are lower case.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
@@ -258,7 +260,9 @@ def _parse_sequence_qc_metadata(
             dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
 
             # Read the CSV using the dtype dict.
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+            df = pd.read_csv(
+                io.BytesIO(data), dtype=dtype, na_values="", engine="python"
+            )
 
             return df
 
@@ -383,7 +387,9 @@ def _parse_surveillance_flags(
 
         if isinstance(data, bytes):
             # Read the CSV data.
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+            df = pd.read_csv(
+                io.BytesIO(data), dtype=dtype, na_values="", engine="python"
+            )
 
             # If there are any nulls in these data, show a warning.
             if df.isnull().values.any():
@@ -518,7 +524,9 @@ def _parse_cohorts_metadata(
             # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
             dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
 
-            df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
+            df = pd.read_csv(
+                io.BytesIO(data), dtype=dtype, na_values="", engine="python"
+            )
 
             # Ensure all column names are lower case.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
@@ -592,7 +600,7 @@ def _parse_aim_metadata(
         assert self._aim_metadata_dtype is not None
         if isinstance(data, bytes):
             # Parse CSV data but don't apply the dtype yet.
-            df = pd.read_csv(io.BytesIO(data), na_values="")
+            df = pd.read_csv(io.BytesIO(data), na_values="", engine="python")
 
             # Convert all column names to lowercase.
             df.columns = [c.lower() for c in df.columns]  # type: ignore
@@ -1042,7 +1050,7 @@ def wgs_run_accessions(self, sample_set: base_params.sample_set):
         # Load data catalog.
         path = f"{self._base_path}/{release_path}/metadata/general/{sample_set}/wgs_accession_data.csv"
         with self._fs.open(path) as f:
-            df = pd.read_csv(f, na_values="")
+            df = pd.read_csv(f, na_values="", engine="python")
 
         # Normalise columns.
         df = df[
@@ -1512,7 +1520,7 @@ def cohorts(
 
         # Read the manifest into a pandas dataframe.
         with self.open_file(path) as f:
-            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+            df_cohorts = pd.read_csv(f, sep=",", na_values="", engine="python")
 
         # Ensure all column names are lower case.
         df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore

diff --git a/malariagen_data/plasmodium.py b/malariagen_data/plasmodium.py
@@ -61,7 +61,9 @@ def sample_metadata(self):
         if self._cache_sample_metadata is None:
             path = f"{self._path}/{self.CONF['metadata_path']}"
             with self._fs.open(path) as f:
-                self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
+                self._cache_sample_metadata = pd.read_csv(
+                    f, sep="\t", na_values="", engine="python"
+                )
         return self._cache_sample_metadata
 
     def _open_variant_calls_zarr(self):

diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -10,7 +10,17 @@
 from functools import wraps
 from inspect import getcallargs
 from textwrap import dedent, fill
-from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable
+from typing import (
+    IO,
+    Dict,
+    Hashable,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+    Callable,
+)
 from urllib.parse import unquote_plus
 from numpy.testing import assert_allclose, assert_array_equal
 
@@ -83,6 +93,7 @@ def _read_gff3(buf, compression="gzip"):
         names=gff3_cols,
         na_values=["", "."],
         compression=compression,
+        engine="python",
     )
 
     # parse attributes
@@ -91,27 +102,38 @@ def _read_gff3(buf, compression="gzip"):
     return df
 
 
-def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]):
+def _unpack_gff3_attributes(
+    df: pd.DataFrame,
+    attributes: Tuple[str, ...],
+) -> pd.DataFrame:
     df = df.copy()
 
-    # discover all attribute keys
+    # Collect all the unique attributes in the DataFrame as a sorted tuple.
     all_attributes = set()
     for a in df["attributes"]:
         all_attributes.update(a.keys())
     all_attributes_sorted = tuple(sorted(all_attributes))
 
-    # handle request for all attributes
+    # If an asterisk was specified, use all the available attributes.
     if attributes == ("*",):
         attributes = all_attributes_sorted
 
-    # unpack attributes into columns
+    # For each of the specified attributes,
+    # if the attribute is not in the tuple of available attributes,
+    # then raise a ValueError.
     for key in attributes:
         if key not in all_attributes_sorted:
             raise ValueError(
                 f"'{key}' not in attributes set. Options {all_attributes_sorted}"
             )
-        df[key] = df["attributes"].apply(lambda v: v.get(key, np.nan))
-    del df["attributes"]
+
+        # Copy the specified attribute into a new column in the DataFrame.
+        # Note: avoid using .apply() here, for type checking.
+        df[key] = [a.get(key, np.nan) for a in df["attributes"]]
+
+    # Drop the original "attributes" column from the DataFrame.
+    # Note: avoid using del here, for type checking.
+    df = df.drop(columns=["attributes"])
 
     return df