Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
37ec2dc
WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim]
leehart Jan 13, 2026
a299cf0
WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim]
leehart Jan 13, 2026
3a7ff46
WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim] in cover…
leehart Jan 13, 2026
71d6603
Use pd.Series for lookup_sample() to avoid lint error (pd.core.series…
leehart Jan 13, 2026
2c402a3
Remove _extract_gff3_attribute. Use defined series to avoid linting e…
leehart Jan 13, 2026
ed1d157
Fix lint errors in _unpack_gff3_attributes
leehart Jan 13, 2026
a477b67
Avoid using .apply in _unpack_gff3_attributes for type checking
leehart Jan 13, 2026
b9e1c0d
Ensure attributes in _unpack_gff3_attributes are dictionaries
leehart Jan 13, 2026
20683b7
WIP: address linting failures in _unpack_gff3_attributes
leehart Jan 13, 2026
51f6c4a
WIP: dev code comments in _unpack_gff3_attributes
leehart Jan 13, 2026
48455df
WIP: ignore all test_wgs_data_catalog cases (debug segfault)
leehart Jan 13, 2026
f8b4ef4
WIP: debug test_wgs_data_catalog (segfault)
leehart Jan 13, 2026
99063d3
WIP: debug test_wgs_data_catalog cases=[case_adir1_sim, case_amin1_sim]
leehart Jan 15, 2026
b1c0ef4
WIP: debug test_wgs_data_catalog unrestricted, surveillance cases
leehart Jan 15, 2026
05b2d5c
WIP: debug test_wgs_data_catalog all cases
leehart Jan 15, 2026
e1e3739
WIP: debug test_wgs_data_catalog all cases, sample_id check
leehart Jan 15, 2026
8013a09
WIP: debug test_wgs_data_catalog, revert
leehart Jan 15, 2026
cbf826a
WIP: debug test_wgs_data_catalog, exclude test_wgs_data_catalog, exce…
leehart Jan 15, 2026
c6bc075
WIP: debug test_wgs_data_catalog, all cases
leehart Jan 16, 2026
0134905
WIP: debug test_wgs_data_catalog, revert
leehart Jan 16, 2026
451de5b
Merge branch 'master' into GH835_fix_segfault
leehart Jan 22, 2026
b213b01
WIP: try adding engine='python' to all pd.read_csv calls (debug segfa…
leehart Jan 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ jobs:
run: pip install .[dev]

- name: Run unit tests with coverage
run: pytest -v tests --ignore tests/integration --cov malariagen_data/anoph --cov-report=xml
run: |
pytest -v tests \
--ignore tests/integration \
--cov malariagen_data/anoph \
--cov-report=xml

- name: Upload coverage report
uses: codecov/codecov-action@v3
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,7 @@ jobs:
run: pip install "${{ matrix.numpy-version }}" .[dev]

- name: Run unit tests
run: pytest -v tests --ignore tests/integration --typeguard-packages=malariagen_data,malariagen_data.anoph
run: |
pytest -v tests \
--ignore tests/integration \
--typeguard-packages=malariagen_data,malariagen_data.anoph
1 change: 1 addition & 0 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ def cross_metadata(self):
na_values=["", "0"],
names=fam_names,
dtype={"sex": str},
engine="python",
)

debug("convert 'sex' column for consistency with sample metadata")
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/anoph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ def _read_sample_sets_manifest(self, *, single_release: str):

# Read the manifest into a pandas dataframe.
with self.open_file(manifest_path) as f:
df = pd.read_csv(f, sep="\t", na_values="")
df = pd.read_csv(f, sep="\t", na_values="", engine="python")

# Add a "release" column for convenience.
df["release"] = single_release
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/anoph/karyotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
)
else:
with importlib.resources.path(resources, self._inversion_tag_path) as path:
df_tag_snps = pd.read_csv(path, sep=",")
df_tag_snps = pd.read_csv(path, sep=",", engine="python")
return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()

@_check_types
Expand Down
2 changes: 1 addition & 1 deletion malariagen_data/anoph/phenotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _load_phenotype_data(

with self._fs.open(phenotype_path, "r") as f:
try:
df_pheno = pd.read_csv(f, low_memory=False)
df_pheno = pd.read_csv(f, low_memory=False, engine="python")
except pd.errors.EmptyDataError:
warnings.warn(f"Empty phenotype file for {sample_set}")
continue
Expand Down
22 changes: 15 additions & 7 deletions malariagen_data/anoph/sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,9 @@ def _parse_general_metadata(
}
# `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
df = pd.read_csv(
io.BytesIO(data), dtype=dtype, na_values="", engine="python"
)

# Ensure all column names are lower case.
df.columns = [c.lower() for c in df.columns] # type: ignore
Expand Down Expand Up @@ -258,7 +260,9 @@ def _parse_sequence_qc_metadata(
dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)

# Read the CSV using the dtype dict.
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
df = pd.read_csv(
io.BytesIO(data), dtype=dtype, na_values="", engine="python"
)

return df

Expand Down Expand Up @@ -383,7 +387,9 @@ def _parse_surveillance_flags(

if isinstance(data, bytes):
# Read the CSV data.
df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
df = pd.read_csv(
io.BytesIO(data), dtype=dtype, na_values="", engine="python"
)

# If there are any nulls in these data, show a warning.
if df.isnull().values.any():
Expand Down Expand Up @@ -518,7 +524,9 @@ def _parse_cohorts_metadata(
# `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv`
dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict)

df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="")
df = pd.read_csv(
io.BytesIO(data), dtype=dtype, na_values="", engine="python"
)

# Ensure all column names are lower case.
df.columns = [c.lower() for c in df.columns] # type: ignore
Expand Down Expand Up @@ -592,7 +600,7 @@ def _parse_aim_metadata(
assert self._aim_metadata_dtype is not None
if isinstance(data, bytes):
# Parse CSV data but don't apply the dtype yet.
df = pd.read_csv(io.BytesIO(data), na_values="")
df = pd.read_csv(io.BytesIO(data), na_values="", engine="python")

# Convert all column names to lowercase.
df.columns = [c.lower() for c in df.columns] # type: ignore
Expand Down Expand Up @@ -1042,7 +1050,7 @@ def wgs_run_accessions(self, sample_set: base_params.sample_set):
# Load data catalog.
path = f"{self._base_path}/{release_path}/metadata/general/{sample_set}/wgs_accession_data.csv"
with self._fs.open(path) as f:
df = pd.read_csv(f, na_values="")
df = pd.read_csv(f, na_values="", engine="python")

# Normalise columns.
df = df[
Expand Down Expand Up @@ -1512,7 +1520,7 @@ def cohorts(

# Read the manifest into a pandas dataframe.
with self.open_file(path) as f:
df_cohorts = pd.read_csv(f, sep=",", na_values="")
df_cohorts = pd.read_csv(f, sep=",", na_values="", engine="python")

# Ensure all column names are lower case.
df_cohorts.columns = [c.lower() for c in df_cohorts.columns] # type: ignore
Expand Down
4 changes: 3 additions & 1 deletion malariagen_data/plasmodium.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def sample_metadata(self):
if self._cache_sample_metadata is None:
path = f"{self._path}/{self.CONF['metadata_path']}"
with self._fs.open(path) as f:
self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="")
self._cache_sample_metadata = pd.read_csv(
f, sep="\t", na_values="", engine="python"
)
return self._cache_sample_metadata

def _open_variant_calls_zarr(self):
Expand Down
36 changes: 29 additions & 7 deletions malariagen_data/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,17 @@
from functools import wraps
from inspect import getcallargs
from textwrap import dedent, fill
from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable
from typing import (
IO,
Dict,
Hashable,
List,
Mapping,
Optional,
Tuple,
Union,
Callable,
)
from urllib.parse import unquote_plus
from numpy.testing import assert_allclose, assert_array_equal

Expand Down Expand Up @@ -83,6 +93,7 @@ def _read_gff3(buf, compression="gzip"):
names=gff3_cols,
na_values=["", "."],
compression=compression,
engine="python",
)

# parse attributes
Expand All @@ -91,27 +102,38 @@ def _read_gff3(buf, compression="gzip"):
return df


def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]):
def _unpack_gff3_attributes(
df: pd.DataFrame,
attributes: Tuple[str, ...],
) -> pd.DataFrame:
df = df.copy()

# discover all attribute keys
# Collect all the unique attributes in the DataFrame as a sorted tuple.
all_attributes = set()
for a in df["attributes"]:
all_attributes.update(a.keys())
all_attributes_sorted = tuple(sorted(all_attributes))

# handle request for all attributes
# If an asterisk was specified, use all the available attributes.
if attributes == ("*",):
attributes = all_attributes_sorted

# unpack attributes into columns
# For each of the specified attributes,
# if the attribute is not in the tuple of available attributes,
# then raise a ValueError.
for key in attributes:
if key not in all_attributes_sorted:
raise ValueError(
f"'{key}' not in attributes set. Options {all_attributes_sorted}"
)
df[key] = df["attributes"].apply(lambda v: v.get(key, np.nan))
del df["attributes"]

# Copy the specified attribute into a new column in the DataFrame.
# Note: avoid using .apply() here, for type checking.
df[key] = [a.get(key, np.nan) for a in df["attributes"]]

# Drop the original "attributes" column from the DataFrame.
# Note: avoid using del here, for type checking.
df = df.drop(columns=["attributes"])

return df

Expand Down
Loading
Loading