From 37ec2dcf33d44d2ee5dac589415bc3fbe002c33d Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:39:08 +0000 Subject: [PATCH 01/21] WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim] --- .github/workflows/tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 942aad738..e2f85326f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,4 +33,7 @@ jobs: run: pip install "${{ matrix.numpy-version }}" .[dev] - name: Run unit tests - run: pytest -v tests --ignore tests/integration --typeguard-packages=malariagen_data,malariagen_data.anoph + run: pytest -v tests \ + --ignore tests/integration \ + -k "not test_wgs_data_catalog[adir1_sim]" \ + --typeguard-packages=malariagen_data,malariagen_data.anoph From a299cf0fea8175fac78999b84f46d14f3c2adc96 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:46:53 +0000 Subject: [PATCH 02/21] WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim] --- .github/workflows/tests.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e2f85326f..8c78b7bd6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,7 +33,8 @@ jobs: run: pip install "${{ matrix.numpy-version }}" .[dev] - name: Run unit tests - run: pytest -v tests \ - --ignore tests/integration \ - -k "not test_wgs_data_catalog[adir1_sim]" \ - --typeguard-packages=malariagen_data,malariagen_data.anoph + run: | + pytest -v tests \ + --ignore tests/integration \ + -k "not test_wgs_data_catalog[adir1_sim]" \ + --typeguard-packages=malariagen_data,malariagen_data.anoph From 3a7ff4682d58161d8ef6e283622c8fcab1e628f4 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:11:49 +0000 Subject: [PATCH 03/21] WIP: debug segfault, ignore test_wgs_data_catalog[adir1_sim] in coverage.yml --- .github/workflows/coverage.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 88d799320..5e69e5b67 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -25,7 +25,12 @@ jobs: run: pip install .[dev] - name: Run unit tests with coverage - run: pytest -v tests --ignore tests/integration --cov malariagen_data/anoph --cov-report=xml + run: | + pytest -v tests \ + --ignore tests/integration \ + -k "not test_wgs_data_catalog[adir1_sim]" \ + --cov malariagen_data/anoph \ + --cov-report=xml - name: Upload coverage report uses: codecov/codecov-action@v3 From 71d6603f5bfbb010bb59bd6c15e1926985252a7f Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:59:34 +0000 Subject: [PATCH 04/21] Use pd.Series for lookup_sample() to avoid lint error (pd.core.series.Series is not defined) --- malariagen_data/anoph/sample_metadata.py | 2 +- malariagen_data/util.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py index c56d271dc..1314873a9 100644 --- a/malariagen_data/anoph/sample_metadata.py +++ b/malariagen_data/anoph/sample_metadata.py @@ -1155,7 +1155,7 @@ def lookup_sample( self, sample: base_params.sample, sample_set: Optional[base_params.sample_set] = None, - ) -> pd.core.series.Series: + ) -> pd.Series: df_samples = self.sample_metadata(sample_sets=sample_set).set_index("sample_id") sample_rec = None if isinstance(sample, str): diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 4e04565a7..6c4ea8b53 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -10,7 +10,18 @@ from functools import wraps from inspect import getcallargs from textwrap import dedent, fill -from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable +from typing import ( + IO, + Dict, + Hashable, + List, + Mapping, + Optional, + Tuple, + Union, + Callable, + Any, +) from urllib.parse import unquote_plus from numpy.testing import assert_allclose, assert_array_equal @@ -91,6 +102,13 @@ def _read_gff3(buf, compression="gzip"): return df +def _extract_gff3_attribute( + attrs: Mapping[str, Any], + key: str, +) -> Any: + return attrs.get(key, np.nan) + + def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]): df = df.copy() @@ -110,7 +128,7 @@ def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]): raise ValueError( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) - df[key] = df["attributes"].apply(lambda v: v.get(key, np.nan)) + df[key] = df["attributes"].apply(_extract_gff3_attribute, args=(key,)) del df["attributes"] return df From 2c402a39d6ea162f969fa95671a585e4ad9c8653 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 14:02:27 +0000 Subject: [PATCH 05/21] Remove _extract_gff3_attribute. Use defined series to avoid linting error (Series[Any] not callable) --- malariagen_data/util.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 6c4ea8b53..596ff9345 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -102,16 +102,15 @@ def _read_gff3(buf, compression="gzip"): return df -def _extract_gff3_attribute( - attrs: Mapping[str, Any], - key: str, -) -> Any: - return attrs.get(key, np.nan) - - -def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]): +def _unpack_gff3_attributes( + df: pd.DataFrame, + attributes: Tuple[str, ...], +) -> pd.DataFrame: df = df.copy() + # define attributes series + attrs: pd.Series[Mapping[str, Any]] = df["attributes"] + # discover all attribute keys all_attributes = set() for a in df["attributes"]: @@ -128,7 +127,9 @@ def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]): raise ValueError( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) - df[key] = df["attributes"].apply(_extract_gff3_attribute, args=(key,)) + + df[key] = attrs.apply(lambda a: a.get(key, np.nan)) + del df["attributes"] return df From ed1d157bc8b412b1c82feb4ea7f67cf26e9a9ef6 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 14:14:22 +0000 Subject: [PATCH 06/21] Fix lint errors in _unpack_gff3_attributes --- malariagen_data/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 596ff9345..ee66bcdad 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -20,7 +20,6 @@ Tuple, Union, Callable, - Any, ) from urllib.parse import unquote_plus from numpy.testing import assert_allclose, assert_array_equal @@ -108,8 +107,8 @@ def _unpack_gff3_attributes( ) -> pd.DataFrame: df = df.copy() - # define attributes series - attrs: pd.Series[Mapping[str, Any]] = df["attributes"] + # ref attributes column + attrs = df["attributes"] # discover all attribute keys all_attributes = set() @@ -128,8 +127,9 @@ def _unpack_gff3_attributes( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) - df[key] = attrs.apply(lambda a: a.get(key, np.nan)) + df[key] = attrs.apply(lambda a, k=key: a.get(k, np.nan)) + # remove attributes column del df["attributes"] return df From a477b67931d0a4298f18192761552e99c4ea99fa Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 14:32:29 +0000 Subject: [PATCH 07/21] Avoid using .apply in _unpack_gff3_attributes for type checking --- malariagen_data/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index ee66bcdad..a80f7259c 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -107,9 +107,6 @@ def _unpack_gff3_attributes( ) -> pd.DataFrame: df = df.copy() - # ref attributes column - attrs = df["attributes"] - # discover all attribute keys all_attributes = set() for a in df["attributes"]: @@ -127,7 +124,8 @@ def _unpack_gff3_attributes( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) - df[key] = attrs.apply(lambda a, k=key: a.get(k, np.nan)) + # Note: avoid using .apply() here, for type checking. + df[key] = [a.get(key, np.nan) for a in df["attributes"]] # remove attributes column del df["attributes"] From b9e1c0d0c10ae841eb9b791ef878931dd88fd99f Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 14:52:40 +0000 Subject: [PATCH 08/21] Ensure attributes in _unpack_gff3_attributes are dictionaries --- malariagen_data/util.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index a80f7259c..ccbe08b8f 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -107,6 +107,11 @@ def _unpack_gff3_attributes( ) -> pd.DataFrame: df = df.copy() + # ensure each value in "attributes" is a dictionary + df["attributes"] = df["attributes"].apply( + lambda x: dict(x) if not isinstance(x, dict) else x + ) + # discover all attribute keys all_attributes = set() for a in df["attributes"]: From 20683b7f6d846e15007bc0d4244478f9ef754eb7 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:08:33 +0000 Subject: [PATCH 09/21] WIP: address linting failures in _unpack_gff3_attributes --- malariagen_data/util.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index ccbe08b8f..aa3a312fc 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -108,9 +108,8 @@ def _unpack_gff3_attributes( df = df.copy() # ensure each value in "attributes" is a dictionary - df["attributes"] = df["attributes"].apply( - lambda x: dict(x) if not isinstance(x, dict) else x - ) + # Note: avoid using .apply() here, for type checking. + df["attributes"] = [x if isinstance(x, dict) else dict(x) for x in df["attributes"]] # discover all attribute keys all_attributes = set() @@ -133,7 +132,8 @@ def _unpack_gff3_attributes( df[key] = [a.get(key, np.nan) for a in df["attributes"]] # remove attributes column - del df["attributes"] + # Note: avoid using del here, for type checking. + df = df.drop(columns=["attributes"]) return df From 51f6c4a6ccfb487cd05a5d9368a719dde3182a98 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:20:56 +0000 Subject: [PATCH 10/21] WIP: dev code comments in _unpack_gff3_attributes --- malariagen_data/util.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/malariagen_data/util.py b/malariagen_data/util.py index aa3a312fc..d70f2a998 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -107,31 +107,30 @@ def _unpack_gff3_attributes( ) -> pd.DataFrame: df = df.copy() - # ensure each value in "attributes" is a dictionary - # Note: avoid using .apply() here, for type checking. - df["attributes"] = [x if isinstance(x, dict) else dict(x) for x in df["attributes"]] - - # discover all attribute keys + # Collect all the unique attributes in the DataFrame as a sorted tuple. all_attributes = set() for a in df["attributes"]: all_attributes.update(a.keys()) all_attributes_sorted = tuple(sorted(all_attributes)) - # handle request for all attributes + # If an asterisk was specified, use all the available attributes. if attributes == ("*",): attributes = all_attributes_sorted - # unpack attributes into columns + # For each of the specified attributes, + # if the attribute is not in the tuple of available attributes, + # then raise a ValueError. for key in attributes: if key not in all_attributes_sorted: raise ValueError( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) + # Copy the specified attribute into a new column in the DataFrame. # Note: avoid using .apply() here, for type checking. df[key] = [a.get(key, np.nan) for a in df["attributes"]] - # remove attributes column + # Drop the original "attributes" column from the DataFrame. # Note: avoid using del here, for type checking. df = df.drop(columns=["attributes"]) From 48455df47e04b867406f183d626d58ffaf5ee0ad Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:00:02 +0000 Subject: [PATCH 11/21] WIP: ignore all test_wgs_data_catalog cases (debug segfault) --- .github/workflows/coverage.yml | 2 +- .github/workflows/tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 5e69e5b67..8567b423e 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -28,7 +28,7 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog[adir1_sim]" \ + -k "not test_wgs_data_catalog" \ --cov malariagen_data/anoph \ --cov-report=xml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8c78b7bd6..9f8270927 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,5 +36,5 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog[adir1_sim]" \ + -k "not test_wgs_data_catalog" \ --typeguard-packages=malariagen_data,malariagen_data.anoph From f8b4ef48f894d4249f31323a93a00561c2a6100c Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Tue, 13 Jan 2026 18:02:59 +0000 Subject: [PATCH 12/21] WIP: debug test_wgs_data_catalog (segfault) --- tests/anoph/test_sample_metadata.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 649118f09..3299f68f3 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,6 +1099,18 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) +@parametrize_with_cases("fixture,api", cases=[case_ag3_sim, case_af1_sim]) +def test_debug_test_wgs_data_catalog(fixture, api): + # Set up test. + df_sample_sets = api.sample_sets().set_index("sample_set") + all_sample_sets = df_sample_sets.index.to_list() + + for sample_set in all_sample_sets: + # Call function to be tested. + df = api.wgs_data_catalog(sample_set=sample_set) + assert isinstance(df, pd.DataFrame) + + @parametrize_with_cases("fixture,api", cases=".") def test_wgs_run_accessions(fixture, api): # Set up test. From 99063d3b6d696eebdf249d12786f6303eef04be8 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 09:40:42 +0000 Subject: [PATCH 13/21] WIP: debug test_wgs_data_catalog cases=[case_adir1_sim, case_amin1_sim] --- tests/anoph/test_sample_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 3299f68f3..82d360cbe 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,7 +1099,7 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) -@parametrize_with_cases("fixture,api", cases=[case_ag3_sim, case_af1_sim]) +@parametrize_with_cases("fixture,api", cases=[case_adir1_sim, case_amin1_sim]) def test_debug_test_wgs_data_catalog(fixture, api): # Set up test. df_sample_sets = api.sample_sets().set_index("sample_set") From b1c0ef4757504b2e78c8b98fe92f75530d9966f7 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:41:57 +0000 Subject: [PATCH 14/21] WIP: debug test_wgs_data_catalog unrestricted, surveillance cases --- tests/anoph/test_sample_metadata.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 82d360cbe..c31225d7c 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,7 +1099,17 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) -@parametrize_with_cases("fixture,api", cases=[case_adir1_sim, case_amin1_sim]) +@parametrize_with_cases( + "fixture,api", + cases=[ + case_ag3_sim_unrestricted_use_only, + case_af1_sim_unrestricted_use_only, + case_ag3_sim_surveillance_use_only, + case_af1_sim_surveillance_use_only, + case_ag3_sim_unrestricted_surveillance_use_only, + case_af1_sim_unrestricted_surveillance_use_only, + ], +) def test_debug_test_wgs_data_catalog(fixture, api): # Set up test. df_sample_sets = api.sample_sets().set_index("sample_set") From 05b2d5c94a83e83e5f2144a7529fde3c937a9af8 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:37:51 +0000 Subject: [PATCH 15/21] WIP: debug test_wgs_data_catalog all cases --- tests/anoph/test_sample_metadata.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index c31225d7c..d28944167 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1101,14 +1101,7 @@ def test_wgs_data_catalog(fixture, api): @parametrize_with_cases( "fixture,api", - cases=[ - case_ag3_sim_unrestricted_use_only, - case_af1_sim_unrestricted_use_only, - case_ag3_sim_surveillance_use_only, - case_af1_sim_surveillance_use_only, - case_ag3_sim_unrestricted_surveillance_use_only, - case_af1_sim_unrestricted_surveillance_use_only, - ], + cases=".", ) def test_debug_test_wgs_data_catalog(fixture, api): # Set up test. From e1e3739bc74ef129f8e7b05babc2a47f1ace2ee5 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 13:07:56 +0000 Subject: [PATCH 16/21] WIP: debug test_wgs_data_catalog all cases, sample_id check --- tests/anoph/test_sample_metadata.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index d28944167..391da8198 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1106,12 +1106,29 @@ def test_wgs_data_catalog(fixture, api): def test_debug_test_wgs_data_catalog(fixture, api): # Set up test. df_sample_sets = api.sample_sets().set_index("sample_set") + sample_count = df_sample_sets["sample_count"] all_sample_sets = df_sample_sets.index.to_list() + # sample_set = random.choice(all_sample_sets) for sample_set in all_sample_sets: # Call function to be tested. df = api.wgs_data_catalog(sample_set=sample_set) + + # Check output. assert isinstance(df, pd.DataFrame) + expected_cols = [ + "sample_id", + "alignments_bam", + "snp_genotypes_vcf", + "snp_genotypes_zarr", + ] + assert df.columns.to_list() == expected_cols + assert len(df) == sample_count.loc[sample_set] + + # Compare with sample metadata. + df_samples = api.sample_metadata(sample_sets=sample_set) + # Don't enforce same order, but require same set. + assert set(df["sample_id"]) == set(df_samples["sample_id"]) @parametrize_with_cases("fixture,api", cases=".") From 8013a09ca6e3329805591b3e895a2b8c057ac1c5 Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 14:12:22 +0000 Subject: [PATCH 17/21] WIP: debug test_wgs_data_catalog, revert --- .github/workflows/coverage.yml | 1 - .github/workflows/tests.yml | 1 - tests/anoph/test_sample_metadata.py | 32 ----------------------------- 3 files changed, 34 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 8567b423e..558d8a911 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -28,7 +28,6 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog" \ --cov malariagen_data/anoph \ --cov-report=xml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9f8270927..7e3d0da27 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,5 +36,4 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog" \ --typeguard-packages=malariagen_data,malariagen_data.anoph diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 391da8198..649118f09 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,38 +1099,6 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) -@parametrize_with_cases( - "fixture,api", - cases=".", -) -def test_debug_test_wgs_data_catalog(fixture, api): - # Set up test. - df_sample_sets = api.sample_sets().set_index("sample_set") - sample_count = df_sample_sets["sample_count"] - all_sample_sets = df_sample_sets.index.to_list() - # sample_set = random.choice(all_sample_sets) - - for sample_set in all_sample_sets: - # Call function to be tested. - df = api.wgs_data_catalog(sample_set=sample_set) - - # Check output. - assert isinstance(df, pd.DataFrame) - expected_cols = [ - "sample_id", - "alignments_bam", - "snp_genotypes_vcf", - "snp_genotypes_zarr", - ] - assert df.columns.to_list() == expected_cols - assert len(df) == sample_count.loc[sample_set] - - # Compare with sample metadata. - df_samples = api.sample_metadata(sample_sets=sample_set) - # Don't enforce same order, but require same set. - assert set(df["sample_id"]) == set(df_samples["sample_id"]) - - @parametrize_with_cases("fixture,api", cases=".") def test_wgs_run_accessions(fixture, api): # Set up test. From cbf826a54964f32113d7dce1b0b720dfa78552bf Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 15 Jan 2026 16:48:52 +0000 Subject: [PATCH 18/21] WIP: debug test_wgs_data_catalog, exclude test_wgs_data_catalog, except case_adir1_sim --- .github/workflows/coverage.yml | 1 + .github/workflows/tests.yml | 1 + tests/anoph/test_sample_metadata.py | 32 +++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 558d8a911..8567b423e 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -28,6 +28,7 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ + -k "not test_wgs_data_catalog" \ --cov malariagen_data/anoph \ --cov-report=xml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7e3d0da27..9f8270927 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,4 +36,5 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ + -k "not test_wgs_data_catalog" \ --typeguard-packages=malariagen_data,malariagen_data.anoph diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 649118f09..fbd7baf4b 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,6 +1099,38 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) +@parametrize_with_cases( + "fixture,api", + cases=[case_adir1_sim], +) +def test_debug_test_wgs_data_catalog(fixture, api): + # Set up test. + df_sample_sets = api.sample_sets().set_index("sample_set") + sample_count = df_sample_sets["sample_count"] + all_sample_sets = df_sample_sets.index.to_list() + # sample_set = random.choice(all_sample_sets) + + for sample_set in all_sample_sets: + # Call function to be tested. + df = api.wgs_data_catalog(sample_set=sample_set) + + # Check output. + assert isinstance(df, pd.DataFrame) + expected_cols = [ + "sample_id", + "alignments_bam", + "snp_genotypes_vcf", + "snp_genotypes_zarr", + ] + assert df.columns.to_list() == expected_cols + assert len(df) == sample_count.loc[sample_set] + + # Compare with sample metadata. + df_samples = api.sample_metadata(sample_sets=sample_set) + # Don't enforce same order, but require same set. + assert set(df["sample_id"]) == set(df_samples["sample_id"]) + + @parametrize_with_cases("fixture,api", cases=".") def test_wgs_run_accessions(fixture, api): # Set up test. From c6bc0757b41fd08b9af29a7ec337d82c1bfe450a Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Fri, 16 Jan 2026 10:26:46 +0000 Subject: [PATCH 19/21] WIP: debug test_wgs_data_catalog, all cases --- tests/anoph/test_sample_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index fbd7baf4b..8bfb8480a 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1101,7 +1101,7 @@ def test_wgs_data_catalog(fixture, api): @parametrize_with_cases( "fixture,api", - cases=[case_adir1_sim], + cases=["."], ) def test_debug_test_wgs_data_catalog(fixture, api): # Set up test. From 0134905626530f781355793302795803a94fa82b Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:59:40 +0000 Subject: [PATCH 20/21] WIP: debug test_wgs_data_catalog, revert --- .github/workflows/coverage.yml | 1 - .github/workflows/tests.yml | 1 - tests/anoph/test_sample_metadata.py | 32 ----------------------------- 3 files changed, 34 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 8567b423e..558d8a911 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -28,7 +28,6 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog" \ --cov malariagen_data/anoph \ --cov-report=xml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9f8270927..7e3d0da27 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,5 +36,4 @@ jobs: run: | pytest -v tests \ --ignore tests/integration \ - -k "not test_wgs_data_catalog" \ --typeguard-packages=malariagen_data,malariagen_data.anoph diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py index 8bfb8480a..649118f09 100644 --- a/tests/anoph/test_sample_metadata.py +++ b/tests/anoph/test_sample_metadata.py @@ -1099,38 +1099,6 @@ def test_wgs_data_catalog(fixture, api): assert set(df["sample_id"]) == set(df_samples["sample_id"]) -@parametrize_with_cases( - "fixture,api", - cases=["."], -) -def test_debug_test_wgs_data_catalog(fixture, api): - # Set up test. - df_sample_sets = api.sample_sets().set_index("sample_set") - sample_count = df_sample_sets["sample_count"] - all_sample_sets = df_sample_sets.index.to_list() - # sample_set = random.choice(all_sample_sets) - - for sample_set in all_sample_sets: - # Call function to be tested. - df = api.wgs_data_catalog(sample_set=sample_set) - - # Check output. - assert isinstance(df, pd.DataFrame) - expected_cols = [ - "sample_id", - "alignments_bam", - "snp_genotypes_vcf", - "snp_genotypes_zarr", - ] - assert df.columns.to_list() == expected_cols - assert len(df) == sample_count.loc[sample_set] - - # Compare with sample metadata. - df_samples = api.sample_metadata(sample_sets=sample_set) - # Don't enforce same order, but require same set. - assert set(df["sample_id"]) == set(df_samples["sample_id"]) - - @parametrize_with_cases("fixture,api", cases=".") def test_wgs_run_accessions(fixture, api): # Set up test. From b213b01fd3227ff4c199271df716de80d23769ad Mon Sep 17 00:00:00 2001 From: Lee <4256466+leehart@users.noreply.github.com> Date: Thu, 22 Jan 2026 15:16:20 +0000 Subject: [PATCH 21/21] WIP: try adding engine='python' to all pd.read_csv calls (debug segfault) --- malariagen_data/ag3.py | 1 + malariagen_data/anoph/base.py | 2 +- malariagen_data/anoph/karyotype.py | 2 +- malariagen_data/anoph/phenotypes.py | 2 +- malariagen_data/anoph/sample_metadata.py | 22 ++++++--- malariagen_data/plasmodium.py | 4 +- malariagen_data/util.py | 1 + tests/anoph/conftest.py | 60 ++++++++++++------------ 8 files changed, 53 insertions(+), 41 deletions(-) diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index fd528f586..ee40645e0 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -354,6 +354,7 @@ def cross_metadata(self): na_values=["", "0"], names=fam_names, dtype={"sex": str}, + engine="python", ) debug("convert 'sex' column for consistency with sample metadata") diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py index 02343a8cb..7a7fea06b 100644 --- a/malariagen_data/anoph/base.py +++ b/malariagen_data/anoph/base.py @@ -588,7 +588,7 @@ def _read_sample_sets_manifest(self, *, single_release: str): # Read the manifest into a pandas dataframe. with self.open_file(manifest_path) as f: - df = pd.read_csv(f, sep="\t", na_values="") + df = pd.read_csv(f, sep="\t", na_values="", engine="python") # Add a "release" column for convenience. df["release"] = single_release diff --git a/malariagen_data/anoph/karyotype.py b/malariagen_data/anoph/karyotype.py index d0eda0d54..cb5d67b41 100644 --- a/malariagen_data/anoph/karyotype.py +++ b/malariagen_data/anoph/karyotype.py @@ -61,7 +61,7 @@ def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: ) else: with importlib.resources.path(resources, self._inversion_tag_path) as path: - df_tag_snps = pd.read_csv(path, sep=",") + df_tag_snps = pd.read_csv(path, sep=",", engine="python") return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() @_check_types diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py index adf8d309a..6baa0c142 100644 --- a/malariagen_data/anoph/phenotypes.py +++ b/malariagen_data/anoph/phenotypes.py @@ -57,7 +57,7 @@ def _load_phenotype_data( with self._fs.open(phenotype_path, "r") as f: try: - df_pheno = pd.read_csv(f, low_memory=False) + df_pheno = pd.read_csv(f, low_memory=False, engine="python") except pd.errors.EmptyDataError: warnings.warn(f"Empty phenotype file for {sample_set}") continue diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py index 1314873a9..08b0833ed 100644 --- a/malariagen_data/anoph/sample_metadata.py +++ b/malariagen_data/anoph/sample_metadata.py @@ -167,7 +167,9 @@ def _parse_general_metadata( } # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv` dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # Ensure all column names are lower case. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -258,7 +260,9 @@ def _parse_sequence_qc_metadata( dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) # Read the CSV using the dtype dict. - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) return df @@ -383,7 +387,9 @@ def _parse_surveillance_flags( if isinstance(data, bytes): # Read the CSV data. - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # If there are any nulls in these data, show a warning. if df.isnull().values.any(): @@ -518,7 +524,9 @@ def _parse_cohorts_metadata( # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv` dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # Ensure all column names are lower case. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -592,7 +600,7 @@ def _parse_aim_metadata( assert self._aim_metadata_dtype is not None if isinstance(data, bytes): # Parse CSV data but don't apply the dtype yet. - df = pd.read_csv(io.BytesIO(data), na_values="") + df = pd.read_csv(io.BytesIO(data), na_values="", engine="python") # Convert all column names to lowercase. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -1042,7 +1050,7 @@ def wgs_run_accessions(self, sample_set: base_params.sample_set): # Load data catalog. path = f"{self._base_path}/{release_path}/metadata/general/{sample_set}/wgs_accession_data.csv" with self._fs.open(path) as f: - df = pd.read_csv(f, na_values="") + df = pd.read_csv(f, na_values="", engine="python") # Normalise columns. df = df[ @@ -1512,7 +1520,7 @@ def cohorts( # Read the manifest into a pandas dataframe. with self.open_file(path) as f: - df_cohorts = pd.read_csv(f, sep=",", na_values="") + df_cohorts = pd.read_csv(f, sep=",", na_values="", engine="python") # Ensure all column names are lower case. df_cohorts.columns = [c.lower() for c in df_cohorts.columns] # type: ignore diff --git a/malariagen_data/plasmodium.py b/malariagen_data/plasmodium.py index 220db4b82..5f6249fe7 100644 --- a/malariagen_data/plasmodium.py +++ b/malariagen_data/plasmodium.py @@ -61,7 +61,9 @@ def sample_metadata(self): if self._cache_sample_metadata is None: path = f"{self._path}/{self.CONF['metadata_path']}" with self._fs.open(path) as f: - self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="") + self._cache_sample_metadata = pd.read_csv( + f, sep="\t", na_values="", engine="python" + ) return self._cache_sample_metadata def _open_variant_calls_zarr(self): diff --git a/malariagen_data/util.py b/malariagen_data/util.py index d70f2a998..6b4fcd63d 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -93,6 +93,7 @@ def _read_gff3(buf, compression="gzip"): names=gff3_cols, na_values=["", "."], compression=compression, + engine="python", ) # parse attributes diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py index 0c97c6c7a..42fc3dc99 100644 --- a/tests/anoph/conftest.py +++ b/tests/anoph/conftest.py @@ -374,7 +374,7 @@ def simulate_snp_genotypes( root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") n_samples = len(df_samples) samples = df_samples["sample_id"].values.astype("S") root.create_dataset(name="samples", data=samples) @@ -618,7 +618,7 @@ def simulate_cnv_hmm(zarr_path, metadata_path, contigs, contig_sizes): root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -717,7 +717,7 @@ def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, contig_sizes) root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") n_samples = len(df_samples) samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -837,7 +837,7 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, contig root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -1250,7 +1250,7 @@ def write_metadata( / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") # Randomly downsample. df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() @@ -1275,7 +1275,7 @@ def write_metadata( / sample_set / "surveillance.flags.csv" ) - df_surveillance_flags = pd.read_csv(surv_flags_src_path) + df_surveillance_flags = pd.read_csv(surv_flags_src_path, engine="python") df_surveillance_flags_ds = ( df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index() ) @@ -1301,7 +1301,7 @@ def write_metadata( / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -1329,7 +1329,7 @@ def write_metadata( / sample_set / "samples.species_aim.csv" ) - df_aim = pd.read_csv(src_path) + df_aim = pd.read_csv(src_path, engine="python") df_aim_ds = df_aim.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1353,7 +1353,7 @@ def write_metadata( / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1395,7 +1395,7 @@ def write_metadata( / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1418,7 +1418,7 @@ def write_metadata( / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1572,7 +1572,7 @@ def init_haplotypes(self): / sample_set / "samples.species_aim.csv" ) - df_aim = pd.read_csv(metadata_path) + df_aim = pd.read_csv(metadata_path, engine="python") # Simulate haplotypes for the gamb_colu_arab analysis. analysis = "gamb_colu_arab" @@ -1716,7 +1716,7 @@ def init_aim_calls(self): / sample_set / "samples.meta.csv" ) - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") ds["sample_id"] = ("samples",), df_samples["sample_id"] # Add call_genotype variable. @@ -2030,7 +2030,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2054,7 +2054,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "surveillance.flags.csv" ) - df_surveillance_flags = pd.read_csv(surv_flags_src_path) + df_surveillance_flags = pd.read_csv(surv_flags_src_path, engine="python") df_surveillance_flags_ds = ( df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index() ) @@ -2080,7 +2080,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2107,7 +2107,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2130,7 +2130,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2153,7 +2153,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2282,7 +2282,7 @@ def init_haplotypes(self): / sample_set / "samples.meta.csv" ) - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values # Simulate haplotypes. @@ -2570,7 +2570,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2595,7 +2595,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2622,7 +2622,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2645,7 +2645,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2668,7 +2668,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2887,7 +2887,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2912,7 +2912,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2939,7 +2939,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2962,7 +2962,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2985,7 +2985,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path