diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 88d799320..558d8a911 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -25,7 +25,11 @@ jobs: run: pip install .[dev] - name: Run unit tests with coverage - run: pytest -v tests --ignore tests/integration --cov malariagen_data/anoph --cov-report=xml + run: | + pytest -v tests \ + --ignore tests/integration \ + --cov malariagen_data/anoph \ + --cov-report=xml - name: Upload coverage report uses: codecov/codecov-action@v3 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 942aad738..7e3d0da27 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -33,4 +33,7 @@ jobs: run: pip install "${{ matrix.numpy-version }}" .[dev] - name: Run unit tests - run: pytest -v tests --ignore tests/integration --typeguard-packages=malariagen_data,malariagen_data.anoph + run: | + pytest -v tests \ + --ignore tests/integration \ + --typeguard-packages=malariagen_data,malariagen_data.anoph diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index fd528f586..ee40645e0 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -354,6 +354,7 @@ def cross_metadata(self): na_values=["", "0"], names=fam_names, dtype={"sex": str}, + engine="python", ) debug("convert 'sex' column for consistency with sample metadata") diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py index 02343a8cb..7a7fea06b 100644 --- a/malariagen_data/anoph/base.py +++ b/malariagen_data/anoph/base.py @@ -588,7 +588,7 @@ def _read_sample_sets_manifest(self, *, single_release: str): # Read the manifest into a pandas dataframe. with self.open_file(manifest_path) as f: - df = pd.read_csv(f, sep="\t", na_values="") + df = pd.read_csv(f, sep="\t", na_values="", engine="python") # Add a "release" column for convenience. df["release"] = single_release diff --git a/malariagen_data/anoph/karyotype.py b/malariagen_data/anoph/karyotype.py index d0eda0d54..cb5d67b41 100644 --- a/malariagen_data/anoph/karyotype.py +++ b/malariagen_data/anoph/karyotype.py @@ -61,7 +61,7 @@ def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame: ) else: with importlib.resources.path(resources, self._inversion_tag_path) as path: - df_tag_snps = pd.read_csv(path, sep=",") + df_tag_snps = pd.read_csv(path, sep=",", engine="python") return df_tag_snps.query(f"inversion == '{inversion}'").reset_index() @_check_types diff --git a/malariagen_data/anoph/phenotypes.py b/malariagen_data/anoph/phenotypes.py index adf8d309a..6baa0c142 100644 --- a/malariagen_data/anoph/phenotypes.py +++ b/malariagen_data/anoph/phenotypes.py @@ -57,7 +57,7 @@ def _load_phenotype_data( with self._fs.open(phenotype_path, "r") as f: try: - df_pheno = pd.read_csv(f, low_memory=False) + df_pheno = pd.read_csv(f, low_memory=False, engine="python") except pd.errors.EmptyDataError: warnings.warn(f"Empty phenotype file for {sample_set}") continue diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py index 1314873a9..08b0833ed 100644 --- a/malariagen_data/anoph/sample_metadata.py +++ b/malariagen_data/anoph/sample_metadata.py @@ -167,7 +167,9 @@ def _parse_general_metadata( } # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv` dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # Ensure all column names are lower case. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -258,7 +260,9 @@ def _parse_sequence_qc_metadata( dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) # Read the CSV using the dtype dict. - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) return df @@ -383,7 +387,9 @@ def _parse_surveillance_flags( if isinstance(data, bytes): # Read the CSV data. - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # If there are any nulls in these data, show a warning. if df.isnull().values.any(): @@ -518,7 +524,9 @@ def _parse_cohorts_metadata( # `dict[str, str]` is incompatible with the `dtype` of `pd.read_csv` dtype: DefaultDict[str, str] = defaultdict(lambda: "object", dtype_dict) - df = pd.read_csv(io.BytesIO(data), dtype=dtype, na_values="") + df = pd.read_csv( + io.BytesIO(data), dtype=dtype, na_values="", engine="python" + ) # Ensure all column names are lower case. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -592,7 +600,7 @@ def _parse_aim_metadata( assert self._aim_metadata_dtype is not None if isinstance(data, bytes): # Parse CSV data but don't apply the dtype yet. - df = pd.read_csv(io.BytesIO(data), na_values="") + df = pd.read_csv(io.BytesIO(data), na_values="", engine="python") # Convert all column names to lowercase. df.columns = [c.lower() for c in df.columns] # type: ignore @@ -1042,7 +1050,7 @@ def wgs_run_accessions(self, sample_set: base_params.sample_set): # Load data catalog. path = f"{self._base_path}/{release_path}/metadata/general/{sample_set}/wgs_accession_data.csv" with self._fs.open(path) as f: - df = pd.read_csv(f, na_values="") + df = pd.read_csv(f, na_values="", engine="python") # Normalise columns. df = df[ @@ -1512,7 +1520,7 @@ def cohorts( # Read the manifest into a pandas dataframe. with self.open_file(path) as f: - df_cohorts = pd.read_csv(f, sep=",", na_values="") + df_cohorts = pd.read_csv(f, sep=",", na_values="", engine="python") # Ensure all column names are lower case. df_cohorts.columns = [c.lower() for c in df_cohorts.columns] # type: ignore diff --git a/malariagen_data/plasmodium.py b/malariagen_data/plasmodium.py index 220db4b82..5f6249fe7 100644 --- a/malariagen_data/plasmodium.py +++ b/malariagen_data/plasmodium.py @@ -61,7 +61,9 @@ def sample_metadata(self): if self._cache_sample_metadata is None: path = f"{self._path}/{self.CONF['metadata_path']}" with self._fs.open(path) as f: - self._cache_sample_metadata = pd.read_csv(f, sep="\t", na_values="") + self._cache_sample_metadata = pd.read_csv( + f, sep="\t", na_values="", engine="python" + ) return self._cache_sample_metadata def _open_variant_calls_zarr(self): diff --git a/malariagen_data/util.py b/malariagen_data/util.py index 4e04565a7..6b4fcd63d 100644 --- a/malariagen_data/util.py +++ b/malariagen_data/util.py @@ -10,7 +10,17 @@ from functools import wraps from inspect import getcallargs from textwrap import dedent, fill -from typing import IO, Dict, Hashable, List, Mapping, Optional, Tuple, Union, Callable +from typing import ( + IO, + Dict, + Hashable, + List, + Mapping, + Optional, + Tuple, + Union, + Callable, +) from urllib.parse import unquote_plus from numpy.testing import assert_allclose, assert_array_equal @@ -83,6 +93,7 @@ def _read_gff3(buf, compression="gzip"): names=gff3_cols, na_values=["", "."], compression=compression, + engine="python", ) # parse attributes @@ -91,27 +102,38 @@ def _read_gff3(buf, compression="gzip"): return df -def _unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]): +def _unpack_gff3_attributes( + df: pd.DataFrame, + attributes: Tuple[str, ...], +) -> pd.DataFrame: df = df.copy() - # discover all attribute keys + # Collect all the unique attributes in the DataFrame as a sorted tuple. all_attributes = set() for a in df["attributes"]: all_attributes.update(a.keys()) all_attributes_sorted = tuple(sorted(all_attributes)) - # handle request for all attributes + # If an asterisk was specified, use all the available attributes. if attributes == ("*",): attributes = all_attributes_sorted - # unpack attributes into columns + # For each of the specified attributes, + # if the attribute is not in the tuple of available attributes, + # then raise a ValueError. for key in attributes: if key not in all_attributes_sorted: raise ValueError( f"'{key}' not in attributes set. Options {all_attributes_sorted}" ) - df[key] = df["attributes"].apply(lambda v: v.get(key, np.nan)) - del df["attributes"] + + # Copy the specified attribute into a new column in the DataFrame. + # Note: avoid using .apply() here, for type checking. + df[key] = [a.get(key, np.nan) for a in df["attributes"]] + + # Drop the original "attributes" column from the DataFrame. + # Note: avoid using del here, for type checking. + df = df.drop(columns=["attributes"]) return df diff --git a/tests/anoph/conftest.py b/tests/anoph/conftest.py index 0c97c6c7a..42fc3dc99 100644 --- a/tests/anoph/conftest.py +++ b/tests/anoph/conftest.py @@ -374,7 +374,7 @@ def simulate_snp_genotypes( root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") n_samples = len(df_samples) samples = df_samples["sample_id"].values.astype("S") root.create_dataset(name="samples", data=samples) @@ -618,7 +618,7 @@ def simulate_cnv_hmm(zarr_path, metadata_path, contigs, contig_sizes): root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -717,7 +717,7 @@ def simulate_cnv_coverage_calls(zarr_path, metadata_path, contigs, contig_sizes) root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") n_samples = len(df_samples) samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -837,7 +837,7 @@ def simulate_cnv_discordant_read_calls(zarr_path, metadata_path, contigs, contig root = zarr.open(zarr_path, mode="w") # Create samples array. - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values root.create_dataset(name="samples", data=samples, dtype=str) @@ -1250,7 +1250,7 @@ def write_metadata( / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") # Randomly downsample. df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() @@ -1275,7 +1275,7 @@ def write_metadata( / sample_set / "surveillance.flags.csv" ) - df_surveillance_flags = pd.read_csv(surv_flags_src_path) + df_surveillance_flags = pd.read_csv(surv_flags_src_path, engine="python") df_surveillance_flags_ds = ( df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index() ) @@ -1301,7 +1301,7 @@ def write_metadata( / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -1329,7 +1329,7 @@ def write_metadata( / sample_set / "samples.species_aim.csv" ) - df_aim = pd.read_csv(src_path) + df_aim = pd.read_csv(src_path, engine="python") df_aim_ds = df_aim.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1353,7 +1353,7 @@ def write_metadata( / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1395,7 +1395,7 @@ def write_metadata( / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1418,7 +1418,7 @@ def write_metadata( / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -1572,7 +1572,7 @@ def init_haplotypes(self): / sample_set / "samples.species_aim.csv" ) - df_aim = pd.read_csv(metadata_path) + df_aim = pd.read_csv(metadata_path, engine="python") # Simulate haplotypes for the gamb_colu_arab analysis. analysis = "gamb_colu_arab" @@ -1716,7 +1716,7 @@ def init_aim_calls(self): / sample_set / "samples.meta.csv" ) - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") ds["sample_id"] = ("samples",), df_samples["sample_id"] # Add call_genotype variable. @@ -2030,7 +2030,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2054,7 +2054,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "surveillance.flags.csv" ) - df_surveillance_flags = pd.read_csv(surv_flags_src_path) + df_surveillance_flags = pd.read_csv(surv_flags_src_path, engine="python") df_surveillance_flags_ds = ( df_surveillance_flags.set_index("sample_id").loc[samples_ds].reset_index() ) @@ -2080,7 +2080,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2107,7 +2107,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2130,7 +2130,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2153,7 +2153,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2282,7 +2282,7 @@ def init_haplotypes(self): / sample_set / "samples.meta.csv" ) - df_samples = pd.read_csv(metadata_path) + df_samples = pd.read_csv(metadata_path, engine="python") samples = df_samples["sample_id"].values # Simulate haplotypes. @@ -2570,7 +2570,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2595,7 +2595,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2622,7 +2622,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2645,7 +2645,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2668,7 +2668,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2887,7 +2887,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.meta.csv" ) - df_general = pd.read_csv(src_path) + df_general = pd.read_csv(src_path, engine="python") df_general_ds = df_general.sample(n_samples_sim, replace=False) samples_ds = df_general_ds["sample_id"].tolist() dst_path = ( @@ -2912,7 +2912,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "sequence_qc_stats.csv" ) - df_sequence_qc_stats = pd.read_csv(src_path) + df_sequence_qc_stats = pd.read_csv(src_path, engine="python") df_sequence_qc_stats_ds = ( df_sequence_qc_stats.set_index("sample_id") .loc[samples_ds] @@ -2939,7 +2939,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "samples.cohorts.csv" ) - df_coh = pd.read_csv(src_path) + df_coh = pd.read_csv(src_path, engine="python") df_coh_ds = df_coh.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2962,7 +2962,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_snp_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path @@ -2985,7 +2985,7 @@ def write_metadata(self, release, release_path, sample_set, sequence_qc=True): / sample_set / "wgs_accession_data.csv" ) - df_cat = pd.read_csv(src_path) + df_cat = pd.read_csv(src_path, engine="python") df_cat_ds = df_cat.set_index("sample_id").loc[samples_ds].reset_index() dst_path = ( self.bucket_path