From b07e7bfa2067b01642a5627dfaf15e3ebd77dec6 Mon Sep 17 00:00:00 2001 From: tristanpwdennis Date: Wed, 18 Jun 2025 17:53:17 -0500 Subject: [PATCH 1/2] Cherry-pick: add hashing to anopheles.py from 1b196344 --- malariagen_data/anopheles.py | 71 +++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index ed9cb65cf..7fc78451e 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -574,31 +574,68 @@ def roh_hmm( debug = self._log.debug resolved_region: Region = parse_single_region(self, region) - del region - debug("compute windowed heterozygosity") - sample_id, sample_set, windows, counts = self._sample_count_het( + name = "roh" + + params = dict( sample=sample, - region=resolved_region, - site_mask=site_mask, + region=region, window_size=window_size, + site_mask=site_mask, sample_set=sample_set, - chunks=chunks, - inline_array=inline_array, - ) - - debug("compute runs of homozygosity") - df_roh = self._roh_hmm_predict( - windows=windows, - counts=counts, phet_roh=phet_roh, phet_nonroh=phet_nonroh, transition=transition, - window_size=window_size, - sample_id=sample_id, - contig=resolved_region.contig, + chunks=chunks, + inline_array=inline_array, ) + del region + + try: + # Load cached numeric data, adding str / obj data again. + results = self.results_cache_get(name=name, params=params) + df_roh = pd.DataFrame(results) + df_roh["sample_id"] = sample + df_roh["contig"] = resolved_region.contig + + except CacheMiss: + debug("compute windowed heterozygosity") + sample_id, sample_set, windows, counts = self._sample_count_het( + sample=sample, + region=resolved_region, + site_mask=site_mask, + window_size=window_size, + sample_set=sample_set, + chunks=chunks, + inline_array=inline_array, + ) + + debug("compute runs of homozygosity") + df_roh = self._roh_hmm_predict( + windows=windows, + counts=counts, + phet_roh=phet_roh, + phet_nonroh=phet_nonroh, + transition=transition, + window_size=window_size, + sample_id=sample_id, + contig=resolved_region.contig, + ) + + # Specify numeric columns to save (saving obj - sample ID and contig - breaks the save. + columns_to_save = [ + "roh_start", + "roh_stop", + "roh_length", + "roh_is_marginal", + ] + self.results_cache_set( + name=name, + params=params, + results={col: df_roh[col].to_numpy() for col in columns_to_save}, + ) + return df_roh @check_types @@ -1306,7 +1343,7 @@ def ihs_gwss( ) -> Tuple[np.ndarray, np.ndarray]: # change this name if you ever change the behaviour of this function, to # invalidate any previously cached data - name = self._ihs_gwss_cache_name + name = "roh" params = dict( contig=contig, From 9ccf96d92459994abfd8b47a85d062d4e97b2dd0 Mon Sep 17 00:00:00 2001 From: tristanpwdennis Date: Wed, 18 Jun 2025 18:17:27 -0500 Subject: [PATCH 2/2] Add more comments, tidy up a little --- malariagen_data/anopheles.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 7fc78451e..c5411a6b1 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -575,7 +575,8 @@ def roh_hmm( resolved_region: Region = parse_single_region(self, region) - name = "roh" + # Create params for hashing. + name = "roh_v1" params = dict( sample=sample, @@ -592,8 +593,9 @@ def roh_hmm( del region + # The caching struggles with saving variable length strings, so we can just load/save the numeric data, and + # add the strings (sample ID and contig) from user input. try: - # Load cached numeric data, adding str / obj data again. results = self.results_cache_get(name=name, params=params) df_roh = pd.DataFrame(results) df_roh["sample_id"] = sample @@ -623,13 +625,15 @@ def roh_hmm( contig=resolved_region.contig, ) - # Specify numeric columns to save (saving obj - sample ID and contig - breaks the save. + # Specify numeric columns to save to cache. (See above - variable length strings can break the save). columns_to_save = [ "roh_start", "roh_stop", "roh_length", "roh_is_marginal", ] + + # Save cache self.results_cache_set( name=name, params=params,