From 2e19059e429d8bd302e7dd076c18f510b56bf325 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 28 Mar 2026 08:09:24 -0700 Subject: [PATCH 1/3] Addressing main PR comments --- multiqc/modules/bases2fastq/bases2fastq.py | 249 +++++++++++--------- multiqc/modules/bases2fastq/plot_runs.py | 28 +-- multiqc/modules/bases2fastq/plot_samples.py | 20 +- 3 files changed, 164 insertions(+), 133 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index d170241de1..f2c9504634 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -1,39 +1,41 @@ -from collections import defaultdict import copy -from itertools import chain -import re import json import logging import random -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import re import uuid +from collections import defaultdict +from itertools import chain from pathlib import Path +from typing import Any, Callable + from natsort import natsorted + from multiqc import config from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound -from multiqc.types import LoadedFileDict -from multiqc.utils import mqc_colour - from multiqc.modules.bases2fastq.plot_runs import ( + plot_base_quality_by_cycle, + plot_base_quality_hist, plot_run_stats, - tabulate_manifest_stats, tabulate_index_assignment_stats, - tabulate_unassigned_index_stats, - tabulate_run_stats, + tabulate_manifest_stats, tabulate_project_stats, - plot_base_quality_hist, - plot_base_quality_by_cycle, + tabulate_run_stats, + tabulate_unassigned_index_stats, ) from multiqc.modules.bases2fastq.plot_samples import ( - tabulate_sample_stats, - sequence_content_plot, - plot_per_cycle_N_content, plot_adapter_content, + plot_per_cycle_N_content, plot_per_read_gc_hist, + sequence_content_plot, + tabulate_sample_stats, ) +from multiqc.types import LoadedFileDict +from multiqc.utils import mqc_colour log = logging.getLogger(__name__) +ELEMBIO_DOCS_URL = "https://docs.elembio.io/docs/bases2fastq/introduction/" # Default minimum polony threshold - samples below this are skipped DEFAULT_MIN_POLONIES = 1000 @@ -121,11 +123,11 @@ class MultiqcModule(BaseMultiqcModule): Data Structures --------------- - - `run_level_data`: Dict[run_name, run_stats] - Run-level QC metrics - - `run_level_samples`: Dict[sample_id, sample_stats] - Sample metrics from run-level - - `project_level_data`: Dict[project_name, project_stats] - Project-level QC metrics - - `project_level_samples`: Dict[sample_id, sample_stats] - Sample metrics from project-level - - `*_samples_to_project`: Dict[sample_id, project_name] - Maps samples to their projects + - `run_level_data`: dict[run_name, run_stats] - Run-level QC metrics + - `run_level_samples`: dict[sample_id, sample_stats] - Sample metrics from run-level + - `project_level_data`: dict[project_name, project_stats] - Project-level QC metrics + - `project_level_samples`: dict[sample_id, sample_stats] - Sample metrics from project-level + - `*_samples_to_project`: dict[sample_id, project_name] - Maps samples to their projects Sample Naming Convention ------------------------ @@ -151,7 +153,7 @@ def __init__(self): super(MultiqcModule, self).__init__( name="Bases2Fastq", anchor="bases2fastq", - href="https://docs.elembio.io/docs/bases2fastq/introduction/", + href=ELEMBIO_DOCS_URL, info="Demultiplexes and converts Element AVITI base calls into FASTQ files", doi="10.1038/s41587-023-01750-7", ) @@ -198,24 +200,24 @@ def _init_data_structures(self) -> None: """ # File cache to avoid reading the same JSON files multiple times # Key: resolved file path, Value: parsed JSON data - self._file_cache: Dict[str, Any] = {} + self._file_cache: dict[str, Any] = {} # === Run-level data structures === # Populated from /RunStats.json - self.run_level_data: Dict[str, Any] = {} # run_name -> full run stats - self.run_level_samples: Dict[str, Any] = {} # sample_id -> sample stats - self.run_level_samples_to_project: Dict[str, str] = {} # sample_id -> project name + self.run_level_data: dict[str, Any] = {} # run_name -> full run stats + self.run_level_samples: dict[str, Any] = {} # sample_id -> sample stats + self.run_level_samples_to_project: dict[str, str] = {} # sample_id -> project name # === Project-level data structures === # Populated from /Samples//RunStats.json - self.project_level_data: Dict[str, Any] = {} # project_name -> project stats - self.project_level_samples: Dict[str, Any] = {} # sample_id -> sample stats - self.project_level_samples_to_project: Dict[str, str] = {} # sample_id -> project name + self.project_level_data: dict[str, Any] = {} # project_name -> project stats + self.project_level_samples: dict[str, Any] = {} # sample_id -> sample stats + self.project_level_samples_to_project: dict[str, str] = {} # sample_id -> project name # === Grouping structures for color assignment === - self.group_dict: Dict[str, Any] = {} # group_name -> list of members - self.group_lookup_dict: Dict[str, Any] = {} # item -> group it belongs to - self.project_lookup_dict: Dict[str, Any] = {} # sample -> project mapping + self.group_dict: dict[str, Any] = {} # group_name -> list of members + self.group_lookup_dict: dict[str, Any] = {} # item -> group it belongs to + self.project_lookup_dict: dict[str, Any] = {} # sample -> project mapping def _validate_path(self, file_path: Path, base_directory: Path) -> bool: """ @@ -241,7 +243,7 @@ def _validate_path(self, file_path: Path, base_directory: Path) -> bool: ) return False - def _read_json_file(self, file_path: Path, base_directory: Optional[Path] = None) -> Optional[Dict[str, Any]]: + def _read_json_file(self, file_path: Path, base_directory: Path | None = None) -> dict[str, Any] | None: """ Read and parse a JSON file with caching. @@ -265,7 +267,7 @@ def _read_json_file(self, file_path: Path, base_directory: Optional[Path] = None log.error( f"{file_path.name} does not exist at {file_path}.\n" f"Please visit Elembio online documentation for more information - " - f"https://docs.elembio.io/docs/bases2fastq/introduction/" + f"{ELEMBIO_DOCS_URL}" ) return None @@ -285,9 +287,12 @@ def _parse_and_validate_data(self) -> str: Returns: summary_path: The determined summary path ('run_level', 'project_level', or 'combined_level') """ - # Collect log files once per pattern (find_log_files returns a generator) - run_level_log_files = list(self.find_log_files("bases2fastq/run")) - project_level_log_files = list(self.find_log_files("bases2fastq/project")) + # Collect log files once per pattern (find_log_files returns a generator). + # Stored as instance vars so downstream parsers can reuse them. + self._run_level_log_files = list(self.find_log_files("bases2fastq/run")) + self._project_level_log_files = list(self.find_log_files("bases2fastq/project")) + run_level_log_files = self._run_level_log_files + project_level_log_files = self._project_level_log_files if len(run_level_log_files) == 0 and len(project_level_log_files) == 0: error_msg = "No run- or project-level log files found within the Bases2Fastq results." @@ -366,8 +371,8 @@ def _determine_summary_path(self) -> str: def _select_data_by_summary_path( self, summary_path: str - ) -> Tuple[ - Dict[str, Any], Dict[str, Any], Dict[str, str], Dict[str, Any], Dict[str, Any], Dict[int, Dict[str, Any]] + ) -> tuple[ + dict[str, Any], dict[str, Any], dict[str, str], dict[str, Any], dict[str, Any], dict[int, dict[str, Any]] ]: """ Select the appropriate data sources based on the summary path. @@ -377,31 +382,35 @@ def _select_data_by_summary_path( index_assignment_data, unassigned_sequences) """ if summary_path == "run_level": + manifest_log_files = list(self.find_log_files("bases2fastq/manifest")) return ( self.run_level_data, self.run_level_samples, self.run_level_samples_to_project, - self._parse_run_manifest("bases2fastq/manifest"), - self._parse_index_assignment("bases2fastq/manifest"), - self._parse_run_unassigned_sequences("bases2fastq/run"), + self._parse_run_manifest("bases2fastq/manifest", log_files=manifest_log_files), + self._parse_index_assignment("bases2fastq/manifest", log_files=manifest_log_files), + self._parse_run_unassigned_sequences("bases2fastq/run", log_files=self._run_level_log_files), ) elif summary_path == "project_level": return ( self.project_level_data, self.project_level_samples, self.project_level_samples_to_project, - self._parse_run_manifest_in_project("bases2fastq/project"), - self._parse_index_assignment_in_project("bases2fastq/project"), + self._parse_run_manifest_in_project("bases2fastq/project", log_files=self._project_level_log_files), + self._parse_index_assignment_in_project("bases2fastq/project", log_files=self._project_level_log_files), {}, # No unassigned sequences for project level ) elif summary_path == "combined_level": + # Use run-level stats for the run table (more complete), but + # project-level samples for per-sample plots (properly split by project). + manifest_log_files = list(self.find_log_files("bases2fastq/manifest")) return ( self.run_level_data, self.project_level_samples, self.project_level_samples_to_project, - self._parse_run_manifest("bases2fastq/manifest"), - self._parse_index_assignment("bases2fastq/manifest"), - self._parse_run_unassigned_sequences("bases2fastq/run"), + self._parse_run_manifest("bases2fastq/manifest", log_files=manifest_log_files), + self._parse_index_assignment("bases2fastq/manifest", log_files=manifest_log_files), + self._parse_run_unassigned_sequences("bases2fastq/run", log_files=self._run_level_log_files), ) else: error_msg = "No run- or project-level data was retained. No report will be generated." @@ -409,16 +418,16 @@ def _select_data_by_summary_path( raise ModuleNoSamplesFound(error_msg) def _setup_colors( - self, sample_data: Dict[str, Any], samples_to_projects: Dict[str, str], summary_path: str + self, sample_data: dict[str, Any], samples_to_projects: dict[str, str], summary_path: str ) -> None: """Set up color schemes for groups and samples.""" # Create run and project groups - run_groups: Dict[str, List] = defaultdict(list) - project_groups: Dict[str, List] = defaultdict(list) - ind_sample_groups: Dict[str, List] = defaultdict(list) + run_groups: dict[str, list] = defaultdict(list) + project_groups: dict[str, list] = defaultdict(list) + ind_sample_groups: dict[str, list] = defaultdict(list) for sample in natsorted(sample_data.keys()): - run_name, _ = sample.split("__") + run_name, _ = sample.split("__", maxsplit=1) run_groups[run_name].append(sample) sample_project = samples_to_projects.get(sample, "DefaultProject") project_groups[sample_project].append(sample) @@ -448,7 +457,7 @@ def _setup_colors( } # Assign colors to samples - self.sample_color: Dict[str, str] = {} + self.sample_color: dict[str, str] = {} for sample_name in natsorted(samples_to_projects.keys()): if summary_path == "project_level" or len(project_groups) == 1: sample_color = self.group_color[sample_name] @@ -463,12 +472,12 @@ def _setup_colors( def _generate_plots( self, summary_path: str, - run_data: Dict[str, Any], - sample_data: Dict[str, Any], - samples_to_projects: Dict[str, str], - manifest_data: Dict[str, Any], - index_assignment_data: Dict[str, Any], - unassigned_sequences: Dict[int, Dict[str, Any]], + run_data: dict[str, Any], + sample_data: dict[str, Any], + samples_to_projects: dict[str, str], + manifest_data: dict[str, Any], + index_assignment_data: dict[str, Any], + unassigned_sequences: dict[int, dict[str, Any]], ) -> None: """Generate all plots and add sections to the report.""" # QC metrics table @@ -505,9 +514,9 @@ def get_uuid(self) -> str: def _extract_run_analysis_name( self, - data: Dict[str, Any], + data: dict[str, Any], source_info: str = "RunStats.json", - ) -> Optional[str]: + ) -> str | None: """ Extract and validate run_analysis_name from data dict. @@ -526,15 +535,15 @@ def _extract_run_analysis_name( f"Error with {source_info}. Either RunName or AnalysisID is absent.\n" f"RunName: {run_name}, AnalysisID: {analysis_id}\n" f"Please visit Elembio online documentation for more information - " - f"https://docs.elembio.io/docs/bases2fastq/introduction/" + f"{ELEMBIO_DOCS_URL}" ) return None return f"{run_name}-{analysis_id[0:4]}" def _parse_run_project_data( - self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None - ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, str]]: + self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> tuple[dict[str, Any], dict[str, Any], dict[str, str]]: """ Parse RunStats.json files to extract run/project and sample-level data. @@ -548,16 +557,16 @@ def _parse_run_project_data( Returns: Tuple of: - - runs_global_data: Dict[run_name, run_stats] - Run/project level metrics - - runs_sample_data: Dict[sample_id, sample_stats] - Per-sample metrics - - sample_to_project: Dict[sample_id, project_name] - Sample-to-project mapping + - runs_global_data: dict[run_name, run_stats] - Run/project level metrics + - runs_sample_data: dict[sample_id, sample_stats] - Per-sample metrics + - sample_to_project: dict[sample_id, project_name] - Sample-to-project mapping Data Flow: RunStats.json -> parse -> filter samples by min_polonies -> populate dicts """ - runs_global_data: Dict[str, Any] = {} - runs_sample_data: Dict[str, Any] = {} - sample_to_project: Dict[str, str] = {} + runs_global_data: dict[str, Any] = {} + runs_sample_data: dict[str, Any] = {} + sample_to_project: dict[str, str] = {} if data_source == "": return (runs_global_data, runs_sample_data, sample_to_project) @@ -618,8 +627,8 @@ def _parse_run_project_data( return (runs_global_data, runs_sample_data, sample_to_project) def _extract_manifest_lane_settings( - self, run_manifest_data: Dict[str, Any], run_analysis_name: str - ) -> Dict[str, Dict[str, Any]]: + self, run_manifest_data: dict[str, Any], run_analysis_name: str + ) -> dict[str, dict[str, Any]]: """ Extract per-lane settings from a parsed RunManifest.json Settings section. @@ -628,10 +637,10 @@ def _extract_manifest_lane_settings( run_analysis_name: Run identifier for building run_lane keys Returns: - Dict[run_lane, settings] where run_lane = "{run_analysis_name} | L{lane_id}" + dict[run_lane, settings] where run_lane = "{run_analysis_name} | L{lane_id}" and settings contain Indexing, AdapterTrimType, R1/R2AdapterMinimumTrimmedLength """ - result: Dict[str, Dict[str, Any]] = {} + result: dict[str, dict[str, Any]] = {} if "Settings" not in run_manifest_data: return result for lane_data in run_manifest_data["Settings"]: @@ -659,7 +668,9 @@ def _extract_manifest_lane_settings( result[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get("R2AdapterMinimumTrimmedLength", "N/A") return result - def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: + def _parse_run_manifest( + self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> dict[str, Any]: """ Parse RunManifest.json for run-level analysis to extract lane and adapter settings. @@ -670,16 +681,18 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: Args: data_source: Search pattern key for RunManifest.json files + log_files: Optional pre-collected list of file dicts from find_log_files. Returns: - Dict[run_lane, settings] where run_lane = "{run_name} | L{lane_id}" + dict[run_lane, settings] where run_lane = "{run_name} | L{lane_id}" """ - runs_manifest_data: Dict[str, Dict[str, Any]] = {} + runs_manifest_data: dict[str, dict[str, Any]] = {} if data_source == "": return runs_manifest_data - for f in self.find_log_files(data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(data_source)) + for f in files_to_process: directory = f.get("root") if not directory: continue @@ -706,7 +719,9 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: return runs_manifest_data - def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: + def _parse_run_manifest_in_project( + self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> dict[str, Any]: """ Parse RunManifest.json for project-level analysis. @@ -718,21 +733,22 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: + ../../RunManifest.json (run-level manifest) -> Extract per-lane settings """ - project_manifest_data: Dict[str, Dict[str, Any]] = {} + project_manifest_data: dict[str, dict[str, Any]] = {} if data_source == "": return project_manifest_data - for f in self.find_log_files(data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(data_source)) + for f in files_to_process: directory = f.get("root") if not directory: continue - # Get RunManifest.json from run output root (check if it exists in the same directory or try two levels up) + # Resolve base_directory to the run output root (not the project subdirectory), + # since RunManifest.json lives at the run root. Path validation in _read_json_file + # will check the manifest path against this run root directory. base_directory = Path(directory).resolve() - if (base_directory / "RunManifest.json").exists(): - base_directory = base_directory - else: + if not (base_directory / "RunManifest.json").exists(): base_directory = base_directory.parent.parent run_manifest = base_directory / "RunManifest.json" project_stats = json.loads(f["f"]) @@ -768,10 +784,10 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: def _build_index_assignment_from_stats( self, - stats_dict: Dict[str, Any], + stats_dict: dict[str, Any], run_analysis_name: str, - project: Optional[str] = None, - ) -> Tuple[Dict[str, Dict[str, Any]], int]: + project: str | None = None, + ) -> tuple[dict[str, dict[str, Any]], int]: """ Build per-run index assignment dict from RunStats SampleStats/Occurrences. @@ -779,7 +795,7 @@ def _build_index_assignment_from_stats( Tuple of (run_inner_dict, total_polonies). run_inner_dict is { merged_expected_sequence -> { SampleID, SamplePolonyCounts, PercentOfPolonies, Index1, Index2, ... } } """ - run_inner: Dict[str, Dict[str, Any]] = {} + run_inner: dict[str, dict[str, Any]] = {} total_polonies = stats_dict.get("NumPoloniesBeforeTrimming", 0) if "SampleStats" not in stats_dict: return (run_inner, total_polonies) @@ -796,7 +812,7 @@ def _build_index_assignment_from_stats( log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") continue if sample_expected_seq not in run_inner: - entry: Dict[str, Any] = { + entry: dict[str, Any] = { "SampleID": sample_id, "SamplePolonyCounts": 0, "PercentOfPolonies": float("nan"), @@ -814,8 +830,8 @@ def _build_index_assignment_from_stats( def _merge_manifest_index_sequences( self, - sample_to_index_assignment: Dict[str, Any], - run_manifest_data: Dict[str, Any], + sample_to_index_assignment: dict[str, Any], + run_manifest_data: dict[str, Any], run_analysis_name: str, ) -> None: """Merge Index1/Index2 from RunManifest Samples into sample_to_index_assignment (mutates).""" @@ -843,7 +859,9 @@ def _merge_manifest_index_sequences( run_data[merged_indices]["Index1"] = index_1 run_data[merged_indices]["Index2"] = index_2 - def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[int, Dict[str, Any]]: + def _parse_run_unassigned_sequences( + self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> dict[int, dict[str, Any]]: """ Parse unassigned/unknown barcode sequences from run-level data. @@ -854,11 +872,12 @@ def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[int, Dict[st RunStats.json -> Lanes -> UnassignedSequences -> Extract: sequence, count, percentage of total polonies """ - run_unassigned_sequences: Dict[int, Dict[str, Any]] = {} + run_unassigned_sequences: dict[int, dict[str, Any]] = {} if data_source == "": return run_unassigned_sequences - for f in self.find_log_files(data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(data_source)) + for f in files_to_process: data = json.loads(f["f"]) # Get RunName and AnalysisID @@ -902,7 +921,9 @@ def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[int, Dict[st return run_unassigned_sequences - def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: + def _parse_index_assignment( + self, manifest_data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> dict[str, Any]: """ Parse index assignment statistics for run-level analysis. @@ -914,12 +935,13 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: + RunManifest.json -> Samples -> index sequences (Index1, Index2) -> Combined index assignment table """ - sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} + sample_to_index_assignment: dict[str, dict[str, dict[str, Any]]] = {} if manifest_data_source == "": return sample_to_index_assignment - for f in self.find_log_files(manifest_data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(manifest_data_source)) + for f in files_to_process: directory = f.get("root") if not directory: continue @@ -944,7 +966,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: f"Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" f"Available keys: {list(run_stats.keys())}\n" f"Please visit Elembio online documentation for more information - " - f"https://docs.elembio.io/docs/bases2fastq/introduction/" + f"{ELEMBIO_DOCS_URL}" ) continue @@ -964,7 +986,9 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: return sample_to_index_assignment - def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any]: + def _parse_index_assignment_in_project( + self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None + ) -> dict[str, Any]: """ Parse index assignment statistics for project-level analysis. @@ -976,12 +1000,13 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] + ../../RunManifest.json -> Samples -> index sequences -> Combined index assignment table """ - sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} + sample_to_index_assignment: dict[str, dict[str, dict[str, Any]]] = {} if data_source == "": return sample_to_index_assignment - for f in self.find_log_files(data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(data_source)) + for f in files_to_process: directory = f.get("root") if not directory: continue @@ -1009,7 +1034,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] f"Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" f"Available keys: {list(project_stats.keys())}\n" f"Please visit Elembio online documentation for more information - " - f"https://docs.elembio.io/docs/bases2fastq/introduction/" + f"{ELEMBIO_DOCS_URL}" ) continue @@ -1032,16 +1057,21 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] return sample_to_index_assignment - def add_run_plots(self, data: Dict[Any, Any], plot_functions: List[Callable]) -> None: + def add_run_plots(self, data: dict[Any, Any], plot_functions: list[Callable]) -> None: + if not data: + return for func in plot_functions: plot_html, plot_name, anchor, description, helptext, plot_data = func(data, self.run_color) - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) - self.write_data_file(plot_data, f"base2fastq:{plot_name}") + if plot_html is not None: + self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) + self.write_data_file(plot_data, f"base2fastq:{plot_name}") def add_sample_plots( - self, data: Dict[str, Any], group_lookup: Dict[str, str], project_lookup: Dict[str, str] + self, data: dict[str, Any], group_lookup: dict[str, str], project_lookup: dict[str, str] ) -> None: - plot_functions: List[Callable] = [ + if not data: + return + plot_functions: list[Callable] = [ tabulate_sample_stats, sequence_content_plot, plot_per_cycle_N_content, @@ -1052,5 +1082,6 @@ def add_sample_plots( plot_html, plot_name, anchor, description, helptext, plot_data = func( data, group_lookup, project_lookup, self.sample_color ) - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) - self.write_data_file(plot_data, f"base2fastq:{plot_name}") + if plot_html is not None: + self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) + self.write_data_file(plot_data, f"base2fastq:{plot_name}") diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 39499dc723..c8ccbbde20 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -1,5 +1,5 @@ import math -from typing import Any, Dict, cast +from typing import Any, cast from multiqc.plots import bargraph, linegraph, table from multiqc.plots.table_object import ColumnDict, SectionT @@ -212,7 +212,7 @@ def tabulate_project_stats(run_data, color_dict): first_key = run_keys[0] project_header = f"{run_data[first_key]['Project']} | " plot_name = f"{project_header}Sequencing QC Metrics Table" - plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "project_run_qc_metrics_table" description = "QC metrics per run, per project" helptext = """ @@ -339,7 +339,7 @@ def tabulate_run_stats(run_data, color_dict): } plot_name = "Sequencing Run QC Metrics Table" - plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_qc_metrics_table" description = "QC metrics per run" helptext = """ @@ -398,7 +398,7 @@ def tabulate_manifest_stats(run_data, color_dict): } plot_name = "Run Manifest Table" - plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_manifest_metrics_table" description = "Run parameters used." helptext = """ @@ -429,7 +429,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): if "Project" in sample_data: sample_index_stats.update({"project": sample_data["Project"]}) project_present = True - sample_index_stats.update({"sample_name": sample_data["SampleID"].split("__")[1]}) + sample_index_stats.update({"sample_name": sample_data["SampleID"].split("__", maxsplit=1)[1]}) sample_index_stats.update({"index_1": sample_data["Index1"]}) sample_index_stats.update({"index_2": sample_data["Index2"]}) sample_index_stats.update({"assigned_polonies": sample_data["SamplePolonyCounts"]}) @@ -437,7 +437,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): plot_content.update({index: sample_index_stats}) index += 1 - headers: Dict[str, Any] = {} + headers: dict[str, Any] = {} headers["run_name"] = { "title": "Run Name", "description": "Run Name.", @@ -480,7 +480,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): } plot_name = "Index Assignment Metrics" - plot_html = table.plot(cast(SectionT, plot_content), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(cast(SectionT, plot_content), cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_assignment_metrics" description = "Index assignment metrics." helptext = """ @@ -506,7 +506,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): - Polonies - % Polonies """ - headers: Dict[str, Any] = {} + headers: dict[str, Any] = {} headers["Run Name"] = { "title": "Run Name", "description": "Run Name (Run ID + Analysis ID).", @@ -544,7 +544,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): } plot_name = "Unassigned Indices Metrics" - plot_html = table.plot(cast(SectionT, run_data), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(cast(SectionT, run_data), cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_unassignment_metrics" description = "Index unassignment metrics." helptext = """ @@ -572,7 +572,7 @@ def _run_has_reads(run_entry: dict) -> bool: def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per base BQ histogram (skip runs without Reads) - bq_hist_dict: Dict[str, Dict[int, float]] = {} + bq_hist_dict: dict[str, dict[int, float]] = {} for s_name in natsorted(run_data.keys()): if not _run_has_reads(run_data[s_name]): continue @@ -588,7 +588,7 @@ def plot_base_quality_hist(run_data, color_dict): bq_hist_dict[s_name].update({quality: R1R2_base_quality_counts[quality] / total_bases * 100}) # Prepare plot data for per read average BQ histogram - per_read_quality_hist_dict: Dict[str, Dict[int, float]] = {} + per_read_quality_hist_dict: dict[str, dict[int, float]] = {} for s_name in natsorted(run_data.keys()): if not _run_has_reads(run_data[s_name]): continue @@ -676,10 +676,10 @@ def plot_base_quality_by_cycle(run_data, color_dict): R1CycleNum = len(read0["Cycles"]) r1r2_split = max(r1r2_split, R1CycleNum) - median_dict: Dict[str, Dict[int, float]] = {} + median_dict: dict[str, dict[int, float]] = {} for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False - cycle_dict: Dict[int, float] = {} + cycle_dict: dict[int, float] = {} R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) @@ -691,7 +691,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): median_dict.update({s_name: cycle_dict}) # Prepare plot data for mean BQ of each cycle - mean_dict: Dict[str, Dict[int, float]] = {} + mean_dict: dict[str, dict[int, float]] = {} for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index e7987135bb..4b715c6819 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, cast +from typing import Any, cast from natsort import natsorted @@ -153,7 +153,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } plot_name = "Sample QC Metrics Table" - plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "sample_qc_metrics_table" description = "QC metrics per unique sample" helptext = """ @@ -177,7 +177,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c """Create the epic HTML for the FastQC sequence content heatmap""" samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_data: Dict[str, Dict[int, Any]] = {} + empty_data: dict[str, dict[int, Any]] = {} plot_html = linegraph.plot( empty_data, pconfig={ @@ -190,7 +190,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c return plot_html, "Per Cycle Base Content", "base_content", "", "", empty_data # Prep the data - data: Dict[str, Dict[int, Any]] = {} + data: dict[str, dict[int, Any]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads): @@ -259,7 +259,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict, color_dict): samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_data: Dict[str, Dict[int, float]] = {} + empty_data: dict[str, dict[int, float]] = {} plot_html = linegraph.plot( empty_data, pconfig={ @@ -271,7 +271,7 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict ) return plot_html, "Per Cycle N Content", "n_content", "", "", empty_data - data: Dict[str, Dict[int, float]] = {} + data: dict[str, dict[int, float]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads): data[s_name] = {} @@ -344,7 +344,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s """ samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_gc_hist: Dict[str, Dict[float, float]] = {} + empty_gc_hist: dict[str, dict[float, float]] = {} plot_html = linegraph.plot( empty_gc_hist, pconfig={ @@ -356,7 +356,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s ) return plot_html, "Per Sample GC Histogram", "gc_histogram", "", "", empty_gc_hist - gc_hist_dict: Dict[str, Dict[float, float]] = {} + gc_hist_dict: dict[str, dict[float, float]] = {} for s_name in natsorted(samples_with_reads): r0 = sample_data[s_name]["Reads"][0] if "PerReadGCCountHistogram" not in r0: @@ -421,7 +421,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa """ samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_content: Dict[str, Dict[int, float]] = {} + empty_content: dict[str, dict[int, float]] = {} plot_html = linegraph.plot( empty_content, pconfig={ @@ -433,7 +433,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa ) return plot_html, "Per Sample Adapter Content", "adapter_content", "", "", empty_content - plot_content: Dict[str, Dict[int, float]] = {} + plot_content: dict[str, dict[int, float]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads): From eee4169c370fbca2e145e25fc05b2626aad7af32 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 28 Mar 2026 08:13:31 -0700 Subject: [PATCH 2/3] Lint --- multiqc/modules/bases2fastq/bases2fastq.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index f2c9504634..7fe21a1084 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -1063,7 +1063,9 @@ def add_run_plots(self, data: dict[Any, Any], plot_functions: list[Callable]) -> for func in plot_functions: plot_html, plot_name, anchor, description, helptext, plot_data = func(data, self.run_color) if plot_html is not None: - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) + self.add_section( + name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext + ) self.write_data_file(plot_data, f"base2fastq:{plot_name}") def add_sample_plots( @@ -1083,5 +1085,7 @@ def add_sample_plots( data, group_lookup, project_lookup, self.sample_color ) if plot_html is not None: - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) + self.add_section( + name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext + ) self.write_data_file(plot_data, f"base2fastq:{plot_name}") From 3e7bf3ad566172ce0be86e3082cbe289d5f89b0d Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 28 Mar 2026 08:21:32 -0700 Subject: [PATCH 3/3] Reverted new typing syntax --- multiqc/modules/bases2fastq/bases2fastq.py | 140 ++++++++++---------- multiqc/modules/bases2fastq/plot_runs.py | 28 ++-- multiqc/modules/bases2fastq/plot_samples.py | 20 +-- 3 files changed, 94 insertions(+), 94 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 7fe21a1084..a42b075bd7 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -7,7 +7,7 @@ from collections import defaultdict from itertools import chain from pathlib import Path -from typing import Any, Callable +from typing import Any, Callable, Dict, List, Optional, Tuple from natsort import natsorted @@ -123,11 +123,11 @@ class MultiqcModule(BaseMultiqcModule): Data Structures --------------- - - `run_level_data`: dict[run_name, run_stats] - Run-level QC metrics - - `run_level_samples`: dict[sample_id, sample_stats] - Sample metrics from run-level - - `project_level_data`: dict[project_name, project_stats] - Project-level QC metrics - - `project_level_samples`: dict[sample_id, sample_stats] - Sample metrics from project-level - - `*_samples_to_project`: dict[sample_id, project_name] - Maps samples to their projects + - `run_level_data`: Dict[run_name, run_stats] - Run-level QC metrics + - `run_level_samples`: Dict[sample_id, sample_stats] - Sample metrics from run-level + - `project_level_data`: Dict[project_name, project_stats] - Project-level QC metrics + - `project_level_samples`: Dict[sample_id, sample_stats] - Sample metrics from project-level + - `*_samples_to_project`: Dict[sample_id, project_name] - Maps samples to their projects Sample Naming Convention ------------------------ @@ -200,24 +200,24 @@ def _init_data_structures(self) -> None: """ # File cache to avoid reading the same JSON files multiple times # Key: resolved file path, Value: parsed JSON data - self._file_cache: dict[str, Any] = {} + self._file_cache: Dict[str, Any] = {} # === Run-level data structures === # Populated from /RunStats.json - self.run_level_data: dict[str, Any] = {} # run_name -> full run stats - self.run_level_samples: dict[str, Any] = {} # sample_id -> sample stats - self.run_level_samples_to_project: dict[str, str] = {} # sample_id -> project name + self.run_level_data: Dict[str, Any] = {} # run_name -> full run stats + self.run_level_samples: Dict[str, Any] = {} # sample_id -> sample stats + self.run_level_samples_to_project: Dict[str, str] = {} # sample_id -> project name # === Project-level data structures === # Populated from /Samples//RunStats.json - self.project_level_data: dict[str, Any] = {} # project_name -> project stats - self.project_level_samples: dict[str, Any] = {} # sample_id -> sample stats - self.project_level_samples_to_project: dict[str, str] = {} # sample_id -> project name + self.project_level_data: Dict[str, Any] = {} # project_name -> project stats + self.project_level_samples: Dict[str, Any] = {} # sample_id -> sample stats + self.project_level_samples_to_project: Dict[str, str] = {} # sample_id -> project name # === Grouping structures for color assignment === - self.group_dict: dict[str, Any] = {} # group_name -> list of members - self.group_lookup_dict: dict[str, Any] = {} # item -> group it belongs to - self.project_lookup_dict: dict[str, Any] = {} # sample -> project mapping + self.group_dict: Dict[str, Any] = {} # group_name -> list of members + self.group_lookup_dict: Dict[str, Any] = {} # item -> group it belongs to + self.project_lookup_dict: Dict[str, Any] = {} # sample -> project mapping def _validate_path(self, file_path: Path, base_directory: Path) -> bool: """ @@ -243,7 +243,7 @@ def _validate_path(self, file_path: Path, base_directory: Path) -> bool: ) return False - def _read_json_file(self, file_path: Path, base_directory: Path | None = None) -> dict[str, Any] | None: + def _read_json_file(self, file_path: Path, base_directory: Optional[Path] = None) -> Optional[Dict[str, Any]]: """ Read and parse a JSON file with caching. @@ -371,8 +371,8 @@ def _determine_summary_path(self) -> str: def _select_data_by_summary_path( self, summary_path: str - ) -> tuple[ - dict[str, Any], dict[str, Any], dict[str, str], dict[str, Any], dict[str, Any], dict[int, dict[str, Any]] + ) -> Tuple[ + Dict[str, Any], Dict[str, Any], Dict[str, str], Dict[str, Any], Dict[str, Any], Dict[int, Dict[str, Any]] ]: """ Select the appropriate data sources based on the summary path. @@ -418,13 +418,13 @@ def _select_data_by_summary_path( raise ModuleNoSamplesFound(error_msg) def _setup_colors( - self, sample_data: dict[str, Any], samples_to_projects: dict[str, str], summary_path: str + self, sample_data: Dict[str, Any], samples_to_projects: Dict[str, str], summary_path: str ) -> None: """Set up color schemes for groups and samples.""" # Create run and project groups - run_groups: dict[str, list] = defaultdict(list) - project_groups: dict[str, list] = defaultdict(list) - ind_sample_groups: dict[str, list] = defaultdict(list) + run_groups: Dict[str, List] = defaultdict(list) + project_groups: Dict[str, List] = defaultdict(list) + ind_sample_groups: Dict[str, List] = defaultdict(list) for sample in natsorted(sample_data.keys()): run_name, _ = sample.split("__", maxsplit=1) @@ -457,7 +457,7 @@ def _setup_colors( } # Assign colors to samples - self.sample_color: dict[str, str] = {} + self.sample_color: Dict[str, str] = {} for sample_name in natsorted(samples_to_projects.keys()): if summary_path == "project_level" or len(project_groups) == 1: sample_color = self.group_color[sample_name] @@ -472,12 +472,12 @@ def _setup_colors( def _generate_plots( self, summary_path: str, - run_data: dict[str, Any], - sample_data: dict[str, Any], - samples_to_projects: dict[str, str], - manifest_data: dict[str, Any], - index_assignment_data: dict[str, Any], - unassigned_sequences: dict[int, dict[str, Any]], + run_data: Dict[str, Any], + sample_data: Dict[str, Any], + samples_to_projects: Dict[str, str], + manifest_data: Dict[str, Any], + index_assignment_data: Dict[str, Any], + unassigned_sequences: Dict[int, Dict[str, Any]], ) -> None: """Generate all plots and add sections to the report.""" # QC metrics table @@ -514,9 +514,9 @@ def get_uuid(self) -> str: def _extract_run_analysis_name( self, - data: dict[str, Any], + data: Dict[str, Any], source_info: str = "RunStats.json", - ) -> str | None: + ) -> Optional[str]: """ Extract and validate run_analysis_name from data dict. @@ -542,8 +542,8 @@ def _extract_run_analysis_name( return f"{run_name}-{analysis_id[0:4]}" def _parse_run_project_data( - self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> tuple[dict[str, Any], dict[str, Any], dict[str, str]]: + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, str]]: """ Parse RunStats.json files to extract run/project and sample-level data. @@ -557,16 +557,16 @@ def _parse_run_project_data( Returns: Tuple of: - - runs_global_data: dict[run_name, run_stats] - Run/project level metrics - - runs_sample_data: dict[sample_id, sample_stats] - Per-sample metrics - - sample_to_project: dict[sample_id, project_name] - Sample-to-project mapping + - runs_global_data: Dict[run_name, run_stats] - Run/project level metrics + - runs_sample_data: Dict[sample_id, sample_stats] - Per-sample metrics + - sample_to_project: Dict[sample_id, project_name] - Sample-to-project mapping Data Flow: RunStats.json -> parse -> filter samples by min_polonies -> populate dicts """ - runs_global_data: dict[str, Any] = {} - runs_sample_data: dict[str, Any] = {} - sample_to_project: dict[str, str] = {} + runs_global_data: Dict[str, Any] = {} + runs_sample_data: Dict[str, Any] = {} + sample_to_project: Dict[str, str] = {} if data_source == "": return (runs_global_data, runs_sample_data, sample_to_project) @@ -627,8 +627,8 @@ def _parse_run_project_data( return (runs_global_data, runs_sample_data, sample_to_project) def _extract_manifest_lane_settings( - self, run_manifest_data: dict[str, Any], run_analysis_name: str - ) -> dict[str, dict[str, Any]]: + self, run_manifest_data: Dict[str, Any], run_analysis_name: str + ) -> Dict[str, Dict[str, Any]]: """ Extract per-lane settings from a parsed RunManifest.json Settings section. @@ -637,10 +637,10 @@ def _extract_manifest_lane_settings( run_analysis_name: Run identifier for building run_lane keys Returns: - dict[run_lane, settings] where run_lane = "{run_analysis_name} | L{lane_id}" + Dict[run_lane, settings] where run_lane = "{run_analysis_name} | L{lane_id}" and settings contain Indexing, AdapterTrimType, R1/R2AdapterMinimumTrimmedLength """ - result: dict[str, dict[str, Any]] = {} + result: Dict[str, Dict[str, Any]] = {} if "Settings" not in run_manifest_data: return result for lane_data in run_manifest_data["Settings"]: @@ -669,8 +669,8 @@ def _extract_manifest_lane_settings( return result def _parse_run_manifest( - self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> dict[str, Any]: + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Dict[str, Any]: """ Parse RunManifest.json for run-level analysis to extract lane and adapter settings. @@ -684,9 +684,9 @@ def _parse_run_manifest( log_files: Optional pre-collected list of file dicts from find_log_files. Returns: - dict[run_lane, settings] where run_lane = "{run_name} | L{lane_id}" + Dict[run_lane, settings] where run_lane = "{run_name} | L{lane_id}" """ - runs_manifest_data: dict[str, dict[str, Any]] = {} + runs_manifest_data: Dict[str, Dict[str, Any]] = {} if data_source == "": return runs_manifest_data @@ -720,8 +720,8 @@ def _parse_run_manifest( return runs_manifest_data def _parse_run_manifest_in_project( - self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> dict[str, Any]: + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Dict[str, Any]: """ Parse RunManifest.json for project-level analysis. @@ -733,7 +733,7 @@ def _parse_run_manifest_in_project( + ../../RunManifest.json (run-level manifest) -> Extract per-lane settings """ - project_manifest_data: dict[str, dict[str, Any]] = {} + project_manifest_data: Dict[str, Dict[str, Any]] = {} if data_source == "": return project_manifest_data @@ -784,10 +784,10 @@ def _parse_run_manifest_in_project( def _build_index_assignment_from_stats( self, - stats_dict: dict[str, Any], + stats_dict: Dict[str, Any], run_analysis_name: str, - project: str | None = None, - ) -> tuple[dict[str, dict[str, Any]], int]: + project: Optional[str] = None, + ) -> Tuple[Dict[str, Dict[str, Any]], int]: """ Build per-run index assignment dict from RunStats SampleStats/Occurrences. @@ -795,7 +795,7 @@ def _build_index_assignment_from_stats( Tuple of (run_inner_dict, total_polonies). run_inner_dict is { merged_expected_sequence -> { SampleID, SamplePolonyCounts, PercentOfPolonies, Index1, Index2, ... } } """ - run_inner: dict[str, dict[str, Any]] = {} + run_inner: Dict[str, Dict[str, Any]] = {} total_polonies = stats_dict.get("NumPoloniesBeforeTrimming", 0) if "SampleStats" not in stats_dict: return (run_inner, total_polonies) @@ -812,7 +812,7 @@ def _build_index_assignment_from_stats( log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") continue if sample_expected_seq not in run_inner: - entry: dict[str, Any] = { + entry: Dict[str, Any] = { "SampleID": sample_id, "SamplePolonyCounts": 0, "PercentOfPolonies": float("nan"), @@ -830,8 +830,8 @@ def _build_index_assignment_from_stats( def _merge_manifest_index_sequences( self, - sample_to_index_assignment: dict[str, Any], - run_manifest_data: dict[str, Any], + sample_to_index_assignment: Dict[str, Any], + run_manifest_data: Dict[str, Any], run_analysis_name: str, ) -> None: """Merge Index1/Index2 from RunManifest Samples into sample_to_index_assignment (mutates).""" @@ -860,8 +860,8 @@ def _merge_manifest_index_sequences( run_data[merged_indices]["Index2"] = index_2 def _parse_run_unassigned_sequences( - self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> dict[int, dict[str, Any]]: + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Dict[int, Dict[str, Any]]: """ Parse unassigned/unknown barcode sequences from run-level data. @@ -872,7 +872,7 @@ def _parse_run_unassigned_sequences( RunStats.json -> Lanes -> UnassignedSequences -> Extract: sequence, count, percentage of total polonies """ - run_unassigned_sequences: dict[int, dict[str, Any]] = {} + run_unassigned_sequences: Dict[int, Dict[str, Any]] = {} if data_source == "": return run_unassigned_sequences @@ -922,8 +922,8 @@ def _parse_run_unassigned_sequences( return run_unassigned_sequences def _parse_index_assignment( - self, manifest_data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> dict[str, Any]: + self, manifest_data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Dict[str, Any]: """ Parse index assignment statistics for run-level analysis. @@ -935,7 +935,7 @@ def _parse_index_assignment( + RunManifest.json -> Samples -> index sequences (Index1, Index2) -> Combined index assignment table """ - sample_to_index_assignment: dict[str, dict[str, dict[str, Any]]] = {} + sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} if manifest_data_source == "": return sample_to_index_assignment @@ -987,8 +987,8 @@ def _parse_index_assignment( return sample_to_index_assignment def _parse_index_assignment_in_project( - self, data_source: str, log_files: list[LoadedFileDict[Any]] | None = None - ) -> dict[str, Any]: + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Dict[str, Any]: """ Parse index assignment statistics for project-level analysis. @@ -1000,7 +1000,7 @@ def _parse_index_assignment_in_project( + ../../RunManifest.json -> Samples -> index sequences -> Combined index assignment table """ - sample_to_index_assignment: dict[str, dict[str, dict[str, Any]]] = {} + sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} if data_source == "": return sample_to_index_assignment @@ -1057,7 +1057,7 @@ def _parse_index_assignment_in_project( return sample_to_index_assignment - def add_run_plots(self, data: dict[Any, Any], plot_functions: list[Callable]) -> None: + def add_run_plots(self, data: Dict[Any, Any], plot_functions: List[Callable]) -> None: if not data: return for func in plot_functions: @@ -1069,11 +1069,11 @@ def add_run_plots(self, data: dict[Any, Any], plot_functions: list[Callable]) -> self.write_data_file(plot_data, f"base2fastq:{plot_name}") def add_sample_plots( - self, data: dict[str, Any], group_lookup: dict[str, str], project_lookup: dict[str, str] + self, data: Dict[str, Any], group_lookup: Dict[str, str], project_lookup: Dict[str, str] ) -> None: if not data: return - plot_functions: list[Callable] = [ + plot_functions: List[Callable] = [ tabulate_sample_stats, sequence_content_plot, plot_per_cycle_N_content, diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index c8ccbbde20..5aff1739a7 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -1,5 +1,5 @@ import math -from typing import Any, cast +from typing import Any, Dict, List, cast from multiqc.plots import bargraph, linegraph, table from multiqc.plots.table_object import ColumnDict, SectionT @@ -212,7 +212,7 @@ def tabulate_project_stats(run_data, color_dict): first_key = run_keys[0] project_header = f"{run_data[first_key]['Project']} | " plot_name = f"{project_header}Sequencing QC Metrics Table" - plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "project_run_qc_metrics_table" description = "QC metrics per run, per project" helptext = """ @@ -339,7 +339,7 @@ def tabulate_run_stats(run_data, color_dict): } plot_name = "Sequencing Run QC Metrics Table" - plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_qc_metrics_table" description = "QC metrics per run" helptext = """ @@ -398,7 +398,7 @@ def tabulate_manifest_stats(run_data, color_dict): } plot_name = "Run Manifest Table" - plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_manifest_metrics_table" description = "Run parameters used." helptext = """ @@ -437,7 +437,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): plot_content.update({index: sample_index_stats}) index += 1 - headers: dict[str, Any] = {} + headers: Dict[str, Any] = {} headers["run_name"] = { "title": "Run Name", "description": "Run Name.", @@ -480,7 +480,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): } plot_name = "Index Assignment Metrics" - plot_html = table.plot(cast(SectionT, plot_content), cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(cast(SectionT, plot_content), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_assignment_metrics" description = "Index assignment metrics." helptext = """ @@ -506,7 +506,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): - Polonies - % Polonies """ - headers: dict[str, Any] = {} + headers: Dict[str, Any] = {} headers["Run Name"] = { "title": "Run Name", "description": "Run Name (Run ID + Analysis ID).", @@ -544,7 +544,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): } plot_name = "Unassigned Indices Metrics" - plot_html = table.plot(cast(SectionT, run_data), cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(cast(SectionT, run_data), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_unassignment_metrics" description = "Index unassignment metrics." helptext = """ @@ -572,7 +572,7 @@ def _run_has_reads(run_entry: dict) -> bool: def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per base BQ histogram (skip runs without Reads) - bq_hist_dict: dict[str, dict[int, float]] = {} + bq_hist_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(run_data.keys()): if not _run_has_reads(run_data[s_name]): continue @@ -588,7 +588,7 @@ def plot_base_quality_hist(run_data, color_dict): bq_hist_dict[s_name].update({quality: R1R2_base_quality_counts[quality] / total_bases * 100}) # Prepare plot data for per read average BQ histogram - per_read_quality_hist_dict: dict[str, dict[int, float]] = {} + per_read_quality_hist_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(run_data.keys()): if not _run_has_reads(run_data[s_name]): continue @@ -655,7 +655,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for median BQ of each cycle (skip runs without Reads/Cycles) runs_with_reads = [s for s in run_data if _run_has_reads(run_data[s]) and run_data[s]["Reads"][0].get("Cycles")] if not runs_with_reads: - plot_content: list[Any] = [] + plot_content: List[Any] = [] plot_html = linegraph.plot( plot_content, pconfig={"id": "bases2fastq_run_bq_by_cycle", "title": "bases2fastq: Run Base Quality by Cycle"}, @@ -676,10 +676,10 @@ def plot_base_quality_by_cycle(run_data, color_dict): R1CycleNum = len(read0["Cycles"]) r1r2_split = max(r1r2_split, R1CycleNum) - median_dict: dict[str, dict[int, float]] = {} + median_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False - cycle_dict: dict[int, float] = {} + cycle_dict: Dict[int, float] = {} R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) @@ -691,7 +691,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): median_dict.update({s_name: cycle_dict}) # Prepare plot data for mean BQ of each cycle - mean_dict: dict[str, dict[int, float]] = {} + mean_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 4b715c6819..e7987135bb 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,4 +1,4 @@ -from typing import Any, cast +from typing import Any, Dict, cast from natsort import natsorted @@ -153,7 +153,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } plot_name = "Sample QC Metrics Table" - plot_html = table.plot(plot_content, cast(dict[Any, ColumnDict], headers), pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "sample_qc_metrics_table" description = "QC metrics per unique sample" helptext = """ @@ -177,7 +177,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c """Create the epic HTML for the FastQC sequence content heatmap""" samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_data: dict[str, dict[int, Any]] = {} + empty_data: Dict[str, Dict[int, Any]] = {} plot_html = linegraph.plot( empty_data, pconfig={ @@ -190,7 +190,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c return plot_html, "Per Cycle Base Content", "base_content", "", "", empty_data # Prep the data - data: dict[str, dict[int, Any]] = {} + data: Dict[str, Dict[int, Any]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads): @@ -259,7 +259,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict, color_dict): samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_data: dict[str, dict[int, float]] = {} + empty_data: Dict[str, Dict[int, float]] = {} plot_html = linegraph.plot( empty_data, pconfig={ @@ -271,7 +271,7 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict ) return plot_html, "Per Cycle N Content", "n_content", "", "", empty_data - data: dict[str, dict[int, float]] = {} + data: Dict[str, Dict[int, float]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads): data[s_name] = {} @@ -344,7 +344,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s """ samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_gc_hist: dict[str, dict[float, float]] = {} + empty_gc_hist: Dict[str, Dict[float, float]] = {} plot_html = linegraph.plot( empty_gc_hist, pconfig={ @@ -356,7 +356,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s ) return plot_html, "Per Sample GC Histogram", "gc_histogram", "", "", empty_gc_hist - gc_hist_dict: dict[str, dict[float, float]] = {} + gc_hist_dict: Dict[str, Dict[float, float]] = {} for s_name in natsorted(samples_with_reads): r0 = sample_data[s_name]["Reads"][0] if "PerReadGCCountHistogram" not in r0: @@ -421,7 +421,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa """ samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] if not samples_with_reads: - empty_content: dict[str, dict[int, float]] = {} + empty_content: Dict[str, Dict[int, float]] = {} plot_html = linegraph.plot( empty_content, pconfig={ @@ -433,7 +433,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa ) return plot_html, "Per Sample Adapter Content", "adapter_content", "", "", empty_content - plot_content: dict[str, dict[int, float]] = {} + plot_content: Dict[str, Dict[int, float]] = {} r1r2_split = 0 for s_name in natsorted(samples_with_reads):