From 72b377bee6c3c1a7a1b65446fe79b2572a2b00c2 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 10:18:12 -0800 Subject: [PATCH 1/6] Addressed review comments --- multiqc/modules/bases2fastq/bases2fastq.py | 401 ++++++++---------- multiqc/modules/bases2fastq/plot_runs.py | 102 ++--- multiqc/modules/bases2fastq/plot_samples.py | 51 ++- .../cells2stats/cells2stats_bar_plots.py | 1 + 4 files changed, 250 insertions(+), 305 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 3c8aae9d18..3f1f212b20 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -5,10 +5,10 @@ import json import logging import random -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import uuid from pathlib import Path - +from natsort import natsorted from multiqc import config from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound from multiqc.types import LoadedFileDict @@ -217,14 +217,6 @@ def _init_data_structures(self) -> None: self.group_lookup_dict: Dict[str, Any] = {} # item -> group it belongs to self.project_lookup_dict: Dict[str, Any] = {} # sample -> project mapping - # === Legacy/auxiliary data structures === - self.b2f_sample_data: Dict[str, Any] = {} - self.b2f_run_data: Dict[str, Any] = {} - self.b2f_run_project_data: Dict[str, Any] = {} - self.b2f_run_project_sample_data: Dict[str, Any] = {} - self.missing_runs: set = set() # Runs referenced but not found - self.sample_id_to_run: Dict[str, str] = {} # sample_id -> run_analysis_name - def _validate_path(self, file_path: Path, base_directory: Path) -> bool: """ Validate that a file path doesn't escape outside the expected directory hierarchy. @@ -293,23 +285,23 @@ def _parse_and_validate_data(self) -> str: Returns: summary_path: The determined summary path ('run_level', 'project_level', or 'combined_level') """ - # Check for available log files - run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) - project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) + # Collect log files once per pattern (find_log_files returns a generator) + run_level_log_files = list(self.find_log_files("bases2fastq/run")) + project_level_log_files = list(self.find_log_files("bases2fastq/project")) - if run_level_log_files == 0 and project_level_log_files == 0: + if len(run_level_log_files) == 0 and len(project_level_log_files) == 0: error_msg = "No run- or project-level log files found within the Bases2Fastq results." log.error(error_msg) raise ModuleNoSamplesFound(error_msg) # Parse data from available sources - if run_level_log_files > 0: + if len(run_level_log_files) > 0: (self.run_level_data, self.run_level_samples, self.run_level_samples_to_project) = ( - self._parse_run_project_data("bases2fastq/run") + self._parse_run_project_data("bases2fastq/run", log_files=run_level_log_files) ) - if project_level_log_files > 0: + if len(project_level_log_files) > 0: (self.project_level_data, self.project_level_samples, self.project_level_samples_to_project) = ( - self._parse_run_project_data("bases2fastq/project") + self._parse_run_project_data("bases2fastq/project", log_files=project_level_log_files) ) # Count samples @@ -332,6 +324,9 @@ def _parse_and_validate_data(self) -> str: # Determine summary path summary_path = self._determine_summary_path() + # Required call to confirm module is used (after confirming data was found) + self.add_software_version(None) + # Log what was found log.info(f"Found {len(self.run_level_data)} run(s) within the Bases2Fastq results.") log.info(f"Found {len(self.project_level_data)} project(s) within the Bases2Fastq results.") @@ -340,9 +335,6 @@ def _parse_and_validate_data(self) -> str: else: log.info(f"Found {num_project_level_samples} sample(s) within the Bases2Fastq results.") - # Required call to confirm module is used - self.add_software_version(None) - # Warn if no data found if len(self.run_level_data) == 0 and len(self.project_level_data) == 0: log.warning("No run/project stats found!") @@ -374,7 +366,9 @@ def _determine_summary_path(self) -> str: def _select_data_by_summary_path( self, summary_path: str - ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, str], Dict[str, Any], Dict[str, Any], Dict[str, Any]]: + ) -> Tuple[ + Dict[str, Any], Dict[str, Any], Dict[str, str], Dict[str, Any], Dict[str, Any], Dict[int, Dict[str, Any]] + ]: """ Select the appropriate data sources based on the summary path. @@ -421,13 +415,14 @@ def _setup_colors( # Create run and project groups run_groups: Dict[str, List] = defaultdict(list) project_groups: Dict[str, List] = defaultdict(list) + # Only populated when summary_path == "project_level"; empty for run_level/combined_level in_project_sample_groups: Dict[str, List] = defaultdict(list) ind_sample_groups: Dict[str, List] = defaultdict(list) - for sample in sample_data.keys(): + for sample in natsorted(sample_data.keys()): run_name, _ = sample.split("__") run_groups[run_name].append(sample) - sample_project = samples_to_projects[sample] + sample_project = samples_to_projects.get(sample, "DefaultProject") project_groups[sample_project].append(sample) ind_sample_groups[sample] = [sample] if summary_path == "project_level": @@ -458,7 +453,7 @@ def _setup_colors( # Assign colors to samples self.sample_color: Dict[str, str] = {} - for sample_name in samples_to_projects.keys(): + for sample_name in natsorted(samples_to_projects.keys()): if summary_path == "project_level" or len(project_groups) == 1: sample_color = self.group_color[sample_name] else: @@ -477,7 +472,7 @@ def _generate_plots( samples_to_projects: Dict[str, str], manifest_data: Dict[str, Any], index_assignment_data: Dict[str, Any], - unassigned_sequences: Dict[str, Any], + unassigned_sequences: Dict[int, Dict[str, Any]], ) -> None: """Generate all plots and add sections to the report.""" # QC metrics table @@ -541,7 +536,9 @@ def _extract_run_analysis_name( return f"{run_name}-{analysis_id[0:4]}" - def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: + def _parse_run_project_data( + self, data_source: str, log_files: Optional[List[LoadedFileDict[Any]]] = None + ) -> Tuple[Dict[str, Any], Dict[str, Any], Dict[str, str]]: """ Parse RunStats.json files to extract run/project and sample-level data. @@ -550,9 +547,11 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: Args: data_source: Search pattern key ("bases2fastq/run" or "bases2fastq/project") + log_files: Optional pre-collected list of file dicts from find_log_files. + When provided, used instead of calling find_log_files again. Returns: - List containing: + Tuple of: - runs_global_data: Dict[run_name, run_stats] - Run/project level metrics - runs_sample_data: Dict[sample_id, sample_stats] - Per-sample metrics - sample_to_project: Dict[sample_id, project_name] - Sample-to-project mapping @@ -560,13 +559,14 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: Data Flow: RunStats.json -> parse -> filter samples by min_polonies -> populate dicts """ - runs_global_data = {} - runs_sample_data = {} - sample_to_project = {} + runs_global_data: Dict[str, Any] = {} + runs_sample_data: Dict[str, Any] = {} + sample_to_project: Dict[str, str] = {} if data_source == "": - return [runs_global_data, runs_sample_data, sample_to_project] + return (runs_global_data, runs_sample_data, sample_to_project) - for f in self.find_log_files(data_source): + files_to_process = log_files if log_files is not None else list(self.find_log_files(data_source)) + for f in files_to_process: data = json.loads(f["f"]) # Copy incomind data and reset samples to include only desired @@ -619,7 +619,53 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") - return [runs_global_data, runs_sample_data, sample_to_project] + return (runs_global_data, runs_sample_data, sample_to_project) + + def _extract_manifest_lane_settings( + self, run_manifest_data: Dict[str, Any], run_analysis_name: str + ) -> Dict[str, Dict[str, Any]]: + """ + Extract per-lane settings from a parsed RunManifest.json Settings section. + + Args: + run_manifest_data: Parsed RunManifest.json (must contain "Settings" list) + run_analysis_name: Run identifier for building run_lane keys + + Returns: + Dict[run_lane, settings] where run_lane = "{run_analysis_name} | L{lane_id}" + and settings contain Indexing, AdapterTrimType, R1/R2AdapterMinimumTrimmedLength + """ + result: Dict[str, Dict[str, Any]] = {} + if "Settings" not in run_manifest_data: + return result + for lane_data in run_manifest_data["Settings"]: + lane_id = lane_data.get("Lane") + if not lane_id: + log.error(" not found in Settings section of RunManifest. Skipping lanes.") + continue + lane_name = f"L{lane_id}" + run_lane = f"{run_analysis_name} | {lane_name}" + result[run_lane] = {} + + indices = [] + indices_cycles = [] + mask_pattern = re.compile(r"^I\d+Mask$") + matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] + for key in matching_keys: + for mask_info in lane_data[key]: + if mask_info["Read"] not in indices: + indices.append(mask_info["Read"]) + indices_cycles.append(str(len(mask_info["Cycles"]))) + indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" + result[run_lane]["Indexing"] = indexing + result[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") + result[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( + "R1AdapterMinimumTrimmedLength", "N/A" + ) + result[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( + "R2AdapterMinimumTrimmedLength", "N/A" + ) + return result def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: """ @@ -636,7 +682,7 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: Returns: Dict[run_lane, settings] where run_lane = "{run_name} | L{lane_id}" """ - runs_manifest_data = {} + runs_manifest_data: Dict[str, Dict[str, Any]] = {} if data_source == "": return runs_manifest_data @@ -662,34 +708,9 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: f" section not found in {directory}/RunManifest.json.\nSkipping RunManifest metrics." ) else: - for lane_data in run_manifest["Settings"]: - lane_id = lane_data.get("Lane") - if not lane_id: - log.error(" not found in Settings section of RunManifest. Skipping lanes.") - continue - lane_name = f"L{lane_id}" - run_lane = f"{run_analysis_name} | {lane_name}" - runs_manifest_data[run_lane] = {} - - indices = [] - indices_cycles = [] - mask_pattern = re.compile(r"^I\d+Mask$") - matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] - for key in matching_keys: - for mask_info in lane_data[key]: - if mask_info["Read"] not in indices: - indices.append(mask_info["Read"]) - indices_cycles.append(str(len(mask_info["Cycles"]))) - indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" - runs_manifest_data[run_lane]["Indexing"] = indexing - - runs_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") - runs_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( - "R1AdapterMinimumTrimmedLength", "N/A" - ) - runs_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( - "R2AdapterMinimumTrimmedLength", "N/A" - ) + runs_manifest_data.update( + self._extract_manifest_lane_settings(run_manifest, run_analysis_name) + ) self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") @@ -707,7 +728,7 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: + ../../RunManifest.json (run-level manifest) -> Extract per-lane settings """ - project_manifest_data = {} + project_manifest_data: Dict[str, Dict[str, Any]] = {} if data_source == "": return project_manifest_data @@ -739,34 +760,9 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: if "Settings" not in run_manifest_data: log.warning(f" section not found in {run_manifest}.\nSkipping RunManifest metrics.") else: - for lane_data in run_manifest_data["Settings"]: - lane_id = lane_data.get("Lane") - if not lane_id: - log.error(" not found in Settings section of RunManifest. Skipping lanes.") - continue - lane_name = f"L{lane_id}" - run_lane = f"{run_analysis_name} | {lane_name}" - project_manifest_data[run_lane] = {} - - indices = [] - indices_cycles = [] - mask_pattern = re.compile(r"^I\d+Mask$") - matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] - for key in matching_keys: - for mask_info in lane_data[key]: - if mask_info["Read"] not in indices: - indices.append(mask_info["Read"]) - indices_cycles.append(str(len(mask_info["Cycles"]))) - indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" - project_manifest_data[run_lane]["Indexing"] = indexing - - project_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") - project_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( - "R1AdapterMinimumTrimmedLength", "N/A" - ) - project_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( - "R2AdapterMinimumTrimmedLength", "N/A" - ) + project_manifest_data.update( + self._extract_manifest_lane_settings(run_manifest_data, run_analysis_name) + ) data_source_info: LoadedFileDict[Any] = { "fn": str(run_manifest.name), "root": str(run_manifest.parent), @@ -778,7 +774,84 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: return project_manifest_data - def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: + def _build_index_assignment_from_stats( + self, + stats_dict: Dict[str, Any], + run_analysis_name: str, + project: Optional[str] = None, + ) -> Tuple[Dict[str, Dict[str, Any]], int]: + """ + Build per-run index assignment dict from RunStats SampleStats/Occurrences. + + Returns: + Tuple of (run_inner_dict, total_polonies). run_inner_dict is + { merged_expected_sequence -> { SampleID, SamplePolonyCounts, PercentOfPolonies, Index1, Index2, ... } } + """ + run_inner: Dict[str, Dict[str, Any]] = {} + total_polonies = stats_dict.get("NumPoloniesBeforeTrimming", 0) + if "SampleStats" not in stats_dict: + return (run_inner, total_polonies) + for sample_data in stats_dict["SampleStats"]: + sample_name = sample_data.get("SampleName") + sample_id = "__".join([run_analysis_name, sample_name]) if (run_analysis_name and sample_name) else None + if "Occurrences" not in sample_data: + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + for occurrence in sample_data["Occurrences"]: + sample_expected_seq = occurrence.get("ExpectedSequence") + sample_counts = occurrence.get("NumPoloniesBeforeTrimming") + if any(x is None for x in [sample_expected_seq, sample_counts, sample_id]): + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + if sample_expected_seq not in run_inner: + entry: Dict[str, Any] = { + "SampleID": sample_id, + "SamplePolonyCounts": 0, + "PercentOfPolonies": float("nan"), + "Index1": "", + "Index2": "", + } + if project is not None: + entry["Project"] = project + run_inner[sample_expected_seq] = entry + run_inner[sample_expected_seq]["SamplePolonyCounts"] += sample_counts + for entry in run_inner.values(): + if total_polonies > 0: + entry["PercentOfPolonies"] = round(entry["SamplePolonyCounts"] / total_polonies * 100, 2) + return (run_inner, total_polonies) + + def _merge_manifest_index_sequences( + self, + sample_to_index_assignment: Dict[str, Any], + run_manifest_data: Dict[str, Any], + run_analysis_name: str, + ) -> None: + """Merge Index1/Index2 from RunManifest Samples into sample_to_index_assignment (mutates).""" + if "Samples" not in run_manifest_data or run_analysis_name not in sample_to_index_assignment: + return + run_data = sample_to_index_assignment[run_analysis_name] + for sample_data in run_manifest_data["Samples"]: + sample_name = sample_data.get("SampleName") + if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: + continue + sample_id = "__".join([run_analysis_name, sample_name]) + for index_data in sample_data["Indexes"]: + index_1 = index_data.get("Index1", "") + index_2 = index_data.get("Index2", "") + merged_indices = f"{index_1}{index_2}" + if merged_indices not in run_data: + log.error(f"Index assignment information not found for sample {sample_id}. Skipping.") + continue + if sample_id != run_data[merged_indices]["SampleID"]: + log.error( + f"RunManifest SampleID <{sample_id}> does not match " + f"RunStats SampleID {run_data[merged_indices]['SampleID']}. Skipping." + ) + continue + run_data[merged_indices]["Index1"] = index_1 + run_data[merged_indices]["Index2"] = index_2 + + def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[int, Dict[str, Any]]: """ Parse unassigned/unknown barcode sequences from run-level data. @@ -789,7 +862,7 @@ def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: RunStats.json -> Lanes -> UnassignedSequences -> Extract: sequence, count, percentage of total polonies """ - run_unassigned_sequences = {} + run_unassigned_sequences: Dict[int, Dict[str, Any]] = {} if data_source == "": return run_unassigned_sequences @@ -849,7 +922,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: + RunManifest.json -> Samples -> index sequences (Index1, Index2) -> Combined index assignment table """ - sample_to_index_assignment = {} + sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} if manifest_data_source == "": return sample_to_index_assignment @@ -865,9 +938,6 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: if run_stats is None: continue - total_polonies = 0 - - # Get run name information run_analysis_name = self._extract_run_analysis_name(run_stats, source_info=str(run_stats_path)) if run_analysis_name is None: continue @@ -877,7 +947,6 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue - # Ensure sample stats are present if "SampleStats" not in run_stats: log.error( f"Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" @@ -887,43 +956,8 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: ) continue - # Extract per sample polony counts and overall total counts - total_polonies = run_stats.get("NumPoloniesBeforeTrimming", 0) - for sample_data in run_stats["SampleStats"]: - sample_name = sample_data.get("SampleName") - sample_id = None - if run_analysis_name and sample_name: - sample_id = "__".join([run_analysis_name, sample_name]) - - if "Occurrences" not in sample_data: - log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") - continue - - for occurrence in sample_data["Occurrences"]: - sample_expected_seq = occurrence.get("ExpectedSequence") - sample_counts = occurrence.get("NumPoloniesBeforeTrimming") - if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): - log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") - continue - if run_analysis_name not in sample_to_index_assignment: - sample_to_index_assignment[run_analysis_name] = {} - if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: - sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { - "SampleID": sample_id, - "SamplePolonyCounts": 0, - "PercentOfPolonies": float("nan"), - "Index1": "", - "Index2": "", - } - sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( - sample_counts - ) - - for sample_data in sample_to_index_assignment[run_analysis_name].values(): - if total_polonies > 0: - sample_data["PercentOfPolonies"] = round( - sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 - ) + run_inner, _ = self._build_index_assignment_from_stats(run_stats, run_analysis_name) + sample_to_index_assignment[run_analysis_name] = run_inner run_manifest = json.loads(f["f"]) if "Samples" not in run_manifest: @@ -934,28 +968,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: elif len(sample_to_index_assignment) == 0: log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") else: - for sample_data in run_manifest["Samples"]: - sample_name = sample_data.get("SampleName") - sample_id = None - if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: - continue - sample_id = "__".join([run_analysis_name, sample_name]) - for index_data in sample_data["Indexes"]: - index_1 = index_data.get("Index1", "") - index_2 = index_data.get("Index2", "") - merged_indices = f"{index_1}{index_2}" - if merged_indices not in sample_to_index_assignment[run_analysis_name]: - log.error(f"Index assignment information not found for sample {sample_id}. Skipping.") - continue - if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: - log.error( - f"RunManifest SampleID <{sample_id}> does not match " - f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." - "Skipping." - ) - continue - sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 - sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 + self._merge_manifest_index_sequences(sample_to_index_assignment, run_manifest, run_analysis_name) return sample_to_index_assignment @@ -971,7 +984,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] + ../../RunManifest.json -> Samples -> index sequences -> Combined index assignment table """ - sample_to_index_assignment = {} + sample_to_index_assignment: Dict[str, Dict[str, Dict[str, Any]]] = {} if data_source == "": return sample_to_index_assignment @@ -999,7 +1012,6 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue - # Ensure sample stats are present if "SampleStats" not in project_stats: log.error( f"Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" @@ -1009,45 +1021,10 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] ) continue - # Extract per sample polony counts and overall total counts - total_polonies = project_stats.get("NumPoloniesBeforeTrimming", 0) - for sample_data in project_stats["SampleStats"]: - sample_name = sample_data.get("SampleName") - sample_id = None - - if run_analysis_name and sample_name: - sample_id = "__".join([run_analysis_name, sample_name]) - - if "Occurrences" not in sample_data: - log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") - continue - - for occurrence in sample_data["Occurrences"]: - sample_expected_seq = occurrence.get("ExpectedSequence") - sample_counts = occurrence.get("NumPoloniesBeforeTrimming") - if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): - log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") - continue - if run_analysis_name not in sample_to_index_assignment: - sample_to_index_assignment[run_analysis_name] = {} - if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: - sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { - "SampleID": sample_id, - "Project": project, - "SamplePolonyCounts": 0, - "PercentOfPolonies": float("nan"), - "Index1": "", - "Index2": "", - } - sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( - sample_counts - ) - - for sample_data in sample_to_index_assignment[run_analysis_name].values(): - if total_polonies > 0: - sample_data["PercentOfPolonies"] = round( - sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 - ) + run_inner, _ = self._build_index_assignment_from_stats( + project_stats, run_analysis_name, project=project + ) + sample_to_index_assignment[run_analysis_name] = run_inner run_manifest_data = self._read_json_file(run_manifest, base_directory=base_directory) if run_manifest_data is None: @@ -1061,31 +1038,13 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] elif len(sample_to_index_assignment) == 0: log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") else: - for sample_data in run_manifest_data["Samples"]: - sample_name = sample_data.get("SampleName") - sample_id = None - if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: - continue - sample_id = "__".join([run_analysis_name, sample_name]) - for index_data in sample_data["Indexes"]: - index_1 = index_data.get("Index1", "") - index_2 = index_data.get("Index2", "") - merged_indices = f"{index_1}{index_2}" - if merged_indices not in sample_to_index_assignment[run_analysis_name]: - continue - if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: - log.error( - f"RunManifest SampleID <{sample_id}> does not match " - f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." - "Skipping." - ) - continue - sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 - sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 + self._merge_manifest_index_sequences( + sample_to_index_assignment, run_manifest_data, run_analysis_name + ) return sample_to_index_assignment - def add_run_plots(self, data: Dict[str, Any], plot_functions: List[Callable]) -> None: + def add_run_plots(self, data: Dict[Any, Any], plot_functions: List[Callable]) -> None: for func in plot_functions: plot_html, plot_name, anchor, description, helptext, plot_data = func(data, self.run_color) self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 87151b3baa..22900ee625 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -1,26 +1,16 @@ import math +from typing import Any, Dict, cast from multiqc.plots import bargraph, linegraph, table -from multiqc import config +from multiqc.plots.table_object import ColumnDict, SectionT from natsort import natsorted -import random -import string - -""" -Functions for plotting per run information of bases2fastq -""" - - -def generate_random_string(length: int): - return "".join(random.choices(string.ascii_letters + string.digits, k=length)) def plot_run_stats(run_data, color_dict): """ Plot a bar graph for polony numbers, Q30/Q40, index assignment rate and yields for each run """ - run_names = list(run_data.keys()) - run_names.sort() + run_names = natsorted(run_data.keys()) num_polonies = dict() yields = dict() for run in run_names: @@ -64,7 +54,7 @@ def plot_run_stats(run_data, color_dict): ], "cpswitch": True, "stacking": "normal", - "id": f"run_metrics_bar_{generate_random_string(10)}", + "id": "bases2fastq_run_metrics_bar", "title": "bases2fastq: General Sequencing Run QC metrics plot", "ylab": "QC", } @@ -119,7 +109,7 @@ def tabulate_project_stats(run_data, color_dict): plot_content = dict() is_percent_q50_present = False reads_present = [] - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): project = run_data[s_name]["Project"] run_project_name = f"{s_name} | {project}" run_stats = dict() @@ -210,7 +200,7 @@ def tabulate_project_stats(run_data, color_dict): pconfig = { "title": "bases2fastq: General Sequencing (Project) QC metrics", "col1_header": "Run Name", - "id": f"project_run_metrics_table_{generate_random_string(5)}", + "id": "bases2fastq_project_run_metrics_table", "ylab": "QC", } @@ -222,7 +212,7 @@ def tabulate_project_stats(run_data, color_dict): first_key = run_keys[0] project_header = f"{run_data[first_key]['Project']} | " plot_name = f"{project_header}Sequencing QC Metrics Table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "project_run_qc_metrics_table" description = "QC metrics per run, per project" helptext = """ @@ -246,7 +236,7 @@ def tabulate_run_stats(run_data, color_dict): plot_content = dict() is_percent_q50_present = False reads_present = [] - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): run_stats = dict() run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) run_stats.update({"percent_assigned_run": run_data[s_name].get("PercentAssignedReads", 100.0)}) @@ -274,7 +264,7 @@ def tabulate_run_stats(run_data, color_dict): headers = {} headers["num_polonies_run"] = { "title": "# Polonies", - "description": "The total number of polonies that are calculated for the run.)", + "description": "The total number of polonies that are calculated for the run.", "min": 0, "scale": "RdYlGn", } @@ -344,12 +334,12 @@ def tabulate_run_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: General Sequencing Run QC metrics", "col1_header": "Run Name", - "id": f"run_metrics_table_{generate_random_string(5)}", + "id": "bases2fastq_run_metrics_table", "ylab": "QC", } plot_name = "Sequencing Run QC Metrics Table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_qc_metrics_table" description = "QC metrics per run" helptext = """ @@ -372,7 +362,7 @@ def tabulate_manifest_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): run_stats = dict() run_stats.update({"indexing": run_data[s_name]["Indexing"]}) run_stats.update({"adapter_trim_type": run_data[s_name]["AdapterTrimType"]}) @@ -397,18 +387,18 @@ def tabulate_manifest_stats(run_data, color_dict): } headers["min_read_length_r2"] = { "title": "Minimum Read Length R2", - "description": "Minimum read length for read R1 (if applicable).", + "description": "Minimum read length for read R2 (if applicable).", "scale": "RdYlGn", } pconfig = { "title": "Bases2Fastq: Run Manifest Metrics", "col1_header": "Run Name | Lane", - "id": f"run_manifest_metrics_table_{generate_random_string(5)}", + "id": "bases2fastq_run_manifest_metrics_table", } plot_name = "Run Manifest Table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "run_manifest_metrics_table" description = "Run parameters used." helptext = """ @@ -426,7 +416,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() - run_names = sorted(run_data.keys()) + run_names = natsorted(run_data.keys()) index = 1 project_present = False for run in run_names: @@ -447,7 +437,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): plot_content.update({index: sample_index_stats}) index += 1 - headers = {} + headers: Dict[str, Any] = {} headers["run_name"] = { "title": "Run Name", "description": "Run Name.", @@ -477,8 +467,8 @@ def tabulate_index_assignment_stats(run_data, color_dict): headers["polony_percentage"] = { "title": "Polony %", "description": "Percentage of total polonies assigned to this index combination.", - "max": 100, - "min": 0, + "max": 100.0, + "min": 0.0, "scale": "RdYlGn", "suffix": "%", } @@ -486,11 +476,11 @@ def tabulate_index_assignment_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Index Assignment Metrics", "col1_header": "Sample #", - "id": f"index_assignment_metrics_{generate_random_string(5)}", + "id": "bases2fastq_index_assignment_metrics", } plot_name = "Index Assignment Metrics" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) + plot_html = table.plot(cast(SectionT, plot_content), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_assignment_metrics" description = "Index assignment metrics." helptext = """ @@ -516,7 +506,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): - Polonies - % Polonies """ - headers = {} + headers: Dict[str, Any] = {} headers["Run Name"] = { "title": "Run Name", "description": "Run Name (Run ID + Analysis ID).", @@ -541,8 +531,8 @@ def tabulate_unassigned_index_stats(run_data, color_dict): headers["% Polonies"] = { "title": "% Polonies", "description": "Percentage of total polonies assigned to this index combination.", - "max": 100, - "min": 0, + "max": 100.0, + "min": 0.0, "scale": "RdYlGn-rev", "suffix": "%", } @@ -550,11 +540,11 @@ def tabulate_unassigned_index_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Unassigned Indices Metrics", "col1_header": "Index #", - "id": f"index_unassignment_metrics_{generate_random_string(5)}", + "id": "bases2fastq_index_unassignment_metrics", } plot_name = "Unassigned Indices Metrics" - plot_html = table.plot(run_data, headers, pconfig=pconfig) + plot_html = table.plot(cast(SectionT, run_data), cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "index_unassignment_metrics" description = "Index unassignment metrics." helptext = """ @@ -571,8 +561,8 @@ def tabulate_unassigned_index_stats(run_data, color_dict): def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per base BQ histogram - bq_hist_dict = dict() - for s_name in run_data.keys(): + bq_hist_dict: Dict[str, Dict[int, float]] = {} + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False R1_base_quality_counts = run_data[s_name]["Reads"][0]["QualityScoreHistogram"] R2_base_quality_counts = [0] * len(R1_base_quality_counts) @@ -585,8 +575,8 @@ def plot_base_quality_hist(run_data, color_dict): bq_hist_dict[s_name].update({quality: R1R2_base_quality_counts[quality] / total_bases * 100}) # Prepare plot data for per read average BQ histogram - per_read_quality_hist_dict = dict() - for s_name in run_data.keys(): + per_read_quality_hist_dict: Dict[str, Dict[int, float]] = {} + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False R1_quality_counts = run_data[s_name]["Reads"][0]["PerReadMeanQualityScoreHistogram"] R2_quality_counts = [0] * len(R1_quality_counts) @@ -622,7 +612,7 @@ def plot_base_quality_hist(run_data, color_dict): "colors": color_dict, }, ], - "id": f"per_run_bq_hist_{generate_random_string(5)}", + "id": "bases2fastq_per_run_bq_hist", "title": "bases2fastq: Quality Histograms", "ylab": "Percentage", } @@ -633,8 +623,8 @@ def plot_base_quality_hist(run_data, color_dict): helptext = """ Run base qualities histogram, summarised by bases and reads. Use tabs to switch between the views:\n - - Quality Per Base: distribution of base qualities.\n - - Quality Per Read: distribution of read qualities.\n + - Quality Per Base: distribution of base qualities.\n + - Quality Per Read: distribution of read qualities.\n \n _The y-axis on the graph shows the quality scores. The higher the score, the better the base call. The background of the graph divides the y-axis into very good quality @@ -650,16 +640,14 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for median BQ of each cycle r1r2_split = 0 - for s_name in run_data.keys(): - paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False - cycle_dict = dict() + for s_name in natsorted(run_data.keys()): R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) r1r2_split = max(r1r2_split, R1CycleNum) - median_dict = {} - for s_name in run_data.keys(): + median_dict: Dict[str, Dict[int, float]] = {} + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False - cycle_dict = dict() + cycle_dict: Dict[int, float] = {} R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) @@ -671,11 +659,11 @@ def plot_base_quality_by_cycle(run_data, color_dict): median_dict.update({s_name: cycle_dict}) # Prepare plot data for mean BQ of each cycle - mean_dict = {} - for s_name in run_data.keys(): + mean_dict: Dict[str, Dict[int, float]] = {} + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info - cycle_dict = dict() + cycle_dict = {} for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) cycle_dict.update({cycle_no: cycle["QualityScoreMean"]}) @@ -687,7 +675,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q30 of each cycle Q30_dict = {} - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info cycle_dict = dict() @@ -702,7 +690,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q40 of each cycle Q40_dict = {} - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() for cycle in run_data[s_name]["Reads"][0]["Cycles"]: @@ -717,7 +705,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q50 of each cycle Q50_dict = {} percent_q50_values = set() - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() for cycle in run_data[s_name]["Reads"][0]["Cycles"]: @@ -741,7 +729,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for % base calls below PF threshold below_pf_dict = {} - for s_name in run_data.keys(): + for s_name in natsorted(run_data.keys()): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) @@ -769,7 +757,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": f"per_run_quality_by_cycle_{generate_random_string(5)}", + "id": "bases2fastq_per_run_quality_by_cycle", "title": "bases2fastq: Quality by cycles", "ylab": "QC", } diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index ebaab9b166..c7e859edce 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,13 +1,10 @@ -from typing import Any, Dict -from multiqc.plots import bargraph, linegraph, table -from multiqc import config -from .plot_runs import generate_random_string +from typing import Any, Dict, cast -import numpy as np +from natsort import natsorted -""" -Functions for plotting per sample information of bases2fastq -""" +from multiqc.plots import bargraph, linegraph, table +from multiqc.plots.table_object import ColumnDict +from multiqc import config def _calculate_sample_reads_eliminated(run_data) -> int: @@ -39,7 +36,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s plot_content = dict() reads_present = set() is_percent_q50_present = False - for s_name in sample_data.keys(): + for s_name in natsorted(sample_data.keys()): general_stats = dict() general_stats.update({"group": group_lookup_dict[s_name]}) general_stats.update({"project": project_lookup_dict.get(s_name, "")}) @@ -141,13 +138,13 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } pconfig = { - "id": f"sample_qc_metric_table_{generate_random_string(5)}", + "id": "bases2fastq_sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": False, } plot_name = "Sample QC Metrics Table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) + plot_html = table.plot(plot_content, cast(Dict[Any, ColumnDict], headers), pconfig=pconfig) anchor = "sample_qc_metrics_table" description = "QC metrics per unique sample" helptext = """ @@ -171,10 +168,10 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c """Create the epic HTML for the FastQC sequence content heatmap""" # Prep the data - data = dict() + data: Dict[str, Dict[int, Any]] = {} r1r2_split = 0 - for s_name in sorted(sample_data.keys()): + for s_name in natsorted(sample_data.keys()): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False for base in "ACTG": base_s_name = "__".join([s_name, base]) @@ -182,7 +179,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c R1 = sample_data[s_name]["Reads"][0]["Cycles"] r1r2_split = max(r1r2_split, len(R1)) - for s_name in sorted(sample_data.keys()): + for s_name in natsorted(sample_data.keys()): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] for cycle in range(len(R1)): @@ -216,7 +213,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": f"per_cycle_base_content_{generate_random_string(5)}", + "id": "bases2fastq_per_cycle_base_content", "title": "bases2fastq: Per Cycle Base Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -239,15 +236,15 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict, color_dict): - data = dict() + data: Dict[str, Dict[int, float]] = {} r1r2_split = 0 - for s_name in sorted(sample_data.keys()): + for s_name in natsorted(sample_data.keys()): data[s_name] = {} R1 = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(R1) r1r2_split = max(r1r2_split, R1_cycle_num) - for s_name in sorted(sample_data.keys()): + for s_name in natsorted(sample_data.keys()): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(R1) @@ -283,11 +280,11 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict "colors": color_dict, "ymin": 0, "ymax": 100, - "id": f"per_cycle_n_content_{generate_random_string(5)}", + "id": "bases2fastq_per_cycle_n_content", "title": "bases2fastq: Per Cycle N Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) - plot_name = "Per Cycle N Content." + plot_name = "Per Cycle N Content" anchor = "n_content" description = """ Percentage of unidentified bases ("N" bases) by each sequencing cycle. @@ -310,8 +307,8 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s """ Plot GC Histogram per Sample """ - gc_hist_dict = dict() - for s_name in sample_data.keys(): + gc_hist_dict: Dict[str, Dict[float, float]] = {} + for s_name in natsorted(sample_data.keys()): R1_gc_counts = sample_data[s_name]["Reads"][0]["PerReadGCCountHistogram"] R2_gc_counts = [0] * len(R1_gc_counts) if len(sample_data[s_name]["Reads"]) > 1: @@ -340,7 +337,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s "xlab": "GC Content (%)", "ylab": "Percentage of reads that have GC (%)", "colors": sample_color, - "id": f"gc_hist_{generate_random_string(5)}", + "id": "bases2fastq_gc_hist", "title": "bases2fastq: Per Sample GC Content Histogram", } plot_name = "Per Sample GC Histogram" @@ -370,17 +367,17 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa """ Plot Adapter Content per Sample """ - plot_content = dict() + plot_content: Dict[str, Dict[int, float]] = {} r1r2_split = 0 - for s_name in sample_data.keys(): + for s_name in natsorted(sample_data.keys()): plot_content.update({s_name: {}}) # Read 1 cycles = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(cycles) r1r2_split = max(r1r2_split, R1_cycle_num) - for s_name in sample_data.keys(): + for s_name in natsorted(sample_data.keys()): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False plot_content.update({s_name: {}}) # Read 1 @@ -397,7 +394,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa adapter_percent = cycle["PercentReadsTrimmed"] plot_content[s_name].update({cycle_no: adapter_percent}) pconfig = { - "id": f"per_cycle_adapter_content_{generate_random_string(5)}", + "id": "bases2fastq_per_cycle_adapter_content", "title": "bases2fastq: Per Cycle Adapter Content", "xlab": "Cycle", "ylab": "% of Sequences", diff --git a/multiqc/modules/cells2stats/cells2stats_bar_plots.py b/multiqc/modules/cells2stats/cells2stats_bar_plots.py index 75eeda73e0..4e5c22aed2 100644 --- a/multiqc/modules/cells2stats/cells2stats_bar_plots.py +++ b/multiqc/modules/cells2stats/cells2stats_bar_plots.py @@ -99,6 +99,7 @@ def plot_cell_assignment(c2s_run_data): cats = [{"total_density": {"name": "Total Density"}}, cat, {"total_count": {"name": "Total Counts"}}, cat, cat] plot_name = "Barcoding Cell Assignment Metrics" + # Check if any dictionary is empty (len(dict) == 0) inside the plot_content list plot_html = ( bargraph.plot(plot_content, cats, pconfig=pconfig) if min([len(el) for el in plot_content]) > 0 else None ) From 0e160285b046c5dc53d25ad30c4a4b49c4fc7598 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 12:09:23 -0800 Subject: [PATCH 2/6] Added additional fixture-based unit tests. --- multiqc/modules/bases2fastq/bases2fastq.py | 10 +- multiqc/modules/bases2fastq/plot_runs.py | 61 ++- multiqc/modules/bases2fastq/plot_samples.py | 95 +++- multiqc/modules/bases2fastq/tests/conftest.py | 25 + .../PairedEndDefaultProject/RunManifest.json | 22 + .../PairedEndDefaultProject/RunStats.json | 18 + .../DefaultProject_RunStats.json | 26 + .../fixtures/PairedEndNoProject/RunStats.json | 18 + .../RunStats.json | 23 + .../PairedEndNoProjectWithLanes/RunStats.json | 31 ++ .../tests/fixtures/project_runstats.json | 24 + .../tests/fixtures/run_manifest.json | 10 + .../fixtures/run_manifest_with_samples.json | 21 + .../tests/fixtures/run_runstats.json | 18 + .../fixtures/run_runstats_low_polonies.json | 21 + .../run_runstats_with_occurrences.json | 23 + .../run_runstats_with_unassigned.json | 29 + .../bases2fastq/tests/test_bases2fastq.py | 507 ++++++++++++++++++ 18 files changed, 952 insertions(+), 30 deletions(-) create mode 100644 multiqc/modules/bases2fastq/tests/conftest.py create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/project_runstats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_manifest.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_manifest_with_samples.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_runstats_low_polonies.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_occurrences.json create mode 100644 multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_unassigned.json create mode 100644 multiqc/modules/bases2fastq/tests/test_bases2fastq.py diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 3f1f212b20..68135b4fb5 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -36,7 +36,7 @@ # Default minimum polony threshold - samples below this are skipped -DEFAULT_MIN_POLONIES = 10000 +DEFAULT_MIN_POLONIES = 1000 def _get_min_polonies() -> int: @@ -738,8 +738,12 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: if not directory: continue - # Get RunManifest.json from run output root (two levels up from project directory) - base_directory = Path(directory).parent.parent + # Get RunManifest.json from run output root (check if it exists in the same directory or try two levels up) + base_directory = Path(directory).resolve() + if (base_directory / "RunManifest.json").exists(): + base_directory = base_directory + else: + base_directory = base_directory.parent.parent run_manifest = base_directory / "RunManifest.json" project_stats = json.loads(f["f"]) run_analysis_name = self._extract_run_analysis_name( diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 22900ee625..d1c2e3e524 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -559,10 +559,23 @@ def tabulate_unassigned_index_stats(run_data, color_dict): return plot_html, plot_name, anchor, description, helptext, run_data +def _run_has_reads(run_entry: dict) -> bool: + """True if run has valid Reads list with at least one read and required keys for run plots.""" + reads = run_entry.get("Reads") + if not reads or not isinstance(reads, list): + return False + if len(reads) < 1: + return False + r0 = reads[0] + return isinstance(r0, dict) and "QualityScoreHistogram" in r0 and "PerReadMeanQualityScoreHistogram" in r0 + + def plot_base_quality_hist(run_data, color_dict): - # Prepare plot data for per base BQ histogram + # Prepare plot data for per base BQ histogram (skip runs without Reads) bq_hist_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(run_data.keys()): + if not _run_has_reads(run_data[s_name]): + continue paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False R1_base_quality_counts = run_data[s_name]["Reads"][0]["QualityScoreHistogram"] R2_base_quality_counts = [0] * len(R1_base_quality_counts) @@ -577,6 +590,8 @@ def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per read average BQ histogram per_read_quality_hist_dict: Dict[str, Dict[int, float]] = {} for s_name in natsorted(run_data.keys()): + if not _run_has_reads(run_data[s_name]): + continue paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False R1_quality_counts = run_data[s_name]["Reads"][0]["PerReadMeanQualityScoreHistogram"] R2_quality_counts = [0] * len(R1_quality_counts) @@ -637,15 +652,37 @@ def plot_base_quality_hist(run_data, color_dict): def plot_base_quality_by_cycle(run_data, color_dict): - # Prepare plot data for median BQ of each cycle + # Prepare plot data for median BQ of each cycle (skip runs without Reads/Cycles) + runs_with_reads = [ + s + for s in run_data + if _run_has_reads(run_data[s]) + and run_data[s]["Reads"][0].get("Cycles") + ] + if not runs_with_reads: + plot_content: list[Any] = [] + plot_html = linegraph.plot( + plot_content, + pconfig={"id": "bases2fastq_run_bq_by_cycle", "title": "bases2fastq: Run Base Quality by Cycle"}, + ) + return ( + plot_html, + "Run Base Quality by Cycle", + "bq_by_cycle", + "Base quality by cycle", + "No run data with Reads available.", + plot_content, + ) r1r2_split = 0 - for s_name in natsorted(run_data.keys()): - R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) - r1r2_split = max(r1r2_split, R1CycleNum) + for s_name in natsorted(runs_with_reads): + read0 = run_data[s_name]["Reads"][0] + if read0.get("Cycles"): + R1CycleNum = len(read0["Cycles"]) + r1r2_split = max(r1r2_split, R1CycleNum) median_dict: Dict[str, Dict[int, float]] = {} - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict: Dict[int, float] = {} R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) @@ -660,7 +697,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for mean BQ of each cycle mean_dict: Dict[str, Dict[int, float]] = {} - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info cycle_dict = {} @@ -675,7 +712,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q30 of each cycle Q30_dict = {} - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False # Update each sample cycle info cycle_dict = dict() @@ -690,7 +727,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q40 of each cycle Q40_dict = {} - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() for cycle in run_data[s_name]["Reads"][0]["Cycles"]: @@ -705,7 +742,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for %Q50 of each cycle Q50_dict = {} percent_q50_values = set() - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() for cycle in run_data[s_name]["Reads"][0]["Cycles"]: @@ -729,11 +766,11 @@ def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for % base calls below PF threshold below_pf_dict = {} - for s_name in natsorted(run_data.keys()): + for s_name in natsorted(runs_with_reads): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) - if "PercentBelowFilterThreshold" not in run_data[s_name]["Reads"][0]["Cycles"][0]: + if not run_data[s_name]["Reads"][0]["Cycles"] or "PercentBelowFilterThreshold" not in run_data[s_name]["Reads"][0]["Cycles"][0]: continue for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index c7e859edce..e7987135bb 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -7,6 +7,15 @@ from multiqc import config +def _sample_has_reads(sample_entry: dict) -> bool: + """True if sample has valid Reads list with at least one read and Cycles.""" + reads = sample_entry.get("Reads") + if not reads or not isinstance(reads, list) or len(reads) < 1: + return False + r0 = reads[0] + return bool(isinstance(r0, dict) and r0.get("Cycles")) + + def _calculate_sample_reads_eliminated(run_data) -> int: """ Calculate the total number of reads eliminated during trimming. @@ -41,16 +50,16 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s general_stats.update({"group": group_lookup_dict[s_name]}) general_stats.update({"project": project_lookup_dict.get(s_name, "")}) general_stats.update({"num_polonies_sample": sample_data[s_name]["NumPolonies"]}) - general_stats.update({"yield_sample": sample_data[s_name]["Yield"]}) - general_stats.update({"mean_base_quality_sample": sample_data[s_name]["QualityScoreMean"]}) - general_stats.update({"percent_q30_sample": sample_data[s_name]["PercentQ30"]}) - general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]}) + general_stats.update({"yield_sample": sample_data[s_name].get("Yield", 0.0)}) + general_stats.update({"mean_base_quality_sample": sample_data[s_name].get("QualityScoreMean", 0)}) + general_stats.update({"percent_q30_sample": sample_data[s_name].get("PercentQ30", 0)}) + general_stats.update({"percent_q40_sample": sample_data[s_name].get("PercentQ40", 0)}) percent_q50 = sample_data[s_name].get("PercentQ50") if percent_q50 is not None: is_percent_q50_present = True general_stats.update({"percent_q50_run": percent_q50}) general_stats.update({"reads_eliminated": _calculate_sample_reads_eliminated(sample_data[s_name])}) - general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]}) + general_stats.update({"percent_mismatch": sample_data[s_name].get("PercentMismatch", 0)}) if "Reads" in sample_data[s_name]: for read in sample_data[s_name]["Reads"]: read_name = read["Read"] @@ -166,20 +175,32 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, color_dict): """Create the epic HTML for the FastQC sequence content heatmap""" + samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] + if not samples_with_reads: + empty_data: Dict[str, Dict[int, Any]] = {} + plot_html = linegraph.plot( + empty_data, + pconfig={ + "id": "bases2fastq_per_cycle_base_content", + "title": "bases2fastq: Per Cycle Base Content Percentage", + "xlab": "Cycle", + "ylab": "Percentage of Total Reads", + }, + ) + return plot_html, "Per Cycle Base Content", "base_content", "", "", empty_data # Prep the data data: Dict[str, Dict[int, Any]] = {} r1r2_split = 0 - for s_name in natsorted(sample_data.keys()): - paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False + for s_name in natsorted(samples_with_reads): for base in "ACTG": base_s_name = "__".join([s_name, base]) data[base_s_name] = {} R1 = sample_data[s_name]["Reads"][0]["Cycles"] r1r2_split = max(r1r2_split, len(R1)) - for s_name in natsorted(sample_data.keys()): + for s_name in natsorted(samples_with_reads): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] for cycle in range(len(R1)): @@ -236,15 +257,29 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict, color_dict): + samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] + if not samples_with_reads: + empty_data: Dict[str, Dict[int, float]] = {} + plot_html = linegraph.plot( + empty_data, + pconfig={ + "id": "bases2fastq_per_cycle_n_content", + "title": "bases2fastq: Per Cycle N Content", + "xlab": "Cycle", + "ylab": "Percentage of N bases", + }, + ) + return plot_html, "Per Cycle N Content", "n_content", "", "", empty_data + data: Dict[str, Dict[int, float]] = {} r1r2_split = 0 - for s_name in natsorted(sample_data.keys()): + for s_name in natsorted(samples_with_reads): data[s_name] = {} R1 = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(R1) r1r2_split = max(r1r2_split, R1_cycle_num) - for s_name in natsorted(sample_data.keys()): + for s_name in natsorted(samples_with_reads): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(R1) @@ -307,9 +342,26 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s """ Plot GC Histogram per Sample """ + samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] + if not samples_with_reads: + empty_gc_hist: Dict[str, Dict[float, float]] = {} + plot_html = linegraph.plot( + empty_gc_hist, + pconfig={ + "id": "bases2fastq_gc_hist", + "title": "bases2fastq: Per Sample GC Content Histogram", + "xlab": "GC Content (%)", + "ylab": "Percentage of reads that have GC (%)", + }, + ) + return plot_html, "Per Sample GC Histogram", "gc_histogram", "", "", empty_gc_hist + gc_hist_dict: Dict[str, Dict[float, float]] = {} - for s_name in natsorted(sample_data.keys()): - R1_gc_counts = sample_data[s_name]["Reads"][0]["PerReadGCCountHistogram"] + for s_name in natsorted(samples_with_reads): + r0 = sample_data[s_name]["Reads"][0] + if "PerReadGCCountHistogram" not in r0: + continue + R1_gc_counts = r0["PerReadGCCountHistogram"] R2_gc_counts = [0] * len(R1_gc_counts) if len(sample_data[s_name]["Reads"]) > 1: R2_gc_counts_raw = sample_data[s_name]["Reads"][1]["PerReadGCCountHistogram"] @@ -367,17 +419,30 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa """ Plot Adapter Content per Sample """ + samples_with_reads = [s for s in sample_data if _sample_has_reads(sample_data[s])] + if not samples_with_reads: + empty_content: Dict[str, Dict[int, float]] = {} + plot_html = linegraph.plot( + empty_content, + pconfig={ + "id": "bases2fastq_per_cycle_adapter_content", + "title": "bases2fastq: Per Cycle Adapter Content", + "xlab": "Cycle", + "ylab": "% of Sequences", + }, + ) + return plot_html, "Per Sample Adapter Content", "adapter_content", "", "", empty_content + plot_content: Dict[str, Dict[int, float]] = {} r1r2_split = 0 - for s_name in natsorted(sample_data.keys()): + for s_name in natsorted(samples_with_reads): plot_content.update({s_name: {}}) - # Read 1 cycles = sample_data[s_name]["Reads"][0]["Cycles"] R1_cycle_num = len(cycles) r1r2_split = max(r1r2_split, R1_cycle_num) - for s_name in natsorted(sample_data.keys()): + for s_name in natsorted(samples_with_reads): paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False plot_content.update({s_name: {}}) # Read 1 diff --git a/multiqc/modules/bases2fastq/tests/conftest.py b/multiqc/modules/bases2fastq/tests/conftest.py new file mode 100644 index 0000000000..181417f978 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/conftest.py @@ -0,0 +1,25 @@ +"""Pytest configuration and fixtures for bases2fastq module tests.""" + +from pathlib import Path + +import pytest + +from multiqc.utils import testing + + +@pytest.fixture +def data_dir(): + """Return path to MultiQC test-data repo data directory (test-data/data).""" + return testing.data_dir() + + +@pytest.fixture +def fixtures_dir(): + """Return path to in-repo JSON fixtures (no test-data clone required). + + - PairedEndNoProject/RunStats.json (run-level only) + - PairedEndDefaultProject/RunStats.json, RunManifest.json, Samples/DefaultProject/DefaultProject_RunStats.json + - PairedEndNoProjectWithLanes/RunStats.json (run-level with Lanes/UnassignedSequences) + - PairedEndNoProjectLowPolonies/RunStats.json (two samples, one below min_polonies) + """ + return Path(__file__).parent / "fixtures" diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json new file mode 100644 index 0000000000..4365e3e6cd --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json @@ -0,0 +1,22 @@ +{ + "Settings": [ + { + "Lane": 1, + "AdapterTrimType": "Paired-End", + "R1AdapterMinimumTrimmedLength": 16, + "R2AdapterMinimumTrimmedLength": 16 + } + ], + "Samples": [ + { + "SampleName": "Sample1", + "Indexes": [ + { + "Lane": 1, + "Index1": "AAA", + "Index2": "TTT" + } + ] + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json new file mode 100644 index 0000000000..e0f2afee0f --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json @@ -0,0 +1,18 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json new file mode 100644 index 0000000000..29f960f0ec --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json @@ -0,0 +1,26 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "Project": "DefaultProject", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000, + "Occurrences": [ + { + "ExpectedSequence": "AAATTT", + "NumPoloniesBeforeTrimming": 5000 + } + ] + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json new file mode 100644 index 0000000000..e0f2afee0f --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json @@ -0,0 +1,18 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json new file mode 100644 index 0000000000..feef607045 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json @@ -0,0 +1,23 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50050, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50 + }, + { + "SampleID": "s2", + "SampleName": "Sample2", + "NumPolonies": 50000 + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json new file mode 100644 index 0000000000..1ff24f6fdc --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json @@ -0,0 +1,31 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ], + "Lanes": [ + { + "Lane": 1, + "UnassignedSequences": [ + { + "I1": "AAA", + "I2": "TTT", + "Count": 100 + } + ] + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/project_runstats.json b/multiqc/modules/bases2fastq/tests/fixtures/project_runstats.json new file mode 100644 index 0000000000..dc73144d25 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/project_runstats.json @@ -0,0 +1,24 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "Project": "MyProject", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 60000, + "AssignedYield": 1.5, + "QualityScoreMean": 35.0, + "PercentQ30": 95.0, + "PercentQ40": 90.0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000, + "Occurrences": [ + { + "ExpectedSequence": "AAATTT", + "NumPoloniesBeforeTrimming": 3000 + } + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_manifest.json b/multiqc/modules/bases2fastq/tests/fixtures/run_manifest.json new file mode 100644 index 0000000000..202fbaf157 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_manifest.json @@ -0,0 +1,10 @@ +{ + "Settings": [ + { + "Lane": 1, + "AdapterTrimType": "N/A", + "R1AdapterMinimumTrimmedLength": "N/A", + "R2AdapterMinimumTrimmedLength": "N/A" + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_manifest_with_samples.json b/multiqc/modules/bases2fastq/tests/fixtures/run_manifest_with_samples.json new file mode 100644 index 0000000000..a6f8e33548 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_manifest_with_samples.json @@ -0,0 +1,21 @@ +{ + "Settings": [ + { + "Lane": 1, + "AdapterTrimType": "N/A", + "R1AdapterMinimumTrimmedLength": "N/A", + "R2AdapterMinimumTrimmedLength": "N/A" + } + ], + "Samples": [ + { + "SampleName": "Sample1", + "Indexes": [ + { + "Index1": "AAA", + "Index2": "TTT" + } + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json new file mode 100644 index 0000000000..7e1812b321 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json @@ -0,0 +1,18 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ] +} \ No newline at end of file diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_low_polonies.json b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_low_polonies.json new file mode 100644 index 0000000000..66032d6a6d --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_low_polonies.json @@ -0,0 +1,21 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50050, + "AssignedYield": 1.5, + "QualityScoreMean": 35.0, + "PercentQ30": 95.0, + "PercentQ40": 90.0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50 + }, + { + "SampleID": "s2", + "SampleName": "Sample2", + "NumPolonies": 50000 + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_occurrences.json b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_occurrences.json new file mode 100644 index 0000000000..a93faccf73 --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_occurrences.json @@ -0,0 +1,23 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35.0, + "PercentQ30": 95.0, + "PercentQ40": 90.0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000, + "Occurrences": [ + { + "ExpectedSequence": "AAATTT", + "NumPoloniesBeforeTrimming": 5000 + } + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_unassigned.json b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_unassigned.json new file mode 100644 index 0000000000..9f8479b1fb --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats_with_unassigned.json @@ -0,0 +1,29 @@ +{ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35.0, + "PercentQ30": 95.0, + "PercentQ40": 90.0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ], + "Lanes": [ + { + "Lane": 1, + "UnassignedSequences": [ + { + "I1": "AAA", + "I2": "TTT", + "Count": 100 + } + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/test_bases2fastq.py b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py new file mode 100644 index 0000000000..72ab3b4d3f --- /dev/null +++ b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py @@ -0,0 +1,507 @@ +"""Tests for bases2fastq module: parsers and integration.""" + +import json +from pathlib import Path +from typing import Any, List +from unittest.mock import patch + +import pytest + +from multiqc import report, config +from multiqc.base_module import ModuleNoSamplesFound +from multiqc.types import LoadedFileDict + +from multiqc.modules.bases2fastq.bases2fastq import MultiqcModule, _get_min_polonies + + +def _load_fixture(fixtures_dir: Path, *parts: str) -> dict: + """Load JSON fixture; path is fixtures_dir / path0 / path1 / ... / filename.""" + path = fixtures_dir.joinpath(*parts) + with path.open() as f: + return json.load(f) + + +class TestExtractManifestLaneSettings: + """Tests for _extract_manifest_lane_settings helper.""" + + def test_extract_manifest_lane_settings_minimal(self, fixtures_dir): + """Manifest with one lane yields run_lane -> settings.""" + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_manifest = _load_fixture(fixtures_dir, "PairedEndDefaultProject", "RunManifest.json") + result = m._extract_manifest_lane_settings(run_manifest, "RUN01-a1b2") + assert len(result) == 1 + run_lane = "RUN01-a1b2 | L1" + assert run_lane in result + assert result[run_lane]["AdapterTrimType"] == "Paired-End" + assert result[run_lane]["R1AdapterMinimumTrimmedLength"] == 16 + assert result[run_lane]["R2AdapterMinimumTrimmedLength"] == 16 + assert "Indexing" in result[run_lane] + + def test_extract_manifest_lane_settings_empty_settings(self, fixtures_dir, tmp_path): + """Manifest without Settings returns empty dict.""" + report.reset() + run_stats = _load_fixture(fixtures_dir, "PairedEndNoProject", "RunStats.json") + (tmp_path / "RunStats.json").write_text(json.dumps(run_stats)) + (tmp_path / "RunManifest.json").write_text(json.dumps({})) + report.analysis_files = [str(tmp_path)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + result = m._extract_manifest_lane_settings({}, "RUN01-a1b2") + assert result == {} + + +class TestBuildIndexAssignmentFromStats: + """Tests for _build_index_assignment_from_stats helper.""" + + def test_build_index_assignment_from_stats_with_occurrences(self, fixtures_dir): + """Project RunStats with Occurrences produces run_inner and percentages.""" + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + project_stats = _load_fixture( + fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" + ) + run_inner, total = m._build_index_assignment_from_stats( + project_stats, "RUN01-a1b2", project="DefaultProject" + ) + assert total == 100000 + assert "AAATTT" in run_inner + assert run_inner["AAATTT"]["SamplePolonyCounts"] == 5000 + assert run_inner["AAATTT"]["PercentOfPolonies"] == 5.0 + + def test_build_index_assignment_from_stats_project(self, fixtures_dir): + """Project-level stats (Samples/DefaultProject) add Project key to entries.""" + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + project_stats = _load_fixture( + fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" + ) + run_inner, _ = m._build_index_assignment_from_stats( + project_stats, "RUN01-a1b2", project="DefaultProject" + ) + assert run_inner + for entry in run_inner.values(): + assert entry.get("Project") == "DefaultProject" + + +class TestMergeManifestIndexSequences: + """Tests for _merge_manifest_index_sequences helper.""" + + def test_merge_manifest_index_sequences(self, fixtures_dir): + """Index1/Index2 from RunManifest Samples merged into assignment dict.""" + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + project_stats = _load_fixture( + fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" + ) + run_manifest = _load_fixture(fixtures_dir, "PairedEndDefaultProject", "RunManifest.json") + run_inner, _ = m._build_index_assignment_from_stats( + project_stats, "RUN01-a1b2", project="DefaultProject" + ) + sample_to_index = {"RUN01-a1b2": run_inner} + m._merge_manifest_index_sequences(sample_to_index, run_manifest, "RUN01-a1b2") + assert run_inner["AAATTT"]["Index1"] == "AAA" + assert run_inner["AAATTT"]["Index2"] == "TTT" + + +class TestParseRunProjectData: + """Tests for run-level and project-level parsing.""" + + def test_parse_run_project_data_run_level(self, fixtures_dir): + """Run-level only (PairedEndNoProject) populates run and sample dicts.""" + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert len(m.run_level_data) >= 1 + assert len(m.run_level_samples) >= 1 + sample_id = next(iter(m.run_level_samples)) + assert "__" in sample_id + assert sample_id in m.run_level_samples_to_project + + def test_parse_run_project_data_min_polonies_filter(self, fixtures_dir): + """Samples below min_polonies excluded (PairedEndNoProjectLowPolonies, config lowered).""" + report.reset() + run_dir = fixtures_dir / "PairedEndNoProjectLowPolonies" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + import multiqc.modules.bases2fastq.bases2fastq as b2f_mod + with patch.object(b2f_mod, "_get_min_polonies", return_value=100): + m = MultiqcModule() + assert len(m.run_level_samples) == 1 + sample_id = next(iter(m.run_level_samples)) + assert sample_id.endswith("__Sample2") + assert not any(s.endswith("__Sample1") for s in m.run_level_samples) + + +class TestParseRunUnassignedSequences: + """Tests for unassigned sequences parser.""" + + def test_parse_run_unassigned_sequences(self, fixtures_dir): + """RunStats with Lanes/UnassignedSequences (PairedEndNoProjectWithLanes) produces int-keyed dict.""" + report.reset() + run_dir = fixtures_dir / "PairedEndNoProjectWithLanes" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + unassigned = m._parse_run_unassigned_sequences("bases2fastq/run") + assert isinstance(unassigned, dict) + for k, v in unassigned.items(): + assert isinstance(k, int) + assert "Run Name" in v + assert "I1" in v + assert "I2" in v + assert "Number of Polonies" in v + + +class TestGetMinPolonies: + """Tests for _get_min_polonies config helper.""" + + def test_get_min_polonies_default_when_config_not_dict(self): + with patch.object(config, "bases2fastq_config", None, create=True): + assert _get_min_polonies() == 1000 + with patch.object(config, "bases2fastq_config", "string", create=True): + assert _get_min_polonies() == 1000 + + def test_get_min_polonies_invalid_int_uses_default(self): + with patch.object(config, "bases2fastq_config", {"min_polonies": "bad"}, create=True): + assert _get_min_polonies() == 1000 + with patch.object(config, "bases2fastq_config", {"min_polonies": None}, create=True): + assert _get_min_polonies() == 1000 + + def test_get_min_polonies_custom_value(self): + with patch.object(config, "bases2fastq_config", {"min_polonies": 5000}, create=True): + assert _get_min_polonies() == 5000 + + +class TestValidatePath: + """Tests for _validate_path security check.""" + + def test_validate_path_escaped_returns_false(self, fixtures_dir, tmp_path): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + base = tmp_path / "sub" + base.mkdir() + outside = base.parent.parent.resolve() + assert m._validate_path(outside / "any", base.resolve()) is False + + def test_validate_path_inside_returns_true(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._validate_path(run_dir / "RunStats.json", run_dir) is True + + +class TestReadJsonFile: + """Tests for _read_json_file with validation and errors.""" + + def test_read_json_file_path_outside_base_returns_none(self, fixtures_dir, tmp_path): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + outside = (tmp_path / "..").resolve() + assert m._read_json_file(outside / "any.json", base_directory=tmp_path) is None + + def test_read_json_file_missing_file_returns_none(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._read_json_file(run_dir / "DoesNotExist.json") is None + + def test_read_json_file_invalid_json_returns_none(self, fixtures_dir, tmp_path): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + bad = tmp_path / "bad.json" + bad.write_text("not json {") + assert m._read_json_file(bad) is None + + +class TestExtractRunAnalysisName: + """Tests for _extract_run_analysis_name.""" + + def test_extract_run_analysis_name_missing_runname_returns_none(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._extract_run_analysis_name({"AnalysisID": "a1b2"}, "test") is None + + def test_extract_run_analysis_name_missing_analysisid_returns_none(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._extract_run_analysis_name({"RunName": "RUN01"}, "test") is None + + def test_extract_run_analysis_name_ok(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._extract_run_analysis_name({"RunName": "RUN01", "AnalysisID": "a1b2c3d4"}) == "RUN01-a1b2" + + +class TestParseRunProjectDataEdgeCases: + """Edge cases for _parse_run_project_data.""" + + def test_parse_run_project_data_empty_data_source_returns_empty(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_data, sample_data, sample_to_project = m._parse_run_project_data("", log_files=[]) + assert run_data == {} + assert sample_data == {} + assert sample_to_project == {} + + def test_parse_run_project_data_ignore_sample_skips_run(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + run_stats = _load_fixture(fixtures_dir, "PairedEndNoProject", "RunStats.json") + log_files: List[LoadedFileDict[Any]] = [ + { + "f": json.dumps(run_stats), + "root": str(run_dir), + "fn": "RunStats.json", + "sp_key": "bases2fastq/run", + "s_name": "RUN01-a1b2", + } + ] + m = MultiqcModule() + with patch.object(m, "is_ignore_sample", return_value=True): + run_data, sample_data, _ = m._parse_run_project_data("bases2fastq/run", log_files=log_files) + assert run_data == {} + assert sample_data == {} + + +class TestBuildIndexAssignmentEdgeCases: + """Edge cases for _build_index_assignment_from_stats.""" + + def test_build_index_assignment_no_samplestats_returns_empty(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_inner, total = m._build_index_assignment_from_stats( + {"NumPoloniesBeforeTrimming": 1000}, "RUN01-a1b2" + ) + assert run_inner == {} + assert total == 1000 + + def test_build_index_assignment_sample_without_occurrences_skipped(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + # Stats with one sample that has no Occurrences + stats = { + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4", + "NumPoloniesBeforeTrimming": 50000, + "SampleStats": [ + {"SampleID": "s1", "SampleName": "S1", "NumPolonies": 100}, + ], + } + run_inner, total = m._build_index_assignment_from_stats(stats, "RUN01-a1b2") + assert run_inner == {} + assert total == 50000 + + +class TestMergeManifestIndexSequencesEdgeCases: + """Edge cases for _merge_manifest_index_sequences.""" + + def test_merge_manifest_no_samples_returns_early(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + sample_to_index = {"RUN01-a1b2": {"AAATTT": {}}} + m._merge_manifest_index_sequences(sample_to_index, {}, "RUN01-a1b2") + assert sample_to_index["RUN01-a1b2"]["AAATTT"].get("Index1", "") == "" + + def test_merge_manifest_run_not_in_assignment_returns_early(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + sample_to_index = {} + m._merge_manifest_index_sequences( + sample_to_index, {"Samples": [{"SampleName": "S1", "Indexes": [{"Index1": "A", "Index2": "T"}]}]}, "RUN01-a1b2" + ) + assert sample_to_index == {} + + def test_merge_manifest_merged_indices_not_in_run_data_skipped(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_inner = {"AAATTT": {"SampleID": "RUN01-a1b2__S1"}} + sample_to_index = {"RUN01-a1b2": run_inner} + m._merge_manifest_index_sequences( + sample_to_index, + {"Samples": [{"SampleName": "S1", "Indexes": [{"Index1": "XXX", "Index2": "YYY"}]}]}, + "RUN01-a1b2", + ) + assert run_inner["AAATTT"].get("Index1", "") == "" + + +class TestParseRunUnassignedEdgeCases: + """Edge cases for _parse_run_unassigned_sequences.""" + + def test_parse_run_unassigned_empty_data_source_returns_empty(self, fixtures_dir): + report.reset() + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + assert m._parse_run_unassigned_sequences("") == {} + + def test_parse_run_unassigned_no_lanes_skipped(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndNoProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + unassigned = m._parse_run_unassigned_sequences("bases2fastq/run") + assert unassigned == {} + + +class TestModuleNoSamplesFound: + """Tests that ModuleNoSamplesFound is raised when no data.""" + + def test_no_log_files_raises(self, tmp_path): + report.reset() + empty_dir = tmp_path / "empty" + empty_dir.mkdir() + report.analysis_files = [str(empty_dir)] + report.search_files(["bases2fastq"]) + with pytest.raises(ModuleNoSamplesFound): + MultiqcModule() + + +class TestProjectLevelPath: + """Tests for project_level summary path (tabulate_project_stats, manifest in project).""" + + def test_project_level_only_produces_sections(self, fixtures_dir, tmp_path): + """Directory with only project-level RunStats (no run-level) uses project_level path.""" + report.reset() + project_stats = _load_fixture( + fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" + ) + manifest = _load_fixture(fixtures_dir, "PairedEndDefaultProject", "RunManifest.json") + (tmp_path / "Samples" / "DefaultProject").mkdir(parents=True) + (tmp_path / "Samples" / "DefaultProject" / "DefaultProject_RunStats.json").write_text( + json.dumps(project_stats) + ) + (tmp_path / "RunManifest.json").write_text(json.dumps(manifest)) + report.analysis_files = [str(tmp_path)] + report.search_files(["bases2fastq"]) + config.strict = True + m = MultiqcModule() + assert len(m.project_level_data) >= 1 + assert len(m.run_level_data) == 0 + assert len(m.sections) > 0 + + +class TestSelectDataBySummaryPath: + """Tests for _select_data_by_summary_path branches.""" + + def test_select_data_project_level(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = m._select_data_by_summary_path( + "project_level" + ) + assert run_data is m.project_level_data + assert sample_data is m.project_level_samples + assert unassigned == {} + + def test_select_data_combined_level(self, fixtures_dir): + report.reset() + run_dir = fixtures_dir / "PairedEndDefaultProject" + report.analysis_files = [str(run_dir)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = m._select_data_by_summary_path( + "combined_level" + ) + assert run_data is m.run_level_data + assert sample_data is m.project_level_samples + assert isinstance(unassigned, dict) + + +class TestParseIndexAssignmentEdgeCases: + """Edge cases for _parse_index_assignment.""" + + def test_parse_index_assignment_runstats_missing_samplestats(self, fixtures_dir, tmp_path): + report.reset() + run_stats = {"RunName": "RUN01", "AnalysisID": "a1b2c3d4", "NumPolonies": 100} + (tmp_path / "RunStats.json").write_text(json.dumps(run_stats)) + (tmp_path / "RunManifest.json").write_text(json.dumps({"Settings": [{"Lane": 1}]})) + report.analysis_files = [str(fixtures_dir / "PairedEndNoProject"), str(tmp_path)] + report.search_files(["bases2fastq"]) + m = MultiqcModule() + result = m._parse_index_assignment("bases2fastq/manifest") + assert isinstance(result, dict) + + +def _test_data_bases2fastq_dir(): + """Path to test-data/data/modules/bases2fastq (used for skipif, no fixture).""" + repo_root = Path(__file__).resolve().parents[4] + return repo_root / "test-data" / "data" / "modules" / "bases2fastq" + + +class TestIntegration: + """Integration test using test-data repo (skipped when absent).""" + + @pytest.mark.skipif( + not _test_data_bases2fastq_dir().exists(), + reason="test-data/data/modules/bases2fastq not found (clone test-data repo)", + ) + def test_module_run_with_test_data(self, data_dir): + """Full module run against test-data repo produces sections and general stats.""" + report.reset() + mod_dir = data_dir / "modules" / "bases2fastq" + report.analysis_files = [str(mod_dir)] + report.search_files(["bases2fastq"]) + config.strict = True + m = MultiqcModule() + # Test-data has multiple run roots (WGS, WES, PairedEndNoProject, PairedEndDefaultProject, etc.) + assert len(m.run_level_data) >= 2, "expected at least 2 runs from test-data" + # At least one project-level layout (PairedEndDefaultProject* or PairedEndProjects) + assert len(m.project_level_data) >= 1, "expected at least 1 project from test-data" + total_samples = len(m.run_level_samples) + len(m.project_level_samples) + assert total_samples >= 10, "expected at least 10 samples from test-data" + # Module must produce output (general stats and/or sections) + assert len(report.general_stats_data) > 0 or len(m.sections) > 0, ( + "expected general stats or report sections to be populated" + ) From fb097385a505caf009d1bb091fab0573a7149ad6 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 13:55:48 -0800 Subject: [PATCH 3/6] Linting --- multiqc/modules/bases2fastq/bases2fastq.py | 30 ++++------------- multiqc/modules/bases2fastq/plot_runs.py | 12 +++---- .../bases2fastq/tests/test_bases2fastq.py | 33 ++++++++----------- 3 files changed, 25 insertions(+), 50 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 68135b4fb5..d170241de1 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -415,8 +415,6 @@ def _setup_colors( # Create run and project groups run_groups: Dict[str, List] = defaultdict(list) project_groups: Dict[str, List] = defaultdict(list) - # Only populated when summary_path == "project_level"; empty for run_level/combined_level - in_project_sample_groups: Dict[str, List] = defaultdict(list) ind_sample_groups: Dict[str, List] = defaultdict(list) for sample in natsorted(sample_data.keys()): @@ -425,10 +423,8 @@ def _setup_colors( sample_project = samples_to_projects.get(sample, "DefaultProject") project_groups[sample_project].append(sample) ind_sample_groups[sample] = [sample] - if summary_path == "project_level": - in_project_sample_groups[sample].append(sample) - merged_groups = {**run_groups, **project_groups, **in_project_sample_groups, **ind_sample_groups} + merged_groups = {**run_groups, **project_groups, **ind_sample_groups} # Build color palette self.color_getter = mqc_colour.mqc_colour_scale() @@ -659,12 +655,8 @@ def _extract_manifest_lane_settings( indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" result[run_lane]["Indexing"] = indexing result[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") - result[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( - "R1AdapterMinimumTrimmedLength", "N/A" - ) - result[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( - "R2AdapterMinimumTrimmedLength", "N/A" - ) + result[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get("R1AdapterMinimumTrimmedLength", "N/A") + result[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get("R2AdapterMinimumTrimmedLength", "N/A") return result def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: @@ -708,9 +700,7 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: f" section not found in {directory}/RunManifest.json.\nSkipping RunManifest metrics." ) else: - runs_manifest_data.update( - self._extract_manifest_lane_settings(run_manifest, run_analysis_name) - ) + runs_manifest_data.update(self._extract_manifest_lane_settings(run_manifest, run_analysis_name)) self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") @@ -764,9 +754,7 @@ def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: if "Settings" not in run_manifest_data: log.warning(f" section not found in {run_manifest}.\nSkipping RunManifest metrics.") else: - project_manifest_data.update( - self._extract_manifest_lane_settings(run_manifest_data, run_analysis_name) - ) + project_manifest_data.update(self._extract_manifest_lane_settings(run_manifest_data, run_analysis_name)) data_source_info: LoadedFileDict[Any] = { "fn": str(run_manifest.name), "root": str(run_manifest.parent), @@ -1025,9 +1013,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] ) continue - run_inner, _ = self._build_index_assignment_from_stats( - project_stats, run_analysis_name, project=project - ) + run_inner, _ = self._build_index_assignment_from_stats(project_stats, run_analysis_name, project=project) sample_to_index_assignment[run_analysis_name] = run_inner run_manifest_data = self._read_json_file(run_manifest, base_directory=base_directory) @@ -1042,9 +1028,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] elif len(sample_to_index_assignment) == 0: log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") else: - self._merge_manifest_index_sequences( - sample_to_index_assignment, run_manifest_data, run_analysis_name - ) + self._merge_manifest_index_sequences(sample_to_index_assignment, run_manifest_data, run_analysis_name) return sample_to_index_assignment diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index d1c2e3e524..39499dc723 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -653,12 +653,7 @@ def plot_base_quality_hist(run_data, color_dict): def plot_base_quality_by_cycle(run_data, color_dict): # Prepare plot data for median BQ of each cycle (skip runs without Reads/Cycles) - runs_with_reads = [ - s - for s in run_data - if _run_has_reads(run_data[s]) - and run_data[s]["Reads"][0].get("Cycles") - ] + runs_with_reads = [s for s in run_data if _run_has_reads(run_data[s]) and run_data[s]["Reads"][0].get("Cycles")] if not runs_with_reads: plot_content: list[Any] = [] plot_html = linegraph.plot( @@ -770,7 +765,10 @@ def plot_base_quality_by_cycle(run_data, color_dict): paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False cycle_dict = dict() R1CycleNum = len(run_data[s_name]["Reads"][0]["Cycles"]) - if not run_data[s_name]["Reads"][0]["Cycles"] or "PercentBelowFilterThreshold" not in run_data[s_name]["Reads"][0]["Cycles"][0]: + if ( + not run_data[s_name]["Reads"][0]["Cycles"] + or "PercentBelowFilterThreshold" not in run_data[s_name]["Reads"][0]["Cycles"][0] + ): continue for cycle in run_data[s_name]["Reads"][0]["Cycles"]: cycle_no = int(cycle["Cycle"]) diff --git a/multiqc/modules/bases2fastq/tests/test_bases2fastq.py b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py index 72ab3b4d3f..cc498aa8ae 100644 --- a/multiqc/modules/bases2fastq/tests/test_bases2fastq.py +++ b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py @@ -67,9 +67,7 @@ def test_build_index_assignment_from_stats_with_occurrences(self, fixtures_dir): project_stats = _load_fixture( fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" ) - run_inner, total = m._build_index_assignment_from_stats( - project_stats, "RUN01-a1b2", project="DefaultProject" - ) + run_inner, total = m._build_index_assignment_from_stats(project_stats, "RUN01-a1b2", project="DefaultProject") assert total == 100000 assert "AAATTT" in run_inner assert run_inner["AAATTT"]["SamplePolonyCounts"] == 5000 @@ -85,9 +83,7 @@ def test_build_index_assignment_from_stats_project(self, fixtures_dir): project_stats = _load_fixture( fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" ) - run_inner, _ = m._build_index_assignment_from_stats( - project_stats, "RUN01-a1b2", project="DefaultProject" - ) + run_inner, _ = m._build_index_assignment_from_stats(project_stats, "RUN01-a1b2", project="DefaultProject") assert run_inner for entry in run_inner.values(): assert entry.get("Project") == "DefaultProject" @@ -107,9 +103,7 @@ def test_merge_manifest_index_sequences(self, fixtures_dir): fixtures_dir, "PairedEndDefaultProject", "Samples", "DefaultProject", "DefaultProject_RunStats.json" ) run_manifest = _load_fixture(fixtures_dir, "PairedEndDefaultProject", "RunManifest.json") - run_inner, _ = m._build_index_assignment_from_stats( - project_stats, "RUN01-a1b2", project="DefaultProject" - ) + run_inner, _ = m._build_index_assignment_from_stats(project_stats, "RUN01-a1b2", project="DefaultProject") sample_to_index = {"RUN01-a1b2": run_inner} m._merge_manifest_index_sequences(sample_to_index, run_manifest, "RUN01-a1b2") assert run_inner["AAATTT"]["Index1"] == "AAA" @@ -139,6 +133,7 @@ def test_parse_run_project_data_min_polonies_filter(self, fixtures_dir): report.analysis_files = [str(run_dir)] report.search_files(["bases2fastq"]) import multiqc.modules.bases2fastq.bases2fastq as b2f_mod + with patch.object(b2f_mod, "_get_min_polonies", return_value=100): m = MultiqcModule() assert len(m.run_level_samples) == 1 @@ -308,9 +303,7 @@ def test_build_index_assignment_no_samplestats_returns_empty(self, fixtures_dir) report.analysis_files = [str(fixtures_dir / "PairedEndNoProject")] report.search_files(["bases2fastq"]) m = MultiqcModule() - run_inner, total = m._build_index_assignment_from_stats( - {"NumPoloniesBeforeTrimming": 1000}, "RUN01-a1b2" - ) + run_inner, total = m._build_index_assignment_from_stats({"NumPoloniesBeforeTrimming": 1000}, "RUN01-a1b2") assert run_inner == {} assert total == 1000 @@ -353,7 +346,9 @@ def test_merge_manifest_run_not_in_assignment_returns_early(self, fixtures_dir): m = MultiqcModule() sample_to_index = {} m._merge_manifest_index_sequences( - sample_to_index, {"Samples": [{"SampleName": "S1", "Indexes": [{"Index1": "A", "Index2": "T"}]}]}, "RUN01-a1b2" + sample_to_index, + {"Samples": [{"SampleName": "S1", "Indexes": [{"Index1": "A", "Index2": "T"}]}]}, + "RUN01-a1b2", ) assert sample_to_index == {} @@ -416,9 +411,7 @@ def test_project_level_only_produces_sections(self, fixtures_dir, tmp_path): ) manifest = _load_fixture(fixtures_dir, "PairedEndDefaultProject", "RunManifest.json") (tmp_path / "Samples" / "DefaultProject").mkdir(parents=True) - (tmp_path / "Samples" / "DefaultProject" / "DefaultProject_RunStats.json").write_text( - json.dumps(project_stats) - ) + (tmp_path / "Samples" / "DefaultProject" / "DefaultProject_RunStats.json").write_text(json.dumps(project_stats)) (tmp_path / "RunManifest.json").write_text(json.dumps(manifest)) report.analysis_files = [str(tmp_path)] report.search_files(["bases2fastq"]) @@ -438,8 +431,8 @@ def test_select_data_project_level(self, fixtures_dir): report.analysis_files = [str(run_dir)] report.search_files(["bases2fastq"]) m = MultiqcModule() - run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = m._select_data_by_summary_path( - "project_level" + run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = ( + m._select_data_by_summary_path("project_level") ) assert run_data is m.project_level_data assert sample_data is m.project_level_samples @@ -451,8 +444,8 @@ def test_select_data_combined_level(self, fixtures_dir): report.analysis_files = [str(run_dir)] report.search_files(["bases2fastq"]) m = MultiqcModule() - run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = m._select_data_by_summary_path( - "combined_level" + run_data, sample_data, samples_to_projects, manifest_data, index_data, unassigned = ( + m._select_data_by_summary_path("combined_level") ) assert run_data is m.run_level_data assert sample_data is m.project_level_samples From bd4f5de0c4a097c0d585a3c3fa129cce540ea1c0 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 14:00:13 -0800 Subject: [PATCH 4/6] Prettier linting --- .../PairedEndDefaultProject/RunManifest.json | 38 ++++++------- .../PairedEndDefaultProject/RunStats.json | 34 +++++------ .../DefaultProject_RunStats.json | 46 +++++++-------- .../fixtures/PairedEndNoProject/RunStats.json | 34 +++++------ .../RunStats.json | 44 +++++++-------- .../PairedEndNoProjectWithLanes/RunStats.json | 56 +++++++++---------- .../tests/fixtures/run_runstats.json | 2 +- multiqc/templates/default/package-lock.json | 5 ++ 8 files changed, 132 insertions(+), 127 deletions(-) diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json index 4365e3e6cd..0a334a49bf 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunManifest.json @@ -1,22 +1,22 @@ { - "Settings": [ + "Settings": [ + { + "Lane": 1, + "AdapterTrimType": "Paired-End", + "R1AdapterMinimumTrimmedLength": 16, + "R2AdapterMinimumTrimmedLength": 16 + } + ], + "Samples": [ + { + "SampleName": "Sample1", + "Indexes": [ { - "Lane": 1, - "AdapterTrimType": "Paired-End", - "R1AdapterMinimumTrimmedLength": 16, - "R2AdapterMinimumTrimmedLength": 16 + "Lane": 1, + "Index1": "AAA", + "Index2": "TTT" } - ], - "Samples": [ - { - "SampleName": "Sample1", - "Indexes": [ - { - "Lane": 1, - "Index1": "AAA", - "Index2": "TTT" - } - ] - } - ] -} \ No newline at end of file + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json index e0f2afee0f..50c4a71b92 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/RunStats.json @@ -1,18 +1,18 @@ { - "RunName": "RUN01", - "AnalysisID": "a1b2c3d4e5f6", - "NumPolonies": 50000, - "AssignedYield": 1.5, - "QualityScoreMean": 35, - "PercentQ30": 95, - "PercentQ40": 90, - "PercentAssignedReads": 100.0, - "PercentMismatch": 0, - "SampleStats": [ - { - "SampleID": "s1", - "SampleName": "Sample1", - "NumPolonies": 50000 - } - ] -} \ No newline at end of file + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json index 29f960f0ec..71a39be1fd 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndDefaultProject/Samples/DefaultProject/DefaultProject_RunStats.json @@ -1,26 +1,26 @@ { - "RunName": "RUN01", - "AnalysisID": "a1b2c3d4e5f6", - "Project": "DefaultProject", - "NumPolonies": 50000, - "NumPoloniesBeforeTrimming": 100000, - "AssignedYield": 1.5, - "QualityScoreMean": 35, - "PercentQ30": 95, - "PercentQ40": 90, - "PercentAssignedReads": 100.0, - "PercentMismatch": 0, - "SampleStats": [ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "Project": "DefaultProject", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000, + "Occurrences": [ { - "SampleID": "s1", - "SampleName": "Sample1", - "NumPolonies": 50000, - "Occurrences": [ - { - "ExpectedSequence": "AAATTT", - "NumPoloniesBeforeTrimming": 5000 - } - ] + "ExpectedSequence": "AAATTT", + "NumPoloniesBeforeTrimming": 5000 } - ] -} \ No newline at end of file + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json index e0f2afee0f..50c4a71b92 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProject/RunStats.json @@ -1,18 +1,18 @@ { - "RunName": "RUN01", - "AnalysisID": "a1b2c3d4e5f6", - "NumPolonies": 50000, - "AssignedYield": 1.5, - "QualityScoreMean": 35, - "PercentQ30": 95, - "PercentQ40": 90, - "PercentAssignedReads": 100.0, - "PercentMismatch": 0, - "SampleStats": [ - { - "SampleID": "s1", - "SampleName": "Sample1", - "NumPolonies": 50000 - } - ] -} \ No newline at end of file + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json index feef607045..c56f7c5ef6 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectLowPolonies/RunStats.json @@ -1,23 +1,23 @@ { - "RunName": "RUN01", - "AnalysisID": "a1b2c3d4e5f6", - "NumPolonies": 50050, - "AssignedYield": 1.5, - "QualityScoreMean": 35, - "PercentQ30": 95, - "PercentQ40": 90, - "PercentAssignedReads": 100.0, - "PercentMismatch": 0, - "SampleStats": [ - { - "SampleID": "s1", - "SampleName": "Sample1", - "NumPolonies": 50 - }, - { - "SampleID": "s2", - "SampleName": "Sample2", - "NumPolonies": 50000 - } - ] -} \ No newline at end of file + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50050, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50 + }, + { + "SampleID": "s2", + "SampleName": "Sample2", + "NumPolonies": 50000 + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json index 1ff24f6fdc..69269bfe48 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/PairedEndNoProjectWithLanes/RunStats.json @@ -1,31 +1,31 @@ { - "RunName": "RUN01", - "AnalysisID": "a1b2c3d4e5f6", - "NumPolonies": 50000, - "NumPoloniesBeforeTrimming": 100000, - "AssignedYield": 1.5, - "QualityScoreMean": 35, - "PercentQ30": 95, - "PercentQ40": 90, - "PercentAssignedReads": 100.0, - "PercentMismatch": 0, - "SampleStats": [ + "RunName": "RUN01", + "AnalysisID": "a1b2c3d4e5f6", + "NumPolonies": 50000, + "NumPoloniesBeforeTrimming": 100000, + "AssignedYield": 1.5, + "QualityScoreMean": 35, + "PercentQ30": 95, + "PercentQ40": 90, + "PercentAssignedReads": 100.0, + "PercentMismatch": 0, + "SampleStats": [ + { + "SampleID": "s1", + "SampleName": "Sample1", + "NumPolonies": 50000 + } + ], + "Lanes": [ + { + "Lane": 1, + "UnassignedSequences": [ { - "SampleID": "s1", - "SampleName": "Sample1", - "NumPolonies": 50000 + "I1": "AAA", + "I2": "TTT", + "Count": 100 } - ], - "Lanes": [ - { - "Lane": 1, - "UnassignedSequences": [ - { - "I1": "AAA", - "I2": "TTT", - "Count": 100 - } - ] - } - ] -} \ No newline at end of file + ] + } + ] +} diff --git a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json index 7e1812b321..50c4a71b92 100644 --- a/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json +++ b/multiqc/modules/bases2fastq/tests/fixtures/run_runstats.json @@ -15,4 +15,4 @@ "NumPolonies": 50000 } ] -} \ No newline at end of file +} diff --git a/multiqc/templates/default/package-lock.json b/multiqc/templates/default/package-lock.json index 4041ea6cf3..38246e4ddf 100644 --- a/multiqc/templates/default/package-lock.json +++ b/multiqc/templates/default/package-lock.json @@ -806,6 +806,7 @@ "integrity": "sha512-P1st0aksCrn9sGZhp8GMYwBnQsbvAWsZAX44oXNNvLHGqAOcoVxmjZiohstwQ7SqKnbR47akdNi+uleWD8+g6A==", "dev": true, "license": "MIT", + "peer": true, "funding": { "type": "opencollective", "url": "https://opencollective.com/popperjs" @@ -1454,6 +1455,7 @@ "integrity": "sha512-xCmtksBKd/jdJ9Bt9p7nPKiuqrlBMBuuGkQlkhZjjQk3Ty48lv93k5Dq6OPkKt4XwxDJ7tvlfrTa1MPA9bf+QA==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "chokidar": "^4.0.0", "immutable": "^5.0.2", @@ -1506,6 +1508,7 @@ "integrity": "sha512-+6erLbBm0+LROX2sPXlUYx/ux5PyE9K/a92Wrt6oA+WDAoFTdpHE5tCYCI5PNzq2y8df4rA+QgHLJuR4jNymsg==", "dev": true, "license": "BSD-2-Clause", + "peer": true, "dependencies": { "@jridgewell/source-map": "^0.3.3", "acorn": "^8.14.0", @@ -1557,6 +1560,7 @@ "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, @@ -1674,6 +1678,7 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", + "peer": true, "engines": { "node": ">=12" }, From 9477572b7bd26908b2ba57a2928e66f6fa66990a Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 14:15:47 -0800 Subject: [PATCH 5/6] Reverted change --- multiqc/modules/bases2fastq/tests/test_bases2fastq.py | 6 +++--- multiqc/templates/default/package-lock.json | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/multiqc/modules/bases2fastq/tests/test_bases2fastq.py b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py index cc498aa8ae..08ad6fe42e 100644 --- a/multiqc/modules/bases2fastq/tests/test_bases2fastq.py +++ b/multiqc/modules/bases2fastq/tests/test_bases2fastq.py @@ -489,10 +489,10 @@ def test_module_run_with_test_data(self, data_dir): config.strict = True m = MultiqcModule() # Test-data has multiple run roots (WGS, WES, PairedEndNoProject, PairedEndDefaultProject, etc.) - assert len(m.run_level_data) >= 2, "expected at least 2 runs from test-data" - # At least one project-level layout (PairedEndDefaultProject* or PairedEndProjects) - assert len(m.project_level_data) >= 1, "expected at least 1 project from test-data" total_samples = len(m.run_level_samples) + len(m.project_level_samples) + assert len(m.run_level_data) >= 2 or len(m.project_level_data) >= 1, ( + "expected at least 2 runs or at least 1 project from test-data" + ) assert total_samples >= 10, "expected at least 10 samples from test-data" # Module must produce output (general stats and/or sections) assert len(report.general_stats_data) > 0 or len(m.sections) > 0, ( diff --git a/multiqc/templates/default/package-lock.json b/multiqc/templates/default/package-lock.json index 38246e4ddf..4041ea6cf3 100644 --- a/multiqc/templates/default/package-lock.json +++ b/multiqc/templates/default/package-lock.json @@ -806,7 +806,6 @@ "integrity": "sha512-P1st0aksCrn9sGZhp8GMYwBnQsbvAWsZAX44oXNNvLHGqAOcoVxmjZiohstwQ7SqKnbR47akdNi+uleWD8+g6A==", "dev": true, "license": "MIT", - "peer": true, "funding": { "type": "opencollective", "url": "https://opencollective.com/popperjs" @@ -1455,7 +1454,6 @@ "integrity": "sha512-xCmtksBKd/jdJ9Bt9p7nPKiuqrlBMBuuGkQlkhZjjQk3Ty48lv93k5Dq6OPkKt4XwxDJ7tvlfrTa1MPA9bf+QA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "chokidar": "^4.0.0", "immutable": "^5.0.2", @@ -1508,7 +1506,6 @@ "integrity": "sha512-+6erLbBm0+LROX2sPXlUYx/ux5PyE9K/a92Wrt6oA+WDAoFTdpHE5tCYCI5PNzq2y8df4rA+QgHLJuR4jNymsg==", "dev": true, "license": "BSD-2-Clause", - "peer": true, "dependencies": { "@jridgewell/source-map": "^0.3.3", "acorn": "^8.14.0", @@ -1560,7 +1557,6 @@ "integrity": "sha512-M7BAV6Rlcy5u+m6oPhAPFgJTzAioX/6B0DxyvDlo9l8+T3nLKbrczg2WLUyzd45L8RqfUMyGPzekbMvX2Ldkwg==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -1678,7 +1674,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, From 549c84360ba56d8f1c3d9b96e7b476d465030692 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 21 Feb 2026 15:04:18 -0800 Subject: [PATCH 6/6] Exclude node_modules from shutil copy --- multiqc/core/write_results.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/multiqc/core/write_results.py b/multiqc/core/write_results.py index 6605966a06..d086bbe02c 100644 --- a/multiqc/core/write_results.py +++ b/multiqc/core/write_results.py @@ -516,13 +516,16 @@ def _write_html_report(to_stdout: bool, report_path: Optional[Path], return_html parent_template.template_dir, tmp_dir.get_tmp_dir(), dirs_exist_ok=True, - ignore=shutil.ignore_patterns("*.pyc"), + ignore=shutil.ignore_patterns("*.pyc", "node_modules"), ) # Copy the template files to the tmp directory (`dirs_exist_ok` makes sure # parent template files are overwritten) shutil.copytree( - template_mod.template_dir, tmp_dir.get_tmp_dir(), dirs_exist_ok=True, ignore=shutil.ignore_patterns("*.pyc") + template_mod.template_dir, + tmp_dir.get_tmp_dir(), + dirs_exist_ok=True, + ignore=shutil.ignore_patterns("*.pyc", "node_modules"), ) # Function to include file contents in Jinja template