From 299123952b6b97301ace22059779f6ec2b03de40 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sat, 20 Sep 2025 23:50:39 -0700 Subject: [PATCH 01/29] updates to b2f report --- docs/markdown/modules/bases2fastq.md | 4 + multiqc/modules/bases2fastq/bases2fastq.py | 403 +++++++++--------- .../modules/bases2fastq/plot_project_runs.py | 88 ---- multiqc/modules/bases2fastq/plot_runs.py | 208 ++++++++- multiqc/modules/bases2fastq/plot_samples.py | 116 ++++- multiqc/search_patterns.yaml | 4 + 6 files changed, 494 insertions(+), 329 deletions(-) delete mode 100644 multiqc/modules/bases2fastq/plot_project_runs.py diff --git a/docs/markdown/modules/bases2fastq.md b/docs/markdown/modules/bases2fastq.md index 2ce4857320..e32db7b5a8 100644 --- a/docs/markdown/modules/bases2fastq.md +++ b/docs/markdown/modules/bases2fastq.md @@ -31,4 +31,8 @@ bases2fastq/run: contents: SampleStats fn: RunStats.json num_lines: 100 +bases2fastq/manifest: + contents: Settings + fn: RunManifest.json + num_lines: 100 ``` diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 132387a226..261a972565 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -1,8 +1,10 @@ +from collections import defaultdict import copy import csv import json import logging import random +from typing import Any, Dict, List import uuid from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound @@ -11,16 +13,18 @@ from multiqc.modules.bases2fastq.plot_runs import ( plot_run_stats, tabulate_run_stats, + tabulate_project_stats, plot_base_quality_hist, plot_base_quality_by_cycle, + plot_lane_cycle_stats, ) -from multiqc.modules.bases2fastq.plot_project_runs import tabulate_project_run_stats from multiqc.modules.bases2fastq.plot_samples import ( tabulate_sample_stats, sequence_content_plot, plot_per_cycle_N_content, plot_adapter_content, plot_per_read_gc_hist, + plot_sample_read_length, ) log = logging.getLogger(__name__) @@ -39,252 +43,245 @@ def __init__(self): doi="10.1038/s41587-023-01750-7", ) + # Initialize run, project and sample level structures + self.run_level_data = {} + self.run_level_samples = {} + self.run_level_samples_to_project = {} + self.project_level_data = {} + self.project_level_samples = {} + self.project_level_samples_to_project = {} + num_run_level_samples = 0 + num_project_level_samples = 0 + + # Initialize run and project groups + self.group_dict = dict() + self.group_lookup_dict = dict() + self.project_lookup_dict = dict() + + self.b2f_sample_data = dict() self.b2f_run_data = dict() self.b2f_run_project_data = dict() + self.b2f_run_project_sample_data = dict() self.missing_runs = set() self.sample_id_to_run = dict() - # Group by run name - self.group_dict = dict() - self.group_lookup_dict = dict() - self.project_lookup_dict = dict() + # Define if call is project- or run-level + run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) + project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) + + if run_level_log_files == 0 and project_level_log_files == 0: + error_msg = "No run- or project-level log files found within the Bases2Fastq results." + log.error(error_msg) + raise ModuleNoSamplesFound(error_msg) + + # Parse data + if run_level_log_files > 0: + ( + self.run_level_data, self.run_level_samples, self.run_level_samples_to_project + ) = self._parse_run_project_data("bases2fastq/run") + if project_level_log_files > 0: + ( + self.project_level_data, self.project_level_samples, self.project_level_samples_to_project + ) = self._parse_run_project_data("bases2fastq/project") + + # Get run- and project-level samples + for data in self.run_level_samples.values(): + num_run_level_samples += len(data.keys()) + for data in self.project_level_samples.values(): + num_project_level_samples += len(data.keys()) + + # Ensure run/sample data found + if all([ + len(self.run_level_data) == 0, + num_run_level_samples == 0, + len(self.project_level_data), + num_project_level_samples == 0, + ]): + error_msg = "No run-, project- or sample-level data found" + log.error(error_msg) + raise ModuleNoSamplesFound(error_msg) + + # Log runs, projects and samples found + log.info(f"Found {len(self.run_level_data)} run(s) within the Bases2Fastq results.") + log.info(f"Found {len(self.project_level_data)} project(s) within the Bases2Fastq results.") + log.info(f"Found {num_project_level_samples} sample(s) within the Bases2Fastq results.") + + # Superfluous function call to confirm that it is used in this module + self.add_software_version(None) + + # Warn user if run-level/project-level or sample-level metrics were not found + if len(self.run_level_data) == 0 and len(self.project_level_data) == 0: + log.warning("No run/project stats found!") + if num_project_level_samples == 0: + log.warning("No sample stats found!") + + # Choose path to take, if project use only project-level data, otherwise use run- and project-level + summary_path = "" + if len(self.run_level_data) > 0 and len(self.project_level_data) == 0: + summary_path = "run_level" + if len(self.run_level_data) == 0 and len(self.project_level_data) > 0: + summary_path = "project_level" + elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: + summary_path = "combined_level" + + # Define data to use + run_data = {} + sample_data = {} + samples_to_projects = {} + if summary_path == "run_level": + run_data = self.run_level_data + sample_data = self.project_level_samples + samples_to_projects = self.run_level_samples_to_project + elif summary_path == "project_level": + run_data = self.project_level_data + sample_data = self.project_level_samples + samples_to_projects = self.project_level_samples_to_project + elif summary_path == "combined_level": + run_data = self.run_level_data + sample_data = self.project_level_samples + samples_to_projects = self.project_level_samples_to_project + else: + error_msg = "No run- or project-level data was retained. No report will be generated." + log.error(error_msg) + return + + # Create run and project groups + run_groups = defaultdict(list) + project_groups = defaultdict(list) + sample_to_run_group = {} + for sample in sample_data.keys(): + (_run_name, _) = sample.split("__") + run_groups[_run_name].append(sample) + sample_to_run_group[sample] = _run_name + sample_project = samples_to_projects[sample] + project_groups[sample_project].append(sample) + merged_groups = dict(run_groups) | dict(project_groups) + + # Assign color for each group + self.color_getter = mqc_colour.mqc_colour_scale() + self.palette = sum( + [ + self.color_getter.get_colours(hue) + for hue in ["Set2", "Pastel1", "Accent", "Set1", "Set3", "Dark2", "Paired", "Pastel2"] + ], + [], + ) + if len(merged_groups) > len(self.palette): + hex_range = 2**24 + extra_colors = [hex(random.randrange(0, hex_range)) for _ in range(len(merged_groups), len(self.palette))] + self.palette = self.palette + extra_colors + self.group_color = {g: c for g, c in zip(merged_groups.keys(), self.palette[: len(merged_groups)])} + self.sample_color = dict() + for s_name in samples_to_projects.keys(): + self.sample_color.update({s_name: self.group_color[samples_to_projects[s_name]]}) + self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match + self.palette = self.palette[len(merged_groups) :] - # bases2fastq/run - num_runs = 0 - num_samples = 0 - for f in self.find_log_files("bases2fastq/run"): + + # Plot metrics + qc_metrics_function = ( + tabulate_run_stats if summary_path in ["run_level", "combined_level"] else tabulate_project_stats + ) + self.add_run_plots( + data=run_data, + plot_functions=[ + qc_metrics_function, + plot_lane_cycle_stats, + plot_run_stats, + plot_base_quality_hist, + plot_base_quality_by_cycle + ] + ) + self.add_sample_plots( + data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects + ) + + def get_uuid(self): + return str(uuid.uuid4()).replace("-", "").lower() + + def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: + runs_global_data = {} + runs_sample_data = {} + sample_to_project = {} + if data_source == "": + return [runs_global_data, runs_sample_data, sample_to_project] + + for f in self.find_log_files(data_source): data = json.loads(f["f"]) + # Copy incomind data and reset samples to include only desired + data_to_return = copy.deepcopy(data) + data_to_return["SampleStats"] = [] + # get run + analysis run_name = data.get("RunName", None) analysis_id = data.get("AnalysisID", None)[0:4] if not run_name or not analysis_id: - log.error("Error with RunStats.json. Either RunName or AnalysisID is absent.") log.error( - "Please visit Elembio online documentation for more information - https://docs.elembio.io/docs/bases2fastq/introduction/" + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" ) continue - + run_analysis_name = "-".join([run_name, analysis_id]) run_analysis_name = self.clean_s_name(run_analysis_name, f) + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info( + f"Skipping <{run_analysis_name}> because it is present in ignore list." + ) + continue + + # Check run is present in the final dictionaries + if run_analysis_name not in runs_global_data: + runs_global_data[run_analysis_name] = data_to_return + + project = self.clean_s_name(data.get("Project", "DefaultProject"), f) + # map sample UUIDs to run_analysis_name for sample_data in data["SampleStats"]: sample_id = sample_data["SampleID"] sample_name = sample_data["SampleName"] sample_data["RunName"] = run_name - run_analysis_sample_name = "__".join([run_analysis_name, sample_name]) num_polonies = sample_data["NumPolonies"] - if num_polonies < MIN_POLONIES: + if num_polonies < 1000: log.warning( - f"Skipping {run_analysis_sample_name} because it has <{MIN_POLONIES} assigned reads [n={num_polonies}]." + f"Skipping {run_analysis_sample_name} because it has" + f" <{MIN_POLONIES} assigned reads [n={num_polonies}]." ) continue # skip run if in user provider ignore list - if self.is_ignore_sample(sample_id): - continue - if self.is_ignore_sample(run_analysis_sample_name): + if self.is_ignore_sample(sample_id) or self.is_ignore_sample(run_analysis_sample_name): + log.info( + f"Skipping <{sample_id}> ({run_analysis_sample_name}) because it is present in ignore list." + ) continue - self.sample_id_to_run[sample_id] = run_analysis_name - self.b2f_sample_data[run_analysis_sample_name] = sample_data - num_samples += 1 - - # skip run if in user provider ignore list - if self.is_ignore_sample(run_analysis_name): - continue + # If sample passes all checks add it back + runs_sample_data[run_analysis_sample_name] = sample_data + sample_to_project[run_analysis_sample_name] = project - num_runs += 1 - self.b2f_run_data[run_analysis_name] = data self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") - # Checking if run lengths configurations are the same for all samples. - self.run_r1r2_lens = [] - for s in self.b2f_run_data.keys(): - read_lens = str(len(self.b2f_run_data[s]["Reads"][0]["Cycles"])) - if len(self.b2f_run_data[s]["Reads"]) > 1: - read_lens += "+" + str(len(self.b2f_run_data[s]["Reads"][1]["Cycles"])) - self.run_r1r2_lens.append(read_lens) - - run_r1r2_lens_dict = {} - for nn, rl in enumerate(self.run_r1r2_lens): - if not run_r1r2_lens_dict.get(rl): - run_r1r2_lens_dict[rl] = [] - run_r1r2_lens_dict[rl].append(list(self.b2f_run_data.keys())[nn]) - - # - # bases2fastq/project - # - num_projects = 0 - for f in self.find_log_files("bases2fastq/project"): - data = json.loads(f["f"]) - samples = data["Samples"] - - # get run + analysis - run_name = data.get("RunName", None) - analysis_id = data.get("AnalysisID", None)[0:4] - - run_analysis_name = "-".join([run_name, analysis_id]) - run_analysis_name = self.clean_s_name(run_analysis_name, f) - - if not run_name or not analysis_id: - log.error(f"Error with {f['root']}. Either RunName or AnalysisID is absent.") - log.error("Please visit Elembio online documentation for more information -") - continue - - project = self.clean_s_name(data.get("Project", "DefaultProject"), f) - - run_analysis_project_name = "__".join([run_name, project, analysis_id]) - run_analysis_project_name = self.clean_s_name(run_analysis_project_name, f) - - # skip project if in user provider ignore list - if self.is_ignore_sample(run_analysis_project_name): - continue - - for sample_name in samples: - run_analysis_sample_name = self.clean_s_name("__".join([run_analysis_name, sample_name]), f) - self.project_lookup_dict[run_analysis_sample_name] = project - num_projects += 1 - - # remove samples - del data["Samples"] - - self.b2f_run_project_data[run_analysis_project_name] = data - self.add_data_source(f=f, s_name=project, module="bases2fastq") - - # if all RunStats.json too large, none will be found. Guide customer and Exit at this point. - if len(self.sample_id_to_run) != 0: - log.info(f"Found {num_runs} total RunStats.json") - - # ensure run/sample data found - if num_projects == 0 and num_samples == 0: - raise ModuleNoSamplesFound - log.info(f"Found {num_samples} samples and {num_projects} projects within the bases2fastq results") - - # Superfluous function call to confirm that it is used in this module - self.add_software_version(None) - - # process groups / projects - for s_name in self.b2f_sample_data.keys(): - s_group = self.b2f_sample_data[s_name]["RunName"] - - if not self.group_dict.get(s_group): - self.group_dict.update({s_group: []}) + return [runs_global_data, runs_sample_data, sample_to_project] - self.group_dict[s_group].append(s_name) - self.group_lookup_dict.update({s_name: s_group}) - - # Assign project - for s_name in self.b2f_sample_data.keys(): - if self.project_lookup_dict.get(s_name): - s_group = self.project_lookup_dict[s_name] - if not self.group_dict.get(s_group): - self.group_dict.update({s_group: []}) - self.group_dict[s_group].append(s_name) - self.group_lookup_dict.update({s_name: s_group}) - - # Assign color for each group - self.color_getter = mqc_colour.mqc_colour_scale() - self.palette = sum( - [ - self.color_getter.get_colours(hue) - for hue in ["Set2", "Pastel1", "Accent", "Set1", "Set3", "Dark2", "Paired", "Pastel2"] - ], - [], - ) - if len(self.group_dict) > len(self.palette): - hex_range = 2**24 - extra_colors = [hex(random.randrange(0, hex_range)) for _ in range(len(self.group_dict), len(self.palette))] - self.palette = self.palette + extra_colors - self.group_color = {g: c for g, c in zip(self.group_dict.keys(), self.palette[: len(self.group_dict)])} - self.sample_color = dict() - for s_name in self.b2f_sample_data.keys(): - self.sample_color.update({s_name: self.group_color[self.group_lookup_dict[s_name]]}) - self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match - self.palette = self.palette[len(self.group_dict) :] - - # Read custom group info - self.group_info_exist = False - for f in self.find_log_files("bases2fastq/group"): - if self.group_info_exist: - log.warning( - "More than one group assignment files are found. Please only keep " - "one assignment file in the analysis folder. Bases2Fastq stats will " - "not be plotted" - ) - for row in csv.DictReader(f["f"]): - s_group = row["Group"] - s_name = row["Sample Name"] - if self.group_dict.get(s_group) is None: - self.group_dict[s_group] = [] - self.group_dict[s_group].append(s_name) - self.group_lookup_dict[s_name] = s_group - for group in self.group_dict.keys(): - if group not in self.run_color: - if len(self.palette) > 0: - self.group_color[group] = self.palette.pop(0) - else: - hex_range = 2**24 - extra_color = hex(random.randrange(0, hex_range)) - self.group_color[group] = extra_color - self.sample_color = dict() - for s_name in self.b2f_sample_data.keys(): - self.sample_color.update({s_name: self.group_color[self.group_lookup_dict[s_name]]}) - - # sort run - data_keys = list(self.b2f_run_data.keys()) - data_keys.sort() - sorted_data = {s_name: self.b2f_run_data[s_name] for s_name in data_keys} - self.b2f_run_data = sorted_data - # sort projects - data_keys = list(self.b2f_run_project_data.keys()) - data_keys.sort() - sorted_data = {s_name: self.b2f_run_project_data[s_name] for s_name in data_keys} - self.b2f_run_project_data = sorted_data - # sort samples - data_keys = list(self.b2f_sample_data.keys()) - sorted_keys = sorted(data_keys, key=lambda x: (self.group_lookup_dict[x], x)) - sorted_data = {s_name: self.b2f_sample_data[s_name] for s_name in sorted_keys} - self.b2f_sample_data = sorted_data - - if len(self.b2f_run_data) == 0: - log.warning("No run stats file found!") - if len(self.b2f_sample_data) == 0: - log.warning("No sample stats file found!") - - # Add sections - self.add_run_plots() - if num_projects > 0: - self.add_project_run_plots() - self.add_sample_plots() - - def get_uuid(self): - return str(uuid.uuid4()).replace("-", "").lower() - - def add_run_plots(self): - plot_functions = [tabulate_run_stats, plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle] + def add_run_plots(self, data, plot_functions): for func in plot_functions: - plot_html, plot_name, anchor, description, helptext, plot_data = func(self.b2f_run_data, self.run_color) + plot_html, plot_name, anchor, description, helptext, plot_data = func(data, self.run_color) self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) self.write_data_file(plot_data, f"base2fastq:{plot_name}") - def add_project_run_plots(self): - plot_functions = [tabulate_project_run_stats] - for func in plot_functions: - plot_html, plot_name, anchor, description, helptext, plot_data = func( - self.b2f_run_project_data, self.run_color - ) - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) - self.write_data_file(plot_data, f"base2fastq_projects:{plot_name}") - - def add_sample_plots(self): + def add_sample_plots(self, data, group_lookup, project_lookup): plot_functions = [ tabulate_sample_stats, + plot_sample_read_length, sequence_content_plot, plot_per_cycle_N_content, plot_adapter_content, @@ -292,7 +289,7 @@ def add_sample_plots(self): ] for func in plot_functions: plot_html, plot_name, anchor, description, helptext, plot_data = func( - self.b2f_sample_data, self.group_lookup_dict, self.project_lookup_dict, self.sample_color + data, group_lookup, project_lookup, self.sample_color ) self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) self.write_data_file(plot_data, f"base2fastq:{plot_name}") diff --git a/multiqc/modules/bases2fastq/plot_project_runs.py b/multiqc/modules/bases2fastq/plot_project_runs.py deleted file mode 100644 index 6a3663535d..0000000000 --- a/multiqc/modules/bases2fastq/plot_project_runs.py +++ /dev/null @@ -1,88 +0,0 @@ -from multiqc.plots import table -from multiqc import config - -""" -Functions for plotting per run information of bases2fastq -""" - - -def tabulate_project_run_stats(run_data, color_dict): - """ - Tabulate general information and statistics of each run - """ - plot_content = dict() - for s_name in run_data.keys(): - run_stats = dict() - run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) - run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) - run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) - run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) - run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) - plot_content.update({s_name: run_stats}) - - headers = {} - headers["num_polonies_run"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run ({config.base_count_desc})", - "min": 0, - "scale": "RdYlGn", - "shared_key": "base_count", - } - headers["percent_assigned_run"] = { - "title": "% Assigned Reads", - "description": "The percentage of reads assigned to sample(s)", - "max": 100, - "min": 0, - "scale": "BuPu", - "suffix": "%", - } - headers["yield_run"] = { - "title": "Assigned Yield (Gb)", - "description": "The run yield based on assigned reads in gigabases", - "scale": "Blues", - } - headers["mean_base_quality_run"] = { - "title": "Quality Score Mean", - "description": "Average base quality across Read 1 and Read 2", - "min": 0, - "scale": "Spectral", - } - headers["percent_q30_run"] = { - "title": "Percent Q30", - "description": "The percentage of ≥ Q30 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", - "max": 100, - "min": 0, - "scale": "RdYlGn", - "suffix": "%", - } - headers["percent_q40_run"] = { - "title": "Percent Q40", - "description": "The percentage of ≥ Q40 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", - "max": 100, - "min": 0, - "scale": "RdYlGn", - "suffix": "%", - } - - pconfig = { - "title": "bases2fastq: General Sequencing (Project) QC metrics", - "col1_header": "Run Name", - "id": "project_run_metrics_table", - "ylab": "QC", - } - - plot_name = "(Project) Sequencing QC metrics table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) - anchor = "project_run_qc_metrics_table" - description = "QC metrics per run, per project" - helptext = """ - This section displays metrics that indicate the quality of each sequencing run: \n - - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n - - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - """ - return plot_html, plot_name, anchor, description, helptext, plot_content diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index accd532678..a7c8ad8d4a 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -19,8 +19,8 @@ def plot_run_stats(run_data, color_dict): yields = dict() for run in run_names: # Index Assignment Polonies and Yields ### - # percent_assigned = run_data[run].get("PercentAssignedReads",100.0) - percent_assigned = run_data[run]["PercentAssignedReads"] + percent_assigned = run_data[run].get("PercentAssignedReads", 100.0) + # percent_assigned = run_data[run]["PercentAssignedReads"] percent_perfect_assigned = ( 100.00 - run_data[run]["PercentMismatch"] @@ -42,7 +42,7 @@ def plot_run_stats(run_data, color_dict): num_polonies[run] = num_polonies_run total_yield_run = {} - total_yield = run_data[run].get("TotalYield", 300.0) + total_yield = run_data[run].get("TotalYield", run_data[run].get("AssignedYield", 300.0)) total_yield_run["Perfect Index"] = total_yield * percent_perfect_total * 0.01 total_yield_run["Mismatched Index"] = total_yield * percent_imperfect_total * 0.01 total_yield_run["Unassigned"] = ( @@ -54,7 +54,7 @@ def plot_run_stats(run_data, color_dict): pconfig = { "data_labels": [ {"name": "Number of Polonies", "ylab": "Number of Polonies", "format": "{d}"}, - {"name": "Yield (Gb)", "ylab": "Gb"}, + {"name": "Yield (Gb)", "ylab": "Yield"}, ], "cpswitch": True, "stacking": "normal", @@ -69,15 +69,135 @@ def plot_run_stats(run_data, color_dict): "Unassigned": {"name": "Unassigned Index", "color": "#434348"}, } ] * 2 - plot_name = "Sequencing Run Yield" plot_html = bargraph.plot(plot_content, cats, pconfig=pconfig) anchor = "run_yield_plot" description = "Bar plots of sequencing run yields. Please see individual run reports for details" helptext = """ This section shows and compare the yield and index assignment rate of each sequencing run.\n\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Yield: The total yield of all assigned reads in gigabases. + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Yield: The total yield of all assigned reads in gigabases. + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def _calculate_reads_eliminated(run_data) -> int: + """ + Calculate the total number of reads eliminated during trimming. + + This function iterates over the lanes in the given run data and sums the + difference between the number of polonies before trimming and after trimming. + If required fields are missing, they are skipped. + + Args: + run_data (dict): Dictionary containing sequencing run data with lane information. + + Returns: + int: The total number of reads eliminated across all lanes. + """ + reads_eliminated = 0 + if "Lanes" not in run_data: + return reads_eliminated + for lane in run_data["Lanes"]: + if "NumPolonies" not in lane or "NumPoloniesBeforeTrimming" not in lane: + continue + reads_eliminated += lane["NumPoloniesBeforeTrimming"] - lane["NumPolonies"] + + return reads_eliminated + + +def tabulate_project_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + for s_name in run_data.keys(): + project = run_data[s_name]["Project"] + run_project_name = f"{s_name} | {project}" + run_stats = dict() + run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) + run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) + run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) + run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) + run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) + plot_content.update({run_project_name: run_stats}) + + headers = {} + headers["num_polonies_run"] = { + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run.", + "min": 0, + "scale": "RdYlGn", + } + headers["percent_assigned_run"] = { + "title": "% Assigned Reads", + "description": "The percentage of reads assigned to sample(s)", + "max": 100, + "min": 0, + "scale": "BuPu", + "suffix": "%", + } + headers["yield_run"] = { + "title": "Assigned Yield (Gb)", + "description": "The run yield based on assigned reads in gigabases", + "scale": "Blues", + } + headers["mean_base_quality_run"] = { + "title": "Quality Score Mean", + "description": "Average base quality across Read 1 and Read 2", + "min": 0, + "scale": "Spectral", + } + headers["percent_q30_run"] = { + "title": "Percent Q30", + "description": "The percentage of ≥ Q30 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + headers["percent_q40_run"] = { + "title": "Percent Q40", + "description": "The percentage of ≥ Q40 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } + + pconfig = { + "title": "bases2fastq: General Sequencing (Project) QC metrics", + "col1_header": "Run Name", + "id": "project_run_metrics_table", + "ylab": "QC", + } + + project_header = "" + run_keys = list(run_data.keys()) + if len(run_keys) > 1: + project_header = "(Project) " + elif len(run_keys) == 1: + first_key = run_keys[0] + project_header = f'{run_data[first_key]["Project"]} | ' + plot_name = f"{project_header}Sequencing QC Metrics Table" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "project_run_qc_metrics_table" + description = "QC metrics per run, per project" + helptext = """ + This section displays metrics that indicate the quality of each sequencing run: \n + - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n + - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -91,19 +211,20 @@ def tabulate_run_stats(run_data, color_dict): run_stats = dict() run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) run_stats.update({"percent_assigned_run": run_data[s_name].get("PercentAssignedReads", 100.0)}) + run_stats.update({"percent_unexpected_index_pairs": run_data[s_name].get("PercentUnexpectedIndexPairs", 0.0)}) run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) plot_content.update({s_name: run_stats}) headers = {} headers["num_polonies_run"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})", + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run.)", "min": 0, "scale": "RdYlGn", - "shared_key": "base_count", } headers["percent_assigned_run"] = { "title": "% Assigned Reads", @@ -113,6 +234,14 @@ def tabulate_run_stats(run_data, color_dict): "scale": "BuPu", "suffix": "%", } + headers["percent_unexpected_index_pairs"] = { + "title": "% Unexpected Index Pairs", + "description": "The percentage of unexpected index pairs", + "max": 100, + "min": 0, + "scale": "BuPu", + "suffix": "%", + } headers["yield_run"] = { "title": "Yield (Gb)", "description": "The run yield based on assigned reads in gigabases", @@ -140,9 +269,13 @@ def tabulate_run_stats(run_data, color_dict): "scale": "RdYlGn", "suffix": "%", } + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } pconfig = { - "title": "bases2fastq: General Sequencing Run QC metrics", + "title": "Bases2Fastq: General Sequencing Run QC metrics", "col1_header": "Run Name", "id": "run_metrics_table", "ylab": "QC", @@ -161,6 +294,49 @@ def tabulate_run_stats(run_data, color_dict): - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def plot_lane_cycle_stats(run_data, color_dict): + """ + Plot number of cycles per read and lane + """ + plot_content = dict() + for s_name in run_data.keys(): + if "Lanes" not in run_data[s_name]: + continue + for lane in run_data[s_name]["Lanes"]: + if "Lane" not in lane or "Reads" not in lane: + continue + lane_stats = dict() + lane_name = f'L{lane["Lane"]}' + run_name = f"{s_name} | {lane_name}" + lane_stats[run_name] = {} + for read in lane["Reads"]: + if "Cycles" not in read or "Read" not in read: + continue + read_name = read["Read"] + num_cycles = len(read["Cycles"]) + lane_stats[run_name][read_name] = num_cycles + plot_content.update(lane_stats) + + pconfig = { + "title": "Bases2Fastq: Cycles Per Read Per Lane", + "id": "project_cycles_per_read_per_lane", + "ylab": "Read Cycles", + "cpswitch": False, + "subtitle": None, + } + + plot_name = "Cycles Per Read Per Lane" + plot_html = bargraph.plot(plot_content, pconfig=pconfig) + anchor = "cycles_per_read_per_lane" + description = "Number of sequencing cycles per read in each lane." + helptext = """ + Shows the number of cycles used for each read in every flowcell lane. + Useful for confirming that read lengths match the expected sequencing setup across all lanes. """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -206,15 +382,15 @@ def plot_base_quality_hist(run_data, color_dict): "description": "Histogram of bases quality", "ymin": 0, "ylabel": "Percentage of base quality", - "xlabel": "base quality", + "xlab": "Q Score", "colors": color_dict, }, { - "name": "Qualiter Per Read", + "name": "Quality Per Read", "description": "Histogram of average read base quality", "ymin": 0, "ylabel": "Percentage of read quality", - "xlabel": "base quality", + "xlab": "Q Score", "colors": color_dict, }, ], @@ -332,7 +508,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): pconfig = { "data_labels": [ {"name": "Median Quality", "xlab": "cycle", "ylab": "Quality"}, - {"name": "Mean Quality", "ylab": "Quality"}, + {"name": "Mean Quality", "xlab": "cycle", "ylab": "Quality"}, {"name": "%Q30", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, {"name": "%Q40", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, {"name": "%Base Calls Below PF", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, @@ -347,7 +523,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): plot_html = linegraph.plot(plot_content, pconfig=pconfig) plot_name = "Quality Metrics By Cycle" anchor = "per_cycle_quality" - description = "Per run base qualities by cycle" + description = "Per run base qualities by cycle. Read 1 and Read 2 are separated by a red dashed line." helptext = """ This section plots the base qualities by each instrument cycle.\n Choose between Median Quality, Mean Quality, Percent Q30 or Percentage Q40 per cycle.\n diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 4cbfd71cf8..71c1ce8624 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,4 +1,4 @@ -from multiqc.plots import linegraph, table +from multiqc.plots import bargraph, linegraph, table from multiqc import config """ @@ -6,6 +6,28 @@ """ +def _calculate_sample_reads_eliminated(run_data) -> int: + """ + Calculate the total number of reads eliminated during trimming. + + This function iterates over the lanes in the given run data and sums the + difference between the number of polonies before trimming and after trimming. + If required fields are missing, they are skipped. + + Args: + run_data (dict): Dictionary containing sequencing run data with lane information. + + Returns: + int: The total number of reads eliminated across all lanes. + """ + reads_eliminated = 0 + if "NumPolonies" not in run_data or "NumPoloniesBeforeTrimming" not in run_data: + return reads_eliminated + reads_eliminated += run_data["NumPoloniesBeforeTrimming"] - run_data["NumPolonies"] + + return reads_eliminated + + def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, sample_color): """ Tabulate general information and statistics per sample @@ -20,6 +42,8 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s general_stats.update({"mean_base_quality_sample": sample_data[s_name]["QualityScoreMean"]}) general_stats.update({"percent_q30_sample": sample_data[s_name]["PercentQ30"]}) general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]}) + general_stats.update({"reads_eliminated": _calculate_sample_reads_eliminated(sample_data[s_name])}) + general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]}) plot_content.update({s_name: general_stats}) headers = {} @@ -37,11 +61,10 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "scale": False, } headers["num_polonies_sample"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})", + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run", "min": 0, "scale": "Blues", - "shared_key": "base_count", } headers["yield_sample"] = { "title": "Yield (Gb)", @@ -70,24 +93,73 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "scale": "RdYlGn", "suffix": "%", } + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } + headers["percent_mismatch"] = { + "title": "Percent Mismatch", + "description": "Percent mismatch", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } - pconfig = {"id": "sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": True} + pconfig = {"id": "sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": False} plot_name = "Sample QC Metrics Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) anchor = "sample_qc_metrics_table" description = "QC metrics per unique sample" helptext = """ - This section displays metrics that indicate the quality of each sample: \n - - Sample Name: Unique identifier composed of (RunName)__(UUID)__(SampleName), where (RunName) maps to the AVITI run name, (UUID) maps to the unique Bases2Fastq analysis result, and (SampleName) maps to the sample name as specified in the RunManifest.csv. - - Group: Run/Sample group label that assigns colors in the plot. To customize group tags:\n - - 1) Set the project name when running Bases2Fastq. In this case the group tags will be project name.\n - - 2) Generate a csv file with the suffix "_b2fgroup.csv", containing the columns "Sample Name" and "Group".\n - - Number of Polonies: The total number of polonies that are assigned to the sample.\n - - Assigned Yield (Gb): The sample yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The average Q score of base calls for the sample.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls\n + This section displays metrics that indicate the quality of each sample: \n + - Sample Name: Unique identifier composed of (RunName)__(UUID)__(SampleName), where (RunName) maps to the AVITI run name, (UUID) maps to the unique Bases2Fastq analysis result, and (SampleName) maps to the sample name as specified in the RunManifest.csv. + - Group: Run/Sample group label that assigns colors in the plot. To customize group tags:\n + - 1) Set the project name when running Bases2Fastq. In this case the group tags will be project name.\n + - 2) Generate a csv file with the suffix "_b2fgroup.csv", containing the columns "Sample Name" and "Group".\n + - Number of Polonies: The total number of polonies that are assigned to the sample.\n + - Assigned Yield (Gb): The sample yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The average Q score of base calls for the sample.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n + - Percent Mismatch: Percent Mismatch.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, color_dict): + """ + Plot number of cycles per read and lane + """ + plot_content = dict() + for s_name, data in sample_data.items(): + read_lengths = {s_name: {}} + if "Reads" not in data: + continue + for read in data["Reads"]: + read_name = read["Read"] + mean_length = read["MeanReadLength"] + read_lengths[s_name][read_name] = mean_length + plot_content.update(read_lengths) + + pconfig = { + "title": "Bases2Fastq: Mean Read Length per Sample", + "id": "mean_read_length_per_sample", + "ylab": "Bases", + "cpswitch": False, + "subtitle": None, + "stacking": "group", + } + + plot_name = "Mean Read Length per Sample" + plot_html = bargraph.plot(plot_content, pconfig=pconfig) + anchor = "mean_read_length_per_sample" + description = "Average read length per read for all samples." + helptext = """ + Shows the number of cycles used for each read in every flowcell lane. + Useful for confirming that read lengths match the expected sequencing setup across all lanes. """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -108,6 +180,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c r1r2_split = max(r1r2_split, len(R1)) for s_name in sorted(sample_data.keys()): + paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] for cycle in range(len(R1)): base_no = cycle + 1 @@ -135,8 +208,8 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c plot_content = data pconfig = { - "xlab": "cycle", - "ylab": "Percentage", + "xlab": "Cycle", + "ylab": "Percentage of Total Reads", "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, @@ -147,8 +220,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c plot_name = "Per Cycle Base Content" anchor = "base_content" description = """ - Percentage of unidentified bases ("N" bases) by each sequencing cycle. - Read 1 and Read 2 are separated by a red dashed line + Base composition per sample per cycle. Read 1 and Read 2 are separated by a red dashed line. """ helptext = """ If a sequencer is unable to make a base call with sufficient confidence then it will @@ -212,7 +284,7 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict "title": "bases2fastq: Per Cycle N Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) - plot_name = "Per Cycle N Content" + plot_name = "Per Cycle N Content." anchor = "n_content" description = """ Percentage of unidentified bases ("N" bases) by each sequencing cycle. @@ -253,7 +325,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s pconfig = { "xlab": "% GC", - "ylab": "Percentage", + "ylab": "Percentage of reads that are GC", "colors": sample_color, "id": "gc_hist", "title": "bases2fastq: Per Sample GC Content Histogram", @@ -323,7 +395,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa pconfig.update({"colors": sample_color}) plot_html = linegraph.plot(plot_content, pconfig=pconfig) anchor = "adapter_content" - description = "Adapter content per cycle" + description = "Adapter content per cycle. Read 1 and Read 2 are separated by a red dashed line." helptext = """ The plot shows a cumulative percentage count of the proportion of your library which has seen each of the adapter sequences at each cycle. diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index a6513a31ea..22e1625ef7 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -43,6 +43,10 @@ bases2fastq/project: fn: "*_RunStats.json" contents: "SampleStats" num_lines: 100 +bases2fastq/manifest: + fn: "RunManifest.json" + contents: "Settings" + num_lines: 100 bbduk: contents: "Executing jgi.BBDuk" num_lines: 2 From 520772ca80d1f1109f65c4375b2fa7f036dd298f Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sun, 28 Sep 2025 21:01:21 -0700 Subject: [PATCH 02/29] Added additional assignment metrics --- multiqc/modules/bases2fastq/bases2fastq.py | 319 +++++++++++++++++++- multiqc/modules/bases2fastq/plot_runs.py | 186 +++++++++++- multiqc/modules/bases2fastq/plot_samples.py | 49 ++- 3 files changed, 546 insertions(+), 8 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 261a972565..3b3deba312 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -1,17 +1,21 @@ from collections import defaultdict import copy -import csv +import re import json import logging import random from typing import Any, Dict, List import uuid +from pathlib import Path from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound from multiqc.utils import mqc_colour from multiqc.modules.bases2fastq.plot_runs import ( plot_run_stats, + tabulate_manifest_stats, + tabulate_index_assignment_stats, + tabulate_unassigned_index_stats, tabulate_run_stats, tabulate_project_stats, plot_base_quality_hist, @@ -20,6 +24,7 @@ ) from multiqc.modules.bases2fastq.plot_samples import ( tabulate_sample_stats, + plot_sample_assignment_histogram, sequence_content_plot, plot_per_cycle_N_content, plot_adapter_content, @@ -124,15 +129,21 @@ def __init__(self): summary_path = "project_level" elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: summary_path = "combined_level" - + # Define data to use run_data = {} sample_data = {} samples_to_projects = {} + manifest_data = {} + index_assigment_data = {} + unassigned_sequences = {} if summary_path == "run_level": run_data = self.run_level_data sample_data = self.project_level_samples samples_to_projects = self.run_level_samples_to_project + manifest_data = self._parse_run_manifest("bases2fastq/manifest") + index_assigment_data = self._parse_index_assignment("bases2fastq/manifest") + unassigned_sequences = self._parse_run_unassigned_sequences("bases2fastq/run") elif summary_path == "project_level": run_data = self.project_level_data sample_data = self.project_level_samples @@ -141,6 +152,9 @@ def __init__(self): run_data = self.run_level_data sample_data = self.project_level_samples samples_to_projects = self.project_level_samples_to_project + manifest_data = self._parse_run_manifest("bases2fastq/manifest") + index_assigment_data = self._parse_index_assignment("bases2fastq/manifest") + unassigned_sequences = self._parse_run_unassigned_sequences("bases2fastq/run") else: error_msg = "No run- or project-level data was retained. No report will be generated." log.error(error_msg) @@ -149,6 +163,7 @@ def __init__(self): # Create run and project groups run_groups = defaultdict(list) project_groups = defaultdict(list) + in_project_sample_groups = defaultdict(list) sample_to_run_group = {} for sample in sample_data.keys(): (_run_name, _) = sample.split("__") @@ -156,7 +171,9 @@ def __init__(self): sample_to_run_group[sample] = _run_name sample_project = samples_to_projects[sample] project_groups[sample_project].append(sample) - merged_groups = dict(run_groups) | dict(project_groups) + if summary_path == "project_level": + in_project_sample_groups[sample].append(sample) + merged_groups = dict(run_groups) | dict(project_groups) | dict(in_project_sample_groups) # Assign color for each group self.color_getter = mqc_colour.mqc_colour_scale() @@ -174,7 +191,11 @@ def __init__(self): self.group_color = {g: c for g, c in zip(merged_groups.keys(), self.palette[: len(merged_groups)])} self.sample_color = dict() for s_name in samples_to_projects.keys(): - self.sample_color.update({s_name: self.group_color[samples_to_projects[s_name]]}) + s_color = ( + self.group_color[s_name] if summary_path == "project_level" else + self.group_color[samples_to_projects[s_name]] + ) + self.sample_color.update({s_name: s_color}) self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match self.palette = self.palette[len(merged_groups) :] @@ -183,16 +204,38 @@ def __init__(self): qc_metrics_function = ( tabulate_run_stats if summary_path in ["run_level", "combined_level"] else tabulate_project_stats ) + self.add_run_plots(data=run_data, plot_functions=[qc_metrics_function]) + + if summary_path in ["run_level", "combined_level"]: + self.add_run_plots( + data=manifest_data, + plot_functions=[ + tabulate_manifest_stats, + ] + ) + self.add_run_plots( + data=index_assigment_data, + plot_functions=[ + tabulate_index_assignment_stats, + ] + ) + self.add_run_plots( + data=unassigned_sequences, + plot_functions=[ + tabulate_unassigned_index_stats, + ] + ) + self.add_run_plots( data=run_data, plot_functions=[ - qc_metrics_function, plot_lane_cycle_stats, plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle ] ) + self.add_sample_plots( data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects ) @@ -271,6 +314,271 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") return [runs_global_data, runs_sample_data, sample_to_project] + + + def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: + runs_manifest_data = {} + + if data_source == "": + return runs_manifest_data + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunStats.json + run_stats_path = Path(directory) / "RunStats.json" + if not run_stats_path.exists(): + log.error( + f"RunStats.json does not exist in the Bases2Fastq output directory {directory}.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + run_analysis_name = None + with open(run_stats_path) as _infile: + run_stats = json.load(_infile) + run_name = run_stats.get("RunName", None) + analysis_id = run_stats.get("AnalysisID", None) + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + run_manifest = json.loads(f["f"]) + if "Settings" not in run_manifest: + log.warning( + f" section not found in {directory}/RunManifest.json.\n" + f"Skipping RunManifest metrics." + ) + else: + for lane_data in run_manifest["Settings"]: + lane_id = lane_data.get("Lane") + if not lane_id: + log.error(" not found in Settings section of RunManifest. Skipping lanes.") + continue + lane_name = f"L{lane_id}" + run_lane = f"{run_analysis_name} | {lane_name}" + runs_manifest_data[run_lane] = {} + + indices = [] + indices_cycles = [] + mask_pattern = re.compile(r"^I\d+Mask$") + matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] + for key in matching_keys: + for mask_info in lane_data[key]: + if mask_info["Read"] not in indices: + indices.append(mask_info["Read"]) + indices_cycles.append(str(len(mask_info["Cycles"]))) + indexing = f'{" + ".join(indices_cycles)}
{" + ".join(indices)}' + runs_manifest_data[run_lane]["Indexing"] = indexing + + runs_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") + runs_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( + "R1AdapterMinimumTrimmedLength", "N/A" + ) + runs_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( + "R2AdapterMinimumTrimmedLength", "N/A" + ) + + self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") + + return runs_manifest_data + + def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: + run_unassigned_sequences = {} + if data_source == "": + return run_unassigned_sequences + + for f in self.find_log_files(data_source): + data = json.loads(f["f"]) + + # Get RunName and AnalysisID + run_name = data.get("RunName", None) + analysis_id = data.get("AnalysisID", None)[0:4] + if not run_name or not analysis_id: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + run_analysis_name = "-".join([run_name, analysis_id]) + run_analysis_name = self.clean_s_name(run_analysis_name, f) + + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info( + f"Skipping <{run_analysis_name}> because it is present in ignore list." + ) + continue + + # Get total polonies and build unassigned indices dictionary + total_polonies = data.get("NumPoloniesBeforeTrimming", 0) + if "Lanes" not in data: + log.error( + f"Missing lane information in RunStats.json for run {run_analysis_name}." + f"Skipping building unassigned indices table." + ) + continue + index_number = 1 + for lane in data["Lanes"]: + lane_id = lane.get("Lane") + if lane_id: + lane_id = f"L{lane_id}" + for sequence in lane.get("UnassignedSequences", []): + run_unassigned_sequences[index_number] = { + "Run Name": run_analysis_name, + "Lane": lane_id, + "I1": sequence["I1"], + "I2": sequence["I2"], + "Polonies": sequence["Count"], + "% Polonies": float("nan"), + } + if total_polonies > 0: + run_unassigned_sequences[index_number]["% Polonies"] = round( + sequence["Count"] / total_polonies, 2 + ) + index_number += 1 + + return run_unassigned_sequences + + def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: + sample_to_index_assignment = {} + + if manifest_data_source == "": + return sample_to_index_assignment + + for f in self.find_log_files(manifest_data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunParameters.json + run_stats_path = Path(directory) / "RunStats.json" + if not run_stats_path.exists(): + log.error( + f"RunStats.json does not exist in the Bases2Fastq output directory {directory}.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + run_analysis_name = None + total_polonies = 0 + with open(run_stats_path) as _infile: + run_stats = json.load(_infile) + + # Get run name information + run_name = run_stats.get("RunName", None) + analysis_id = run_stats.get("AnalysisID", None) + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {run_stats_path}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue + + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info( + f"Skipping <{run_analysis_name}> because it is present in ignore list." + ) + continue + + # Ensure sample stats are present + if "SampleStats" not in run_stats: + log.error( + "Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Missing SampleStats in RunStats.json. Available keys: {list(run_stats.keys())}.") + continue + + # Extract per sample polony counts and overall total counts + total_polonies = run_stats.get("NumPoloniesBeforeTrimming", 0) + for sample_data in run_stats["SampleStats"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name and sample_name: + sample_id = "__".join([run_analysis_name, sample_name]) + + if "Occurrences" not in sample_data: + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + + for occurrence in sample_data["Occurrences"]: + sample_expected_seq = occurrence.get("ExpectedSequence") + sample_counts = occurrence.get("NumPoloniesBeforeTrimming") + if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): + log.error( + f"Missing data needed to extract index assignment for sample {sample_id}. Skipping." + ) + continue + if sample_expected_seq not in sample_to_index_assignment: + sample_to_index_assignment[sample_expected_seq] = { + "SampleID": sample_id, + "SamplePolonyCounts": 0, + "PercentOfPolonies": float("nan"), + "Index1": "", + "Index2": "", + } + sample_to_index_assignment[sample_expected_seq]["SamplePolonyCounts"] += sample_counts + + for index_assigment in sample_to_index_assignment.values(): + if total_polonies > 0: + index_assigment["PercentOfPolonies"] = round( + index_assigment["SamplePolonyCounts"] / total_polonies * 100, 2 + ) + + run_manifest = json.loads(f["f"]) + if "Samples" not in run_manifest: + log.warning( + f" section not found in {directory}/RunManifest.json.\n" + f"Skipping RunManifest sample index assignment metrics." + ) + elif len(sample_to_index_assignment) == 0: + log.warning( + "Index assignment data missing. Skipping creation of index assignment metrics." + ) + else: + for sample_data in run_manifest["Samples"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: + continue + sample_id = "__".join([run_analysis_name, sample_name]) + for index_data in sample_data["Indexes"]: + index_1 = index_data.get("Index1", "") + index_2 = index_data.get("Index2", "") + merged_indices = f"{index_1}{index_2}" + if merged_indices not in sample_to_index_assignment: + log.error(f"Index assignment information not found for sample {sample_id}. Skipping.") + continue + if sample_id != sample_to_index_assignment[merged_indices]["SampleID"]: + log.error( + f"RunManifest SampleID <{sample_id}> does not match " + f"RunStats SampleID {sample_to_index_assignment[merged_indices]["SampleID"]}." + "Skipping." + ) + continue + sample_to_index_assignment[merged_indices]["Index1"] = index_1 + sample_to_index_assignment[merged_indices]["Index2"] = index_2 + + return sample_to_index_assignment def add_run_plots(self, data, plot_functions): for func in plot_functions: @@ -281,6 +589,7 @@ def add_run_plots(self, data, plot_functions): def add_sample_plots(self, data, group_lookup, project_lookup): plot_functions = [ tabulate_sample_stats, + plot_sample_assignment_histogram, plot_sample_read_length, sequence_content_plot, plot_per_cycle_N_content, diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index a7c8ad8d4a..18d58dee30 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -2,7 +2,7 @@ from multiqc.plots import bargraph, linegraph, table from multiqc import config - +from natsort import natsorted """ Functions for plotting per run information of bases2fastq @@ -299,6 +299,190 @@ def tabulate_run_stats(run_data, color_dict): return plot_html, plot_name, anchor, description, helptext, plot_content +def tabulate_manifest_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + for s_name in run_data.keys(): + run_stats = dict() + run_stats.update({"indexing": run_data[s_name]["Indexing"]}) + run_stats.update({"adapter_trim_type": run_data[s_name]["AdapterTrimType"]}) + run_stats.update({"min_read_length_r1": run_data[s_name]["R1AdapterMinimumTrimmedLength"]}) + run_stats.update({"min_read_length_r2": run_data[s_name]["R2AdapterMinimumTrimmedLength"]}) + plot_content.update({s_name: run_stats}) + + headers = {} + headers["indexing"] = { + "title": "Indexing", + "description": "Indexing scheme.", + "scale": "RdYlGn", + } + headers["adapter_trim_type"] = { + "title": "Adapter Trim Type", + "description": "Adapter trimming method.", + } + headers["min_read_length_r1"] = { + "title": "Minimum Read Length R1", + "description": "Minimum read length for read R1.", + "scale": "RdYlGn", + } + headers["min_read_length_r2"] = { + "title": "Minimum Read Length R2", + "description": "Minimum read length for read R1 (if applicable).", + "scale": "RdYlGn", + } + + pconfig = { + "title": "Bases2Fastq: Run Manifest Metrics", + "col1_header": "Run Name | Lane", + "id": "run_manifest_metrics", + } + + plot_name = "Run Manifest Table" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "run_manifest_metrics_table" + description = "Run parameters used." + helptext = """ + This section displays metrics that indicate the parameters used in the run: \n + - Run Name | Lane: Unique identifier composed of (RunName)__(UUID) | (Lane), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Indexing: Describes the indexing scheme.\n + - Adapter Trim Type: Adapter trimming method.\n + - Minimum Read Length R1/R2: Minumum read length after adapter trimming.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def tabulate_index_assignment_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + sorted_run_data = natsorted(run_data.items(), key=lambda x: x[1]["SampleID"]) + for index, sample_data in enumerate(sorted_run_data, start=1): + sample_data = sample_data[1] + sample_index_stats = dict() + sample_index_stats.update({"sample_name": sample_data["SampleID"]}) + sample_index_stats.update({"index_1": sample_data["Index1"]}) + sample_index_stats.update({"index_2": sample_data["Index2"]}) + sample_index_stats.update({"polonies": sample_data["SamplePolonyCounts"]}) + sample_index_stats.update({"polony_percentage": sample_data["PercentOfPolonies"]}) + plot_content.update({index: sample_index_stats}) + + headers = {} + headers["sample_name"] = { + "title": "Sample Name", + "description": "Sample Name (RunID + Sample ID).", + } + headers["index_1"] = { + "title": "Index 1", + "description": "Sample Index 1 (I1).", + } + headers["index_2"] = { + "title": "Index 2", + "description": "Sample Index 2 (I2).", + } + headers["polonies"] = { + "title": "Polonies", + "description": "Number of polonies assigned to sample.", + "scale": "RdYlGn", + } + headers["polony_percentage"] = { + "title": "Polony %", + "description": "Percentage of total polonies assigned to this index combination.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + + pconfig = { + "title": "Bases2Fastq: Index Assignment Metrics", + "col1_header": "Sample #", + "id": "index_assignment_metrics", + } + + plot_name = "Index Assignment Metrics" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "index_assignment_metrics" + description = "Index assignment metrics." + helptext = """ + This section displays index assignment metrics including: \n + - Sample Name: Sample identifier combining RunID and SampleID.\n + - Index 1: Sample I1.\n + - Index 2: Sample I2.\n + - Polonies: Number of polonies assigned each sample.\n + - Polony %: Percentage of total run's polonies assigned to each sample.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def tabulate_unassigned_index_stats(run_data, color_dict): + """ + Tabulate unassigned index metrics. + + run_data: Dictionary with unassigned index data including: + - RunName + - Lane + - I1 + - I2 + - Polonies + - % Polonies + """ + + headers = {} + headers["Run Name"] = { + "title": "Run Name", + "description": "Run Name (Run ID + Analysis ID).", + } + headers["Lane"] = { + "title": "Lane", + "description": "Index Lane.", + } + headers["I1"] = { + "title": "I1", + "description": "Index 1.", + } + headers["I2"] = { + "title": "I2", + "description": "Index 2.", + } + headers["Polonies"] = { + "title": "Polonies", + "description": "Number of polonies assigned to indices.", + "scale": "GnYlRd", + } + headers["% Polonies"] = { + "title": "% Polonies", + "description": "Percentage of total polonies assigned to this index combination.", + "max": 100, + "min": 0, + "scale": "GnYlRd", + "suffix": "%", + } + + pconfig = { + "title": "Bases2Fastq: Unassiged Indices Metrics", + "col1_header": "Index #", + "id": "index_unassignment_metrics", + } + + plot_name = "Unassiged Indices Metrics" + plot_html = table.plot(run_data, headers, pconfig=pconfig) + anchor = "index_unassignment_metrics" + description = "Index unassignment metrics." + helptext = """ + This section displays index assignment metrics including: \n + - Run Name: Run identifier. Built from Run ID and Analysis ID.\n + - Lane: Lane number.\n + - Index 1: Sample I1.\n + - Index 2: Sample I2.\n + - Polonies: Number of polonies assigned each index combination.\n + - Polony %: Percentage of total run's polonies assigned to each index combination.\n + """ + return plot_html, plot_name, anchor, description, helptext, run_data + + def plot_lane_cycle_stats(run_data, color_dict): """ Plot number of cycles per read and lane diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 71c1ce8624..b9f02a538e 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,6 +1,8 @@ from multiqc.plots import bargraph, linegraph, table from multiqc import config +import numpy as np + """ Functions for plotting per sample information of bases2fastq """ @@ -129,6 +131,49 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s return plot_html, plot_name, anchor, description, helptext, plot_content +def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_lookup_dict, color_dict): + """ + Plot number of cycles per read and lane + """ + plot_content = dict() + polony_assignments = [] + for s_name in sample_data.keys(): + polonies = sample_data[s_name].get("NumPolonies") + if polonies: + polony_assignments.append(polonies) + + bins = 100 + for bins in [100, 50, 20, 10]: + if len(polony_assignments) > bins: + break + + hist, bin_edges = np.histogram(polony_assignments, bins=bins) + bin_ranges = [f"({bin_edges[i]}, {bin_edges[i+1]})" for i in range(len(bin_edges)-1)] + + for range_data, frequency in zip(bin_ranges, hist): + plot_content[range_data] = {} + plot_content[range_data]["Assigned Polonies"] = float(frequency) + + pconfig = { + "title": "Bases2Fastq: Sample Polony Assignment Histogram", + "id": "sample_assignment_hist", + "ylab": "Number of Samples", + "xlab": "Range Assigned Polonies", + "cpswitch": False, + "subtitle": None, + } + + plot_name = "Sample Polony Assignment Histogram" + plot_html = bargraph.plot(plot_content, pconfig=pconfig) + anchor = "sample_assignment_hist" + description = "Average read length per read for all samples." + helptext = """ + Shows the number of cycles used for each read in every flowcell lane. + Useful for confirming that read lengths match the expected sequencing setup across all lanes. + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, color_dict): """ Plot number of cycles per read and lane @@ -324,8 +369,8 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s plot_content = gc_hist_dict pconfig = { - "xlab": "% GC", - "ylab": "Percentage of reads that are GC", + "xlab": "GC Content (%)", + "ylab": "Percentage of reads that have GC (%)", "colors": sample_color, "id": "gc_hist", "title": "bases2fastq: Per Sample GC Content Histogram", From 8ea72b39b19c38660efbaa594d3d9051c7a3eb83 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Mon, 29 Sep 2025 09:14:18 -0700 Subject: [PATCH 03/29] Fixed typo --- multiqc/modules/bases2fastq/plot_runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 18d58dee30..e6db7f69a3 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -462,12 +462,12 @@ def tabulate_unassigned_index_stats(run_data, color_dict): } pconfig = { - "title": "Bases2Fastq: Unassiged Indices Metrics", + "title": "Bases2Fastq: Unassigned Indices Metrics", "col1_header": "Index #", "id": "index_unassignment_metrics", } - plot_name = "Unassiged Indices Metrics" + plot_name = "Unassigned Indices Metrics" plot_html = table.plot(run_data, headers, pconfig=pconfig) anchor = "index_unassignment_metrics" description = "Index unassignment metrics." From 2a3e3db4e23bce831f0f8a439c9fcb91b8936539 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 00:13:05 -0700 Subject: [PATCH 04/29] Fixed --no-project B2F failure. --- multiqc/modules/bases2fastq/bases2fastq.py | 35 ++-- multiqc/modules/bases2fastq/plot_samples.py | 184 ++++++++++++++------ 2 files changed, 145 insertions(+), 74 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 3b3deba312..0ba862f60c 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -91,10 +91,8 @@ def __init__(self): ) = self._parse_run_project_data("bases2fastq/project") # Get run- and project-level samples - for data in self.run_level_samples.values(): - num_run_level_samples += len(data.keys()) - for data in self.project_level_samples.values(): - num_project_level_samples += len(data.keys()) + num_run_level_samples = len(self.run_level_samples) + num_project_level_samples = len(self.project_level_samples) # Ensure run/sample data found if all([ @@ -107,10 +105,22 @@ def __init__(self): log.error(error_msg) raise ModuleNoSamplesFound(error_msg) + # Choose path to take, if project use only project-level data, otherwise use run- and project-level + summary_path = "" + if len(self.run_level_data) > 0 and len(self.project_level_data) == 0: + summary_path = "run_level" + if len(self.run_level_data) == 0 and len(self.project_level_data) > 0: + summary_path = "project_level" + elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: + summary_path = "combined_level" + # Log runs, projects and samples found log.info(f"Found {len(self.run_level_data)} run(s) within the Bases2Fastq results.") log.info(f"Found {len(self.project_level_data)} project(s) within the Bases2Fastq results.") - log.info(f"Found {num_project_level_samples} sample(s) within the Bases2Fastq results.") + if summary_path == "run_level": + log.info(f"Found {num_run_level_samples} sample(s) within the Bases2Fastq results.") + else: + log.info(f"Found {num_project_level_samples} sample(s) within the Bases2Fastq results.") # Superfluous function call to confirm that it is used in this module self.add_software_version(None) @@ -118,17 +128,8 @@ def __init__(self): # Warn user if run-level/project-level or sample-level metrics were not found if len(self.run_level_data) == 0 and len(self.project_level_data) == 0: log.warning("No run/project stats found!") - if num_project_level_samples == 0: + if num_run_level_samples == 0 and num_project_level_samples == 0: log.warning("No sample stats found!") - - # Choose path to take, if project use only project-level data, otherwise use run- and project-level - summary_path = "" - if len(self.run_level_data) > 0 and len(self.project_level_data) == 0: - summary_path = "run_level" - if len(self.run_level_data) == 0 and len(self.project_level_data) > 0: - summary_path = "project_level" - elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: - summary_path = "combined_level" # Define data to use run_data = {} @@ -139,7 +140,7 @@ def __init__(self): unassigned_sequences = {} if summary_path == "run_level": run_data = self.run_level_data - sample_data = self.project_level_samples + sample_data = self.run_level_samples samples_to_projects = self.run_level_samples_to_project manifest_data = self._parse_run_manifest("bases2fastq/manifest") index_assigment_data = self._parse_index_assignment("bases2fastq/manifest") @@ -293,7 +294,7 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: run_analysis_sample_name = "__".join([run_analysis_name, sample_name]) num_polonies = sample_data["NumPolonies"] - if num_polonies < 1000: + if num_polonies < MIN_POLONIES: log.warning( f"Skipping {run_analysis_sample_name} because it has" f" <{MIN_POLONIES} assigned reads [n={num_polonies}]." diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index b9f02a538e..a555a88630 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -133,79 +133,149 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_lookup_dict, color_dict): """ - Plot number of cycles per read and lane + Plots a histogram of number of assigned polonies in all samples for each run. """ - plot_content = dict() - polony_assignments = [] - for s_name in sample_data.keys(): - polonies = sample_data[s_name].get("NumPolonies") - if polonies: - polony_assignments.append(polonies) + plot_content = [] + polony_assignments = {} + for s_name, data in sample_data.items(): + if "NumPolonies" not in data: + continue + run_name, _ = s_name.split("__") + if run_name not in polony_assignments: + polony_assignments[run_name] = [] + polonies = data["NumPolonies"] + polony_assignments[run_name].append(polonies) + + pconfig = {"data_labels": []} + for run_name, assignment_data in polony_assignments.items(): + run_data = {} + bins = 50 + for bins in [50, 20, 10]: + if len(assignment_data) > bins: + break + hist, bin_edges = np.histogram(assignment_data, bins=bins) + bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)})" for i in range(len(bin_edges)-1)] + points = [float(point) for point in hist] + run_data["Polonies Assigned"] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} + plot_content.append(run_data) + + pconfig["data_labels"].append({ + "name": run_name, + "xlab": "Assigned Polonies (Range)", + "ylab": "Number of Samples with N Polonies Assigned", + }) - bins = 100 - for bins in [100, 50, 20, 10]: - if len(polony_assignments) > bins: - break - - hist, bin_edges = np.histogram(polony_assignments, bins=bins) - bin_ranges = [f"({bin_edges[i]}, {bin_edges[i+1]})" for i in range(len(bin_edges)-1)] - - for range_data, frequency in zip(bin_ranges, hist): - plot_content[range_data] = {} - plot_content[range_data]["Assigned Polonies"] = float(frequency) - - pconfig = { - "title": "Bases2Fastq: Sample Polony Assignment Histogram", - "id": "sample_assignment_hist", - "ylab": "Number of Samples", - "xlab": "Range Assigned Polonies", - "cpswitch": False, - "subtitle": None, - } + pconfig = pconfig | { + "id": "sample_assignment_hist", + "title": "bases2fastq: Sample Polony Assignment Histogram", + "style": 'lines+markers', + "xlab": "Assigned Polonies (Range)", + "ylab": "Number of Samples with N Polonies Assigned", + "categories": True, + } plot_name = "Sample Polony Assignment Histogram" - plot_html = bargraph.plot(plot_content, pconfig=pconfig) + plot_html = linegraph.plot(plot_content, pconfig=pconfig) anchor = "sample_assignment_hist" - description = "Average read length per read for all samples." + description = "Histogram showing the distribution of samples according to the number of polonies assigned to them." helptext = """ - Shows the number of cycles used for each read in every flowcell lane. - Useful for confirming that read lengths match the expected sequencing setup across all lanes. + Shows bins of assigned polony counts on the X-axis and the number of samples whose number of polonies fall + within each bin on the Y-axis. """ + return plot_html, plot_name, anchor, description, helptext, plot_content def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, color_dict): """ - Plot number of cycles per read and lane + Plots the average read length for each sample if less than 50 samples in total, or the distribution per run + as a lineplot based on histogram bins. """ + total_samples = len(sample_data.keys()) plot_content = dict() - for s_name, data in sample_data.items(): - read_lengths = {s_name: {}} - if "Reads" not in data: - continue - for read in data["Reads"]: - read_name = read["Read"] - mean_length = read["MeanReadLength"] - read_lengths[s_name][read_name] = mean_length - plot_content.update(read_lengths) - - pconfig = { - "title": "Bases2Fastq: Mean Read Length per Sample", - "id": "mean_read_length_per_sample", - "ylab": "Bases", - "cpswitch": False, - "subtitle": None, - "stacking": "group", - } - + pconfig = {} + plot_html = None plot_name = "Mean Read Length per Sample" - plot_html = bargraph.plot(plot_content, pconfig=pconfig) anchor = "mean_read_length_per_sample" - description = "Average read length per read for all samples." - helptext = """ - Shows the number of cycles used for each read in every flowcell lane. - Useful for confirming that read lengths match the expected sequencing setup across all lanes. - """ + description = "" + helptext = "" + + if total_samples <= 50: + for s_name, data in sample_data.items(): + read_lengths = {s_name: {}} + if "Reads" not in data: + continue + for read in data["Reads"]: + read_name = read["Read"] + mean_length = read["MeanReadLength"] + read_lengths[s_name][read_name] = mean_length + plot_content.update(read_lengths) + + pconfig = { + "title": "Bases2Fastq: Mean Read Length per Sample", + "id": "mean_read_length_per_sample", + "ylab": "Bases", + "cpswitch": False, + "subtitle": None, + "stacking": "group", + } + plot_html = bargraph.plot(plot_content, pconfig=pconfig) + description = "Average read length per read for all samples." + helptext = """ + Shows the average read length for each read in each sample. + """ + + elif total_samples > 50: + plot_content = [] + read_lengths = {} + for s_name, data in sample_data.items(): + if "Reads" not in data: + continue + run_name, _ = s_name.split("__") + if run_name not in read_lengths: + read_lengths[run_name] = {} + for read in data["Reads"]: + read_id = read["Read"] + if read_id not in read_lengths[run_name]: + read_lengths[run_name][read_id] = [] + read_lengths[run_name][read_id].append(read["MeanReadLength"]) + + pconfig = {"data_labels": []} + for run_name, read_data in read_lengths.items(): + run_data = {} + for read_name, read_lengths in read_data.items(): + bins = 50 + for bins in [50, 20, 10]: + if len(read_lengths) > bins: + break + hist, bin_edges = np.histogram(read_lengths, bins=bins) + bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)})" for i in range(len(bin_edges)-1)] + points = [float(point) for point in hist] + run_data[read_name] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} + plot_content.append(run_data) + + pconfig["data_labels"].append({ + "name": run_name, + "xlab": "Average Read Length (Range)", + "ylab": "Samples with Average Read Length", + }) + + pconfig = pconfig | { + "id": "mean_read_length_per_sample", + "title": "bases2fastq: Mean Read Length Per Sample", + "style": 'lines+markers', + "xlab": "Average Read Length (Range)", + "ylab": "Samples with Average Read Length", + "categories": True, + } + + plot_html = linegraph.plot(plot_content, pconfig=pconfig) + description = "Distribution of average read lengths for all samples." + helptext = """ + Shows the distribution of samples whose average read lengths fall in a given range. + Reads are shown as different lines. + """ + return plot_html, plot_name, anchor, description, helptext, plot_content From 783cbd2dece66cfc6ff502c2ff7895625cbd5f06 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 00:27:41 -0700 Subject: [PATCH 05/29] Fixed f-string bug --- multiqc/modules/bases2fastq/bases2fastq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 0ba862f60c..e823589a92 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -572,7 +572,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: if sample_id != sample_to_index_assignment[merged_indices]["SampleID"]: log.error( f"RunManifest SampleID <{sample_id}> does not match " - f"RunStats SampleID {sample_to_index_assignment[merged_indices]["SampleID"]}." + f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." "Skipping." ) continue From 9204e3157241da3bb4b6d26ee0aa89ca735217e2 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 00:44:41 -0700 Subject: [PATCH 06/29] Fixed formatting --- multiqc/modules/bases2fastq/bases2fastq.py | 92 +++++++++------------ multiqc/modules/bases2fastq/plot_runs.py | 6 +- multiqc/modules/bases2fastq/plot_samples.py | 48 ++++++----- 3 files changed, 68 insertions(+), 78 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index e823589a92..6538f2633e 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -63,7 +63,6 @@ def __init__(self): self.group_lookup_dict = dict() self.project_lookup_dict = dict() - self.b2f_sample_data = dict() self.b2f_run_data = dict() self.b2f_run_project_data = dict() @@ -74,37 +73,39 @@ def __init__(self): # Define if call is project- or run-level run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) - + if run_level_log_files == 0 and project_level_log_files == 0: error_msg = "No run- or project-level log files found within the Bases2Fastq results." log.error(error_msg) raise ModuleNoSamplesFound(error_msg) - + # Parse data if run_level_log_files > 0: - ( - self.run_level_data, self.run_level_samples, self.run_level_samples_to_project - ) = self._parse_run_project_data("bases2fastq/run") + (self.run_level_data, self.run_level_samples, self.run_level_samples_to_project) = ( + self._parse_run_project_data("bases2fastq/run") + ) if project_level_log_files > 0: - ( - self.project_level_data, self.project_level_samples, self.project_level_samples_to_project - ) = self._parse_run_project_data("bases2fastq/project") + (self.project_level_data, self.project_level_samples, self.project_level_samples_to_project) = ( + self._parse_run_project_data("bases2fastq/project") + ) # Get run- and project-level samples num_run_level_samples = len(self.run_level_samples) num_project_level_samples = len(self.project_level_samples) # Ensure run/sample data found - if all([ - len(self.run_level_data) == 0, - num_run_level_samples == 0, - len(self.project_level_data), - num_project_level_samples == 0, - ]): + if all( + [ + len(self.run_level_data) == 0, + num_run_level_samples == 0, + len(self.project_level_data), + num_project_level_samples == 0, + ] + ): error_msg = "No run-, project- or sample-level data found" log.error(error_msg) raise ModuleNoSamplesFound(error_msg) - + # Choose path to take, if project use only project-level data, otherwise use run- and project-level summary_path = "" if len(self.run_level_data) > 0 and len(self.project_level_data) == 0: @@ -113,7 +114,7 @@ def __init__(self): summary_path = "project_level" elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: summary_path = "combined_level" - + # Log runs, projects and samples found log.info(f"Found {len(self.run_level_data)} run(s) within the Bases2Fastq results.") log.info(f"Found {len(self.project_level_data)} project(s) within the Bases2Fastq results.") @@ -193,14 +194,14 @@ def __init__(self): self.sample_color = dict() for s_name in samples_to_projects.keys(): s_color = ( - self.group_color[s_name] if summary_path == "project_level" else - self.group_color[samples_to_projects[s_name]] + self.group_color[s_name] + if summary_path == "project_level" + else self.group_color[samples_to_projects[s_name]] ) self.sample_color.update({s_name: s_color}) self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match self.palette = self.palette[len(merged_groups) :] - # Plot metrics qc_metrics_function = ( tabulate_run_stats if summary_path in ["run_level", "combined_level"] else tabulate_project_stats @@ -212,34 +213,27 @@ def __init__(self): data=manifest_data, plot_functions=[ tabulate_manifest_stats, - ] + ], ) self.add_run_plots( data=index_assigment_data, plot_functions=[ tabulate_index_assignment_stats, - ] + ], ) self.add_run_plots( data=unassigned_sequences, plot_functions=[ tabulate_unassigned_index_stats, - ] + ], ) - + self.add_run_plots( data=run_data, - plot_functions=[ - plot_lane_cycle_stats, - plot_run_stats, - plot_base_quality_hist, - plot_base_quality_by_cycle - ] + plot_functions=[plot_lane_cycle_stats, plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle], ) - self.add_sample_plots( - data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects - ) + self.add_sample_plots(data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects) def get_uuid(self): return str(uuid.uuid4()).replace("-", "").lower() @@ -269,15 +263,13 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: "https://docs.elembio.io/docs/bases2fastq/introduction/" ) continue - + run_analysis_name = "-".join([run_name, analysis_id]) run_analysis_name = self.clean_s_name(run_analysis_name, f) # skip run if in user provider ignore list if self.is_ignore_sample(run_analysis_name): - log.info( - f"Skipping <{run_analysis_name}> because it is present in ignore list." - ) + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue # Check run is present in the final dictionaries @@ -315,7 +307,6 @@ def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") return [runs_global_data, runs_sample_data, sample_to_project] - def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: runs_manifest_data = {} @@ -356,8 +347,7 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: run_manifest = json.loads(f["f"]) if "Settings" not in run_manifest: log.warning( - f" section not found in {directory}/RunManifest.json.\n" - f"Skipping RunManifest metrics." + f" section not found in {directory}/RunManifest.json.\nSkipping RunManifest metrics." ) else: for lane_data in run_manifest["Settings"]: @@ -378,7 +368,7 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: if mask_info["Read"] not in indices: indices.append(mask_info["Read"]) indices_cycles.append(str(len(mask_info["Cycles"]))) - indexing = f'{" + ".join(indices_cycles)}
{" + ".join(indices)}' + indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" runs_manifest_data[run_lane]["Indexing"] = indexing runs_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") @@ -388,7 +378,7 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: runs_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( "R2AdapterMinimumTrimmedLength", "N/A" ) - + self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") return runs_manifest_data @@ -416,11 +406,9 @@ def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: # skip run if in user provider ignore list if self.is_ignore_sample(run_analysis_name): - log.info( - f"Skipping <{run_analysis_name}> because it is present in ignore list." - ) + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue - + # Get total polonies and build unassigned indices dictionary total_polonies = data.get("NumPoloniesBeforeTrimming", 0) if "Lanes" not in data: @@ -491,12 +479,10 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: log.debug(f"Error in RunStats.json: {run_stats_path}") log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") continue - + # skip run if in user provider ignore list if self.is_ignore_sample(run_analysis_name): - log.info( - f"Skipping <{run_analysis_name}> because it is present in ignore list." - ) + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue # Ensure sample stats are present @@ -508,7 +494,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: ) log.debug(f"Missing SampleStats in RunStats.json. Available keys: {list(run_stats.keys())}.") continue - + # Extract per sample polony counts and overall total counts total_polonies = run_stats.get("NumPoloniesBeforeTrimming", 0) for sample_data in run_stats["SampleStats"]: @@ -552,9 +538,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: f"Skipping RunManifest sample index assignment metrics." ) elif len(sample_to_index_assignment) == 0: - log.warning( - "Index assignment data missing. Skipping creation of index assignment metrics." - ) + log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") else: for sample_data in run_manifest["Samples"]: sample_name = sample_data.get("SampleName") diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index e6db7f69a3..706b265353 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -102,7 +102,7 @@ def _calculate_reads_eliminated(run_data) -> int: if "NumPolonies" not in lane or "NumPoloniesBeforeTrimming" not in lane: continue reads_eliminated += lane["NumPoloniesBeforeTrimming"] - lane["NumPolonies"] - + return reads_eliminated @@ -183,7 +183,7 @@ def tabulate_project_stats(run_data, color_dict): project_header = "(Project) " elif len(run_keys) == 1: first_key = run_keys[0] - project_header = f'{run_data[first_key]["Project"]} | ' + project_header = f"{run_data[first_key]['Project']} | " plot_name = f"{project_header}Sequencing QC Metrics Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) anchor = "project_run_qc_metrics_table" @@ -495,7 +495,7 @@ def plot_lane_cycle_stats(run_data, color_dict): if "Lane" not in lane or "Reads" not in lane: continue lane_stats = dict() - lane_name = f'L{lane["Lane"]}' + lane_name = f"L{lane['Lane']}" run_name = f"{s_name} | {lane_name}" lane_stats[run_name] = {} for read in lane["Reads"]: diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index a5c91449e2..9351b8d099 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -154,25 +154,27 @@ def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_loo if len(assignment_data) > bins: break hist, bin_edges = np.histogram(assignment_data, bins=bins) - bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)})" for i in range(len(bin_edges)-1)] + bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)})" for i in range(len(bin_edges) - 1)] points = [float(point) for point in hist] run_data["Polonies Assigned"] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} plot_content.append(run_data) - pconfig["data_labels"].append({ - "name": run_name, - "xlab": "Assigned Polonies (Range)", - "ylab": "Number of Samples with N Polonies Assigned", - }) - + pconfig["data_labels"].append( + { + "name": run_name, + "xlab": "Assigned Polonies (Range)", + "ylab": "Number of Samples with N Polonies Assigned", + } + ) + pconfig = pconfig | { - "id": "sample_assignment_hist", - "title": "bases2fastq: Sample Polony Assignment Histogram", - "style": 'lines+markers', - "xlab": "Assigned Polonies (Range)", - "ylab": "Number of Samples with N Polonies Assigned", - "categories": True, - } + "id": "sample_assignment_hist", + "title": "bases2fastq: Sample Polony Assignment Histogram", + "style": "lines+markers", + "xlab": "Assigned Polonies (Range)", + "ylab": "Number of Samples with N Polonies Assigned", + "categories": True, + } plot_name = "Sample Polony Assignment Histogram" plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -249,21 +251,25 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, if len(read_lengths) > bins: break hist, bin_edges = np.histogram(read_lengths, bins=bins) - bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i+1], 2)})" for i in range(len(bin_edges)-1)] + bin_ranges = [ + f"({round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)})" for i in range(len(bin_edges) - 1) + ] points = [float(point) for point in hist] run_data[read_name] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} plot_content.append(run_data) - pconfig["data_labels"].append({ - "name": run_name, - "xlab": "Average Read Length (Range)", - "ylab": "Samples with Average Read Length", - }) + pconfig["data_labels"].append( + { + "name": run_name, + "xlab": "Average Read Length (Range)", + "ylab": "Samples with Average Read Length", + } + ) pconfig = pconfig | { "id": "mean_read_length_per_sample", "title": "bases2fastq: Mean Read Length Per Sample", - "style": 'lines+markers', + "style": "lines+markers", "xlab": "Average Read Length (Range)", "ylab": "Samples with Average Read Length", "categories": True, From e39c5bc99be9bde8350d35135f0d2a36c686509c Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 00:47:48 -0700 Subject: [PATCH 07/29] Fixed dictionary merging for older python version --- multiqc/modules/bases2fastq/bases2fastq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 6538f2633e..3a356f477d 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -175,7 +175,7 @@ def __init__(self): project_groups[sample_project].append(sample) if summary_path == "project_level": in_project_sample_groups[sample].append(sample) - merged_groups = dict(run_groups) | dict(project_groups) | dict(in_project_sample_groups) + merged_groups = {**run_groups, **project_groups, **in_project_sample_groups} # Assign color for each group self.color_getter = mqc_colour.mqc_colour_scale() From 5ff5bd784c907de35084983ae494c726c0cd2cc9 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 08:06:28 -0700 Subject: [PATCH 08/29] Fixed dict constructor --- multiqc/modules/bases2fastq/plot_samples.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 9351b8d099..b6088ba186 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,3 +1,4 @@ +from typing import Any, Dict from multiqc.plots import bargraph, linegraph, table from multiqc import config @@ -146,7 +147,7 @@ def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_loo polonies = data["NumPolonies"] polony_assignments[run_name].append(polonies) - pconfig = {"data_labels": []} + pconfig: Dict[str, Any] = {"data_labels": []} for run_name, assignment_data in polony_assignments.items(): run_data = {} bins = 50 @@ -167,14 +168,14 @@ def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_loo } ) - pconfig = pconfig | { + pconfig.update({ "id": "sample_assignment_hist", "title": "bases2fastq: Sample Polony Assignment Histogram", "style": "lines+markers", "xlab": "Assigned Polonies (Range)", "ylab": "Number of Samples with N Polonies Assigned", "categories": True, - } + }) plot_name = "Sample Polony Assignment Histogram" plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -242,7 +243,7 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, read_lengths[run_name][read_id] = [] read_lengths[run_name][read_id].append(read["MeanReadLength"]) - pconfig = {"data_labels": []} + pconfig: Dict[str, Any] = {"data_labels": []} for run_name, read_data in read_lengths.items(): run_data = {} for read_name, read_lengths in read_data.items(): @@ -266,14 +267,14 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, } ) - pconfig = pconfig | { + pconfig.update({ "id": "mean_read_length_per_sample", "title": "bases2fastq: Mean Read Length Per Sample", "style": "lines+markers", "xlab": "Average Read Length (Range)", "ylab": "Samples with Average Read Length", "categories": True, - } + }) plot_html = linegraph.plot(plot_content, pconfig=pconfig) description = "Distribution of average read lengths for all samples." From fa734f8f85a6d6745acecc8304edca99be3a1ae5 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 08:11:01 -0700 Subject: [PATCH 09/29] Linting --- multiqc/modules/bases2fastq/plot_samples.py | 36 ++++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index b6088ba186..af745b97fa 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -168,14 +168,16 @@ def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_loo } ) - pconfig.update({ - "id": "sample_assignment_hist", - "title": "bases2fastq: Sample Polony Assignment Histogram", - "style": "lines+markers", - "xlab": "Assigned Polonies (Range)", - "ylab": "Number of Samples with N Polonies Assigned", - "categories": True, - }) + pconfig.update( + { + "id": "sample_assignment_hist", + "title": "bases2fastq: Sample Polony Assignment Histogram", + "style": "lines+markers", + "xlab": "Assigned Polonies (Range)", + "ylab": "Number of Samples with N Polonies Assigned", + "categories": True, + } + ) plot_name = "Sample Polony Assignment Histogram" plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -267,14 +269,16 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, } ) - pconfig.update({ - "id": "mean_read_length_per_sample", - "title": "bases2fastq: Mean Read Length Per Sample", - "style": "lines+markers", - "xlab": "Average Read Length (Range)", - "ylab": "Samples with Average Read Length", - "categories": True, - }) + pconfig.update( + { + "id": "mean_read_length_per_sample", + "title": "bases2fastq: Mean Read Length Per Sample", + "style": "lines+markers", + "xlab": "Average Read Length (Range)", + "ylab": "Samples with Average Read Length", + "categories": True, + } + ) plot_html = linegraph.plot(plot_content, pconfig=pconfig) description = "Distribution of average read lengths for all samples." From 585e241d358612ae7f6f6d3f9db41f6e5290a180 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 08:33:50 -0700 Subject: [PATCH 10/29] Fix raise error --- multiqc/modules/bases2fastq/bases2fastq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 3a356f477d..f3809ae50c 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -160,7 +160,7 @@ def __init__(self): else: error_msg = "No run- or project-level data was retained. No report will be generated." log.error(error_msg) - return + raise ModuleNoSamplesFound(error_msg) # Create run and project groups run_groups = defaultdict(list) From b33b7ca939b546592ab4926b528533130a6d0338 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 09:02:00 -0700 Subject: [PATCH 11/29] test plot with random id --- multiqc/modules/bases2fastq/plot_runs.py | 32 +++++++++++++++--------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 706b265353..ab4110b8f0 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -3,16 +3,23 @@ from multiqc.plots import bargraph, linegraph, table from multiqc import config from natsort import natsorted +import random +import string """ Functions for plotting per run information of bases2fastq """ +def generate_random_string(): + return ''.join(random.choices(string.ascii_letters + string.digits, k=4)) + + def plot_run_stats(run_data, color_dict): """ Plot a bar graph for polony numbers, Q30/Q40, index assignment rate and yields for each run """ + random_id = generate_random_string() run_names = list(run_data.keys()) run_names.sort() num_polonies = dict() @@ -58,7 +65,7 @@ def plot_run_stats(run_data, color_dict): ], "cpswitch": True, "stacking": "normal", - "id": "run_metrics_bar", + "id": f"run_metrics_bar_{random_id}", "title": "bases2fastq: General Sequencing Run QC metrics plot", "ylab": "QC", } @@ -71,7 +78,7 @@ def plot_run_stats(run_data, color_dict): ] * 2 plot_name = "Sequencing Run Yield" plot_html = bargraph.plot(plot_content, cats, pconfig=pconfig) - anchor = "run_yield_plot" + anchor = f"run_metrics_bar_{random_id}" description = "Bar plots of sequencing run yields. Please see individual run reports for details" helptext = """ This section shows and compare the yield and index assignment rate of each sequencing run.\n\n @@ -287,14 +294,14 @@ def tabulate_run_stats(run_data, color_dict): description = "QC metrics per run" helptext = """ This section displays metrics that indicate the quality of each sequencing run: \n - - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n - - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - - Reads Eliminated: Number of reads eliminated across lanes.\n + - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n + - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -303,6 +310,7 @@ def tabulate_manifest_stats(run_data, color_dict): """ Tabulate general information and statistics of each run """ + random_id = generate_random_string() plot_content = dict() for s_name in run_data.keys(): run_stats = dict() @@ -336,12 +344,12 @@ def tabulate_manifest_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Run Manifest Metrics", "col1_header": "Run Name | Lane", - "id": "run_manifest_metrics", + "id": f"run_manifest_metrics_table_{random_id}", } plot_name = "Run Manifest Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) - anchor = "run_manifest_metrics_table" + anchor = f"run_manifest_metrics_table_{random_id}" description = "Run parameters used." helptext = """ This section displays metrics that indicate the parameters used in the run: \n From 0580139822db7a1bd918d453bcf9c51e29dc8f81 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 09:03:48 -0700 Subject: [PATCH 12/29] pre xcommit --- multiqc/modules/bases2fastq/plot_runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index ab4110b8f0..a9e428cd6b 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -12,7 +12,7 @@ def generate_random_string(): - return ''.join(random.choices(string.ascii_letters + string.digits, k=4)) + return "".join(random.choices(string.ascii_letters + string.digits, k=4)) def plot_run_stats(run_data, color_dict): From ed1d894ecee56408d4b86b6f2a77e292fd4d5772 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 09:45:32 -0700 Subject: [PATCH 13/29] Test no id --- multiqc/modules/bases2fastq/plot_runs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index a9e428cd6b..defc6d1149 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -344,12 +344,12 @@ def tabulate_manifest_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Run Manifest Metrics", "col1_header": "Run Name | Lane", - "id": f"run_manifest_metrics_table_{random_id}", + # "id": f"run_manifest_metrics_table_{random_id}", } plot_name = "Run Manifest Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) - anchor = f"run_manifest_metrics_table_{random_id}" + anchor = f"run_manifest_metrics_table" description = "Run parameters used." helptext = """ This section displays metrics that indicate the parameters used in the run: \n From 16f5655e988baa81e27c45d78e76f91415a667f1 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 09:51:59 -0700 Subject: [PATCH 14/29] Random IDs --- multiqc/modules/bases2fastq/plot_runs.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index defc6d1149..e905237ee7 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -11,15 +11,14 @@ """ -def generate_random_string(): - return "".join(random.choices(string.ascii_letters + string.digits, k=4)) +def generate_random_string(length: int): + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) def plot_run_stats(run_data, color_dict): """ Plot a bar graph for polony numbers, Q30/Q40, index assignment rate and yields for each run """ - random_id = generate_random_string() run_names = list(run_data.keys()) run_names.sort() num_polonies = dict() @@ -65,7 +64,7 @@ def plot_run_stats(run_data, color_dict): ], "cpswitch": True, "stacking": "normal", - "id": f"run_metrics_bar_{random_id}", + "id": generate_random_string(10), "title": "bases2fastq: General Sequencing Run QC metrics plot", "ylab": "QC", } @@ -78,7 +77,7 @@ def plot_run_stats(run_data, color_dict): ] * 2 plot_name = "Sequencing Run Yield" plot_html = bargraph.plot(plot_content, cats, pconfig=pconfig) - anchor = f"run_metrics_bar_{random_id}" + anchor = "run_metrics_bar" description = "Bar plots of sequencing run yields. Please see individual run reports for details" helptext = """ This section shows and compare the yield and index assignment rate of each sequencing run.\n\n @@ -310,7 +309,6 @@ def tabulate_manifest_stats(run_data, color_dict): """ Tabulate general information and statistics of each run """ - random_id = generate_random_string() plot_content = dict() for s_name in run_data.keys(): run_stats = dict() @@ -344,12 +342,12 @@ def tabulate_manifest_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Run Manifest Metrics", "col1_header": "Run Name | Lane", - # "id": f"run_manifest_metrics_table_{random_id}", + "id": generate_random_string(10), } plot_name = "Run Manifest Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) - anchor = f"run_manifest_metrics_table" + anchor = "run_manifest_metrics_table" description = "Run parameters used." helptext = """ This section displays metrics that indicate the parameters used in the run: \n From 1e0df111431c34dbccae2935fad1a377054379a6 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 09:57:28 -0700 Subject: [PATCH 15/29] Test mix static plus random --- multiqc/modules/bases2fastq/plot_runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index e905237ee7..66274762ba 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -405,7 +405,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Index Assignment Metrics", "col1_header": "Sample #", - "id": "index_assignment_metrics", + "id": f"index_assignment_metrics_{generate_random_string(5)}", } plot_name = "Index Assignment Metrics" From 20b59b7c67db83f3de4b1108cb1160494ded039f Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 10:12:51 -0700 Subject: [PATCH 16/29] standardize plot id --- multiqc/modules/bases2fastq/plot_runs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 66274762ba..9b2bde785d 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -342,7 +342,7 @@ def tabulate_manifest_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Run Manifest Metrics", "col1_header": "Run Name | Lane", - "id": generate_random_string(10), + "id": f"run_manifest_metrics_table_{generate_random_string(5)}", } plot_name = "Run Manifest Table" From 9bd588cdf18ea8f6ab341cbaec7317d0e4be3f77 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 11:45:00 -0700 Subject: [PATCH 17/29] Added random ids to plots --- multiqc/modules/bases2fastq/plot_runs.py | 8 ++++---- multiqc/modules/bases2fastq/plot_samples.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 9b2bde785d..4cff385247 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -470,7 +470,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Unassigned Indices Metrics", "col1_header": "Index #", - "id": "index_unassignment_metrics", + "id": f"index_unassignment_metrics_{generate_random_string(5)}", } plot_name = "Unassigned Indices Metrics" @@ -514,7 +514,7 @@ def plot_lane_cycle_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: Cycles Per Read Per Lane", - "id": "project_cycles_per_read_per_lane", + "id": f"project_cycles_per_read_per_lane_{generate_random_string(5)}", "ylab": "Read Cycles", "cpswitch": False, "subtitle": None, @@ -584,7 +584,7 @@ def plot_base_quality_hist(run_data, color_dict): "colors": color_dict, }, ], - "id": "per_run_bq_hist", + "id": f"per_run_bq_hist_{generate_random_string(5)}", "title": "bases2fastq: Quality Histograms", "ylab": "Percentage", } @@ -706,7 +706,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": "per_run_quality_by_cycle", + "id": f"per_run_quality_by_cycle_{generate_random_string(5)}", "title": "bases2fastq: Quality by cycles", "ylab": "QC", } diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index af745b97fa..5ef104ddb4 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,6 +1,7 @@ from typing import Any, Dict from multiqc.plots import bargraph, linegraph, table from multiqc import config +from .plot_runs import generate_random_string import numpy as np @@ -109,7 +110,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "suffix": "%", } - pconfig = {"id": "sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": False} + pconfig = {"id": f"sample_qc_metric_table_{generate_random_string(5)}", "title": "Sample QC Metrics Table", "no_violin": False} plot_name = "Sample QC Metrics Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) @@ -170,7 +171,7 @@ def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_loo pconfig.update( { - "id": "sample_assignment_hist", + "id": f"sample_assignment_hist_{generate_random_string(5)}", "title": "bases2fastq: Sample Polony Assignment Histogram", "style": "lines+markers", "xlab": "Assigned Polonies (Range)", @@ -218,7 +219,7 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, pconfig = { "title": "Bases2Fastq: Mean Read Length per Sample", - "id": "mean_read_length_per_sample", + "id": f"mean_read_length_per_sample_{generate_random_string(5)}", "ylab": "Bases", "cpswitch": False, "subtitle": None, @@ -271,7 +272,7 @@ def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, pconfig.update( { - "id": "mean_read_length_per_sample", + "id": f"mean_read_length_per_sample_{generate_random_string(5)}", "title": "bases2fastq: Mean Read Length Per Sample", "style": "lines+markers", "xlab": "Average Read Length (Range)", @@ -339,7 +340,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": "per_cycle_base_content", + "id": f"per_cycle_base_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle Base Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -406,7 +407,7 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict "colors": color_dict, "ymin": 0, "ymax": 100, - "id": "per_cycle_n_content", + "id": f"per_cycle_n_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle N Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) @@ -463,7 +464,7 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s "xlab": "GC Content (%)", "ylab": "Percentage of reads that have GC (%)", "colors": sample_color, - "id": "gc_hist", + "id": f"gc_hist_{generate_random_string(5)}", "title": "bases2fastq: Per Sample GC Content Histogram", } plot_name = "Per Sample GC Histogram" @@ -520,7 +521,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa adapter_percent = cycle["PercentReadsTrimmed"] plot_content[s_name].update({cycle_no: adapter_percent}) pconfig = { - "id": "per_cycle_adapter_content", + "id": f"per_cycle_adapter_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle Adapter Content", "xlab": "Cycle", "ylab": "% of Sequences", From f299394064f09264d906f748bfb0c71efeb692f9 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 12:07:40 -0700 Subject: [PATCH 18/29] Fix skipping --- multiqc/modules/bases2fastq/bases2fastq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index f3809ae50c..ad1655e71d 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -74,6 +74,7 @@ def __init__(self): run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) + if run_level_log_files == 0 and project_level_log_files == 0: error_msg = "No run- or project-level log files found within the Bases2Fastq results." log.error(error_msg) @@ -98,7 +99,7 @@ def __init__(self): [ len(self.run_level_data) == 0, num_run_level_samples == 0, - len(self.project_level_data), + len(self.project_level_data) == 0, num_project_level_samples == 0, ] ): From 06eafeda008d09ebbb3dfd3da9ea47bd216b626a Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Wed, 1 Oct 2025 14:06:10 -0700 Subject: [PATCH 19/29] Fixed color palette --- multiqc/modules/bases2fastq/plot_runs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index 4cff385247..efc074f103 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -64,7 +64,7 @@ def plot_run_stats(run_data, color_dict): ], "cpswitch": True, "stacking": "normal", - "id": generate_random_string(10), + "id": f"run_metrics_bar_{generate_random_string(10)}", "title": "bases2fastq: General Sequencing Run QC metrics plot", "ylab": "QC", } @@ -456,14 +456,14 @@ def tabulate_unassigned_index_stats(run_data, color_dict): headers["Polonies"] = { "title": "Polonies", "description": "Number of polonies assigned to indices.", - "scale": "GnYlRd", + "scale": "RdYlGn_r", } headers["% Polonies"] = { "title": "% Polonies", "description": "Percentage of total polonies assigned to this index combination.", "max": 100, "min": 0, - "scale": "GnYlRd", + "scale": "RdYlGn_r", "suffix": "%", } From 8addc829394a2bffc1b3c843f7d0b72e63e8edb3 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Thu, 2 Oct 2025 17:16:31 -0700 Subject: [PATCH 20/29] DefaultProject bug fix, added mean len to table --- multiqc/modules/bases2fastq/bases2fastq.py | 7 ++++--- multiqc/modules/bases2fastq/plot_samples.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index ad1655e71d..29b1eb7e5a 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -167,6 +167,7 @@ def __init__(self): run_groups = defaultdict(list) project_groups = defaultdict(list) in_project_sample_groups = defaultdict(list) + ind_sample_groups = defaultdict(list) sample_to_run_group = {} for sample in sample_data.keys(): (_run_name, _) = sample.split("__") @@ -174,9 +175,10 @@ def __init__(self): sample_to_run_group[sample] = _run_name sample_project = samples_to_projects[sample] project_groups[sample_project].append(sample) + ind_sample_groups[sample] = [sample] if summary_path == "project_level": in_project_sample_groups[sample].append(sample) - merged_groups = {**run_groups, **project_groups, **in_project_sample_groups} + merged_groups = {**run_groups, **project_groups, **in_project_sample_groups, **ind_sample_groups} # Assign color for each group self.color_getter = mqc_colour.mqc_colour_scale() @@ -188,8 +190,7 @@ def __init__(self): [], ) if len(merged_groups) > len(self.palette): - hex_range = 2**24 - extra_colors = [hex(random.randrange(0, hex_range)) for _ in range(len(merged_groups), len(self.palette))] + extra_colors = ["#{:06x}".format(random.randrange(0, 0xFFFFFF)) for _ in range(len(self.palette), len(merged_groups))] self.palette = self.palette + extra_colors self.group_color = {g: c for g, c in zip(merged_groups.keys(), self.palette[: len(merged_groups)])} self.sample_color = dict() diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 5ef104ddb4..72caebb7fb 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -37,6 +37,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s Tabulate general information and statistics per sample """ plot_content = dict() + reads_present = set() for s_name in sample_data.keys(): general_stats = dict() general_stats.update({"group": group_lookup_dict[s_name]}) @@ -48,6 +49,13 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]}) general_stats.update({"reads_eliminated": _calculate_sample_reads_eliminated(sample_data[s_name])}) general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]}) + if "Reads" in sample_data[s_name]: + for read in sample_data[s_name]["Reads"]: + read_name = read["Read"] + reads_present.add(read_name) + mean_length = read["MeanReadLength"] + general_stats.update({f"{read_name}_mean_len": mean_length}) + plot_content.update({s_name: general_stats}) headers = {} @@ -97,6 +105,14 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "scale": "RdYlGn", "suffix": "%", } + + for read in sorted(reads_present): + headers[f"{read}_mean_len"] = { + "title": f"{read} Mean Lenght", + "description": f"Average read length for read {read}", + "scale": "RdYlGn", + } + headers["reads_eliminated"] = { "title": "Reads Eliminated", "description": "Number of reads eliminated.", From aa87263a3cf061ca6cafaa7d09b95e07e60de5b7 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Thu, 2 Oct 2025 17:25:58 -0700 Subject: [PATCH 21/29] Removed histograms --- multiqc/modules/bases2fastq/bases2fastq.py | 4 - multiqc/modules/bases2fastq/plot_samples.py | 160 +------------------- 2 files changed, 1 insertion(+), 163 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 29b1eb7e5a..19851b6b4a 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -24,12 +24,10 @@ ) from multiqc.modules.bases2fastq.plot_samples import ( tabulate_sample_stats, - plot_sample_assignment_histogram, sequence_content_plot, plot_per_cycle_N_content, plot_adapter_content, plot_per_read_gc_hist, - plot_sample_read_length, ) log = logging.getLogger(__name__) @@ -576,8 +574,6 @@ def add_run_plots(self, data, plot_functions): def add_sample_plots(self, data, group_lookup, project_lookup): plot_functions = [ tabulate_sample_stats, - plot_sample_assignment_histogram, - plot_sample_read_length, sequence_content_plot, plot_per_cycle_N_content, plot_adapter_content, diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 72caebb7fb..919a996bb3 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -108,7 +108,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s for read in sorted(reads_present): headers[f"{read}_mean_len"] = { - "title": f"{read} Mean Lenght", + "title": f"{read} Mean Length", "description": f"Average read length for read {read}", "scale": "RdYlGn", } @@ -149,164 +149,6 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s return plot_html, plot_name, anchor, description, helptext, plot_content -def plot_sample_assignment_histogram(sample_data, group_lookup_dict, project_lookup_dict, color_dict): - """ - Plots a histogram of number of assigned polonies in all samples for each run. - """ - plot_content = [] - polony_assignments = {} - for s_name, data in sample_data.items(): - if "NumPolonies" not in data: - continue - run_name, _ = s_name.split("__") - if run_name not in polony_assignments: - polony_assignments[run_name] = [] - polonies = data["NumPolonies"] - polony_assignments[run_name].append(polonies) - - pconfig: Dict[str, Any] = {"data_labels": []} - for run_name, assignment_data in polony_assignments.items(): - run_data = {} - bins = 50 - for bins in [50, 20, 10]: - if len(assignment_data) > bins: - break - hist, bin_edges = np.histogram(assignment_data, bins=bins) - bin_ranges = [f"({round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)})" for i in range(len(bin_edges) - 1)] - points = [float(point) for point in hist] - run_data["Polonies Assigned"] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} - plot_content.append(run_data) - - pconfig["data_labels"].append( - { - "name": run_name, - "xlab": "Assigned Polonies (Range)", - "ylab": "Number of Samples with N Polonies Assigned", - } - ) - - pconfig.update( - { - "id": f"sample_assignment_hist_{generate_random_string(5)}", - "title": "bases2fastq: Sample Polony Assignment Histogram", - "style": "lines+markers", - "xlab": "Assigned Polonies (Range)", - "ylab": "Number of Samples with N Polonies Assigned", - "categories": True, - } - ) - - plot_name = "Sample Polony Assignment Histogram" - plot_html = linegraph.plot(plot_content, pconfig=pconfig) - anchor = "sample_assignment_hist" - description = "Histogram showing the distribution of samples according to the number of polonies assigned to them." - helptext = """ - Shows bins of assigned polony counts on the X-axis and the number of samples whose number of polonies fall - within each bin on the Y-axis. - """ - - return plot_html, plot_name, anchor, description, helptext, plot_content - - -def plot_sample_read_length(sample_data, group_lookup_dict, project_lookup_dict, color_dict): - """ - Plots the average read length for each sample if less than 50 samples in total, or the distribution per run - as a lineplot based on histogram bins. - """ - total_samples = len(sample_data.keys()) - plot_content = dict() - pconfig = {} - plot_html = None - plot_name = "Mean Read Length per Sample" - anchor = "mean_read_length_per_sample" - description = "" - helptext = "" - - if total_samples <= 50: - for s_name, data in sample_data.items(): - read_lengths = {s_name: {}} - if "Reads" not in data: - continue - for read in data["Reads"]: - read_name = read["Read"] - mean_length = read["MeanReadLength"] - read_lengths[s_name][read_name] = mean_length - plot_content.update(read_lengths) - - pconfig = { - "title": "Bases2Fastq: Mean Read Length per Sample", - "id": f"mean_read_length_per_sample_{generate_random_string(5)}", - "ylab": "Bases", - "cpswitch": False, - "subtitle": None, - "stacking": "group", - } - plot_html = bargraph.plot(plot_content, pconfig=pconfig) - description = "Average read length per read for all samples." - helptext = """ - Shows the average read length for each read in each sample. - """ - - elif total_samples > 50: - plot_content = [] - read_lengths = {} - for s_name, data in sample_data.items(): - if "Reads" not in data: - continue - run_name, _ = s_name.split("__") - if run_name not in read_lengths: - read_lengths[run_name] = {} - for read in data["Reads"]: - read_id = read["Read"] - if read_id not in read_lengths[run_name]: - read_lengths[run_name][read_id] = [] - read_lengths[run_name][read_id].append(read["MeanReadLength"]) - - pconfig: Dict[str, Any] = {"data_labels": []} - for run_name, read_data in read_lengths.items(): - run_data = {} - for read_name, read_lengths in read_data.items(): - bins = 50 - for bins in [50, 20, 10]: - if len(read_lengths) > bins: - break - hist, bin_edges = np.histogram(read_lengths, bins=bins) - bin_ranges = [ - f"({round(bin_edges[i], 2)}-{round(bin_edges[i + 1], 2)})" for i in range(len(bin_edges) - 1) - ] - points = [float(point) for point in hist] - run_data[read_name] = {bin_range: point for bin_range, point in zip(bin_ranges, points)} - plot_content.append(run_data) - - pconfig["data_labels"].append( - { - "name": run_name, - "xlab": "Average Read Length (Range)", - "ylab": "Samples with Average Read Length", - } - ) - - pconfig.update( - { - "id": f"mean_read_length_per_sample_{generate_random_string(5)}", - "title": "bases2fastq: Mean Read Length Per Sample", - "style": "lines+markers", - "xlab": "Average Read Length (Range)", - "ylab": "Samples with Average Read Length", - "categories": True, - } - ) - - plot_html = linegraph.plot(plot_content, pconfig=pconfig) - description = "Distribution of average read lengths for all samples." - helptext = """ - Shows the distribution of samples whose average read lengths fall in a given range. - Reads are shown as different lines. - """ - - return plot_html, plot_name, anchor, description, helptext, plot_content - - def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, color_dict): """Create the epic HTML for the FastQC sequence content heatmap""" From 626544a0c461ab5065c4e1e978bfec054a079764 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Fri, 3 Oct 2025 09:52:21 -0700 Subject: [PATCH 22/29] Linting --- multiqc/modules/bases2fastq/plot_runs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index efc074f103..ae06f9af12 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -371,7 +371,7 @@ def tabulate_index_assignment_stats(run_data, color_dict): sample_index_stats.update({"sample_name": sample_data["SampleID"]}) sample_index_stats.update({"index_1": sample_data["Index1"]}) sample_index_stats.update({"index_2": sample_data["Index2"]}) - sample_index_stats.update({"polonies": sample_data["SamplePolonyCounts"]}) + sample_index_stats.update({"assigned_polonies": sample_data["SamplePolonyCounts"]}) sample_index_stats.update({"polony_percentage": sample_data["PercentOfPolonies"]}) plot_content.update({index: sample_index_stats}) @@ -388,8 +388,8 @@ def tabulate_index_assignment_stats(run_data, color_dict): "title": "Index 2", "description": "Sample Index 2 (I2).", } - headers["polonies"] = { - "title": "Polonies", + headers["assigned_polonies"] = { + "title": "Assigned Polonies", "description": "Number of polonies assigned to sample.", "scale": "RdYlGn", } @@ -456,14 +456,14 @@ def tabulate_unassigned_index_stats(run_data, color_dict): headers["Polonies"] = { "title": "Polonies", "description": "Number of polonies assigned to indices.", - "scale": "RdYlGn_r", + "scale": "RdYlGn-rev", } headers["% Polonies"] = { "title": "% Polonies", "description": "Percentage of total polonies assigned to this index combination.", "max": 100, "min": 0, - "scale": "RdYlGn_r", + "scale": "RdYlGn-rev", "suffix": "%", } From e34652bf37a258f151e6705f14f4c95d6877b3e0 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Fri, 3 Oct 2025 09:55:02 -0700 Subject: [PATCH 23/29] fix pre-commit --- multiqc/modules/bases2fastq/bases2fastq.py | 5 +++-- multiqc/modules/bases2fastq/plot_samples.py | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 19851b6b4a..598082e295 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -72,7 +72,6 @@ def __init__(self): run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) - if run_level_log_files == 0 and project_level_log_files == 0: error_msg = "No run- or project-level log files found within the Bases2Fastq results." log.error(error_msg) @@ -188,7 +187,9 @@ def __init__(self): [], ) if len(merged_groups) > len(self.palette): - extra_colors = ["#{:06x}".format(random.randrange(0, 0xFFFFFF)) for _ in range(len(self.palette), len(merged_groups))] + extra_colors = [ + "#{:06x}".format(random.randrange(0, 0xFFFFFF)) for _ in range(len(self.palette), len(merged_groups)) + ] self.palette = self.palette + extra_colors self.group_color = {g: c for g, c in zip(merged_groups.keys(), self.palette[: len(merged_groups)])} self.sample_color = dict() diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 919a996bb3..b629f0fc0e 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -126,7 +126,11 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "suffix": "%", } - pconfig = {"id": f"sample_qc_metric_table_{generate_random_string(5)}", "title": "Sample QC Metrics Table", "no_violin": False} + pconfig = { + "id": f"sample_qc_metric_table_{generate_random_string(5)}", + "title": "Sample QC Metrics Table", + "no_violin": False, + } plot_name = "Sample QC Metrics Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) From b1dbaf6ef76517c8a416977a5f110f818d0386b7 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Fri, 3 Oct 2025 11:07:49 -0700 Subject: [PATCH 24/29] Derep plot ids --- multiqc/modules/bases2fastq/bases2fastq.py | 2 +- multiqc/modules/bases2fastq/plot_runs.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 598082e295..4f13006f2e 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -429,7 +429,7 @@ def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: "Lane": lane_id, "I1": sequence["I1"], "I2": sequence["I2"], - "Polonies": sequence["Count"], + "Number of Polonies": sequence["Count"], "% Polonies": float("nan"), } if total_polonies > 0: diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index ae06f9af12..a06061ec5f 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -179,7 +179,7 @@ def tabulate_project_stats(run_data, color_dict): pconfig = { "title": "bases2fastq: General Sequencing (Project) QC metrics", "col1_header": "Run Name", - "id": "project_run_metrics_table", + "id": f"project_run_metrics_table_{generate_random_string(5)}", "ylab": "QC", } @@ -283,7 +283,7 @@ def tabulate_run_stats(run_data, color_dict): pconfig = { "title": "Bases2Fastq: General Sequencing Run QC metrics", "col1_header": "Run Name", - "id": "run_metrics_table", + "id": f"run_metrics_table_{generate_random_string(5)}", "ylab": "QC", } @@ -435,7 +435,6 @@ def tabulate_unassigned_index_stats(run_data, color_dict): - Polonies - % Polonies """ - headers = {} headers["Run Name"] = { "title": "Run Name", @@ -453,7 +452,7 @@ def tabulate_unassigned_index_stats(run_data, color_dict): "title": "I2", "description": "Index 2.", } - headers["Polonies"] = { + headers["Number of Polonies"] = { "title": "Polonies", "description": "Number of polonies assigned to indices.", "scale": "RdYlGn-rev", From 7d6f18ef2043d083c3c528bd67bd150cc88435a7 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sun, 5 Oct 2025 18:19:14 -0700 Subject: [PATCH 25/29] Added Q50 metrics --- multiqc/modules/bases2fastq/bases2fastq.py | 13 +- multiqc/modules/bases2fastq/plot_runs.py | 134 +++++++++++++------- multiqc/modules/bases2fastq/plot_samples.py | 24 +++- 3 files changed, 117 insertions(+), 54 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 4f13006f2e..a642611718 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -20,7 +20,6 @@ tabulate_project_stats, plot_base_quality_hist, plot_base_quality_by_cycle, - plot_lane_cycle_stats, ) from multiqc.modules.bases2fastq.plot_samples import ( tabulate_sample_stats, @@ -196,7 +195,7 @@ def __init__(self): for s_name in samples_to_projects.keys(): s_color = ( self.group_color[s_name] - if summary_path == "project_level" + if (summary_path == "project_level" or len(project_groups) == 1) else self.group_color[samples_to_projects[s_name]] ) self.sample_color.update({s_name: s_color}) @@ -231,7 +230,7 @@ def __init__(self): self.add_run_plots( data=run_data, - plot_functions=[plot_lane_cycle_stats, plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle], + plot_functions=[plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle], ) self.add_sample_plots(data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects) @@ -516,15 +515,17 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: f"Missing data needed to extract index assignment for sample {sample_id}. Skipping." ) continue - if sample_expected_seq not in sample_to_index_assignment: - sample_to_index_assignment[sample_expected_seq] = { + if run_analysis_name not in sample_to_index_assignment: + sample_to_index_assignment[run_analysis_name] = {} + if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: + sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { "SampleID": sample_id, "SamplePolonyCounts": 0, "PercentOfPolonies": float("nan"), "Index1": "", "Index2": "", } - sample_to_index_assignment[sample_expected_seq]["SamplePolonyCounts"] += sample_counts + sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += sample_counts for index_assigment in sample_to_index_assignment.values(): if total_polonies > 0: diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index a06061ec5f..ba51644b9f 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -117,6 +117,8 @@ def tabulate_project_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() + is_percent_q50_present = False + reads_present = [] for s_name in run_data.keys(): project = run_data[s_name]["Project"] run_project_name = f"{s_name} | {project}" @@ -126,7 +128,20 @@ def tabulate_project_stats(run_data, color_dict): run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + percent_q50 = run_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + run_stats.update({"percent_q50_run": percent_q50}) run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) + if "Reads" in run_data[s_name]: + for read in run_data[s_name]["Reads"]: + if "Cycles" not in read or "Read" not in read: + continue + read_name = read["Read"] + num_cycles = len(read["Cycles"]) + reads_present.append(read_name) + run_stats.update({f"{read_name}_cycles": num_cycles}) + plot_content.update({run_project_name: run_stats}) headers = {} @@ -171,6 +186,22 @@ def tabulate_project_stats(run_data, color_dict): "scale": "RdYlGn", "suffix": "%", } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + for read in reads_present: + headers[f"{read}_cycles"] = { + "title": f"Cycles {read}", + "description": f"Number of cycles for read {read}.", + "scale": "RdPu", + } + headers["reads_eliminated"] = { "title": "Reads Eliminated", "description": "Number of reads eliminated.", @@ -213,6 +244,8 @@ def tabulate_run_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() + is_percent_q50_present = False + reads_present = [] for s_name in run_data.keys(): run_stats = dict() run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) @@ -222,7 +255,20 @@ def tabulate_run_stats(run_data, color_dict): run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + percent_q50 = run_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + run_stats.update({"percent_q50_run": percent_q50}) run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) + if "Reads" in run_data[s_name]: + for read in run_data[s_name]["Reads"]: + if "Cycles" not in read or "Read" not in read: + continue + read_name = read["Read"] + num_cycles = len(read["Cycles"]) + reads_present.append(read_name) + run_stats.update({f"{read_name}_cycles": num_cycles}) + plot_content.update({s_name: run_stats}) headers = {} @@ -275,6 +321,21 @@ def tabulate_run_stats(run_data, color_dict): "scale": "RdYlGn", "suffix": "%", } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + for read in reads_present: + headers[f"{read}_cycles"] = { + "title": f"Cycles {read}", + "description": f"Number of cycles for read {read}.", + "scale": "RdPu", + } headers["reads_eliminated"] = { "title": "Reads Eliminated", "description": "Number of reads eliminated.", @@ -300,6 +361,7 @@ def tabulate_run_stats(run_data, color_dict): - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q50: The percentage of ≥ Q50 Q scores for the run (when applicable). This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - Reads Eliminated: Number of reads eliminated across lanes.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -488,48 +550,6 @@ def tabulate_unassigned_index_stats(run_data, color_dict): return plot_html, plot_name, anchor, description, helptext, run_data -def plot_lane_cycle_stats(run_data, color_dict): - """ - Plot number of cycles per read and lane - """ - plot_content = dict() - for s_name in run_data.keys(): - if "Lanes" not in run_data[s_name]: - continue - for lane in run_data[s_name]["Lanes"]: - if "Lane" not in lane or "Reads" not in lane: - continue - lane_stats = dict() - lane_name = f"L{lane['Lane']}" - run_name = f"{s_name} | {lane_name}" - lane_stats[run_name] = {} - for read in lane["Reads"]: - if "Cycles" not in read or "Read" not in read: - continue - read_name = read["Read"] - num_cycles = len(read["Cycles"]) - lane_stats[run_name][read_name] = num_cycles - plot_content.update(lane_stats) - - pconfig = { - "title": "Bases2Fastq: Cycles Per Read Per Lane", - "id": f"project_cycles_per_read_per_lane_{generate_random_string(5)}", - "ylab": "Read Cycles", - "cpswitch": False, - "subtitle": None, - } - - plot_name = "Cycles Per Read Per Lane" - plot_html = bargraph.plot(plot_content, pconfig=pconfig) - anchor = "cycles_per_read_per_lane" - description = "Number of sequencing cycles per read in each lane." - helptext = """ - Shows the number of cycles used for each read in every flowcell lane. - Useful for confirming that read lengths match the expected sequencing setup across all lanes. - """ - return plot_html, plot_name, anchor, description, helptext, plot_content - - def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per base BQ histogram bq_hist_dict = dict() @@ -675,6 +695,31 @@ def plot_base_quality_by_cycle(run_data, color_dict): cycle_dict.update({cycle_no: cycle["PercentQ40"]}) Q40_dict.update({s_name: cycle_dict}) + # Prepare plot data for %Q50 of each cycle + Q50_dict = {} + percent_q50_values = set() + for s_name in run_data.keys(): + paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False + cycle_dict = dict() + for cycle in run_data[s_name]["Reads"][0]["Cycles"]: + cycle_no = int(cycle["Cycle"]) + if "PercentQ50" not in cycle: + continue + cycle_perc_q50 = cycle["PercentQ50"] + cycle_dict.update({cycle_no: cycle_perc_q50}) + if cycle_perc_q50 is not None: + percent_q50_values.add(cycle_perc_q50) + if paired_end: + for cycle in run_data[s_name]["Reads"][1]["Cycles"]: + cycle_no = int(cycle["Cycle"]) + r1r2_split + if "PercentQ50" not in cycle: + continue + cycle_perc_q50 = cycle["PercentQ50"] + cycle_dict.update({cycle_no: cycle_perc_q50}) + if cycle_perc_q50 is not None: + percent_q50_values.add(cycle_perc_q50) + Q50_dict.update({s_name: cycle_dict}) + # Prepare plot data for % base calls below PF threshold below_pf_dict = {} for s_name in run_data.keys(): @@ -709,13 +754,16 @@ def plot_base_quality_by_cycle(run_data, color_dict): "title": "bases2fastq: Quality by cycles", "ylab": "QC", } + if len(percent_q50_values) > 0 and any(v is not None for v in percent_q50_values): + plot_content.insert(4, Q50_dict) + pconfig["data_labels"].insert(4, {"name": "%Q50", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}) plot_html = linegraph.plot(plot_content, pconfig=pconfig) plot_name = "Quality Metrics By Cycle" anchor = "per_cycle_quality" description = "Per run base qualities by cycle. Read 1 and Read 2 are separated by a red dashed line." helptext = """ This section plots the base qualities by each instrument cycle.\n - Choose between Median Quality, Mean Quality, Percent Q30 or Percentage Q40 per cycle.\n + Choose between Median Quality, Mean Quality, Percent Q30, Percent Q40 or Percent Q50 (when applicable) per cycle.\n Read 1 and Read 2 are separated by a red dashed line. """ return plot_html, plot_name, anchor, description, helptext, plot_content diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index b629f0fc0e..ebaab9b166 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -38,6 +38,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s """ plot_content = dict() reads_present = set() + is_percent_q50_present = False for s_name in sample_data.keys(): general_stats = dict() general_stats.update({"group": group_lookup_dict[s_name]}) @@ -47,6 +48,10 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s general_stats.update({"mean_base_quality_sample": sample_data[s_name]["QualityScoreMean"]}) general_stats.update({"percent_q30_sample": sample_data[s_name]["PercentQ30"]}) general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]}) + percent_q50 = sample_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + general_stats.update({"percent_q50_run": percent_q50}) general_stats.update({"reads_eliminated": _calculate_sample_reads_eliminated(sample_data[s_name])}) general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]}) if "Reads" in sample_data[s_name]: @@ -74,24 +79,24 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } headers["num_polonies_sample"] = { "title": "# Polonies", - "description": "The total number of polonies that are calculated for the run", + "description": "The total number of polonies that are calculated for the run.", "min": 0, "scale": "Blues", } headers["yield_sample"] = { "title": "Yield (Gb)", - "description": "The sample yield based on assigned reads in gigabases", + "description": "The sample yield based on assigned reads in gigabases.", "scale": "Greens", } headers["mean_base_quality_sample"] = { "title": "Mean Base Quality", - "description": "Average base quality across R1/R2", + "description": "Average base quality across R1/R2.", "min": 0, "scale": "Spectral", } headers["percent_q30_sample"] = { "title": "Percent Q30", - "description": "The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls", + "description": "The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", "max": 100, "min": 0, "scale": "RdYlGn", @@ -99,12 +104,21 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } headers["percent_q40_sample"] = { "title": "Percent Q40", - "description": "The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls", + "description": "The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", "max": 100, "min": 0, "scale": "RdYlGn", "suffix": "%", } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } for read in sorted(reads_present): headers[f"{read}_mean_len"] = { From 7a1a59febba1a6efb1a82dbea9084b0bc537d202 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Sun, 5 Oct 2025 18:19:59 -0700 Subject: [PATCH 26/29] Linting --- multiqc/modules/bases2fastq/bases2fastq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index a642611718..179de71855 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -525,7 +525,9 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: "Index1": "", "Index2": "", } - sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += sample_counts + sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( + sample_counts + ) for index_assigment in sample_to_index_assignment.values(): if total_polonies > 0: From f97aa411e5277ae37fe21b44d0a28f604ccc9fe6 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Thu, 9 Oct 2025 15:22:52 -0700 Subject: [PATCH 27/29] Added in project index assignment --- multiqc/modules/bases2fastq/bases2fastq.py | 158 +++++++++++++++++++-- multiqc/modules/bases2fastq/plot_runs.py | 41 ++++-- 2 files changed, 179 insertions(+), 20 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 179de71855..9e7d7b900a 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -147,6 +147,7 @@ def __init__(self): run_data = self.project_level_data sample_data = self.project_level_samples samples_to_projects = self.project_level_samples_to_project + index_assigment_data = self._parse_index_assignment_in_project("bases2fastq/project") elif summary_path == "combined_level": run_data = self.run_level_data sample_data = self.project_level_samples @@ -227,6 +228,13 @@ def __init__(self): tabulate_unassigned_index_stats, ], ) + else: + self.add_run_plots( + data=index_assigment_data, + plot_functions=[ + tabulate_index_assignment_stats, + ], + ) self.add_run_plots( data=run_data, @@ -529,11 +537,11 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: sample_counts ) - for index_assigment in sample_to_index_assignment.values(): - if total_polonies > 0: - index_assigment["PercentOfPolonies"] = round( - index_assigment["SamplePolonyCounts"] / total_polonies * 100, 2 - ) + for sample_data in sample_to_index_assignment[run_analysis_name].values(): + if total_polonies > 0: + sample_data["PercentOfPolonies"] = round( + sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 + ) run_manifest = json.loads(f["f"]) if "Samples" not in run_manifest: @@ -554,18 +562,150 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: index_1 = index_data.get("Index1", "") index_2 = index_data.get("Index2", "") merged_indices = f"{index_1}{index_2}" - if merged_indices not in sample_to_index_assignment: + if merged_indices not in sample_to_index_assignment[run_analysis_name]: log.error(f"Index assignment information not found for sample {sample_id}. Skipping.") continue - if sample_id != sample_to_index_assignment[merged_indices]["SampleID"]: + if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: + log.error( + f"RunManifest SampleID <{sample_id}> does not match " + f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." + "Skipping." + ) + continue + sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 + sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 + + return sample_to_index_assignment + + def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any]: + sample_to_index_assignment = {} + + if data_source == "": + return sample_to_index_assignment + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunParameters.json + run_manifest = Path(directory) / "../../RunManifest.json" + if not run_manifest.exists(): + log.error( + f"RunManifest.json could not be found in {run_manifest}. Skipping index assignment.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + project_stats = json.loads(f["f"]) + run_analysis_name = None + run_name = project_stats.get("RunName", None) + analysis_id = project_stats.get("AnalysisID", None) + project = self.clean_s_name(project_stats.get("Project", "DefaultProject"), f) + + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with project's RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {f['fn']}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue + + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") + continue + + # Ensure sample stats are present + if "SampleStats" not in project_stats: + log.error( + "Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Missing SampleStats in RunStats.json. Available keys: {list(project_stats.keys())}.") + continue + + # Extract per sample polony counts and overall total counts + total_polonies = project_stats.get("NumPoloniesBeforeTrimming", 0) + for sample_data in project_stats["SampleStats"]: + sample_name = sample_data.get("SampleName") + sample_id = None + + if run_analysis_name and sample_name: + sample_id = "__".join([run_analysis_name, sample_name]) + + if "Occurrences" not in sample_data: + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + + for occurrence in sample_data["Occurrences"]: + sample_expected_seq = occurrence.get("ExpectedSequence") + sample_counts = occurrence.get("NumPoloniesBeforeTrimming") + if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): + log.error( + f"Missing data needed to extract index assignment for sample {sample_id}. Skipping." + ) + continue + if run_analysis_name not in sample_to_index_assignment: + sample_to_index_assignment[run_analysis_name] = {} + if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: + sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { + "SampleID": sample_id, + "Project": project, + "SamplePolonyCounts": 0, + "PercentOfPolonies": float("nan"), + "Index1": "", + "Index2": "", + } + sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( + sample_counts + ) + + for sample_data in sample_to_index_assignment[run_analysis_name].values(): + if total_polonies > 0: + sample_data["PercentOfPolonies"] = round( + sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 + ) + + run_manifest_data = None + with open(run_manifest) as _infile: + run_manifest_data = json.load(_infile) + + if "Samples" not in run_manifest_data: + log.warning( + f" section not found in {directory}/RunManifest.json.\n" + f"Skipping RunManifest sample index assignment metrics." + ) + elif len(sample_to_index_assignment) == 0: + log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") + else: + for sample_data in run_manifest_data["Samples"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: + continue + sample_id = "__".join([run_analysis_name, sample_name]) + for index_data in sample_data["Indexes"]: + index_1 = index_data.get("Index1", "") + index_2 = index_data.get("Index2", "") + merged_indices = f"{index_1}{index_2}" + if merged_indices not in sample_to_index_assignment[run_analysis_name]: + continue + if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: log.error( f"RunManifest SampleID <{sample_id}> does not match " f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." "Skipping." ) continue - sample_to_index_assignment[merged_indices]["Index1"] = index_1 - sample_to_index_assignment[merged_indices]["Index2"] = index_2 + sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 + sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 return sample_to_index_assignment diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index ba51644b9f..cd63978dba 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -426,21 +426,40 @@ def tabulate_index_assignment_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() - sorted_run_data = natsorted(run_data.items(), key=lambda x: x[1]["SampleID"]) - for index, sample_data in enumerate(sorted_run_data, start=1): - sample_data = sample_data[1] - sample_index_stats = dict() - sample_index_stats.update({"sample_name": sample_data["SampleID"]}) - sample_index_stats.update({"index_1": sample_data["Index1"]}) - sample_index_stats.update({"index_2": sample_data["Index2"]}) - sample_index_stats.update({"assigned_polonies": sample_data["SamplePolonyCounts"]}) - sample_index_stats.update({"polony_percentage": sample_data["PercentOfPolonies"]}) - plot_content.update({index: sample_index_stats}) + run_names = sorted(run_data.keys()) + index = 1 + project_present = False + for run in run_names: + run_sample_data = run_data[run] + sorted_run_sample_data = natsorted(run_sample_data.items(), key=lambda x: x[1]["SampleID"]) + for sample_data in sorted_run_sample_data: + sample_data = sample_data[1] + sample_index_stats = dict() + sample_index_stats.update({"run_name": run}) + if "Project" in sample_data: + sample_index_stats.update({"project": sample_data["Project"]}) + project_present = True + sample_index_stats.update({"sample_name": sample_data["SampleID"].split("__")[1]}) + sample_index_stats.update({"index_1": sample_data["Index1"]}) + sample_index_stats.update({"index_2": sample_data["Index2"]}) + sample_index_stats.update({"assigned_polonies": sample_data["SamplePolonyCounts"]}) + sample_index_stats.update({"polony_percentage": sample_data["PercentOfPolonies"]}) + plot_content.update({index: sample_index_stats}) + index += 1 headers = {} + headers["run_name"] = { + "title": "Run Name", + "description": "Run Name.", + } + if project_present: + headers["project"] = { + "title": "Project", + "description": "Run Project.", + } headers["sample_name"] = { "title": "Sample Name", - "description": "Sample Name (RunID + Sample ID).", + "description": "Sample Name.", } headers["index_1"] = { "title": "Index 1", From 8709cff9ea4ea9b372e1fd0cdee6254180a79477 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Thu, 9 Oct 2025 15:40:48 -0700 Subject: [PATCH 28/29] Linting --- multiqc/modules/bases2fastq/bases2fastq.py | 8 +++----- multiqc/modules/bases2fastq/plot_runs.py | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 9e7d7b900a..c84c77da2c 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -576,7 +576,7 @@ def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 return sample_to_index_assignment - + def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any]: sample_to_index_assignment = {} @@ -648,9 +648,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] sample_expected_seq = occurrence.get("ExpectedSequence") sample_counts = occurrence.get("NumPoloniesBeforeTrimming") if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): - log.error( - f"Missing data needed to extract index assignment for sample {sample_id}. Skipping." - ) + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") continue if run_analysis_name not in sample_to_index_assignment: sample_to_index_assignment[run_analysis_name] = {} @@ -672,7 +670,7 @@ def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any] sample_data["PercentOfPolonies"] = round( sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 ) - + run_manifest_data = None with open(run_manifest) as _infile: run_manifest_data = json.load(_infile) diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index cd63978dba..87151b3baa 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -454,9 +454,9 @@ def tabulate_index_assignment_stats(run_data, color_dict): } if project_present: headers["project"] = { - "title": "Project", - "description": "Run Project.", - } + "title": "Project", + "description": "Run Project.", + } headers["sample_name"] = { "title": "Sample Name", "description": "Sample Name.", From 77e72ab8eec3ff4384692a4e5049930a26dd2938 Mon Sep 17 00:00:00 2001 From: Carlos Ruiz Date: Thu, 9 Oct 2025 20:40:30 -0700 Subject: [PATCH 29/29] Added run manifest for project --- multiqc/modules/bases2fastq/bases2fastq.py | 104 +++++++++++++++++++-- 1 file changed, 97 insertions(+), 7 deletions(-) diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index c84c77da2c..df4e29af77 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -9,6 +9,7 @@ from pathlib import Path from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.types import LoadedFileDict from multiqc.utils import mqc_colour from multiqc.modules.bases2fastq.plot_runs import ( @@ -147,6 +148,7 @@ def __init__(self): run_data = self.project_level_data sample_data = self.project_level_samples samples_to_projects = self.project_level_samples_to_project + manifest_data = self._parse_run_manifest_in_project("bases2fastq/project") index_assigment_data = self._parse_index_assignment_in_project("bases2fastq/project") elif summary_path == "combined_level": run_data = self.run_level_data @@ -208,14 +210,13 @@ def __init__(self): tabulate_run_stats if summary_path in ["run_level", "combined_level"] else tabulate_project_stats ) self.add_run_plots(data=run_data, plot_functions=[qc_metrics_function]) - + self.add_run_plots( + data=manifest_data, + plot_functions=[ + tabulate_manifest_stats, + ], + ) if summary_path in ["run_level", "combined_level"]: - self.add_run_plots( - data=manifest_data, - plot_functions=[ - tabulate_manifest_stats, - ], - ) self.add_run_plots( data=index_assigment_data, plot_functions=[ @@ -391,6 +392,95 @@ def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: return runs_manifest_data + def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: + project_manifest_data = {} + + if data_source == "": + return project_manifest_data + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunParameters.json + run_manifest = Path(directory) / "../../RunManifest.json" + if not run_manifest.exists(): + log.error( + f"RunManifest.json could not be found in {run_manifest}. Skipping index assignment.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + project_stats = json.loads(f["f"]) + run_analysis_name = None + run_name = project_stats.get("RunName", None) + analysis_id = project_stats.get("AnalysisID", None) + + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with project's RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {f['fn']}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue + + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") + continue + + run_manifest_data = None + with open(run_manifest) as _infile: + run_manifest_data = json.load(_infile) + + if "Settings" not in run_manifest_data: + log.warning(f" section not found in {run_manifest}.\nSkipping RunManifest metrics.") + else: + for lane_data in run_manifest_data["Settings"]: + lane_id = lane_data.get("Lane") + if not lane_id: + log.error(" not found in Settings section of RunManifest. Skipping lanes.") + continue + lane_name = f"L{lane_id}" + run_lane = f"{run_analysis_name} | {lane_name}" + project_manifest_data[run_lane] = {} + + indices = [] + indices_cycles = [] + mask_pattern = re.compile(r"^I\d+Mask$") + matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] + for key in matching_keys: + for mask_info in lane_data[key]: + if mask_info["Read"] not in indices: + indices.append(mask_info["Read"]) + indices_cycles.append(str(len(mask_info["Cycles"]))) + indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" + project_manifest_data[run_lane]["Indexing"] = indexing + + project_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") + project_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( + "R1AdapterMinimumTrimmedLength", "N/A" + ) + project_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( + "R2AdapterMinimumTrimmedLength", "N/A" + ) + data_source_info: LoadedFileDict[Any] = { + "fn": str(run_manifest.name), + "root": str(run_manifest.parent), + "sp_key": data_source, + "s_name": str(run_manifest.with_suffix("").name), + "f": run_manifest_data, + } + self.add_data_source(f=data_source_info, s_name=run_analysis_name, module="bases2fastq") + + return project_manifest_data + def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: run_unassigned_sequences = {} if data_source == "":