diff --git a/docs/markdown/modules/bases2fastq.md b/docs/markdown/modules/bases2fastq.md index cb84eb1bcd..a929fb504a 100644 --- a/docs/markdown/modules/bases2fastq.md +++ b/docs/markdown/modules/bases2fastq.md @@ -32,4 +32,8 @@ bases2fastq/run: contents: SampleStats fn: RunStats.json num_lines: 100 +bases2fastq/manifest: + contents: Settings + fn: RunManifest.json + num_lines: 100 ``` diff --git a/multiqc/modules/bases2fastq/bases2fastq.py b/multiqc/modules/bases2fastq/bases2fastq.py index 132387a226..df4e29af77 100644 --- a/multiqc/modules/bases2fastq/bases2fastq.py +++ b/multiqc/modules/bases2fastq/bases2fastq.py @@ -1,20 +1,27 @@ +from collections import defaultdict import copy -import csv +import re import json import logging import random +from typing import Any, Dict, List import uuid +from pathlib import Path from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.types import LoadedFileDict from multiqc.utils import mqc_colour from multiqc.modules.bases2fastq.plot_runs import ( plot_run_stats, + tabulate_manifest_stats, + tabulate_index_assignment_stats, + tabulate_unassigned_index_stats, tabulate_run_stats, + tabulate_project_stats, plot_base_quality_hist, plot_base_quality_by_cycle, ) -from multiqc.modules.bases2fastq.plot_project_runs import tabulate_project_run_stats from multiqc.modules.bases2fastq.plot_samples import ( tabulate_sample_stats, sequence_content_plot, @@ -39,250 +46,764 @@ def __init__(self): doi="10.1038/s41587-023-01750-7", ) + # Initialize run, project and sample level structures + self.run_level_data = {} + self.run_level_samples = {} + self.run_level_samples_to_project = {} + self.project_level_data = {} + self.project_level_samples = {} + self.project_level_samples_to_project = {} + num_run_level_samples = 0 + num_project_level_samples = 0 + + # Initialize run and project groups + self.group_dict = dict() + self.group_lookup_dict = dict() + self.project_lookup_dict = dict() + self.b2f_sample_data = dict() self.b2f_run_data = dict() self.b2f_run_project_data = dict() + self.b2f_run_project_sample_data = dict() self.missing_runs = set() self.sample_id_to_run = dict() - # Group by run name - self.group_dict = dict() - self.group_lookup_dict = dict() - self.project_lookup_dict = dict() + # Define if call is project- or run-level + run_level_log_files = len(list(self.find_log_files("bases2fastq/run"))) + project_level_log_files = len(list(self.find_log_files("bases2fastq/project"))) + + if run_level_log_files == 0 and project_level_log_files == 0: + error_msg = "No run- or project-level log files found within the Bases2Fastq results." + log.error(error_msg) + raise ModuleNoSamplesFound(error_msg) + + # Parse data + if run_level_log_files > 0: + (self.run_level_data, self.run_level_samples, self.run_level_samples_to_project) = ( + self._parse_run_project_data("bases2fastq/run") + ) + if project_level_log_files > 0: + (self.project_level_data, self.project_level_samples, self.project_level_samples_to_project) = ( + self._parse_run_project_data("bases2fastq/project") + ) + + # Get run- and project-level samples + num_run_level_samples = len(self.run_level_samples) + num_project_level_samples = len(self.project_level_samples) + + # Ensure run/sample data found + if all( + [ + len(self.run_level_data) == 0, + num_run_level_samples == 0, + len(self.project_level_data) == 0, + num_project_level_samples == 0, + ] + ): + error_msg = "No run-, project- or sample-level data found" + log.error(error_msg) + raise ModuleNoSamplesFound(error_msg) + + # Choose path to take, if project use only project-level data, otherwise use run- and project-level + summary_path = "" + if len(self.run_level_data) > 0 and len(self.project_level_data) == 0: + summary_path = "run_level" + if len(self.run_level_data) == 0 and len(self.project_level_data) > 0: + summary_path = "project_level" + elif len(self.run_level_data) > 0 and len(self.project_level_data) > 0: + summary_path = "combined_level" + + # Log runs, projects and samples found + log.info(f"Found {len(self.run_level_data)} run(s) within the Bases2Fastq results.") + log.info(f"Found {len(self.project_level_data)} project(s) within the Bases2Fastq results.") + if summary_path == "run_level": + log.info(f"Found {num_run_level_samples} sample(s) within the Bases2Fastq results.") + else: + log.info(f"Found {num_project_level_samples} sample(s) within the Bases2Fastq results.") + + # Superfluous function call to confirm that it is used in this module + self.add_software_version(None) + + # Warn user if run-level/project-level or sample-level metrics were not found + if len(self.run_level_data) == 0 and len(self.project_level_data) == 0: + log.warning("No run/project stats found!") + if num_run_level_samples == 0 and num_project_level_samples == 0: + log.warning("No sample stats found!") + + # Define data to use + run_data = {} + sample_data = {} + samples_to_projects = {} + manifest_data = {} + index_assigment_data = {} + unassigned_sequences = {} + if summary_path == "run_level": + run_data = self.run_level_data + sample_data = self.run_level_samples + samples_to_projects = self.run_level_samples_to_project + manifest_data = self._parse_run_manifest("bases2fastq/manifest") + index_assigment_data = self._parse_index_assignment("bases2fastq/manifest") + unassigned_sequences = self._parse_run_unassigned_sequences("bases2fastq/run") + elif summary_path == "project_level": + run_data = self.project_level_data + sample_data = self.project_level_samples + samples_to_projects = self.project_level_samples_to_project + manifest_data = self._parse_run_manifest_in_project("bases2fastq/project") + index_assigment_data = self._parse_index_assignment_in_project("bases2fastq/project") + elif summary_path == "combined_level": + run_data = self.run_level_data + sample_data = self.project_level_samples + samples_to_projects = self.project_level_samples_to_project + manifest_data = self._parse_run_manifest("bases2fastq/manifest") + index_assigment_data = self._parse_index_assignment("bases2fastq/manifest") + unassigned_sequences = self._parse_run_unassigned_sequences("bases2fastq/run") + else: + error_msg = "No run- or project-level data was retained. No report will be generated." + log.error(error_msg) + raise ModuleNoSamplesFound(error_msg) + + # Create run and project groups + run_groups = defaultdict(list) + project_groups = defaultdict(list) + in_project_sample_groups = defaultdict(list) + ind_sample_groups = defaultdict(list) + sample_to_run_group = {} + for sample in sample_data.keys(): + (_run_name, _) = sample.split("__") + run_groups[_run_name].append(sample) + sample_to_run_group[sample] = _run_name + sample_project = samples_to_projects[sample] + project_groups[sample_project].append(sample) + ind_sample_groups[sample] = [sample] + if summary_path == "project_level": + in_project_sample_groups[sample].append(sample) + merged_groups = {**run_groups, **project_groups, **in_project_sample_groups, **ind_sample_groups} + + # Assign color for each group + self.color_getter = mqc_colour.mqc_colour_scale() + self.palette = sum( + [ + self.color_getter.get_colours(hue) + for hue in ["Set2", "Pastel1", "Accent", "Set1", "Set3", "Dark2", "Paired", "Pastel2"] + ], + [], + ) + if len(merged_groups) > len(self.palette): + extra_colors = [ + "#{:06x}".format(random.randrange(0, 0xFFFFFF)) for _ in range(len(self.palette), len(merged_groups)) + ] + self.palette = self.palette + extra_colors + self.group_color = {g: c for g, c in zip(merged_groups.keys(), self.palette[: len(merged_groups)])} + self.sample_color = dict() + for s_name in samples_to_projects.keys(): + s_color = ( + self.group_color[s_name] + if (summary_path == "project_level" or len(project_groups) == 1) + else self.group_color[samples_to_projects[s_name]] + ) + self.sample_color.update({s_name: s_color}) + self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match + self.palette = self.palette[len(merged_groups) :] + + # Plot metrics + qc_metrics_function = ( + tabulate_run_stats if summary_path in ["run_level", "combined_level"] else tabulate_project_stats + ) + self.add_run_plots(data=run_data, plot_functions=[qc_metrics_function]) + self.add_run_plots( + data=manifest_data, + plot_functions=[ + tabulate_manifest_stats, + ], + ) + if summary_path in ["run_level", "combined_level"]: + self.add_run_plots( + data=index_assigment_data, + plot_functions=[ + tabulate_index_assignment_stats, + ], + ) + self.add_run_plots( + data=unassigned_sequences, + plot_functions=[ + tabulate_unassigned_index_stats, + ], + ) + else: + self.add_run_plots( + data=index_assigment_data, + plot_functions=[ + tabulate_index_assignment_stats, + ], + ) + + self.add_run_plots( + data=run_data, + plot_functions=[plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle], + ) + + self.add_sample_plots(data=sample_data, group_lookup=samples_to_projects, project_lookup=samples_to_projects) - # bases2fastq/run - num_runs = 0 - num_samples = 0 - for f in self.find_log_files("bases2fastq/run"): + def get_uuid(self): + return str(uuid.uuid4()).replace("-", "").lower() + + def _parse_run_project_data(self, data_source: str) -> List[Dict[str, Any]]: + runs_global_data = {} + runs_sample_data = {} + sample_to_project = {} + if data_source == "": + return [runs_global_data, runs_sample_data, sample_to_project] + + for f in self.find_log_files(data_source): data = json.loads(f["f"]) + # Copy incomind data and reset samples to include only desired + data_to_return = copy.deepcopy(data) + data_to_return["SampleStats"] = [] + # get run + analysis run_name = data.get("RunName", None) analysis_id = data.get("AnalysisID", None)[0:4] if not run_name or not analysis_id: - log.error("Error with RunStats.json. Either RunName or AnalysisID is absent.") log.error( - "Please visit Elembio online documentation for more information - https://docs.elembio.io/docs/bases2fastq/introduction/" + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" ) continue run_analysis_name = "-".join([run_name, analysis_id]) run_analysis_name = self.clean_s_name(run_analysis_name, f) + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") + continue + + # Check run is present in the final dictionaries + if run_analysis_name not in runs_global_data: + runs_global_data[run_analysis_name] = data_to_return + + project = self.clean_s_name(data.get("Project", "DefaultProject"), f) + # map sample UUIDs to run_analysis_name for sample_data in data["SampleStats"]: sample_id = sample_data["SampleID"] sample_name = sample_data["SampleName"] sample_data["RunName"] = run_name - run_analysis_sample_name = "__".join([run_analysis_name, sample_name]) num_polonies = sample_data["NumPolonies"] if num_polonies < MIN_POLONIES: log.warning( - f"Skipping {run_analysis_sample_name} because it has <{MIN_POLONIES} assigned reads [n={num_polonies}]." + f"Skipping {run_analysis_sample_name} because it has" + f" <{MIN_POLONIES} assigned reads [n={num_polonies}]." ) continue # skip run if in user provider ignore list - if self.is_ignore_sample(sample_id): + if self.is_ignore_sample(sample_id) or self.is_ignore_sample(run_analysis_sample_name): + log.info( + f"Skipping <{sample_id}> ({run_analysis_sample_name}) because it is present in ignore list." + ) continue - if self.is_ignore_sample(run_analysis_sample_name): + + # If sample passes all checks add it back + runs_sample_data[run_analysis_sample_name] = sample_data + sample_to_project[run_analysis_sample_name] = project + + self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") + + return [runs_global_data, runs_sample_data, sample_to_project] + + def _parse_run_manifest(self, data_source: str) -> Dict[str, Any]: + runs_manifest_data = {} + + if data_source == "": + return runs_manifest_data + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunStats.json + run_stats_path = Path(directory) / "RunStats.json" + if not run_stats_path.exists(): + log.error( + f"RunStats.json does not exist in the Bases2Fastq output directory {directory}.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + run_analysis_name = None + with open(run_stats_path) as _infile: + run_stats = json.load(_infile) + run_name = run_stats.get("RunName", None) + analysis_id = run_stats.get("AnalysisID", None) + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) continue - self.sample_id_to_run[sample_id] = run_analysis_name - self.b2f_sample_data[run_analysis_sample_name] = sample_data - num_samples += 1 + run_manifest = json.loads(f["f"]) + if "Settings" not in run_manifest: + log.warning( + f" section not found in {directory}/RunManifest.json.\nSkipping RunManifest metrics." + ) + else: + for lane_data in run_manifest["Settings"]: + lane_id = lane_data.get("Lane") + if not lane_id: + log.error(" not found in Settings section of RunManifest. Skipping lanes.") + continue + lane_name = f"L{lane_id}" + run_lane = f"{run_analysis_name} | {lane_name}" + runs_manifest_data[run_lane] = {} + + indices = [] + indices_cycles = [] + mask_pattern = re.compile(r"^I\d+Mask$") + matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] + for key in matching_keys: + for mask_info in lane_data[key]: + if mask_info["Read"] not in indices: + indices.append(mask_info["Read"]) + indices_cycles.append(str(len(mask_info["Cycles"]))) + indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" + runs_manifest_data[run_lane]["Indexing"] = indexing + + runs_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") + runs_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( + "R1AdapterMinimumTrimmedLength", "N/A" + ) + runs_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( + "R2AdapterMinimumTrimmedLength", "N/A" + ) + + self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") + + return runs_manifest_data + + def _parse_run_manifest_in_project(self, data_source: str) -> Dict[str, Any]: + project_manifest_data = {} + + if data_source == "": + return project_manifest_data + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunParameters.json + run_manifest = Path(directory) / "../../RunManifest.json" + if not run_manifest.exists(): + log.error( + f"RunManifest.json could not be found in {run_manifest}. Skipping index assignment.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + project_stats = json.loads(f["f"]) + run_analysis_name = None + run_name = project_stats.get("RunName", None) + analysis_id = project_stats.get("AnalysisID", None) + + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with project's RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {f['fn']}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue # skip run if in user provider ignore list if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue - num_runs += 1 - self.b2f_run_data[run_analysis_name] = data - self.add_data_source(f=f, s_name=run_analysis_name, module="bases2fastq") - - # Checking if run lengths configurations are the same for all samples. - self.run_r1r2_lens = [] - for s in self.b2f_run_data.keys(): - read_lens = str(len(self.b2f_run_data[s]["Reads"][0]["Cycles"])) - if len(self.b2f_run_data[s]["Reads"]) > 1: - read_lens += "+" + str(len(self.b2f_run_data[s]["Reads"][1]["Cycles"])) - self.run_r1r2_lens.append(read_lens) - - run_r1r2_lens_dict = {} - for nn, rl in enumerate(self.run_r1r2_lens): - if not run_r1r2_lens_dict.get(rl): - run_r1r2_lens_dict[rl] = [] - run_r1r2_lens_dict[rl].append(list(self.b2f_run_data.keys())[nn]) - - # - # bases2fastq/project - # - num_projects = 0 - for f in self.find_log_files("bases2fastq/project"): + run_manifest_data = None + with open(run_manifest) as _infile: + run_manifest_data = json.load(_infile) + + if "Settings" not in run_manifest_data: + log.warning(f" section not found in {run_manifest}.\nSkipping RunManifest metrics.") + else: + for lane_data in run_manifest_data["Settings"]: + lane_id = lane_data.get("Lane") + if not lane_id: + log.error(" not found in Settings section of RunManifest. Skipping lanes.") + continue + lane_name = f"L{lane_id}" + run_lane = f"{run_analysis_name} | {lane_name}" + project_manifest_data[run_lane] = {} + + indices = [] + indices_cycles = [] + mask_pattern = re.compile(r"^I\d+Mask$") + matching_keys = [key for key in lane_data.keys() if mask_pattern.match(key)] + for key in matching_keys: + for mask_info in lane_data[key]: + if mask_info["Read"] not in indices: + indices.append(mask_info["Read"]) + indices_cycles.append(str(len(mask_info["Cycles"]))) + indexing = f"{' + '.join(indices_cycles)}
{' + '.join(indices)}" + project_manifest_data[run_lane]["Indexing"] = indexing + + project_manifest_data[run_lane]["AdapterTrimType"] = lane_data.get("AdapterTrimType", "N/A") + project_manifest_data[run_lane]["R1AdapterMinimumTrimmedLength"] = lane_data.get( + "R1AdapterMinimumTrimmedLength", "N/A" + ) + project_manifest_data[run_lane]["R2AdapterMinimumTrimmedLength"] = lane_data.get( + "R2AdapterMinimumTrimmedLength", "N/A" + ) + data_source_info: LoadedFileDict[Any] = { + "fn": str(run_manifest.name), + "root": str(run_manifest.parent), + "sp_key": data_source, + "s_name": str(run_manifest.with_suffix("").name), + "f": run_manifest_data, + } + self.add_data_source(f=data_source_info, s_name=run_analysis_name, module="bases2fastq") + + return project_manifest_data + + def _parse_run_unassigned_sequences(self, data_source: str) -> Dict[str, Any]: + run_unassigned_sequences = {} + if data_source == "": + return run_unassigned_sequences + + for f in self.find_log_files(data_source): data = json.loads(f["f"]) - samples = data["Samples"] - # get run + analysis + # Get RunName and AnalysisID run_name = data.get("RunName", None) analysis_id = data.get("AnalysisID", None)[0:4] - + if not run_name or not analysis_id: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue run_analysis_name = "-".join([run_name, analysis_id]) run_analysis_name = self.clean_s_name(run_analysis_name, f) - if not run_name or not analysis_id: - log.error(f"Error with {f['root']}. Either RunName or AnalysisID is absent.") - log.error("Please visit Elembio online documentation for more information -") + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") continue - project = self.clean_s_name(data.get("Project", "DefaultProject"), f) + # Get total polonies and build unassigned indices dictionary + total_polonies = data.get("NumPoloniesBeforeTrimming", 0) + if "Lanes" not in data: + log.error( + f"Missing lane information in RunStats.json for run {run_analysis_name}." + f"Skipping building unassigned indices table." + ) + continue + index_number = 1 + for lane in data["Lanes"]: + lane_id = lane.get("Lane") + if lane_id: + lane_id = f"L{lane_id}" + for sequence in lane.get("UnassignedSequences", []): + run_unassigned_sequences[index_number] = { + "Run Name": run_analysis_name, + "Lane": lane_id, + "I1": sequence["I1"], + "I2": sequence["I2"], + "Number of Polonies": sequence["Count"], + "% Polonies": float("nan"), + } + if total_polonies > 0: + run_unassigned_sequences[index_number]["% Polonies"] = round( + sequence["Count"] / total_polonies, 2 + ) + index_number += 1 + + return run_unassigned_sequences + + def _parse_index_assignment(self, manifest_data_source: str) -> Dict[str, Any]: + sample_to_index_assignment = {} + + if manifest_data_source == "": + return sample_to_index_assignment + + for f in self.find_log_files(manifest_data_source): + directory = f.get("root") + if not directory: + continue + + # Get RunName and RunID from RunParameters.json + run_stats_path = Path(directory) / "RunStats.json" + if not run_stats_path.exists(): + log.error( + f"RunStats.json does not exist in the Bases2Fastq output directory {directory}.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue + + run_analysis_name = None + total_polonies = 0 + with open(run_stats_path) as _infile: + run_stats = json.load(_infile) - run_analysis_project_name = "__".join([run_name, project, analysis_id]) - run_analysis_project_name = self.clean_s_name(run_analysis_project_name, f) + # Get run name information + run_name = run_stats.get("RunName", None) + analysis_id = run_stats.get("AnalysisID", None) + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {run_stats_path}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue + + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") + continue - # skip project if in user provider ignore list - if self.is_ignore_sample(run_analysis_project_name): + # Ensure sample stats are present + if "SampleStats" not in run_stats: + log.error( + "Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Missing SampleStats in RunStats.json. Available keys: {list(run_stats.keys())}.") + continue + + # Extract per sample polony counts and overall total counts + total_polonies = run_stats.get("NumPoloniesBeforeTrimming", 0) + for sample_data in run_stats["SampleStats"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name and sample_name: + sample_id = "__".join([run_analysis_name, sample_name]) + + if "Occurrences" not in sample_data: + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + + for occurrence in sample_data["Occurrences"]: + sample_expected_seq = occurrence.get("ExpectedSequence") + sample_counts = occurrence.get("NumPoloniesBeforeTrimming") + if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): + log.error( + f"Missing data needed to extract index assignment for sample {sample_id}. Skipping." + ) + continue + if run_analysis_name not in sample_to_index_assignment: + sample_to_index_assignment[run_analysis_name] = {} + if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: + sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { + "SampleID": sample_id, + "SamplePolonyCounts": 0, + "PercentOfPolonies": float("nan"), + "Index1": "", + "Index2": "", + } + sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( + sample_counts + ) + + for sample_data in sample_to_index_assignment[run_analysis_name].values(): + if total_polonies > 0: + sample_data["PercentOfPolonies"] = round( + sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 + ) + + run_manifest = json.loads(f["f"]) + if "Samples" not in run_manifest: + log.warning( + f" section not found in {directory}/RunManifest.json.\n" + f"Skipping RunManifest sample index assignment metrics." + ) + elif len(sample_to_index_assignment) == 0: + log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") + else: + for sample_data in run_manifest["Samples"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: + continue + sample_id = "__".join([run_analysis_name, sample_name]) + for index_data in sample_data["Indexes"]: + index_1 = index_data.get("Index1", "") + index_2 = index_data.get("Index2", "") + merged_indices = f"{index_1}{index_2}" + if merged_indices not in sample_to_index_assignment[run_analysis_name]: + log.error(f"Index assignment information not found for sample {sample_id}. Skipping.") + continue + if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: + log.error( + f"RunManifest SampleID <{sample_id}> does not match " + f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." + "Skipping." + ) + continue + sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 + sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 + + return sample_to_index_assignment + + def _parse_index_assignment_in_project(self, data_source: str) -> Dict[str, Any]: + sample_to_index_assignment = {} + + if data_source == "": + return sample_to_index_assignment + + for f in self.find_log_files(data_source): + directory = f.get("root") + if not directory: continue - for sample_name in samples: - run_analysis_sample_name = self.clean_s_name("__".join([run_analysis_name, sample_name]), f) - self.project_lookup_dict[run_analysis_sample_name] = project - num_projects += 1 + # Get RunName and RunID from RunParameters.json + run_manifest = Path(directory) / "../../RunManifest.json" + if not run_manifest.exists(): + log.error( + f"RunManifest.json could not be found in {run_manifest}. Skipping index assignment.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + continue - # remove samples - del data["Samples"] + project_stats = json.loads(f["f"]) + run_analysis_name = None + run_name = project_stats.get("RunName", None) + analysis_id = project_stats.get("AnalysisID", None) + project = self.clean_s_name(project_stats.get("Project", "DefaultProject"), f) - self.b2f_run_project_data[run_analysis_project_name] = data - self.add_data_source(f=f, s_name=project, module="bases2fastq") + if run_name and analysis_id: + run_analysis_name = "-".join([run_name, analysis_id[0:4]]) + else: + log.error( + "Error with project's RunStats.json. Either RunName or AnalysisID is absent.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Error in RunStats.json: {f['fn']}") + log.debug(f"Missing: RunName: {run_name} or AnalysisID: {analysis_id}") + continue - # if all RunStats.json too large, none will be found. Guide customer and Exit at this point. - if len(self.sample_id_to_run) != 0: - log.info(f"Found {num_runs} total RunStats.json") + # skip run if in user provider ignore list + if self.is_ignore_sample(run_analysis_name): + log.info(f"Skipping <{run_analysis_name}> because it is present in ignore list.") + continue - # ensure run/sample data found - if num_projects == 0 and num_samples == 0: - raise ModuleNoSamplesFound - log.info(f"Found {num_samples} samples and {num_projects} projects within the bases2fastq results") + # Ensure sample stats are present + if "SampleStats" not in project_stats: + log.error( + "Error, missing SampleStats in RunStats.json. Skipping index assignment metrics.\n" + "Please visit Elembio online documentation for more information - " + "https://docs.elembio.io/docs/bases2fastq/introduction/" + ) + log.debug(f"Missing SampleStats in RunStats.json. Available keys: {list(project_stats.keys())}.") + continue - # Superfluous function call to confirm that it is used in this module - self.add_software_version(None) + # Extract per sample polony counts and overall total counts + total_polonies = project_stats.get("NumPoloniesBeforeTrimming", 0) + for sample_data in project_stats["SampleStats"]: + sample_name = sample_data.get("SampleName") + sample_id = None - # process groups / projects - for s_name in self.b2f_sample_data.keys(): - s_group = self.b2f_sample_data[s_name]["RunName"] + if run_analysis_name and sample_name: + sample_id = "__".join([run_analysis_name, sample_name]) - if not self.group_dict.get(s_group): - self.group_dict.update({s_group: []}) + if "Occurrences" not in sample_data: + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue - self.group_dict[s_group].append(s_name) - self.group_lookup_dict.update({s_name: s_group}) + for occurrence in sample_data["Occurrences"]: + sample_expected_seq = occurrence.get("ExpectedSequence") + sample_counts = occurrence.get("NumPoloniesBeforeTrimming") + if any([element is None for element in [sample_expected_seq, sample_counts, sample_id]]): + log.error(f"Missing data needed to extract index assignment for sample {sample_id}. Skipping.") + continue + if run_analysis_name not in sample_to_index_assignment: + sample_to_index_assignment[run_analysis_name] = {} + if sample_expected_seq not in sample_to_index_assignment[run_analysis_name]: + sample_to_index_assignment[run_analysis_name][sample_expected_seq] = { + "SampleID": sample_id, + "Project": project, + "SamplePolonyCounts": 0, + "PercentOfPolonies": float("nan"), + "Index1": "", + "Index2": "", + } + sample_to_index_assignment[run_analysis_name][sample_expected_seq]["SamplePolonyCounts"] += ( + sample_counts + ) - # Assign project - for s_name in self.b2f_sample_data.keys(): - if self.project_lookup_dict.get(s_name): - s_group = self.project_lookup_dict[s_name] - if not self.group_dict.get(s_group): - self.group_dict.update({s_group: []}) - self.group_dict[s_group].append(s_name) - self.group_lookup_dict.update({s_name: s_group}) + for sample_data in sample_to_index_assignment[run_analysis_name].values(): + if total_polonies > 0: + sample_data["PercentOfPolonies"] = round( + sample_data["SamplePolonyCounts"] / total_polonies * 100, 2 + ) - # Assign color for each group - self.color_getter = mqc_colour.mqc_colour_scale() - self.palette = sum( - [ - self.color_getter.get_colours(hue) - for hue in ["Set2", "Pastel1", "Accent", "Set1", "Set3", "Dark2", "Paired", "Pastel2"] - ], - [], - ) - if len(self.group_dict) > len(self.palette): - hex_range = 2**24 - extra_colors = [hex(random.randrange(0, hex_range)) for _ in range(len(self.group_dict), len(self.palette))] - self.palette = self.palette + extra_colors - self.group_color = {g: c for g, c in zip(self.group_dict.keys(), self.palette[: len(self.group_dict)])} - self.sample_color = dict() - for s_name in self.b2f_sample_data.keys(): - self.sample_color.update({s_name: self.group_color[self.group_lookup_dict[s_name]]}) - self.run_color = copy.deepcopy(self.group_color) # Make sure that run colors and group colors match - self.palette = self.palette[len(self.group_dict) :] + run_manifest_data = None + with open(run_manifest) as _infile: + run_manifest_data = json.load(_infile) - # Read custom group info - self.group_info_exist = False - for f in self.find_log_files("bases2fastq/group"): - if self.group_info_exist: + if "Samples" not in run_manifest_data: log.warning( - "More than one group assignment files are found. Please only keep " - "one assignment file in the analysis folder. Bases2Fastq stats will " - "not be plotted" + f" section not found in {directory}/RunManifest.json.\n" + f"Skipping RunManifest sample index assignment metrics." ) - for row in csv.DictReader(f["f"]): - s_group = row["Group"] - s_name = row["Sample Name"] - if self.group_dict.get(s_group) is None: - self.group_dict[s_group] = [] - self.group_dict[s_group].append(s_name) - self.group_lookup_dict[s_name] = s_group - for group in self.group_dict.keys(): - if group not in self.run_color: - if len(self.palette) > 0: - self.group_color[group] = self.palette.pop(0) - else: - hex_range = 2**24 - extra_color = hex(random.randrange(0, hex_range)) - self.group_color[group] = extra_color - self.sample_color = dict() - for s_name in self.b2f_sample_data.keys(): - self.sample_color.update({s_name: self.group_color[self.group_lookup_dict[s_name]]}) - - # sort run - data_keys = list(self.b2f_run_data.keys()) - data_keys.sort() - sorted_data = {s_name: self.b2f_run_data[s_name] for s_name in data_keys} - self.b2f_run_data = sorted_data - # sort projects - data_keys = list(self.b2f_run_project_data.keys()) - data_keys.sort() - sorted_data = {s_name: self.b2f_run_project_data[s_name] for s_name in data_keys} - self.b2f_run_project_data = sorted_data - # sort samples - data_keys = list(self.b2f_sample_data.keys()) - sorted_keys = sorted(data_keys, key=lambda x: (self.group_lookup_dict[x], x)) - sorted_data = {s_name: self.b2f_sample_data[s_name] for s_name in sorted_keys} - self.b2f_sample_data = sorted_data - - if len(self.b2f_run_data) == 0: - log.warning("No run stats file found!") - if len(self.b2f_sample_data) == 0: - log.warning("No sample stats file found!") - - # Add sections - self.add_run_plots() - if num_projects > 0: - self.add_project_run_plots() - self.add_sample_plots() - - def get_uuid(self): - return str(uuid.uuid4()).replace("-", "").lower() - - def add_run_plots(self): - plot_functions = [tabulate_run_stats, plot_run_stats, plot_base_quality_hist, plot_base_quality_by_cycle] + elif len(sample_to_index_assignment) == 0: + log.warning("Index assignment data missing. Skipping creation of index assignment metrics.") + else: + for sample_data in run_manifest_data["Samples"]: + sample_name = sample_data.get("SampleName") + sample_id = None + if run_analysis_name is None or sample_name is None or "Indexes" not in sample_data: + continue + sample_id = "__".join([run_analysis_name, sample_name]) + for index_data in sample_data["Indexes"]: + index_1 = index_data.get("Index1", "") + index_2 = index_data.get("Index2", "") + merged_indices = f"{index_1}{index_2}" + if merged_indices not in sample_to_index_assignment[run_analysis_name]: + continue + if sample_id != sample_to_index_assignment[run_analysis_name][merged_indices]["SampleID"]: + log.error( + f"RunManifest SampleID <{sample_id}> does not match " + f"RunStats SampleID {sample_to_index_assignment[merged_indices]['SampleID']}." + "Skipping." + ) + continue + sample_to_index_assignment[run_analysis_name][merged_indices]["Index1"] = index_1 + sample_to_index_assignment[run_analysis_name][merged_indices]["Index2"] = index_2 + + return sample_to_index_assignment + + def add_run_plots(self, data, plot_functions): for func in plot_functions: - plot_html, plot_name, anchor, description, helptext, plot_data = func(self.b2f_run_data, self.run_color) + plot_html, plot_name, anchor, description, helptext, plot_data = func(data, self.run_color) self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) self.write_data_file(plot_data, f"base2fastq:{plot_name}") - def add_project_run_plots(self): - plot_functions = [tabulate_project_run_stats] - for func in plot_functions: - plot_html, plot_name, anchor, description, helptext, plot_data = func( - self.b2f_run_project_data, self.run_color - ) - self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) - self.write_data_file(plot_data, f"base2fastq_projects:{plot_name}") - - def add_sample_plots(self): + def add_sample_plots(self, data, group_lookup, project_lookup): plot_functions = [ tabulate_sample_stats, sequence_content_plot, @@ -292,7 +813,7 @@ def add_sample_plots(self): ] for func in plot_functions: plot_html, plot_name, anchor, description, helptext, plot_data = func( - self.b2f_sample_data, self.group_lookup_dict, self.project_lookup_dict, self.sample_color + data, group_lookup, project_lookup, self.sample_color ) self.add_section(name=plot_name, plot=plot_html, anchor=anchor, description=description, helptext=helptext) self.write_data_file(plot_data, f"base2fastq:{plot_name}") diff --git a/multiqc/modules/bases2fastq/plot_project_runs.py b/multiqc/modules/bases2fastq/plot_project_runs.py deleted file mode 100644 index 6a3663535d..0000000000 --- a/multiqc/modules/bases2fastq/plot_project_runs.py +++ /dev/null @@ -1,88 +0,0 @@ -from multiqc.plots import table -from multiqc import config - -""" -Functions for plotting per run information of bases2fastq -""" - - -def tabulate_project_run_stats(run_data, color_dict): - """ - Tabulate general information and statistics of each run - """ - plot_content = dict() - for s_name in run_data.keys(): - run_stats = dict() - run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) - run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) - run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) - run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) - run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) - plot_content.update({s_name: run_stats}) - - headers = {} - headers["num_polonies_run"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run ({config.base_count_desc})", - "min": 0, - "scale": "RdYlGn", - "shared_key": "base_count", - } - headers["percent_assigned_run"] = { - "title": "% Assigned Reads", - "description": "The percentage of reads assigned to sample(s)", - "max": 100, - "min": 0, - "scale": "BuPu", - "suffix": "%", - } - headers["yield_run"] = { - "title": "Assigned Yield (Gb)", - "description": "The run yield based on assigned reads in gigabases", - "scale": "Blues", - } - headers["mean_base_quality_run"] = { - "title": "Quality Score Mean", - "description": "Average base quality across Read 1 and Read 2", - "min": 0, - "scale": "Spectral", - } - headers["percent_q30_run"] = { - "title": "Percent Q30", - "description": "The percentage of ≥ Q30 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", - "max": 100, - "min": 0, - "scale": "RdYlGn", - "suffix": "%", - } - headers["percent_q40_run"] = { - "title": "Percent Q40", - "description": "The percentage of ≥ Q40 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", - "max": 100, - "min": 0, - "scale": "RdYlGn", - "suffix": "%", - } - - pconfig = { - "title": "bases2fastq: General Sequencing (Project) QC metrics", - "col1_header": "Run Name", - "id": "project_run_metrics_table", - "ylab": "QC", - } - - plot_name = "(Project) Sequencing QC metrics table" - plot_html = table.plot(plot_content, headers, pconfig=pconfig) - anchor = "project_run_qc_metrics_table" - description = "QC metrics per run, per project" - helptext = """ - This section displays metrics that indicate the quality of each sequencing run: \n - - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n - - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - """ - return plot_html, plot_name, anchor, description, helptext, plot_content diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py index accd532678..87151b3baa 100644 --- a/multiqc/modules/bases2fastq/plot_runs.py +++ b/multiqc/modules/bases2fastq/plot_runs.py @@ -2,13 +2,19 @@ from multiqc.plots import bargraph, linegraph, table from multiqc import config - +from natsort import natsorted +import random +import string """ Functions for plotting per run information of bases2fastq """ +def generate_random_string(length: int): + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + + def plot_run_stats(run_data, color_dict): """ Plot a bar graph for polony numbers, Q30/Q40, index assignment rate and yields for each run @@ -19,8 +25,8 @@ def plot_run_stats(run_data, color_dict): yields = dict() for run in run_names: # Index Assignment Polonies and Yields ### - # percent_assigned = run_data[run].get("PercentAssignedReads",100.0) - percent_assigned = run_data[run]["PercentAssignedReads"] + percent_assigned = run_data[run].get("PercentAssignedReads", 100.0) + # percent_assigned = run_data[run]["PercentAssignedReads"] percent_perfect_assigned = ( 100.00 - run_data[run]["PercentMismatch"] @@ -42,7 +48,7 @@ def plot_run_stats(run_data, color_dict): num_polonies[run] = num_polonies_run total_yield_run = {} - total_yield = run_data[run].get("TotalYield", 300.0) + total_yield = run_data[run].get("TotalYield", run_data[run].get("AssignedYield", 300.0)) total_yield_run["Perfect Index"] = total_yield * percent_perfect_total * 0.01 total_yield_run["Mismatched Index"] = total_yield * percent_imperfect_total * 0.01 total_yield_run["Unassigned"] = ( @@ -54,11 +60,11 @@ def plot_run_stats(run_data, color_dict): pconfig = { "data_labels": [ {"name": "Number of Polonies", "ylab": "Number of Polonies", "format": "{d}"}, - {"name": "Yield (Gb)", "ylab": "Gb"}, + {"name": "Yield (Gb)", "ylab": "Yield"}, ], "cpswitch": True, "stacking": "normal", - "id": "run_metrics_bar", + "id": f"run_metrics_bar_{generate_random_string(10)}", "title": "bases2fastq: General Sequencing Run QC metrics plot", "ylab": "QC", } @@ -69,15 +75,166 @@ def plot_run_stats(run_data, color_dict): "Unassigned": {"name": "Unassigned Index", "color": "#434348"}, } ] * 2 - plot_name = "Sequencing Run Yield" plot_html = bargraph.plot(plot_content, cats, pconfig=pconfig) - anchor = "run_yield_plot" + anchor = "run_metrics_bar" description = "Bar plots of sequencing run yields. Please see individual run reports for details" helptext = """ This section shows and compare the yield and index assignment rate of each sequencing run.\n\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Yield: The total yield of all assigned reads in gigabases. + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Yield: The total yield of all assigned reads in gigabases. + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def _calculate_reads_eliminated(run_data) -> int: + """ + Calculate the total number of reads eliminated during trimming. + + This function iterates over the lanes in the given run data and sums the + difference between the number of polonies before trimming and after trimming. + If required fields are missing, they are skipped. + + Args: + run_data (dict): Dictionary containing sequencing run data with lane information. + + Returns: + int: The total number of reads eliminated across all lanes. + """ + reads_eliminated = 0 + if "Lanes" not in run_data: + return reads_eliminated + for lane in run_data["Lanes"]: + if "NumPolonies" not in lane or "NumPoloniesBeforeTrimming" not in lane: + continue + reads_eliminated += lane["NumPoloniesBeforeTrimming"] - lane["NumPolonies"] + + return reads_eliminated + + +def tabulate_project_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + is_percent_q50_present = False + reads_present = [] + for s_name in run_data.keys(): + project = run_data[s_name]["Project"] + run_project_name = f"{s_name} | {project}" + run_stats = dict() + run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) + run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) + run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) + run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) + run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + percent_q50 = run_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + run_stats.update({"percent_q50_run": percent_q50}) + run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) + if "Reads" in run_data[s_name]: + for read in run_data[s_name]["Reads"]: + if "Cycles" not in read or "Read" not in read: + continue + read_name = read["Read"] + num_cycles = len(read["Cycles"]) + reads_present.append(read_name) + run_stats.update({f"{read_name}_cycles": num_cycles}) + + plot_content.update({run_project_name: run_stats}) + + headers = {} + headers["num_polonies_run"] = { + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run.", + "min": 0, + "scale": "RdYlGn", + } + headers["percent_assigned_run"] = { + "title": "% Assigned Reads", + "description": "The percentage of reads assigned to sample(s)", + "max": 100, + "min": 0, + "scale": "BuPu", + "suffix": "%", + } + headers["yield_run"] = { + "title": "Assigned Yield (Gb)", + "description": "The run yield based on assigned reads in gigabases", + "scale": "Blues", + } + headers["mean_base_quality_run"] = { + "title": "Quality Score Mean", + "description": "Average base quality across Read 1 and Read 2", + "min": 0, + "scale": "Spectral", + } + headers["percent_q30_run"] = { + "title": "Percent Q30", + "description": "The percentage of ≥ Q30 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + headers["percent_q40_run"] = { + "title": "Percent Q40", + "description": "The percentage of ≥ Q40 Q scores for the project. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + for read in reads_present: + headers[f"{read}_cycles"] = { + "title": f"Cycles {read}", + "description": f"Number of cycles for read {read}.", + "scale": "RdPu", + } + + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } + + pconfig = { + "title": "bases2fastq: General Sequencing (Project) QC metrics", + "col1_header": "Run Name", + "id": f"project_run_metrics_table_{generate_random_string(5)}", + "ylab": "QC", + } + + project_header = "" + run_keys = list(run_data.keys()) + if len(run_keys) > 1: + project_header = "(Project) " + elif len(run_keys) == 1: + first_key = run_keys[0] + project_header = f"{run_data[first_key]['Project']} | " + plot_name = f"{project_header}Sequencing QC Metrics Table" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "project_run_qc_metrics_table" + description = "QC metrics per run, per project" + helptext = """ + This section displays metrics that indicate the quality of each sequencing run: \n + - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n + - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -87,23 +244,39 @@ def tabulate_run_stats(run_data, color_dict): Tabulate general information and statistics of each run """ plot_content = dict() + is_percent_q50_present = False + reads_present = [] for s_name in run_data.keys(): run_stats = dict() run_stats.update({"num_polonies_run": int(run_data[s_name]["NumPolonies"])}) run_stats.update({"percent_assigned_run": run_data[s_name].get("PercentAssignedReads", 100.0)}) + run_stats.update({"percent_unexpected_index_pairs": run_data[s_name].get("PercentUnexpectedIndexPairs", 0.0)}) run_stats.update({"yield_run": run_data[s_name]["AssignedYield"]}) run_stats.update({"mean_base_quality_run": run_data[s_name]["QualityScoreMean"]}) run_stats.update({"percent_q30_run": run_data[s_name]["PercentQ30"]}) run_stats.update({"percent_q40_run": run_data[s_name]["PercentQ40"]}) + percent_q50 = run_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + run_stats.update({"percent_q50_run": percent_q50}) + run_stats.update({"reads_eliminated": _calculate_reads_eliminated(run_data[s_name])}) + if "Reads" in run_data[s_name]: + for read in run_data[s_name]["Reads"]: + if "Cycles" not in read or "Read" not in read: + continue + read_name = read["Read"] + num_cycles = len(read["Cycles"]) + reads_present.append(read_name) + run_stats.update({f"{read_name}_cycles": num_cycles}) + plot_content.update({s_name: run_stats}) headers = {} headers["num_polonies_run"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})", + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run.)", "min": 0, "scale": "RdYlGn", - "shared_key": "base_count", } headers["percent_assigned_run"] = { "title": "% Assigned Reads", @@ -113,6 +286,14 @@ def tabulate_run_stats(run_data, color_dict): "scale": "BuPu", "suffix": "%", } + headers["percent_unexpected_index_pairs"] = { + "title": "% Unexpected Index Pairs", + "description": "The percentage of unexpected index pairs", + "max": 100, + "min": 0, + "scale": "BuPu", + "suffix": "%", + } headers["yield_run"] = { "title": "Yield (Gb)", "description": "The run yield based on assigned reads in gigabases", @@ -140,11 +321,30 @@ def tabulate_run_stats(run_data, color_dict): "scale": "RdYlGn", "suffix": "%", } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + for read in reads_present: + headers[f"{read}_cycles"] = { + "title": f"Cycles {read}", + "description": f"Number of cycles for read {read}.", + "scale": "RdPu", + } + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } pconfig = { - "title": "bases2fastq: General Sequencing Run QC metrics", + "title": "Bases2Fastq: General Sequencing Run QC metrics", "col1_header": "Run Name", - "id": "run_metrics_table", + "id": f"run_metrics_table_{generate_random_string(5)}", "ylab": "QC", } @@ -154,17 +354,221 @@ def tabulate_run_stats(run_data, color_dict): description = "QC metrics per run" helptext = """ This section displays metrics that indicate the quality of each sequencing run: \n - - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n - - Number of Polonies: The total number of polonies that are calculated for the run.\n - - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n - - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Run Name: Unique identifier composed of (RunName)__(UUID), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Number of Polonies: The total number of polonies that are calculated for the run.\n + - Percentage Assigned Reads: The percentage of reads that are assigned to a sample.\n + - Assigned Yield (Gb): The run yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The mean Q score of base calls for the samples. This excludes filtered reads and no calls.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the run. This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Percent Q50: The percentage of ≥ Q50 Q scores for the run (when applicable). This includes assigned and unassigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def tabulate_manifest_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + for s_name in run_data.keys(): + run_stats = dict() + run_stats.update({"indexing": run_data[s_name]["Indexing"]}) + run_stats.update({"adapter_trim_type": run_data[s_name]["AdapterTrimType"]}) + run_stats.update({"min_read_length_r1": run_data[s_name]["R1AdapterMinimumTrimmedLength"]}) + run_stats.update({"min_read_length_r2": run_data[s_name]["R2AdapterMinimumTrimmedLength"]}) + plot_content.update({s_name: run_stats}) + + headers = {} + headers["indexing"] = { + "title": "Indexing", + "description": "Indexing scheme.", + "scale": "RdYlGn", + } + headers["adapter_trim_type"] = { + "title": "Adapter Trim Type", + "description": "Adapter trimming method.", + } + headers["min_read_length_r1"] = { + "title": "Minimum Read Length R1", + "description": "Minimum read length for read R1.", + "scale": "RdYlGn", + } + headers["min_read_length_r2"] = { + "title": "Minimum Read Length R2", + "description": "Minimum read length for read R1 (if applicable).", + "scale": "RdYlGn", + } + + pconfig = { + "title": "Bases2Fastq: Run Manifest Metrics", + "col1_header": "Run Name | Lane", + "id": f"run_manifest_metrics_table_{generate_random_string(5)}", + } + + plot_name = "Run Manifest Table" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "run_manifest_metrics_table" + description = "Run parameters used." + helptext = """ + This section displays metrics that indicate the parameters used in the run: \n + - Run Name | Lane: Unique identifier composed of (RunName)__(UUID) | (Lane), where (RunName) maps to the AVITI run name and (UUID) maps to the unique Bases2Fastq analysis result.\n + - Indexing: Describes the indexing scheme.\n + - Adapter Trim Type: Adapter trimming method.\n + - Minimum Read Length R1/R2: Minumum read length after adapter trimming.\n + """ + return plot_html, plot_name, anchor, description, helptext, plot_content + + +def tabulate_index_assignment_stats(run_data, color_dict): + """ + Tabulate general information and statistics of each run + """ + plot_content = dict() + run_names = sorted(run_data.keys()) + index = 1 + project_present = False + for run in run_names: + run_sample_data = run_data[run] + sorted_run_sample_data = natsorted(run_sample_data.items(), key=lambda x: x[1]["SampleID"]) + for sample_data in sorted_run_sample_data: + sample_data = sample_data[1] + sample_index_stats = dict() + sample_index_stats.update({"run_name": run}) + if "Project" in sample_data: + sample_index_stats.update({"project": sample_data["Project"]}) + project_present = True + sample_index_stats.update({"sample_name": sample_data["SampleID"].split("__")[1]}) + sample_index_stats.update({"index_1": sample_data["Index1"]}) + sample_index_stats.update({"index_2": sample_data["Index2"]}) + sample_index_stats.update({"assigned_polonies": sample_data["SamplePolonyCounts"]}) + sample_index_stats.update({"polony_percentage": sample_data["PercentOfPolonies"]}) + plot_content.update({index: sample_index_stats}) + index += 1 + + headers = {} + headers["run_name"] = { + "title": "Run Name", + "description": "Run Name.", + } + if project_present: + headers["project"] = { + "title": "Project", + "description": "Run Project.", + } + headers["sample_name"] = { + "title": "Sample Name", + "description": "Sample Name.", + } + headers["index_1"] = { + "title": "Index 1", + "description": "Sample Index 1 (I1).", + } + headers["index_2"] = { + "title": "Index 2", + "description": "Sample Index 2 (I2).", + } + headers["assigned_polonies"] = { + "title": "Assigned Polonies", + "description": "Number of polonies assigned to sample.", + "scale": "RdYlGn", + } + headers["polony_percentage"] = { + "title": "Polony %", + "description": "Percentage of total polonies assigned to this index combination.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + + pconfig = { + "title": "Bases2Fastq: Index Assignment Metrics", + "col1_header": "Sample #", + "id": f"index_assignment_metrics_{generate_random_string(5)}", + } + + plot_name = "Index Assignment Metrics" + plot_html = table.plot(plot_content, headers, pconfig=pconfig) + anchor = "index_assignment_metrics" + description = "Index assignment metrics." + helptext = """ + This section displays index assignment metrics including: \n + - Sample Name: Sample identifier combining RunID and SampleID.\n + - Index 1: Sample I1.\n + - Index 2: Sample I2.\n + - Polonies: Number of polonies assigned each sample.\n + - Polony %: Percentage of total run's polonies assigned to each sample.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content +def tabulate_unassigned_index_stats(run_data, color_dict): + """ + Tabulate unassigned index metrics. + + run_data: Dictionary with unassigned index data including: + - RunName + - Lane + - I1 + - I2 + - Polonies + - % Polonies + """ + headers = {} + headers["Run Name"] = { + "title": "Run Name", + "description": "Run Name (Run ID + Analysis ID).", + } + headers["Lane"] = { + "title": "Lane", + "description": "Index Lane.", + } + headers["I1"] = { + "title": "I1", + "description": "Index 1.", + } + headers["I2"] = { + "title": "I2", + "description": "Index 2.", + } + headers["Number of Polonies"] = { + "title": "Polonies", + "description": "Number of polonies assigned to indices.", + "scale": "RdYlGn-rev", + } + headers["% Polonies"] = { + "title": "% Polonies", + "description": "Percentage of total polonies assigned to this index combination.", + "max": 100, + "min": 0, + "scale": "RdYlGn-rev", + "suffix": "%", + } + + pconfig = { + "title": "Bases2Fastq: Unassigned Indices Metrics", + "col1_header": "Index #", + "id": f"index_unassignment_metrics_{generate_random_string(5)}", + } + + plot_name = "Unassigned Indices Metrics" + plot_html = table.plot(run_data, headers, pconfig=pconfig) + anchor = "index_unassignment_metrics" + description = "Index unassignment metrics." + helptext = """ + This section displays index assignment metrics including: \n + - Run Name: Run identifier. Built from Run ID and Analysis ID.\n + - Lane: Lane number.\n + - Index 1: Sample I1.\n + - Index 2: Sample I2.\n + - Polonies: Number of polonies assigned each index combination.\n + - Polony %: Percentage of total run's polonies assigned to each index combination.\n + """ + return plot_html, plot_name, anchor, description, helptext, run_data + + def plot_base_quality_hist(run_data, color_dict): # Prepare plot data for per base BQ histogram bq_hist_dict = dict() @@ -206,19 +610,19 @@ def plot_base_quality_hist(run_data, color_dict): "description": "Histogram of bases quality", "ymin": 0, "ylabel": "Percentage of base quality", - "xlabel": "base quality", + "xlab": "Q Score", "colors": color_dict, }, { - "name": "Qualiter Per Read", + "name": "Quality Per Read", "description": "Histogram of average read base quality", "ymin": 0, "ylabel": "Percentage of read quality", - "xlabel": "base quality", + "xlab": "Q Score", "colors": color_dict, }, ], - "id": "per_run_bq_hist", + "id": f"per_run_bq_hist_{generate_random_string(5)}", "title": "bases2fastq: Quality Histograms", "ylab": "Percentage", } @@ -310,6 +714,31 @@ def plot_base_quality_by_cycle(run_data, color_dict): cycle_dict.update({cycle_no: cycle["PercentQ40"]}) Q40_dict.update({s_name: cycle_dict}) + # Prepare plot data for %Q50 of each cycle + Q50_dict = {} + percent_q50_values = set() + for s_name in run_data.keys(): + paired_end = True if len(run_data[s_name]["Reads"]) > 1 else False + cycle_dict = dict() + for cycle in run_data[s_name]["Reads"][0]["Cycles"]: + cycle_no = int(cycle["Cycle"]) + if "PercentQ50" not in cycle: + continue + cycle_perc_q50 = cycle["PercentQ50"] + cycle_dict.update({cycle_no: cycle_perc_q50}) + if cycle_perc_q50 is not None: + percent_q50_values.add(cycle_perc_q50) + if paired_end: + for cycle in run_data[s_name]["Reads"][1]["Cycles"]: + cycle_no = int(cycle["Cycle"]) + r1r2_split + if "PercentQ50" not in cycle: + continue + cycle_perc_q50 = cycle["PercentQ50"] + cycle_dict.update({cycle_no: cycle_perc_q50}) + if cycle_perc_q50 is not None: + percent_q50_values.add(cycle_perc_q50) + Q50_dict.update({s_name: cycle_dict}) + # Prepare plot data for % base calls below PF threshold below_pf_dict = {} for s_name in run_data.keys(): @@ -332,7 +761,7 @@ def plot_base_quality_by_cycle(run_data, color_dict): pconfig = { "data_labels": [ {"name": "Median Quality", "xlab": "cycle", "ylab": "Quality"}, - {"name": "Mean Quality", "ylab": "Quality"}, + {"name": "Mean Quality", "xlab": "cycle", "ylab": "Quality"}, {"name": "%Q30", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, {"name": "%Q40", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, {"name": "%Base Calls Below PF", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}, @@ -340,17 +769,20 @@ def plot_base_quality_by_cycle(run_data, color_dict): "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": "per_run_quality_by_cycle", + "id": f"per_run_quality_by_cycle_{generate_random_string(5)}", "title": "bases2fastq: Quality by cycles", "ylab": "QC", } + if len(percent_q50_values) > 0 and any(v is not None for v in percent_q50_values): + plot_content.insert(4, Q50_dict) + pconfig["data_labels"].insert(4, {"name": "%Q50", "xlab": "cycle", "ylab": "Percentage", "ymax": 100}) plot_html = linegraph.plot(plot_content, pconfig=pconfig) plot_name = "Quality Metrics By Cycle" anchor = "per_cycle_quality" - description = "Per run base qualities by cycle" + description = "Per run base qualities by cycle. Read 1 and Read 2 are separated by a red dashed line." helptext = """ This section plots the base qualities by each instrument cycle.\n - Choose between Median Quality, Mean Quality, Percent Q30 or Percentage Q40 per cycle.\n + Choose between Median Quality, Mean Quality, Percent Q30, Percent Q40 or Percent Q50 (when applicable) per cycle.\n Read 1 and Read 2 are separated by a red dashed line. """ return plot_html, plot_name, anchor, description, helptext, plot_content diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 34388a847f..ebaab9b166 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -1,16 +1,44 @@ -from multiqc.plots import linegraph, table +from typing import Any, Dict +from multiqc.plots import bargraph, linegraph, table from multiqc import config +from .plot_runs import generate_random_string + +import numpy as np """ Functions for plotting per sample information of bases2fastq """ +def _calculate_sample_reads_eliminated(run_data) -> int: + """ + Calculate the total number of reads eliminated during trimming. + + This function iterates over the lanes in the given run data and sums the + difference between the number of polonies before trimming and after trimming. + If required fields are missing, they are skipped. + + Args: + run_data (dict): Dictionary containing sequencing run data with lane information. + + Returns: + int: The total number of reads eliminated across all lanes. + """ + reads_eliminated = 0 + if "NumPolonies" not in run_data or "NumPoloniesBeforeTrimming" not in run_data: + return reads_eliminated + reads_eliminated += run_data["NumPoloniesBeforeTrimming"] - run_data["NumPolonies"] + + return reads_eliminated + + def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, sample_color): """ Tabulate general information and statistics per sample """ plot_content = dict() + reads_present = set() + is_percent_q50_present = False for s_name in sample_data.keys(): general_stats = dict() general_stats.update({"group": group_lookup_dict[s_name]}) @@ -20,6 +48,19 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s general_stats.update({"mean_base_quality_sample": sample_data[s_name]["QualityScoreMean"]}) general_stats.update({"percent_q30_sample": sample_data[s_name]["PercentQ30"]}) general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]}) + percent_q50 = sample_data[s_name].get("PercentQ50") + if percent_q50 is not None: + is_percent_q50_present = True + general_stats.update({"percent_q50_run": percent_q50}) + general_stats.update({"reads_eliminated": _calculate_sample_reads_eliminated(sample_data[s_name])}) + general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]}) + if "Reads" in sample_data[s_name]: + for read in sample_data[s_name]["Reads"]: + read_name = read["Read"] + reads_present.add(read_name) + mean_length = read["MeanReadLength"] + general_stats.update({f"{read_name}_mean_len": mean_length}) + plot_content.update({s_name: general_stats}) headers = {} @@ -37,26 +78,25 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s "scale": False, } headers["num_polonies_sample"] = { - "title": f"# Polonies ({config.base_count_prefix})", - "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})", + "title": "# Polonies", + "description": "The total number of polonies that are calculated for the run.", "min": 0, "scale": "Blues", - "shared_key": "base_count", } headers["yield_sample"] = { "title": "Yield (Gb)", - "description": "The sample yield based on assigned reads in gigabases", + "description": "The sample yield based on assigned reads in gigabases.", "scale": "Greens", } headers["mean_base_quality_sample"] = { "title": "Mean Base Quality", - "description": "Average base quality across R1/R2", + "description": "Average base quality across R1/R2.", "min": 0, "scale": "Spectral", } headers["percent_q30_sample"] = { "title": "Percent Q30", - "description": "The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls", + "description": "The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", "max": 100, "min": 0, "scale": "RdYlGn", @@ -64,30 +104,65 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s } headers["percent_q40_sample"] = { "title": "Percent Q40", - "description": "The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls", + "description": "The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + if is_percent_q50_present: + headers["percent_q50_run"] = { + "title": "Percent Q50", + "description": "The percentage of ≥ Q50 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.", + "max": 100, + "min": 0, + "scale": "RdYlGn", + "suffix": "%", + } + + for read in sorted(reads_present): + headers[f"{read}_mean_len"] = { + "title": f"{read} Mean Length", + "description": f"Average read length for read {read}", + "scale": "RdYlGn", + } + + headers["reads_eliminated"] = { + "title": "Reads Eliminated", + "description": "Number of reads eliminated.", + } + headers["percent_mismatch"] = { + "title": "Percent Mismatch", + "description": "Percent mismatch", "max": 100, "min": 0, "scale": "RdYlGn", "suffix": "%", } - pconfig = {"id": "sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": True} + pconfig = { + "id": f"sample_qc_metric_table_{generate_random_string(5)}", + "title": "Sample QC Metrics Table", + "no_violin": False, + } plot_name = "Sample QC Metrics Table" plot_html = table.plot(plot_content, headers, pconfig=pconfig) anchor = "sample_qc_metrics_table" description = "QC metrics per unique sample" helptext = """ - This section displays metrics that indicate the quality of each sample: \n - - Sample Name: Unique identifier composed of (RunName)__(UUID)__(SampleName), where (RunName) maps to the AVITI run name, (UUID) maps to the unique Bases2Fastq analysis result, and (SampleName) maps to the sample name as specified in the RunManifest.csv. - - Group: Run/Sample group label that assigns colors in the plot. To customize group tags:\n - - 1) Set the project name when running Bases2Fastq. In this case the group tags will be project name.\n - - 2) Generate a csv file with the suffix "_b2fgroup.csv", containing the columns "Sample Name" and "Group".\n - - Number of Polonies: The total number of polonies that are assigned to the sample.\n - - Assigned Yield (Gb): The sample yield that is based on assigned reads in gigabases.\n - - Quality Score Mean: The average Q score of base calls for the sample.\n - - Percent Q30: The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n - - Percent Q40: The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls\n + This section displays metrics that indicate the quality of each sample: \n + - Sample Name: Unique identifier composed of (RunName)__(UUID)__(SampleName), where (RunName) maps to the AVITI run name, (UUID) maps to the unique Bases2Fastq analysis result, and (SampleName) maps to the sample name as specified in the RunManifest.csv. + - Group: Run/Sample group label that assigns colors in the plot. To customize group tags:\n + - 1) Set the project name when running Bases2Fastq. In this case the group tags will be project name.\n + - 2) Generate a csv file with the suffix "_b2fgroup.csv", containing the columns "Sample Name" and "Group".\n + - Number of Polonies: The total number of polonies that are assigned to the sample.\n + - Assigned Yield (Gb): The sample yield that is based on assigned reads in gigabases.\n + - Quality Score Mean: The average Q score of base calls for the sample.\n + - Percent Q30: The percentage of ≥ Q30 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n + - Percent Q40: The percentage of ≥ Q40 Q scores for the sample. This includes assigned reads and excludes filtered reads and no calls.\n + - Reads Eliminated: Number of reads eliminated across lanes.\n + - Percent Mismatch: Percent Mismatch.\n """ return plot_html, plot_name, anchor, description, helptext, plot_content @@ -108,6 +183,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c r1r2_split = max(r1r2_split, len(R1)) for s_name in sorted(sample_data.keys()): + paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False R1 = sample_data[s_name]["Reads"][0]["Cycles"] for cycle in range(len(R1)): base_no = cycle + 1 @@ -135,20 +211,19 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c plot_content = data pconfig = { - "xlab": "cycle", - "ylab": "Percentage", + "xlab": "Cycle", + "ylab": "Percentage of Total Reads", "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}], "colors": color_dict, "ymin": 0, - "id": "per_cycle_base_content", + "id": f"per_cycle_base_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle Base Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) plot_name = "Per Cycle Base Content" anchor = "base_content" description = """ - Percentage of unidentified bases ("N" bases) by each sequencing cycle. - Read 1 and Read 2 are separated by a red dashed line + Base composition per sample per cycle. Read 1 and Read 2 are separated by a red dashed line. """ helptext = """ If a sequencer is unable to make a base call with sufficient confidence then it will @@ -208,11 +283,11 @@ def plot_per_cycle_N_content(sample_data, group_lookup_dict, project_lookup_dict "colors": color_dict, "ymin": 0, "ymax": 100, - "id": "per_cycle_n_content", + "id": f"per_cycle_n_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle N Content Percentage", } plot_html = linegraph.plot(plot_content, pconfig=pconfig) - plot_name = "Per Cycle N Content" + plot_name = "Per Cycle N Content." anchor = "n_content" description = """ Percentage of unidentified bases ("N" bases) by each sequencing cycle. @@ -262,10 +337,10 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s plot_content = gc_hist_dict pconfig = { - "xlab": "% GC", - "ylab": "Percentage", + "xlab": "GC Content (%)", + "ylab": "Percentage of reads that have GC (%)", "colors": sample_color, - "id": "gc_hist", + "id": f"gc_hist_{generate_random_string(5)}", "title": "bases2fastq: Per Sample GC Content Histogram", } plot_name = "Per Sample GC Histogram" @@ -322,7 +397,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa adapter_percent = cycle["PercentReadsTrimmed"] plot_content[s_name].update({cycle_no: adapter_percent}) pconfig = { - "id": "per_cycle_adapter_content", + "id": f"per_cycle_adapter_content_{generate_random_string(5)}", "title": "bases2fastq: Per Cycle Adapter Content", "xlab": "Cycle", "ylab": "% of Sequences", @@ -333,7 +408,7 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa pconfig.update({"colors": sample_color}) plot_html = linegraph.plot(plot_content, pconfig=pconfig) anchor = "adapter_content" - description = "Adapter content per cycle" + description = "Adapter content per cycle. Read 1 and Read 2 are separated by a red dashed line." helptext = """ The plot shows a cumulative percentage count of the proportion of your library which has seen each of the adapter sequences at each cycle. diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index e74ad3ea07..17a734ee54 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -45,6 +45,10 @@ bases2fastq/project: fn: "*_RunStats.json" contents: "SampleStats" num_lines: 100 +bases2fastq/manifest: + fn: "RunManifest.json" + contents: "Settings" + num_lines: 100 bbduk: contents: "Executing jgi.BBDuk" num_lines: 2