From 417a6ac3676988cc0509f839b9af9bd7cd2de1a3 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 20 Aug 2025 20:19:17 +0200 Subject: [PATCH 01/35] Make config type checking optional if typeguard is not installed (i.e. for pyodide worker) --- multiqc/validation.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/multiqc/validation.py b/multiqc/validation.py index 8cb4125dd1..b5877b1900 100644 --- a/multiqc/validation.py +++ b/multiqc/validation.py @@ -10,7 +10,6 @@ from PIL import ImageColor from pydantic import BaseModel -from typeguard import TypeCheckError, check_type from multiqc import config @@ -221,25 +220,31 @@ def validate_fields(cls, path_in_cfg: Tuple[str, ...], values: Dict[str, Any]) - continue try: - check_type(val, expected_type) - except TypeCheckError as e: - try: # try casting to expected type? - if expected_type is not None: - if expected_type.__name__ in ["Optional", "Union"]: - expected_type = expected_type.__args__[0] - val = expected_type(val) # type: ignore - except Exception: - v_str = repr(val) - if len(v_str) > 20: - v_str = v_str[:20] + "..." - expected_type_str = str(expected_type).replace("typing.", "") - msg = rf"expected type '{expected_type_str}', got '{type(val).__name__}' {v_str}" - add_validation_error(path_in_cfg + (name,), msg) - logger.debug(f"{msg}: {e}") + from typeguard import TypeCheckError, check_type + except ImportError: + logger.debug("typeguard not installed, skipping type checking") + corrected_values[name] = val + else: + try: + check_type(val, expected_type) + except TypeCheckError as e: + try: # try casting to expected type? + if expected_type is not None: + if expected_type.__name__ in ["Optional", "Union"]: + expected_type = expected_type.__args__[0] + val = expected_type(val) # type: ignore + except Exception: + v_str = repr(val) + if len(v_str) > 20: + v_str = v_str[:20] + "..." + expected_type_str = str(expected_type).replace("typing.", "") + msg = rf"expected type '{expected_type_str}', got '{type(val).__name__}' {v_str}" + add_validation_error(path_in_cfg + (name,), msg) + logger.debug(f"{msg}: {e}") + else: + corrected_values[name] = val else: corrected_values[name] = val - else: - corrected_values[name] = val values = corrected_values return values From 001f3c4e7d80667953ccde64300f8a0160408480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pontus=20H=C3=B6jer?= Date: Thu, 21 Aug 2025 15:52:47 +0200 Subject: [PATCH 02/35] New module: Seqfu stats (#3271) * add seqfu stats * Minor fix --------- Co-authored-by: Vlad Savelyev --- multiqc/config_defaults.yaml | 1 + multiqc/modules/seqfu/__init__.py | 3 + multiqc/modules/seqfu/seqfu.py | 50 ++++ multiqc/modules/seqfu/stats.py | 300 +++++++++++++++++++++++ multiqc/modules/seqfu/test/test_stats.py | 70 ++++++ multiqc/search_patterns.yaml | 3 + pyproject.toml | 1 + 7 files changed, 428 insertions(+) create mode 100644 multiqc/modules/seqfu/__init__.py create mode 100644 multiqc/modules/seqfu/seqfu.py create mode 100644 multiqc/modules/seqfu/stats.py create mode 100644 multiqc/modules/seqfu/test/test_stats.py diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index da5c37b9ed..7a1be867bb 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -577,6 +577,7 @@ module_order: - skewer - sortmerna - biobloomtools + - seqfu - fastq_screen - afterqc - fastp diff --git a/multiqc/modules/seqfu/__init__.py b/multiqc/modules/seqfu/__init__.py new file mode 100644 index 0000000000..0d428439b9 --- /dev/null +++ b/multiqc/modules/seqfu/__init__.py @@ -0,0 +1,3 @@ +from .seqfu import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/seqfu/seqfu.py b/multiqc/modules/seqfu/seqfu.py new file mode 100644 index 0000000000..08035493c4 --- /dev/null +++ b/multiqc/modules/seqfu/seqfu.py @@ -0,0 +1,50 @@ +import logging + +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound + +from .stats import parse_seqfu_stats + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + """ + Supported commands: + + - `stats`: + + ### seqfu stats + + #### Input files + + `seqfu stats` can generated reports in multiple formats, see https://telatin.github.io/seqfu2/tools/stats.html. Only TSVs with headers (default `seqfu stats` output) are currently detected and parsed by MultiQC. + + :::note + `seqfu stats` has a `--multiqc` option that generates a `_mqc.txt` file can be used with MuliQC as custom content. This is different from this module which enables additional features. + ::: + + #### Configuration + + Sample names are automatically extracted from the "File" columns by default. If you only have one sample per file and prefer to use the filename as the sample name instead, you can set the global `use_filename_as_sample_name` option to `true` or list `seqfu` under it. + """ + + def __init__(self): + super(MultiqcModule, self).__init__( + name="Seqfu", + anchor="seqfu", + target="seqfu", + href="https://telatin.github.io/seqfu2", + info="A general-purpose program to manipulate and parse information from FASTX files", + doi="10.3390/bioengineering8050059", + ) + + n = dict() + + # Call submodule functions + n["stats"] = parse_seqfu_stats(self) + if n["stats"] > 0: + log.info(f"Found {n['stats']} seqfu stats reports") + + # Exit if we didn't find anything + if sum(n.values()) == 0: + raise ModuleNoSamplesFound diff --git a/multiqc/modules/seqfu/stats.py b/multiqc/modules/seqfu/stats.py new file mode 100644 index 0000000000..69e7cf972e --- /dev/null +++ b/multiqc/modules/seqfu/stats.py @@ -0,0 +1,300 @@ +import logging +from typing import Dict, Any + +from multiqc import config +from multiqc.base_module import BaseMultiqcModule, SampleGroupingConfig, ModuleNoSamplesFound +from multiqc.plots import bargraph +from multiqc.types import SampleName, ColumnKey + +log = logging.getLogger(__name__) + + +def parse_seqfu_stats(module: BaseMultiqcModule): + """Find Seqfu stats logs and parse their data""" + use_filename = False + if isinstance(config.use_filename_as_sample_name, list): + # Check for module anchor + if module.anchor in config.use_filename_as_sample_name: + use_filename = True + elif config.use_filename_as_sample_name is True: + use_filename = True + + seqfu_stats: Dict[SampleName, Dict[str, Any]] = {} + for f in module.find_log_files("seqfu/stats", filehandles=True): + for sample_name, data in parse_file(f, use_filename): + sample_name = SampleName(module.clean_s_name(sample_name, f=f)) + + if sample_name in seqfu_stats: + log.debug(f"Duplicate sample name found! Overwriting: {sample_name}") + + seqfu_stats[sample_name] = data + + module.add_data_source(f, sample_name) + + # Superfluous function call to confirm that it is used in this module + # Replace None with actual version if it is available + module.add_software_version(None, sample_name) + + # Filter to strip out ignored sample names + seqfu_stats = module.ignore_samples(seqfu_stats) + + if len(seqfu_stats) == 0: + raise ModuleNoSamplesFound + + add_general_stats_cols(module, seqfu_stats) + + plot_sequence_lengths(module, seqfu_stats) + + plot_sequence_counts(module, seqfu_stats) + + # Write parsed report data to a file + module.write_data_file(seqfu_stats, "multiqc_seqfu_stats") + + # Return the number of logs that were found + return len(seqfu_stats) + + +def parse_file(f, use_filename=False): + lines = f["f"].readlines() + + # Check if file is empty + if len(lines) == 1: + log.debug("Empty file detected") + return [] + + if len(lines) > 2 and use_filename: + log.warning(f"File with multiple samples incompatible with option `seqfu_stats_config.use_filename` ({f['f']})") + return [] + + # Extract columns from first line + cols = [c for c in lines[0].strip().split("\t")] + + # Parse sample(s) data + for line in lines[1:]: + values = line.strip().split("\t") + row = dict(zip(cols, values)) + + sample_name = row["File"] + + # Using file name if generated from stdin (File='-') or option `use_filename` selected + if sample_name == "-" or use_filename: + sample_name = f["s_name"] + + data = {k: float(v) for k, v in row.items() if k != "File"} + yield sample_name, data + + +def add_general_stats_cols(module: BaseMultiqcModule, seqfu_stats: Dict[SampleName, Dict[str, Any]]): + # Add columns to General Stats Table + general_stats_headers = get_general_stats_headers() + + cols_to_weighted_average = [ + (ColumnKey("Avg"), ColumnKey("#Seq")), + (ColumnKey("N50"), ColumnKey("#Seq")), + (ColumnKey("N75"), ColumnKey("#Seq")), + (ColumnKey("N90"), ColumnKey("#Seq")), + (ColumnKey("auN"), ColumnKey("#Seq")), + (ColumnKey("Min"), ColumnKey("#Seq")), + (ColumnKey("Max"), ColumnKey("#Seq")), + ] + + no_gc = not any("%GC" in row for row in seqfu_stats.values()) + if not no_gc: + cols_to_weighted_average.append((ColumnKey("%GC"), ColumnKey("#Seq"))) + + general_stats_grouping_config = SampleGroupingConfig( + cols_to_sum=[ColumnKey(k) for k in ["#Seq", "Total bp"]], cols_to_weighted_average=cols_to_weighted_average + ) + + module.general_stats_addcols( + data_by_sample={s_name: {ColumnKey(k): v for k, v in data.items()} for s_name, data in seqfu_stats.items()}, + headers=general_stats_headers, + namespace="stats", + group_samples_config=general_stats_grouping_config, + ) + + +def all_same_length(seqfu_stats: Dict[SampleName, Dict[str, Any]]): + """Check if all sequences are the same length""" + lengths: set[float] = set() + for col in ["Min", "Max"]: + lengths.update(data[col] for data in seqfu_stats.values()) + return len(lengths) == 1 + + +def plot_sequence_lengths(module: BaseMultiqcModule, seqfu_stats: Dict[SampleName, Dict[str, Any]]): + """ + Plot sequence length statistics as a bar graph with switches for different stats + """ + if all_same_length(seqfu_stats): + log.debug("All samples have sequences of a single length") + + # Show a message if all sequences are the same length + # code inspired by FastQC module + length = seqfu_stats[next(iter(seqfu_stats))]["Min"] + module.add_section( + name="Sequence lengths", + anchor="seqfu-stats-lengths", + content=f'
All samples have sequences of a single length ({length:,.0f} bp)
', + ) + + return + + seqfu_lengths_cols = ["Avg", "N50", "N75", "N90", "auN", "Min", "Max"] + + seqfu_lengths_cols_labels = [ + {"name": "Mean", "ylab": "Mean length"}, + {"name": "N50", "ylab": "N50 length"}, + {"name": "N75", "ylab": "N75 length"}, + {"name": "N90", "ylab": "N90 length"}, + {"name": "auN", "ylab": "auN length"}, + {"name": "Min", "ylab": "Min length"}, + {"name": "Max", "ylab": "Max length"}, + ] + + seqfu_lengths_data = [] + for c in seqfu_lengths_cols: + seqfu_lengths_data.append({str(s): {c: seqfu_stats[s][c]} for s in seqfu_stats.keys()}) + + module.add_section( + name="Sequence lengths", + anchor="seqfu-stats-lengths", + id="seqfu-stats-lengths", + description="Sequence lengths statistics from `seqfu stats`", + helptext=""" +- Mean: average sequence length +- N50: 50% of sequences are longer than this +- N75: 75% of sequences are longer than this +- N90: 90% of sequences are longer than this +- auN: Area under the Nx sequence length curve +- Min: minimum sequence length +- Max: maximum sequence length +""", + plot=bargraph.plot( + seqfu_lengths_data, + pconfig={ + "id": "seqfu-stats-lengths-barplot", + "title": "Seqfu stats: Sequence length statistics", + "ymin": 0, + "cpswitch": False, + "ysuffix": " bp", + "tt_decimals": 0, + "data_labels": seqfu_lengths_cols_labels, + }, + ), + ) + + +def plot_sequence_counts(module: BaseMultiqcModule, seqfu_stats: Dict[SampleName, Dict[str, Any]]): + """ + Plot sequence count statistics as a bar graph with switches for sequences and bases + """ + seqfu_counts_cols = ["#Seq", "Total bp"] + + seqfu_counts_cols_labels = [ + {"name": "Sequences", "ylab": "Sequences"}, + {"name": "Bases", "ylab": "Bases"}, + ] + + seqfu_counts_data = [] + for c in seqfu_counts_cols: + seqfu_counts_data.append({str(s): {c: seqfu_stats[s][c]} for s in seqfu_stats.keys()}) + + module.add_section( + name="Sequence counts", + anchor="seqfu-stats-counts", + id="seqfu-stats-counts", + description="Sequence count statistics from `seqfu stats`", + plot=bargraph.plot( + seqfu_counts_data, + pconfig={ + "id": "seqfu-stats-counts-barplot", + "title": "Seqfu stats: Sequence count statistics", + "ymin": 0, + "ysuffix": "", + "cpswitch": False, + "tt_decimals": 0, + "data_labels": seqfu_counts_cols_labels, + }, + ), + ) + + +def get_general_stats_headers(): + return { + "#Seq": { + "title": "Seqs", + "description": "Number of sequences", + "shared_key": "read_count", + "scale": "Oranges", + }, + "Total bp": { + "title": "Bases", + "description": "Number of bases", + "shared_key": "base_count", + "scale": "Purples", + }, + "Avg": { + "title": "Mean len", + "description": "Average sequence length", + "format": "{:,.0f}", + "scale": "Greens", + "suffix": "bp", + }, + "N50": { + "title": "N50 len", + "description": "50% of the sequences are longer than this size", + "format": "{:,.0f}", + "scale": "Blues", + "suffix": "bp", + "hidden": True, + }, + "N75": { + "title": "N75 len", + "description": "75% of the sequences are longer than this size", + "format": "{:,.0f}", + "scale": "Blues", + "suffix": "bp", + "hidden": True, + }, + "N90": { + "title": "N90 len", + "description": "90% of the sequences are longer than this size", + "format": "{:,.0f}", + "scale": "Blues", + "suffix": "bp", + "hidden": True, + }, + "auN": { + "title": "auN len", + "description": "Area under the Nx curve", + "format": "{:,.0f}", + "scale": "Blues", + "suffix": "bp", + "hidden": True, + }, + "Min": { + "title": "Min len", + "description": "Length of the shortest sequence", + "format": "{:,.0f}", + "scale": "RdYlGn", + "suffix": "bp", + "hidden": True, + }, + "Max": { + "title": "Max len", + "description": "Length of the longest sequence", + "format": "{:,.0f}", + "scale": "RdYlGn", + "suffix": "bp", + "hidden": True, + }, + "%GC": { + "title": "%GC", + "description": "GC content in sequences", + "format": "{:.1%}", + "scale": "Oranges", + "min": 0, + "max": 1, + }, + } diff --git a/multiqc/modules/seqfu/test/test_stats.py b/multiqc/modules/seqfu/test/test_stats.py new file mode 100644 index 0000000000..bae8fe9f86 --- /dev/null +++ b/multiqc/modules/seqfu/test/test_stats.py @@ -0,0 +1,70 @@ +import pytest + +from multiqc import report, config +from multiqc.base_module import ModuleNoSamplesFound +from multiqc.types import SampleName +from multiqc.modules.seqfu.stats import all_same_length +from multiqc.modules.seqfu.seqfu import MultiqcModule +from multiqc.utils import testing + + +@pytest.fixture +def data_dir(): + return testing.data_dir() + + +NUM_SAMPLES_PER_FILE = { + "paired_end.tsv": 50, + "promethion_fail.tsv": 1, + "promethion_pass_stdin.tsv": 1, + "promethion_pass.tsv": 1, + "single_end_gc.tsv": 7, + "single_end.tsv": 7, +} + + +def test_data_parsed(data_dir): + data_subdir = data_dir / "modules/seqfu/stats" + assert data_subdir.exists() + for path in data_subdir.rglob("*.tsv"): + print(path) + path = data_subdir / path + if path.name not in NUM_SAMPLES_PER_FILE: + continue + + report.reset() + report.analysis_files = [path] + report.search_files(["seqfu"]) + config.preserve_module_raw_data = True + m = MultiqcModule() + assert m.saved_raw_data is not None + assert len(list(m.saved_raw_data.values())[0]) == NUM_SAMPLES_PER_FILE[path.name] + + +def test_empty_file_parsing(data_dir): + path = data_dir / "modules/seqfu/stats/empty.tsv" + assert path.exists() + report.reset() + report.analysis_files = [path] + report.search_files(["seqfu"]) + with pytest.raises(ModuleNoSamplesFound): + _ = MultiqcModule() + + +def test_use_filename_as_sample_name(data_dir): + path = data_dir / "modules/seqfu/stats/promethion_pass.tsv" + assert path.exists() + report.reset() + report.analysis_files = [path] + report.search_files(["seqfu"]) + config.preserve_module_raw_data = True + config.use_filename_as_sample_name = True + m = MultiqcModule() + assert m.saved_raw_data is not None + assert len(list(m.saved_raw_data.values())[0]) == NUM_SAMPLES_PER_FILE[path.name] + assert m._clean_s_name(path.name) in list(m.saved_raw_data.values())[0] + + +def test_all_same_length(): + assert all_same_length({SampleName("a"): {"Min": 1, "Max": 1}, SampleName("b"): {"Min": 1, "Max": 1}}) + assert not all_same_length({SampleName("a"): {"Min": 1, "Max": 1}, SampleName("b"): {"Min": 1, "Max": 2}}) diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index a6513a31ea..ab10a5ae2f 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -919,6 +919,9 @@ samtools/markdup_json: num_lines: 10 sargasso: fn: "overall_filtering_summary.txt" +seqfu/stats: + contents: "File #Seq Total bp Avg N50 N75 N90 auN Min Max" + num_lines: 1 seqwho: contents: ' "Per Base Seq": [' num_lines: 10 diff --git a/pyproject.toml b/pyproject.toml index f3bf76437a..efdf8f097e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,6 +234,7 @@ samblaster = "multiqc.modules.samblaster:MultiqcModule" samtools = "multiqc.modules.samtools:MultiqcModule" sargasso = "multiqc.modules.sargasso:MultiqcModule" seqera_cli = "multiqc.modules.seqera_cli:MultiqcModule" +seqfu = "multiqc.modules.seqfu:MultiqcModule" sequali = "multiqc.modules.sequali:MultiqcModule" seqwho = "multiqc.modules.seqwho:MultiqcModule" seqyclean = "multiqc.modules.seqyclean:MultiqcModule" From 997f52e7ea4ddf926d77acb7967acb13d4b0d234 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 21 Aug 2025 16:52:21 +0200 Subject: [PATCH 03/35] WASM workaround: if write_parquet not supported, write csv (#3309) * WASM workaround: if write_parquet not supported, write csv * Fix * Use debug print --- multiqc/core/plot_data_store.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/multiqc/core/plot_data_store.py b/multiqc/core/plot_data_store.py index 10812f5dd2..36e3fd3e10 100644 --- a/multiqc/core/plot_data_store.py +++ b/multiqc/core/plot_data_store.py @@ -210,6 +210,11 @@ def _write_parquet(df: pl.DataFrame) -> None: # Write to file try: df.write_parquet(parquet_file, compression="gzip") + except AttributeError: # 'builtins.PyDataFrame' object has no attribute 'write_parquet' + # Pyodide polars doesn't support write_parquet, fall back to CSV + csv_file = parquet_file.with_suffix(".csv") + logger.debug(f"Parquet writing not supported in Pyodide, falling back to CSV: {csv_file}") + df.write_csv(csv_file) except Exception as e: logger.error(f"Error writing parquet file: {e}") raise @@ -217,14 +222,25 @@ def _write_parquet(df: pl.DataFrame) -> None: def _read_or_create_df() -> pl.DataFrame: parquet_file = tmp_dir.parquet_file() + csv_file = parquet_file.with_suffix(".csv") - # Update existing file or create new one + # Try to read parquet first, then fall back to CSV if parquet_file.exists(): try: return pl.read_parquet(parquet_file) - except Exception as e: - logger.error(f"Error updating parquet file with metadata: {e}") + logger.error(f"Error reading parquet file: {e}") + if config.strict: + raise e + elif csv_file.exists(): + try: + # Read CSV and convert creation_date back to datetime + df = pl.read_csv(csv_file) + if "creation_date" in df.columns: + df = df.with_columns(pl.col("creation_date").str.to_datetime()) + return df + except Exception as e: + logger.error(f"Error reading CSV file: {e}") if config.strict: raise e else: From 805ea23b3b0170a43ac46f358bf62ea9dda3a143 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 22 Aug 2025 12:43:11 +0200 Subject: [PATCH 04/35] File search: do not read files if contents exclusion patterns are not provided (#3312) * Do not read file if contents exclusion patterns are not provided * Fix tests * Clean config in tests * More cleaning --- .gitignore | 2 ++ .../umicollapse/tests/test_umicollapse.py | 11 +++++++++ .../modules/umitools/tests/test_umitools.py | 11 +++++++++ multiqc/report.py | 19 +++++++-------- tests/test_custom_content.py | 10 ++++++++ tests/test_modules_run.py | 10 ++++++++ tests/test_plots.py | 23 +++++++++++++++++++ tests/test_sample_name_cleaning.py | 10 ++++++++ tests/test_search_files.py | 17 ++++++++++++++ 9 files changed, 104 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index c4e0f99c18..6804320e70 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,5 @@ package.json .env .claude + +Pipfile diff --git a/multiqc/modules/umicollapse/tests/test_umicollapse.py b/multiqc/modules/umicollapse/tests/test_umicollapse.py index 34d6c59cd5..574754bb02 100644 --- a/multiqc/modules/umicollapse/tests/test_umicollapse.py +++ b/multiqc/modules/umicollapse/tests/test_umicollapse.py @@ -1,10 +1,21 @@ +import pytest from multiqc import config, report from multiqc.modules.umicollapse import MultiqcModule from multiqc.plots.table_object import InputRow from multiqc.types import SampleName, SectionKey +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_preserve = config.preserve_module_raw_data + yield + config.preserve_module_raw_data = original_preserve + + def test_parse(tmp_path): + config.reset() + # File without a file name or content match. f1 = tmp_path / "SAMPLE.log" f1.write_text("Irrelevant file") diff --git a/multiqc/modules/umitools/tests/test_umitools.py b/multiqc/modules/umitools/tests/test_umitools.py index 523babb45f..59a8539697 100644 --- a/multiqc/modules/umitools/tests/test_umitools.py +++ b/multiqc/modules/umitools/tests/test_umitools.py @@ -1,9 +1,20 @@ +import pytest from multiqc import report from multiqc import config from multiqc.modules.umitools import MultiqcModule +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_preserve = config.preserve_module_raw_data + yield + config.preserve_module_raw_data = original_preserve + + def test_parse_name(tmp_path): + config.reset() + f1 = tmp_path / "stdout.log" f1.write_text("""\ # output generated by extract -I INPUT.fastq.gz -S OUTPUT.fastq.gz diff --git a/multiqc/report.py b/multiqc/report.py index a8a674ece1..83bf283b5a 100644 --- a/multiqc/report.py +++ b/multiqc/report.py @@ -812,15 +812,16 @@ def exclude_file(sp, f: SearchFile): return True # Search the contents of the file - for num_lines, line_block in f.line_block_iterator(): - if sp.exclude_contents: - for pat in sp.exclude_contents: - if pat and pat in line_block: - return True - if sp.exclude_contents_re: - for pat in sp.exclude_contents_re: - if pat and re.search(pat, line_block): - return True + if sp.exclude_contents or sp.exclude_contents_re: + for _, line_block in f.line_block_iterator(): + if sp.exclude_contents: + for pat in sp.exclude_contents: + if pat and pat in line_block: + return True + if sp.exclude_contents_re: + for pat in sp.exclude_contents_re: + if pat and re.search(pat, line_block): + return True return False diff --git a/tests/test_custom_content.py b/tests/test_custom_content.py index d9d412fb7a..7dd5b4fb95 100644 --- a/tests/test_custom_content.py +++ b/tests/test_custom_content.py @@ -20,6 +20,16 @@ from multiqc.validation import ModuleConfigValidationError +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_strict = config.strict + original_run_modules = config.run_modules[:] + yield + config.strict = original_strict + config.run_modules[:] = original_run_modules + + def test_linegraph_single_sample_txt(data_dir): path = data_dir / "custom_content" / "embedded_config" / "linegraph_single_sample_txt_mqc.txt" """ diff --git a/tests/test_modules_run.py b/tests/test_modules_run.py index e9110e7bce..f64e2adfa2 100644 --- a/tests/test_modules_run.py +++ b/tests/test_modules_run.py @@ -17,6 +17,16 @@ def multiqc_reset(): reset() +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_strict = config.strict + original_sample_names_ignore = config.sample_names_ignore[:] + yield + config.strict = original_strict + config.sample_names_ignore[:] = original_sample_names_ignore + + @pytest.mark.parametrize("module_id,entry_point", modules) def test_all_modules(module_id, entry_point, data_dir): """ diff --git a/tests/test_plots.py b/tests/test_plots.py index b03612ca51..c437b5dc67 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -15,6 +15,29 @@ from multiqc.validation import ModuleConfigValidationError +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state during tests that modify global config.""" + original_boxplot_boxpoints = config.boxplot_boxpoints + original_box_min_threshold_no_points = config.box_min_threshold_no_points + original_box_min_threshold_outliers = config.box_min_threshold_outliers + original_development = config.development + original_export_plots = config.export_plots + original_export_plot_formats = getattr(config, "export_plot_formats", None) + original_strict = config.strict + yield + config.boxplot_boxpoints = original_boxplot_boxpoints + config.box_min_threshold_no_points = original_box_min_threshold_no_points + config.box_min_threshold_outliers = original_box_min_threshold_outliers + config.development = original_development + config.export_plots = original_export_plots + if original_export_plot_formats is not None: + config.export_plot_formats = original_export_plot_formats + elif hasattr(config, "export_plot_formats"): + delattr(config, "export_plot_formats") + config.strict = original_strict + + def _verify_rendered(plot) -> Plot: assert isinstance(plot, Plot) plot.add_to_report(module_anchor=Anchor("test"), section_anchor=Anchor("test")) diff --git a/tests/test_sample_name_cleaning.py b/tests/test_sample_name_cleaning.py index 8ff8ecd2ae..f9719d1f4e 100644 --- a/tests/test_sample_name_cleaning.py +++ b/tests/test_sample_name_cleaning.py @@ -10,6 +10,16 @@ def base_module(): return BaseMultiqcModule() +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_fn_clean_exts = config.fn_clean_exts[:] + original_fn_clean_trim = config.fn_clean_trim[:] + yield + config.fn_clean_exts[:] = original_fn_clean_exts + config.fn_clean_trim[:] = original_fn_clean_trim + + def test_no_trim(base_module): config.fn_clean_exts[:] = [] config.fn_clean_trim[:] = [] diff --git a/tests/test_search_files.py b/tests/test_search_files.py index aea67544c4..a574e74948 100644 --- a/tests/test_search_files.py +++ b/tests/test_search_files.py @@ -14,6 +14,23 @@ from multiqc.core.file_search import file_search +@pytest.fixture(autouse=True) +def reset_config(): + """Reset config state after each test.""" + original_sp = getattr(config, "sp", None) + original_run_modules = config.run_modules[:] + original_avail_modules = config.avail_modules.copy() + original_analysis_dir = config.analysis_dir[:] + yield + if original_sp is not None: + config.sp = original_sp + elif hasattr(config, "sp"): + delattr(config, "sp") + config.run_modules[:] = original_run_modules + config.avail_modules = original_avail_modules + config.analysis_dir[:] = original_analysis_dir + + def _test_search_files( search_patterns: Dict, analysis_dir: Path, From 64dde16c25d2b31d7fd5b13074e95de107ef67b5 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 22 Aug 2025 17:02:14 +0200 Subject: [PATCH 05/35] Add pandoc to Docker image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds the pandoc package to the Docker image installation, allowing MultiQC to generate additional output formats when running in Docker containers. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8cecd91201..561ac6cfd9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,8 +26,8 @@ RUN \ echo "Docker build log: Run apt-get update" 1>&2 && \ apt-get update -y -qq \ && \ - echo "Docker build log: Install procps" 1>&2 && \ - apt-get install -y -qq procps && \ + echo "Docker build log: Install procps and pandoc" 1>&2 && \ + apt-get install -y -qq procps pandoc && \ echo "Docker build log: Clean apt cache" 1>&2 && \ rm -rf /var/lib/apt/lists/* && \ apt-get clean -y && \ From d58e581651e8385503e72dc287b4576ef74d2f57 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 22 Aug 2025 17:03:59 +0200 Subject: [PATCH 06/35] Instruct Claude about main branch --- CLAUDE.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 6aa1db873c..36073274de 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,5 +1,9 @@ # CLAUDE.md +# IMPORTANT + +Never push to main branch. When asked to add a change or create a pull request, always check if we are on main first. If we are, create a new branch and push to it. + This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Development Commands From 62275162c093c8cab8806605558081c783670045 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 27 Aug 2025 16:24:45 +0200 Subject: [PATCH 07/35] Fix box plot AI summaries (#3315) --- multiqc/templates/default/assets/js/plots/box.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/multiqc/templates/default/assets/js/plots/box.js b/multiqc/templates/default/assets/js/plots/box.js index 2bc6cc407e..34a7fcdb02 100644 --- a/multiqc/templates/default/assets/js/plots/box.js +++ b/multiqc/templates/default/assets/js/plots/box.js @@ -65,9 +65,7 @@ class BoxPlot extends Plot { return (isInt ? val : parseFloat(val.toFixed(2))) + suffix; }; - prompt += `|${anonymizeSampleName(sample)}|${fmt(min)}|${fmt(q1)}|${fmt(median)}|${fmt(q3)}|${fmt(max)}|${fmt( - mean, - )}|\n`; + prompt += `|${sample}|${fmt(min)}|${fmt(q1)}|${fmt(median)}|${fmt(q3)}|${fmt(max)}|${fmt(mean)}|\n`; }); return prompt; From dcfb9151534ff0a15fa42f38b0b56122983faaa6 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 28 Aug 2025 17:05:31 +0200 Subject: [PATCH 08/35] AI summaries: fix when provider undefined --- multiqc/templates/default/assets/js/ai.js | 4 ++-- multiqc/templates/default/assets/js/toolbox.js | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/multiqc/templates/default/assets/js/ai.js b/multiqc/templates/default/assets/js/ai.js index e4e6d7d638..4457e50480 100644 --- a/multiqc/templates/default/assets/js/ai.js +++ b/multiqc/templates/default/assets/js/ai.js @@ -205,7 +205,7 @@ async function summarizeWithAi(button) { let aiApiKey = $("#ai-api-key").val(); let endpoint = $("#ai-endpoint").val(); - if (!modelName && provider.defaultModel) { + if (!modelName && provider && provider.defaultModel) { modelName = provider.defaultModel; $("#ai-model").val(modelName); storeModelName(providerId, modelName); @@ -346,7 +346,7 @@ async function summarizeWithAi(button) { }), ); const endTime = performance.now(); - console.log(`Time to generate more: ${endTime - startTime}ms`); + console.log(`Time to run generation: ${endTime - startTime}ms`); if (!isMore && isGlobal) $("#global_ai_summary_more_button_and_disclaimer").show(); disclaimerDiv.show(); }, diff --git a/multiqc/templates/default/assets/js/toolbox.js b/multiqc/templates/default/assets/js/toolbox.js index 9bda73a4a0..e1a8a1077d 100644 --- a/multiqc/templates/default/assets/js/toolbox.js +++ b/multiqc/templates/default/assets/js/toolbox.js @@ -1614,7 +1614,7 @@ function updatePanel(providerId) { } // Doing it here again because model depends on provider const storedModel = getStoredModelName(providerId); - const defaultModel = provider.defaultModel; + const defaultModel = provider && provider.defaultModel ? provider.defaultModel : null; $("#ai-model").val(storedModel || defaultModel); if (providerId === "openai") { @@ -1659,7 +1659,7 @@ $(function () { let model = getStoredModelName(providerId); if (model === null && aiConfigModel !== "None") model = aiConfigModel; - if (model === null && provider.defaultModel) model = provider.defaultModel; + if (model === null && provider && provider.defaultModel) model = provider.defaultModel; $("#ai-model").val(model); let endpoint = getStoredEndpoint(); From 928e8d9c2b9e9347ff08bb799f77228d3cce1ad5 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 28 Aug 2025 17:29:46 +0200 Subject: [PATCH 09/35] Fix using default AI provider --- multiqc/config_defaults.yaml | 2 +- multiqc/templates/default/assets/js/toolbox.js | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/multiqc/config_defaults.yaml b/multiqc/config_defaults.yaml index 7a1be867bb..c8edbe5c7f 100644 --- a/multiqc/config_defaults.yaml +++ b/multiqc/config_defaults.yaml @@ -66,7 +66,7 @@ make_pdf: false # AI settings: ai_summary: false ai_summary_full: false -ai_provider: null +ai_provider: seqera ai_model: null ai_custom_endpoint: null ai_auth_type: null diff --git a/multiqc/templates/default/assets/js/toolbox.js b/multiqc/templates/default/assets/js/toolbox.js index e1a8a1077d..b417d22637 100644 --- a/multiqc/templates/default/assets/js/toolbox.js +++ b/multiqc/templates/default/assets/js/toolbox.js @@ -1652,7 +1652,8 @@ $(function () { }); // Set initial values from storage or values from Python - const providerId = getStoredProvider() || aiConfigProviderId || "seqera"; + let providerId = getStoredProvider() || aiConfigProviderId; + if (!providerId || providerId === "None") providerId = "seqera"; aiProviderSelect.val(providerId); const provider = AI_PROVIDERS[providerId]; $("#ai-api-key").val(getStoredApiKey(providerId) || ""); From 1ce7127cb9d960c6856afb93d387edb7f457d422 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 28 Aug 2025 17:30:05 +0200 Subject: [PATCH 10/35] OpenAI: default to gpt-5 --- multiqc/templates/default/assets/js/toolbox.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/multiqc/templates/default/assets/js/toolbox.js b/multiqc/templates/default/assets/js/toolbox.js index b417d22637..058cd65826 100644 --- a/multiqc/templates/default/assets/js/toolbox.js +++ b/multiqc/templates/default/assets/js/toolbox.js @@ -21,8 +21,8 @@ const AI_PROVIDERS = { }, openai: { name: "OpenAI", - defaultModel: "gpt-4o", - suggestedModels: ["gpt-4o", "gpt-4.1"], + defaultModel: "gpt-5", + suggestedModels: ["gpt-5"], apiKeysUrl: "https://platform.openai.com/api-keys", modelsUrl: "https://platform.openai.com/docs/models", }, From e553f29c3da2a6b0ea5abaae8501479b684e894c Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 29 Aug 2025 14:38:10 +0200 Subject: [PATCH 11/35] Xenium: feedback (#3313) * Do not read file if contents exclusion patterns are not provided * Optimize for lazy dataframe * Optimize multi-sample runs * Remove try-catches * Use heatmap for FoV plot * Fix * Heatmap: if too wide, determine only width from max_width * Optimizations and feedback * Adjust single-sample plots * Add flat_if_very_large into pconfig * Add helptext * Fix box plot AI summaries * Clean up * Fix FoV heatmap * Use heatmap for both scenarios * Use barplot instead of heatmap * FoV: use red; percentages by default * AI summaries: fix when provider undefined * Fix using default AI provider * OpenAI: default to gpt-5 * Reflect % in max in table * Vertical line positionin * Log scale for distr --- multiqc/modules/xenium/xenium.py | 1881 ++++++++++++++++++------------ multiqc/plots/bargraph.py | 1 - multiqc/plots/heatmap.py | 13 +- multiqc/plots/plot.py | 9 +- multiqc/plots/table_object.py | 1 + multiqc/plots/violin.py | 1 - 6 files changed, 1157 insertions(+), 749 deletions(-) diff --git a/multiqc/modules/xenium/xenium.py b/multiqc/modules/xenium/xenium.py index 8d6ef9d7cb..479c0fabbe 100644 --- a/multiqc/modules/xenium/xenium.py +++ b/multiqc/modules/xenium/xenium.py @@ -3,10 +3,21 @@ from pathlib import Path from typing import Any, Dict, Optional, Tuple +import numpy as np import polars as pl from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound -from multiqc.plots import bargraph, box, linegraph, scatter, violin +from multiqc.plots import bargraph, box, linegraph, scatter, table +from multiqc.plots.table_object import ColumnDict, TableConfig + +# Try importing scipy, fallback gracefully if not available +try: + import scipy + import scipy.stats + + SCIPY_AVAILABLE = True +except ImportError: + SCIPY_AVAILABLE = False log = logging.getLogger(__name__) @@ -90,7 +101,9 @@ def __init__(self): # Parse transcript quality data transcript_data_by_sample = {} - for transcript_f in self.find_log_files("xenium/transcripts", filecontents=False, filehandles=False): + transcript_files = list(self.find_log_files("xenium/transcripts", filecontents=False, filehandles=False)) + + for transcript_f in transcript_files: parsed_transcript_data = self.parse_transcripts_parquet(transcript_f) if parsed_transcript_data: # Use parent directory name as sample name @@ -128,25 +141,35 @@ def __init__(self): for sample_name, cell_data in cells_data_by_sample.items(): if sample_name in data_by_sample: # Add cell area metrics to existing sample data - if "cell_area_mean" in cell_data: - data_by_sample[sample_name]["cell_area_mean"] = cell_data["cell_area_mean"] - if "nucleus_area_mean" in cell_data: - data_by_sample[sample_name]["nucleus_area_mean"] = cell_data["nucleus_area_mean"] - if "nucleus_to_cell_area_ratio_mean" in cell_data: - data_by_sample[sample_name]["nucleus_to_cell_area_ratio_mean"] = cell_data[ - "nucleus_to_cell_area_ratio_mean" - ] + data_by_sample[sample_name]["cell_area_median"] = cell_data["cell_area_median"] + data_by_sample[sample_name]["nucleus_area_median"] = cell_data["nucleus_area_median"] + data_by_sample[sample_name]["nucleus_to_cell_area_ratio_median"] = cell_data[ + "nucleus_to_cell_area_ratio_median" + ] elif cell_data: # Create new sample entry if only cell data exists data_by_sample[sample_name] = {} - if "cell_area_mean" in cell_data: - data_by_sample[sample_name]["cell_area_mean"] = cell_data["cell_area_mean"] - if "nucleus_area_mean" in cell_data: - data_by_sample[sample_name]["nucleus_area_mean"] = cell_data["nucleus_area_mean"] - if "nucleus_to_cell_area_ratio_mean" in cell_data: - data_by_sample[sample_name]["nucleus_to_cell_area_ratio_mean"] = cell_data[ - "nucleus_to_cell_area_ratio_mean" - ] + data_by_sample[sample_name]["cell_area_median"] = cell_data["cell_area_median"] + data_by_sample[sample_name]["nucleus_area_median"] = cell_data["nucleus_area_median"] + data_by_sample[sample_name]["nucleus_to_cell_area_ratio_median"] = cell_data[ + "nucleus_to_cell_area_ratio_median" + ] + + # Use transcript count from parquet file if missing from JSON + for sample_name, transcript_data in transcript_data_by_sample.items(): + if sample_name in data_by_sample: + # Add transcript count if missing from JSON data + if ( + "num_transcripts" not in data_by_sample[sample_name] + or data_by_sample[sample_name]["num_transcripts"] is None + ): + if "total_transcripts" in transcript_data: + data_by_sample[sample_name]["num_transcripts"] = transcript_data["total_transcripts"] + elif "total_transcripts" in transcript_data: + # Create new sample entry if only transcript data exists + if sample_name not in data_by_sample: + data_by_sample[sample_name] = {} + data_by_sample[sample_name]["num_transcripts"] = transcript_data["total_transcripts"] # Write parsed data to a file self.write_data_file(data_by_sample, "multiqc_xenium") @@ -157,21 +180,21 @@ def __init__(self): # Create plots - Cell detection metrics are already in general stats table self.add_section( - name="Segmentation Methods", + name="Segmentation Method", anchor="xenium-segmentation", description="Distribution of cell segmentation methods used", helptext=""" This stacked bar chart shows the fraction of cells segmented by each method: - + * **Boundary**: Cells segmented using boundary staining (e.g., ATP1A1/E-cadherin/CD45) - * **Interior**: Cells segmented using interior staining (e.g., 18S RNA) + * **Interior**: Cells segmented using interior staining (e.g., 18S RNA) * **Nuclear Expansion**: Cells segmented by expanding from nucleus boundaries - + **What to look for:** * **Boundary segmentation** typically provides the most accurate cell boundaries * **High nuclear expansion fraction** may indicate poor membrane staining * Consistent ratios across samples of the same tissue type - + **Interpretation:** * >80% boundary segmentation: Excellent membrane staining and segmentation * >50% nuclear expansion: Consider optimizing membrane staining protocols @@ -182,78 +205,71 @@ def __init__(self): # Add transcript quality section if transcript data is available if transcript_data_by_sample: - self.add_section( - name="Transcript Quality Distribution", - anchor="xenium-transcript-quality", - description="Distribution of transcript quality values by codeword category across samples", - helptext=""" - This plot shows transcript quality (QV score) vs. transcript count by gene category: - - * **Pre-designed genes**: Standard genes from Xenium panels (blue) - * **Custom genes**: User-added custom targets (orange) - * **Negative controls**: Control probes for background estimation (red/yellow) - * **Genomic controls**: Genomic DNA controls (pink) - - **Quality Score (QV) Interpretation:** - * QV ≥20: High-quality transcripts (≥99% accuracy) - * QV 10-20: Medium quality (90-99% accuracy) - * QV <10: Low-quality transcripts (<90% accuracy) - - **What to look for:** - * Pre-designed genes should cluster at high QV (>20) and reasonable counts - * Negative controls should have low counts and variable quality - * Outlier genes with very low quality or unexpectedly high/low counts - - **Single vs. Multiple samples:** - * Single sample: Scatter plot showing individual genes - * Multiple samples: Line plot showing category trends - """, - plot=self.xenium_transcript_quality_plot(transcript_data_by_sample), - ) - - # Add violin plot for transcript quality by codeword category if available - violin_plot = self.xenium_transcript_category_violin_plot(transcript_data_by_sample) - if violin_plot is not None: + if len(transcript_data_by_sample) == 1: + self.add_section( + name="Transcript Quality", + anchor="xenium-transcript-quality", + description="Transcript quality statistics by gene category", + helptext=""" + This scatter plot shows transcript quality statistics broken down by gene category: + + **Gene Categories:** + * **Pre-designed**: Standard genes from Xenium panels + * **Custom**: User-added custom targets + * **Deprecated**: Genes no longer recommended for use + * **Control**: Control probe sequences (e.g., negative controls) + + **Quality Metrics:** + * **X-axis**: Transcript count per gene category + * **Y-axis**: Quality score distribution for each category + + **Expected patterns:** + * Pre-designed genes typically show the highest counts and quality + * Custom genes may show variable performance depending on probe design + * Control probes should show expected low signal + """, + plot=self.xenium_transcript_quality_scatter_plot(transcript_data_by_sample), + ) + else: self.add_section( - name="Transcript Quality by Codeword Category", - anchor="xenium-transcript-category-violin", - description="Distribution of transcript quality values by codeword category (averaged across samples)", + name="Transcript Quality Summary", + anchor="xenium-transcript-quality", + description="Per-sample transcript quality statistics by gene category", helptext=""" - This violin plot shows transcript quality (QV score) distributions for each codeword category: + This table shows transcript quality statistics for each sample, with separate columns for each gene category: - **Codeword Categories:** - * **predesigned_gene**: Standard genes from Xenium panels - * **custom_gene**: User-added custom targets - * **negative_control_probe**: Control probes for background estimation - * **negative_control_codeword**: Control codewords for background estimation - * **genomic_control_probe**: Genomic DNA controls - * **unassigned_codeword**: Unassigned transcripts + **Gene Categories:** + * **Pre-designed**: Standard genes from Xenium panels + * **Custom**: User-added custom targets + * **Negative Control Probe/Codeword**: Control probes for background estimation + * **Genomic Control Probe**: Genomic DNA controls + * **Unassigned/Deprecated Codeword**: Other transcript types **Quality Score (QV) Interpretation:** * QV ≥20: High-quality transcripts (≥99% accuracy) - * QV 10-20: Medium quality (90-99% accuracy) + * QV 10-20: Medium quality (90-99% accuracy) * QV <10: Low-quality transcripts (<90% accuracy) - **What to look for:** - * **predesigned_gene** should have the highest QV distributions (centered around 30-40) - * **custom_gene** typically has slightly lower but still good QV distributions - * **Negative controls** should have variable quality and lower overall counts - * **Large differences** between categories may indicate panel design issues + **Table Layout:** + * **Rows**: Individual samples + * **Columns**: Mean QV and Standard Deviation for each category + * Values show quality statistics computed from all transcripts in that category for each sample - **Multiple samples:** - * For multiple samples, data is averaged across all samples to show overall category patterns - * The violin shape shows the density of QV values at each quality level - * Wider sections indicate more transcripts at that quality level + **What to look for:** + * Pre-designed genes should have high mean QV (>20) across all samples + * Consistent quality patterns across samples indicate good data quality + * High standard deviations may indicate heterogeneous quality within a category + * Missing values (empty cells) indicate no transcripts found for that category in that sample """, - plot=violin_plot, + plot=self.xenium_transcript_quality_table(transcript_data_by_sample), ) - # Add molecules per gene distribution if available - molecules_plot = self.xenium_molecules_per_gene_plot(transcript_data_by_sample) - if molecules_plot is not None: + # Add transcripts per gene distribution if available + transcripts_per_gene_plot = self.xenium_transcripts_per_gene_plot(transcript_data_by_sample) + if transcripts_per_gene_plot is not None: self.add_section( - name="Molecules per Gene Distribution", - anchor="xenium-molecules-per-gene", + name="Distribution of Transcripts", + anchor="xenium-transcripts-per-gene", description="Distribution of transcript counts per gene", helptext=""" This histogram shows the distribution of transcript counts per gene across all samples: @@ -280,11 +296,11 @@ def __init__(self): * Clear separation between gene and non-gene distributions * Absence of unusual spikes or gaps in the distribution """, - plot=molecules_plot, + plot=transcripts_per_gene_plot, ) # Add cell area distribution section if cells data is available - if cells_data_by_sample: + if cells_data_by_sample and SCIPY_AVAILABLE: area_plot = self.xenium_cell_area_distribution_plot(cells_data_by_sample) if area_plot: self.add_section( @@ -315,13 +331,13 @@ def __init__(self): plot=area_plot, ) - # Add nucleus RNA fraction distribution plot + # Add nucleus RNA fraction distribution plot (scipy required) nucleus_plot = self.xenium_nucleus_rna_fraction_plot(cells_data_by_sample) if nucleus_plot: self.add_section( - name="Distribution of fractions of molecules in nucleus per cell", + name="Fraction of Transcripts in Nucleus", anchor="xenium-nucleus-rna-fraction", - description="Distribution of nucleus RNA molecule fractions across cells", + description="Distribution of the fraction of transcripts found in the nucleus across cells", helptext=""" This plot shows the distribution of the fraction of RNA molecules located in the nucleus versus cytoplasm for each cell: @@ -350,7 +366,7 @@ def __init__(self): ratio_plot = self.xenium_nucleus_cell_area_ratio_plot(cells_data_by_sample) if ratio_plot: self.add_section( - name="Nucleus to cell area distribution", + name="Nucleus to Cell Area", anchor="xenium-nucleus-cell-area-ratio", description="Distribution of nucleus-to-cell area ratios across cells", helptext=""" @@ -386,7 +402,7 @@ def __init__(self): combined_plot = self.xenium_cell_distributions_combined_plot(cells_data_by_sample) if combined_plot: self.add_section( - name="Cell Distribution Analysis", + name="Distribution of Transcripts/Genes per Cell", anchor="xenium-cell-distributions", description="Distribution of transcripts and detected genes per cell", helptext=""" @@ -442,44 +458,297 @@ def __init__(self): self.add_section( name="Field of View Quality", anchor="xenium-fov-quality", - description="Transcript quality distribution by Field of View (FoV)", + description="Field of View quality distribution across QV ranges", helptext=""" - This plot shows transcript quality distributions across different imaging fields (FoVs - Fields of View): + This plot shows the distribution of Field of View (FoV) quality across different quality ranges: **What is a Field of View?** * Each FoV represents one microscope imaging area/tile * Large tissue sections are imaged as multiple overlapping FoVs * FoVs are systematically captured in a grid pattern across the tissue - **Plot types:** - * **Single sample**: Box plots showing quality distributions per FoV (median, quartiles, outliers) - * **Multiple samples**: Box plots with aggregated quality distributions per FoV across all samples + **Plot interpretation:** + * **X-axis**: Quality ranges (Low to Excellent QV ranges) + * **Y-axis**: Number of Fields of View in each quality range + * **Colors**: Color-coded by quality level (grey=poor, green=excellent) + * **Bars**: Each sample shown as separate colored bars for comparison - **Box plot interpretation:** - * **Box boundaries**: 25th and 75th percentiles (Q1 and Q3) - * **Center line**: Median quality score - * **Whiskers**: Extend to 1.5 × IQR or the most extreme data point - * **Points**: Individual transcript quality scores (outliers or small datasets) + **Quality ranges:** + * **Low (QV < 20)**: Poor imaging quality - investigate issues (dark grey) + * **Poor (QV 20-25)**: Below optimal quality - may need attention (light grey) + * **Fair (QV 25-30)**: Acceptable quality (lighter grey) + * **Good (QV 30-35)**: Good imaging quality (light green) + * **Excellent (QV ≥ 35)**: Optimal imaging quality (bright green) **What to look for:** - * **Consistent quality** across FoVs (similar median QV values around 30-40) - * **Tight distributions** (narrow boxes indicate consistent quality within FoVs) - * **No systematic patterns**: Random variation is normal, systematic gradients are not - * **Outlier FoVs**: Any FoV with notably poor median quality (<20 QV) - - **Quality thresholds:** - * QV >30: Excellent imaging quality - * QV 20-30: Good quality - * QV <20: Poor quality, investigate issues + * **Good distribution**: Most FoVs should be in "Good" or "Excellent" ranges + * **Few poor FoVs**: Minimal counts in "Low" and "Poor" ranges + * **Sample consistency**: Similar distributions across samples **Troubleshooting:** - * Specific low-quality FoVs: Focus/illumination issues, debris, tissue damage + * Many low-quality FoVs: Focus/illumination issues, debris, tissue damage + * Sample inconsistency: Processing or storage differences * Edge effects: FoVs at tissue edges often have lower quality - * Systematic gradients: Temperature, timing, or optical alignment issues """, plot=fov_plot, ) + def _create_non_overlapping_labels( + self, + mean_value, + median_value, + mean_color="red", + median_color="green", + precision=0, + suffix="", + prefix="", + threshold_percent=5, + data_min=None, + data_max=None, + ): + """ + Create vertical line configurations with non-overlapping labels when mean and median are close. + + Args: + mean_value: Mean value for vertical line + median_value: Median value for vertical line + mean_color: Color for mean line (default: "red") + median_color: Color for median line (default: "green") + precision: Decimal places for value display + suffix: Unit suffix to add to labels (e.g., " μm²") + prefix: Prefix for labels (e.g., "Transcripts ", "Genes ") + threshold_percent: If values are within this percentage of plot range, offset labels + data_min: Minimum value of the underlying data range (optional) + data_max: Maximum value of the underlying data range (optional) + + Returns: + List of line configurations with appropriate label positioning + """ + # Calculate plot range for scale-aware overlap detection + if data_min is not None and data_max is not None: + plot_range = data_max - data_min + + # If data range is too small, use mean/median range + if plot_range == 0: + plot_range = max(abs(mean_value - median_value), max(abs(mean_value), abs(median_value), 1)) + else: + # Fall back to using mean/median values to estimate scale + plot_range = max(abs(mean_value), abs(median_value), 1) + + # Calculate percentage difference relative to plot scale + value_diff = abs(mean_value - median_value) + range_percent_diff = (value_diff / plot_range) * 100 + + # Format values according to precision + if precision == 0: + mean_str = f"{mean_value:.0f}" + median_str = f"{median_value:.0f}" + else: + mean_str = f"{mean_value:.{precision}f}" + median_str = f"{median_value:.{precision}f}" + + # Create base line configurations + lines = [ + { + "value": float(mean_value), + "color": mean_color, + "dash": "dash", + "width": 2, + "label": f"{prefix}Mean ({mean_str}{suffix})", + }, + { + "value": float(median_value), + "color": median_color, + "dash": "dash", + "width": 2, + "label": f"{prefix}Median ({median_str}{suffix})", + }, + ] + + # If values are too close on the plot scale, create labels with non-breaking spaces to offset them horizontally + if range_percent_diff < threshold_percent: + # Use non-breaking spaces to create horizontal offset + space = " " * 30 + lines[0]["label"] = f"{prefix}Mean ({mean_str}{suffix}){space}" # Add trailing spaces + lines[1]["label"] = f"{space}{prefix}Median ({median_str}{suffix})" # Add leading spaces + + return lines + + def _create_non_overlapping_combined_lines( + self, transcript_values=None, gene_values=None, plot_data=None, threshold_percent=5 + ): + """ + Create all vertical lines for combined plots with intelligent label positioning to avoid any overlaps. + + Args: + transcript_values: Array of transcript values (optional) + gene_values: Array of gene values (optional) + plot_data: Dictionary of plot data to calculate X-axis range (optional) + threshold_percent: Minimum percentage difference relative to plot range + + Returns: + List of all line configurations with non-overlapping labels + """ + import numpy as np + + lines = [] + all_values = [] # Track all line values for overlap detection + + # Collect transcript lines if provided + if transcript_values is not None: + mean_transcripts = np.nanmean(transcript_values) + median_transcripts = np.nanmedian(transcript_values) + + transcript_lines = [ + { + "value": float(mean_transcripts), + "color": "#7cb5ec", + "dash": "dash", + "width": 2, + "label": f"Transcripts Mean ({mean_transcripts:.0f})", + "type": "mean", + "dataset": "transcripts", + }, + { + "value": float(median_transcripts), + "color": "#99c2e8", + "dash": "dash", + "width": 2, + "label": f"Transcripts Median ({median_transcripts:.0f})", + "type": "median", + "dataset": "transcripts", + }, + ] + lines.extend(transcript_lines) + all_values.extend([mean_transcripts, median_transcripts]) + + # Collect gene lines if provided + if gene_values is not None: + mean_genes = np.nanmean(gene_values) + median_genes = np.nanmedian(gene_values) + + gene_lines = [ + { + "value": float(mean_genes), + "color": "#434348", + "dash": "dash", + "width": 2, + "label": f"Genes Mean ({mean_genes:.0f})", + "type": "mean", + "dataset": "genes", + }, + { + "value": float(median_genes), + "color": "#888888", + "dash": "dash", + "width": 2, + "label": f"Genes Median ({median_genes:.0f})", + "type": "median", + "dataset": "genes", + }, + ] + lines.extend(gene_lines) + all_values.extend([mean_genes, median_genes]) + + if not lines: + return [] + + # Sort lines by value for easier overlap detection + lines.sort(key=lambda x: x["value"]) + + # Calculate plot range from actual plot data X values + if plot_data: + all_x_values = [] + for dataset in plot_data.values(): + all_x_values.extend(dataset.keys()) + + if all_x_values: + min_value = min(all_x_values) + max_value = max(all_x_values) + plot_range = max_value - min_value + else: + # Fallback to line values if no plot data + all_line_values = [line["value"] for line in lines] + min_value = min(all_line_values) + max_value = max(all_line_values) + plot_range = max_value - min_value + else: + # Fallback to line values if no plot data provided + all_line_values = [line["value"] for line in lines] + min_value = min(all_line_values) + max_value = max(all_line_values) + plot_range = max_value - min_value + + # If plot range is too small, fall back to absolute threshold + if plot_range == 0: + plot_range = max(abs(max_value), 1) # Avoid division by zero + + # Group overlapping lines and apply spacing once per group + processed = set() + + for i in range(len(lines)): + if i in processed: + continue + + line = lines[i] + overlap_group = [i] + + # Find all lines that overlap with this one + for j in range(i + 1, len(lines)): + if j in processed: + continue + + other_line = lines[j] + value_diff = abs(line["value"] - other_line["value"]) + + # Calculate percentage relative to the plot range, not individual values + range_percent_diff = (value_diff / plot_range) * 100 + + if range_percent_diff < threshold_percent: + overlap_group.append(j) + + # Apply spacing to the entire overlap group + if len(overlap_group) > 1: + space = " " * 15 + group_size = len(overlap_group) + + for idx, line_idx in enumerate(overlap_group): + target_line = lines[line_idx] + + if group_size == 2: + # Two lines: one gets trailing space, other gets leading space + if idx == 0: + target_line["label"] = target_line["label"] + space + else: + target_line["label"] = space + target_line["label"] + elif group_size == 3: + # Three lines: spread out with different amounts of spacing + if idx == 0: + target_line["label"] = target_line["label"] + space + space + elif idx == 1: + target_line["label"] = space + target_line["label"] + space + else: + target_line["label"] = space + space + target_line["label"] + elif group_size >= 4: + # Four or more lines: maximum spreading + if idx == 0: + target_line["label"] = target_line["label"] + space + space + space + elif idx == 1: + target_line["label"] = target_line["label"] + space + elif idx == group_size - 2: + target_line["label"] = space + target_line["label"] + else: + target_line["label"] = space + space + space + target_line["label"] + + processed.add(line_idx) + + # Clean up temporary fields + for line in lines: + line.pop("type", None) + line.pop("dataset", None) + + return lines + def parse_xenium_metrics(self, f) -> Dict: """Parse Xenium metrics_summary.csv file""" lines = f["f"].splitlines() @@ -602,146 +871,137 @@ def parse_experiment_json(self, f) -> Dict: return {} def parse_transcripts_parquet(self, f) -> Optional[Dict]: - """Parse Xenium transcripts.parquet file to extract quality distribution by codeword""" - # Read the parquet file content - sample more for better scatter plots + """Parse Xenium transcripts.parquet file with optimized lazy dataframe processing + + Only computes aggregated statistics needed for reporting, avoiding per-transcript dictionaries. + + Args: + f: File info dict + """ file_path = Path(f["root"]) / f["fn"] - df = pl.read_parquet(file_path) - # Check if required columns exist + # Use lazy loading to avoid reading entire file into memory + df_lazy = pl.scan_parquet(file_path) + + # Check if required columns exist by scanning schema (avoid performance warning) + schema = df_lazy.collect_schema() required_cols = ["qv", "feature_name"] - if not all(col in df.columns for col in required_cols): + if not all(col in schema for col in required_cols): log.warning(f"Missing required columns in {f['fn']}: {required_cols}") return None - # Group by feature_name and calculate both quality distribution and transcript counts - quality_dist = {} - transcript_counts = {} + # Get total row count efficiently without loading full data + total_transcripts = df_lazy.select(pl.len()).collect().item() - grouped = df.group_by("feature_name").agg( - [ - pl.col("qv").value_counts().alias("qv_counts"), - pl.col("qv").mean().alias("mean_qv"), - pl.len().alias("transcript_count"), - ] + # Compute category statistics directly in lazy dataframe for optimal performance + # This replaces per-transcript dictionaries with aggregated category stats + category_stats = ( + df_lazy.with_columns( + pl.col("feature_name") + .map_elements(lambda x: categorize_feature(str(x))[0], return_dtype=pl.Utf8) + .alias("category") + ) + .group_by("category") + .agg( + [ + pl.col("qv").mean().alias("mean_quality"), + pl.col("qv").std().alias("std_quality"), + pl.col("qv").count().alias("transcript_count"), + pl.col("feature_name").n_unique().alias("feature_count"), + ] + ) + .collect() ) - for row in grouped.iter_rows(named=True): - feature = str(row["feature_name"]) - qv_counts_df = row["qv_counts"] - transcript_counts[feature] = {"count": row["transcript_count"], "mean_quality": row["mean_qv"]} - - # Convert to dictionary format for backward compatibility - qv_dict = {} - for qv_row in qv_counts_df: - qv_dict[qv_row["qv"]] = qv_row["count"] - quality_dist[feature] = qv_dict + # Create optimized result structure - only store aggregated category statistics + category_summary = {} + for row in category_stats.iter_rows(named=True): + category = str(row["category"]) + category_summary[category] = { + "mean_quality": row["mean_quality"], + "std_quality": row["std_quality"] or 0.0, # Handle null std for single values + "transcript_count": row["transcript_count"], + "feature_count": row["feature_count"], + } result = { - "quality_distribution": quality_dist, - "transcript_counts": transcript_counts, - "total_transcripts": df.height, - "unique_features": len(quality_dist), + "category_summary": category_summary, + "total_transcripts": total_transcripts, } - # Add codeword category quality analysis if codeword_category column is present - if "codeword_category" in df.columns: - category_quality_distributions = {} - - # Group by codeword_category and collect QV values for violin plots - category_grouped = df.group_by("codeword_category").agg(pl.col("qv").alias("qv_values")) - - for row in category_grouped.iter_rows(named=True): - category = str(row["codeword_category"]) - qv_values = row["qv_values"] - - # Store quality values for violin plots (sample if too many) - if hasattr(qv_values, "to_list"): - # It's a polars Series - if len(qv_values) > 2000: # Sample more for better violin plot - sampled_qv = qv_values.sample(2000, seed=42) - category_quality_distributions[category] = sampled_qv.to_list() - else: - category_quality_distributions[category] = qv_values.to_list() - else: - # It's already a Python list - if len(qv_values) > 2000: - import random - - random.seed(42) - sampled_qv = random.sample(qv_values, 2000) - category_quality_distributions[category] = sampled_qv - else: - category_quality_distributions[category] = qv_values + # Add feature-level transcript counts for scatter plot (single sample case) + # This is needed for the transcript quality scatter plot + feature_stats = ( + df_lazy.group_by("feature_name") + .agg( + [ + pl.col("qv").mean().alias("mean_quality"), + pl.col("qv").count().alias("count"), + ] + ) + .collect() + ) - result["category_quality_distributions"] = category_quality_distributions + # Create transcript_counts dictionary for scatter plot + transcript_counts = {} + for row in feature_stats.iter_rows(named=True): + feature_name = str(row["feature_name"]) + transcript_counts[feature_name] = { + "count": row["count"], + "mean_quality": row["mean_quality"], + } - # Add molecules per gene analysis if feature_name and is_gene columns are present - if "feature_name" in df.columns and "is_gene" in df.columns: - # Group by feature_name and calculate molecule count per gene - molecules_per_gene = {} + result["transcript_counts"] = transcript_counts - gene_grouped = df.group_by("feature_name").agg( - [pl.len().alias("molecule_count"), pl.col("is_gene").first().alias("is_gene")] + # Add transcripts per gene analysis if is_gene column is present + if "is_gene" in schema: + transcript_stats = ( + df_lazy.group_by("feature_name") + .agg([pl.len().alias("transcript_count"), pl.col("is_gene").first().alias("is_gene")]) + .collect() ) - for row in gene_grouped.iter_rows(named=True): - feature_name = str(row["feature_name"]) - molecule_count = row["molecule_count"] - is_gene = row["is_gene"] - - molecules_per_gene[feature_name] = {"count": molecule_count, "is_gene": is_gene} - - result["molecules_per_gene"] = molecules_per_gene + if not transcript_stats.is_empty(): + molecules_per_gene = {} + for row in transcript_stats.iter_rows(named=True): + feature_name = str(row["feature_name"]) + molecules_per_gene[feature_name] = { + "count": row["transcript_count"], # This is transcript count per gene + "is_gene": row["is_gene"], + } + result["molecules_per_gene"] = molecules_per_gene # Add FoV quality analysis if fov_name column is present - if "fov_name" in df.columns: - fov_quality_stats = {} - fov_quality_distributions = {} - - # Group by FoV and calculate quality stats and distributions - fov_grouped = df.group_by("fov_name").agg( - [ - pl.col("qv").mean().alias("mean_qv"), - pl.col("qv").median().alias("median_qv"), - pl.col("qv").std().alias("std_qv"), - pl.col("qv").alias("qv_values"), # Keep all QV values for distributions - pl.len().alias("transcript_count"), - ] + if "fov_name" in schema: + fov_stats = ( + df_lazy.group_by("fov_name") + .agg( + [ + pl.col("qv").mean().alias("mean_qv"), + pl.col("qv").median().alias("median_qv"), + pl.col("qv").std().alias("std_qv"), + pl.len().alias("transcript_count"), + ] + ) + .collect() ) - for row in fov_grouped.iter_rows(named=True): + fov_quality_stats = {} + fov_medians = [] + for row in fov_stats.iter_rows(named=True): fov_name = str(row["fov_name"]) + median_qv = row["median_qv"] fov_quality_stats[fov_name] = { "mean_quality": row["mean_qv"], - "median_quality": row["median_qv"], - "std_quality": row["std_qv"], + "median_quality": median_qv, + "std_quality": row["std_qv"] or 0.0, "transcript_count": row["transcript_count"], } - - # Store quality values for violin plots (limit to reasonable sample size) - qv_values = row["qv_values"] - - # Check if it's a polars Series or already a list - if hasattr(qv_values, "to_list"): - # It's a polars Series - if len(qv_values) > 1000: - sampled_qv = qv_values.sample(1000, seed=42) - fov_quality_distributions[fov_name] = sampled_qv.to_list() - else: - fov_quality_distributions[fov_name] = qv_values.to_list() - else: - # It's already a Python list - if len(qv_values) > 1000: - import random - - random.seed(42) - sampled_qv = random.sample(qv_values, 1000) - fov_quality_distributions[fov_name] = sampled_qv - else: - fov_quality_distributions[fov_name] = qv_values + if median_qv is not None: + fov_medians.append(median_qv) result["fov_quality_stats"] = fov_quality_stats - result["fov_quality_distributions"] = fov_quality_distributions + result["fov_median_qualities"] = fov_medians # For heatmap generation return result @@ -749,135 +1009,196 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: """Parse Xenium cells.parquet file to extract cell-level metrics""" file_path = Path(f["root"]) / f["fn"] - try: - # Read cells parquet file - df = pl.read_parquet(file_path) - - # Check for required columns - required_cols = ["cell_area", "nucleus_area", "total_counts", "transcript_counts"] - missing_cols = [col for col in required_cols if col not in df.columns] - if missing_cols: - log.warning(f"Missing columns in {f['fn']}: {missing_cols}") - return None - - # Calculate summary statistics - cell_stats = {} - - # Cell area distribution stats - cell_area_stats = df["cell_area"].drop_nulls() - if cell_area_stats.len() > 0: - cell_stats.update( - { - "cell_area_mean": cell_area_stats.mean(), - "cell_area_median": cell_area_stats.median(), - "cell_area_std": cell_area_stats.std(), - "cell_area_min": cell_area_stats.min(), - "cell_area_max": cell_area_stats.max(), - } + # Use lazy reading to avoid loading entire file into memory + log.info(f"Processing cells parquet file with memory-efficient lazy read: {file_path}") + # Start with lazy frame to check schema without loading data + lazy_df = pl.scan_parquet(file_path, parallel="none") # parallel execution causing panics + + # Check for required columns using schema + schema = lazy_df.collect_schema() + required_cols = ["cell_area", "nucleus_area", "total_counts", "transcript_counts"] + missing_cols = [col for col in required_cols if col not in schema] + if missing_cols: + log.warning(f"Missing columns in {f['fn']}: {missing_cols}") + return None + + # Get row count efficiently without loading data + total_cells = lazy_df.select(pl.len()).collect().item() + cell_stats = {"total_cells": total_cells} + + # Cell area distribution stats using lazy operations + cell_area_stats = ( + lazy_df.filter(pl.col("cell_area").is_not_null()) + .select( + [ + pl.col("cell_area").mean().alias("mean"), + pl.col("cell_area").median().alias("median"), + pl.col("cell_area").std().alias("std"), + pl.col("cell_area").min().alias("min"), + pl.col("cell_area").max().alias("max"), + pl.col("cell_area").count().alias("count"), + ] + ) + .collect() + ) + + if cell_area_stats["count"].item() > 0: + cell_stats.update( + { + "cell_area_mean": cell_area_stats["mean"].item(), + "cell_area_median": cell_area_stats["median"].item(), + "cell_area_std": cell_area_stats["std"].item(), + "cell_area_min": cell_area_stats["min"].item(), + "cell_area_max": cell_area_stats["max"].item(), + } + ) + + # Sample cell area values for distribution plots + count = cell_area_stats["count"].item() + print(f"count: {count}, sample name: {f['s_name']}") + sample_values = ( + lazy_df.filter(pl.col("cell_area").is_not_null()).select("cell_area").collect().to_series().to_list() + ) + cell_stats["cell_area_values"] = sample_values + + # Nucleus area distribution stats using lazy operations + nucleus_area_stats = ( + lazy_df.filter(pl.col("nucleus_area").is_not_null()) + .select( + [ + pl.col("nucleus_area").mean().alias("mean"), + pl.col("nucleus_area").median().alias("median"), + pl.col("nucleus_area").std().alias("std"), + pl.col("nucleus_area").count().alias("count"), + ] + ) + .collect() + ) + + if nucleus_area_stats["count"].item() > 0: + cell_stats.update( + { + "nucleus_area_mean": nucleus_area_stats["mean"].item(), + "nucleus_area_median": nucleus_area_stats["median"].item(), + "nucleus_area_std": nucleus_area_stats["std"].item(), + } + ) + + # Nucleus to cell area ratio (only for non-null values) + ratio_stats = ( + lazy_df.filter( + (pl.col("cell_area").is_not_null()) + & (pl.col("nucleus_area").is_not_null()) + & (pl.col("cell_area") > 0) + ) + .with_columns((pl.col("nucleus_area") / pl.col("cell_area")).alias("ratio")) + .select( + [ + pl.col("ratio").mean().alias("mean"), + pl.col("ratio").median().alias("median"), + pl.col("ratio").count().alias("count"), + ] ) + .collect() + ) - # Nucleus area distribution stats - nucleus_area_stats = df["nucleus_area"].drop_nulls() - if nucleus_area_stats.len() > 0: + if ratio_stats["count"].item() > 0: cell_stats.update( { - "nucleus_area_mean": nucleus_area_stats.mean(), - "nucleus_area_median": nucleus_area_stats.median(), - "nucleus_area_std": nucleus_area_stats.std(), + "nucleus_to_cell_area_ratio_mean": ratio_stats["mean"].item(), + "nucleus_to_cell_area_ratio_median": ratio_stats["median"].item(), } ) - # Nucleus to cell area ratio (only for non-null values) - valid_ratio_df = df.filter( - (pl.col("cell_area").is_not_null()) - & (pl.col("nucleus_area").is_not_null()) - & (pl.col("cell_area") > 0) - ) - if valid_ratio_df.height > 0: - ratio = valid_ratio_df["nucleus_area"] / valid_ratio_df["cell_area"] - cell_stats.update( - { - "nucleus_to_cell_area_ratio_mean": ratio.mean(), - "nucleus_to_cell_area_ratio_median": ratio.median(), - } + # Sample ratio values for distribution plots + count = ratio_stats["count"].item() + sample_values = ( + lazy_df.filter( + (pl.col("cell_area").is_not_null()) + & (pl.col("nucleus_area").is_not_null()) + & (pl.col("cell_area") > 0) ) + .with_columns((pl.col("nucleus_area") / pl.col("cell_area")).alias("ratio")) + .select("ratio") + .collect() + .to_series() + .to_list() + ) - # Store nucleus-to-cell area ratio values for distribution plots - # Sample up to 10000 cells for distribution plotting to avoid memory issues - max_cells_for_plot = min(10000, ratio.len()) - if ratio.len() > max_cells_for_plot: - # Random sample - ratio_values = ratio.sample(max_cells_for_plot, seed=42).to_list() - else: - ratio_values = ratio.to_list() - cell_stats["nucleus_to_cell_area_ratio_values"] = ratio_values - - # Total cell count - cell_stats["total_cells"] = df.height - - # Add nucleus RNA fraction if nucleus_count is available - if "nucleus_count" in df.columns: - # Filter out cells with zero total counts to avoid division by zero - valid_cells = df.filter(pl.col("total_counts") > 0) - if valid_cells.height > 0: - nucleus_rna_fraction = valid_cells["nucleus_count"] / valid_cells["total_counts"] - cell_stats.update( - { - "nucleus_rna_fraction_mean": nucleus_rna_fraction.mean(), - "nucleus_rna_fraction_median": nucleus_rna_fraction.median(), - } - ) + cell_stats["nucleus_to_cell_area_ratio_values"] = sample_values - # Store nucleus RNA fraction values for distribution plots - # Sample up to 10000 cells for distribution plotting to avoid memory issues - max_cells_for_plot = min(10000, nucleus_rna_fraction.len()) - if nucleus_rna_fraction.len() > max_cells_for_plot: - # Random sample - nucleus_fraction_values = nucleus_rna_fraction.sample(max_cells_for_plot, seed=42).to_list() - else: - nucleus_fraction_values = nucleus_rna_fraction.to_list() - cell_stats["nucleus_rna_fraction_values"] = nucleus_fraction_values - - # Store cell area values for distribution plots - if cell_area_stats.len() > 0: - # Sample up to 10000 cells for distribution plotting to avoid memory issues - max_cells_for_plot = min(10000, cell_area_stats.len()) - if cell_area_stats.len() > max_cells_for_plot: - # Random sample - cell_area_values = cell_area_stats.sample(max_cells_for_plot, seed=42).to_list() - else: - cell_area_values = cell_area_stats.to_list() - cell_stats["cell_area_values"] = cell_area_values - - # Store transcript counts per cell (total_counts) for distribution plots - total_counts_stats = df["total_counts"].drop_nulls() - if total_counts_stats.len() > 0: - # Sample up to 10000 cells for distribution plotting to avoid memory issues - max_cells_for_plot = min(10000, total_counts_stats.len()) - if total_counts_stats.len() > max_cells_for_plot: - # Random sample - transcript_counts_values = total_counts_stats.sample(max_cells_for_plot, seed=42).to_list() - else: - transcript_counts_values = total_counts_stats.to_list() - cell_stats["transcript_counts_values"] = transcript_counts_values - - # Store detected genes per cell (transcript_counts) for distribution plots - detected_genes_stats = df["transcript_counts"].drop_nulls() - if detected_genes_stats.len() > 0: - # Sample up to 10000 cells for distribution plotting to avoid memory issues - max_cells_for_plot = min(10000, detected_genes_stats.len()) - if detected_genes_stats.len() > max_cells_for_plot: - # Random sample - detected_genes_values = detected_genes_stats.sample(max_cells_for_plot, seed=42).to_list() - else: - detected_genes_values = detected_genes_stats.to_list() - cell_stats["detected_genes_values"] = detected_genes_values + # Store total transcript counts per cell (total_counts) for distribution plots + total_count_check = ( + lazy_df.filter(pl.col("total_counts").is_not_null()) + .select(pl.col("total_counts").count().alias("count")) + .collect() + ) + + if total_count_check["count"].item() > 0: + count = total_count_check["count"].item() + sample_values = ( + lazy_df.filter(pl.col("total_counts").is_not_null()) + .select("total_counts") + .collect() + .to_series() + .to_list() + ) + cell_stats["total_counts_values"] = sample_values - return cell_stats + # Store detected genes per cell (transcript_counts) for distribution plots + detected_count_check = ( + lazy_df.filter(pl.col("transcript_counts").is_not_null()) + .select(pl.col("transcript_counts").count().alias("count")) + .collect() + ) - except Exception as e: - log.warning(f"Could not parse cells.parquet file {f['fn']}: {e}") - return None + if detected_count_check["count"].item() > 0: + count = detected_count_check["count"].item() + sample_values = ( + lazy_df.filter(pl.col("transcript_counts").is_not_null()) + .select("transcript_counts") + .collect() + .to_series() + .to_list() + ) + cell_stats["detected_genes_values"] = sample_values + + # Add nucleus RNA fraction if nucleus_count is available + if "nucleus_count" in schema: + nucleus_fraction_stats = ( + lazy_df.filter(pl.col("total_counts") > 0) + .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) + .select( + [ + pl.col("fraction").mean().alias("mean"), + pl.col("fraction").median().alias("median"), + pl.col("fraction").count().alias("count"), + ] + ) + .collect() + ) + + if nucleus_fraction_stats["count"].item() > 0: + cell_stats.update( + { + "nucleus_rna_fraction_mean": nucleus_fraction_stats["mean"].item(), + "nucleus_rna_fraction_median": nucleus_fraction_stats["median"].item(), + } + ) + + # Sample nucleus fraction values for distribution plots + count = nucleus_fraction_stats["count"].item() + sample_values = ( + lazy_df.filter(pl.col("total_counts") > 0) + .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) + .select("fraction") + .collect() + .to_series() + .to_list() + ) + cell_stats["nucleus_rna_fraction_values"] = sample_values + + return cell_stats def check_qc_warnings(self, data_by_sample): """Check for quality control issues and log warnings""" @@ -909,11 +1230,12 @@ def xenium_general_stats_table(self, data_by_sample): "format": "{:,.0f}", }, "fraction_transcripts_assigned": { - "title": "% Transcripts Assigned", + "title": "Transcripts Assigned", "description": "Fraction of transcripts assigned to cells", "suffix": "%", "scale": "RdYlGn", "modify": lambda x: x * 100.0, + "max": 100.0, }, "median_genes_per_cell": { "title": "Genes/Cell", @@ -922,29 +1244,32 @@ def xenium_general_stats_table(self, data_by_sample): "format": "{:,.0f}", }, "fraction_transcripts_decoded_q20": { - "title": "% Q20+ Transcripts", + "title": "Q20+ Transcripts", "description": "Fraction of transcripts decoded with Q20+", "suffix": "%", "scale": "Greens", "modify": lambda x: x * 100.0, + "max": 100.0, }, - "cell_area_mean": { - "title": "Cell Area", - "description": "Mean cell area", + "cell_area_median": { + "title": "Median Cell", + "description": "Median cell area", "suffix": " μm²", "scale": "Blues", "format": "{:,.1f}", + "shared_key": "xenium_cell_area", }, - "nucleus_area_mean": { - "title": "Nucleus Area", - "description": "Mean nucleus area", + "nucleus_area_median": { + "title": "Median Nucleus", + "description": "Median nucleus area", "suffix": " μm²", "scale": "Oranges", "format": "{:,.1f}", + "shared_key": "xenium_cell_area", }, - "nucleus_to_cell_area_ratio_mean": { - "title": "Nucleus/Cell Ratio", - "description": "Mean nucleus to cell area ratio", + "nucleus_to_cell_area_ratio_median": { + "title": "Nucleus/Cell", + "description": "Median nucleus to cell area ratio", "scale": "Greens", "format": "{:.3f}", "max": 1.0, @@ -954,47 +1279,24 @@ def xenium_general_stats_table(self, data_by_sample): def xenium_segmentation_plot(self, data_by_sample): """Create stacked bar plot for segmentation methods""" - plot_data = {} - for s_name, data in data_by_sample.items(): - plot_data[s_name] = { - "segmented_cell_boundary_frac": data.get("segmented_cell_boundary_frac", 0), - "segmented_cell_interior_frac": data.get("segmented_cell_interior_frac", 0), - "segmented_cell_nuc_expansion_frac": data.get("segmented_cell_nuc_expansion_frac", 0), - } - keys = { - "segmented_cell_boundary_frac": {"name": "Boundary", "color": "#1f77b4"}, - "segmented_cell_interior_frac": {"name": "Interior", "color": "#ff7f0e"}, - "segmented_cell_nuc_expansion_frac": {"name": "Nuclear Expansion", "color": "#2ca02c"}, + "segmented_cell_boundary_frac": {"name": "Boundary", "color": "#c72eba"}, + "segmented_cell_interior_frac": {"name": "Interior", "color": "#bbbf34"}, + "segmented_cell_nuc_expansion_frac": {"name": "Nuclear Expansion", "color": "#426cf5"}, } config = { "id": "xenium_segmentation", - "title": "Xenium: Cell Segmentation Methods", + "title": "Xenium: Cell Segmentation Method", "ylab": "Fraction", "stacking": "normal", "ymax": 1.0, "cpswitch": False, } - return bargraph.plot(plot_data, keys, config) - - def xenium_transcript_quality_plot(self, transcript_data_by_sample): - """Create adaptive transcript quality plots based on sample count""" - if not transcript_data_by_sample: - return None - - num_samples = len(transcript_data_by_sample) + return bargraph.plot(data_by_sample, keys, config) - if num_samples == 1: - # Single sample: scatter plot of transcript count vs mean quality - return self._create_single_sample_scatter(transcript_data_by_sample) - - else: - # Many samples: violin plots with tabs for categories - return self._create_multi_sample(transcript_data_by_sample) - - def _create_single_sample_scatter(self, transcript_data_by_sample): + def xenium_transcript_quality_scatter_plot(self, transcript_data_by_sample): """Create scatter plot - handles both single and multiple samples""" # Prepare scatter data - create individual points for each gene from all samples plot_data: Dict[str, Any] = {} @@ -1056,11 +1358,14 @@ def _create_single_sample_scatter(self, transcript_data_by_sample): "title": title, "xlab": "Total transcripts per gene", "ylab": "Mean calibrated quality of gene transcripts", - "marker_size": 5, + "marker_size": 4, + "marker_line_width": 0, + "opacity": 0.75, "series_label": "transcripts", "xlog": True, "showlegend": True, "groups": category_order, + "flat_if_very_large": False, } return scatter.plot(final_plot_data, config) @@ -1111,62 +1416,101 @@ def _create_multi_sample(self, transcript_data_by_sample): return linegraph.plot(list(datasets.values()), config) - def xenium_cell_area_plot(self, cells_data_by_sample): - """Create bar plot for cell area metrics""" - plot_data = {} - for s_name, data in cells_data_by_sample.items(): - plot_data[s_name] = {} - - # Only add metrics that exist and are not None/NaN - if ( - "cell_area_mean" in data - and data["cell_area_mean"] is not None - and str(data["cell_area_mean"]).lower() != "nan" - ): - try: - plot_data[s_name]["cell_area_mean"] = float(data["cell_area_mean"]) - except (ValueError, TypeError): - pass - - if ( - "nucleus_area_mean" in data - and data["nucleus_area_mean"] is not None - and str(data["nucleus_area_mean"]).lower() != "nan" - ): - try: - plot_data[s_name]["nucleus_area_mean"] = float(data["nucleus_area_mean"]) - except (ValueError, TypeError): - pass - - if ( - "nucleus_to_cell_area_ratio_mean" in data - and data["nucleus_to_cell_area_ratio_mean"] is not None - and str(data["nucleus_to_cell_area_ratio_mean"]).lower() != "nan" - ): - try: - plot_data[s_name]["nucleus_to_cell_ratio"] = float(data["nucleus_to_cell_area_ratio_mean"]) - except (ValueError, TypeError): - pass + def xenium_transcript_quality_table(self, transcript_data_by_sample): + """Create per-sample table showing mean quality for each category (samples as rows, categories as columns)""" + if not transcript_data_by_sample: + return None + + # Collect all categories across samples to create consistent columns + all_categories = set() + for sample_data in transcript_data_by_sample.values(): + if "category_summary" in sample_data: + all_categories.update(sample_data["category_summary"].keys()) - # Check if we have any data to plot - has_data = any(bool(sample_data) for sample_data in plot_data.values()) - if not has_data: + if not all_categories: return None - keys = { - "cell_area_mean": {"name": "Mean Cell Area (μm²)", "color": "#1f77b4"}, - "nucleus_area_mean": {"name": "Mean Nucleus Area (μm²)", "color": "#ff7f0e"}, - "nucleus_to_cell_ratio": {"name": "Nucleus/Cell Area Ratio", "color": "#2ca02c"}, - } + # Create table data: samples as rows, categories as columns + table_data = {} + for sample_name, sample_data in transcript_data_by_sample.items(): + if "category_summary" not in sample_data: + continue - config = { - "id": "xenium_cell_area", - "title": "Xenium: Cell Area Metrics", - "ylab": "Area (μm²) / Ratio", - "cpswitch_counts_label": "Values", - } + table_data[sample_name] = {} + + # Add mean quality for each category + for category in all_categories: + if category in sample_data["category_summary"]: + mean_quality = sample_data["category_summary"][category]["mean_quality"] + table_data[sample_name][f"{category} Mean QV"] = mean_quality + else: + table_data[sample_name][f"{category} Mean QV"] = None + + # Add standard deviation for each category + for category in all_categories: + if category in sample_data["category_summary"]: + std_quality = sample_data["category_summary"][category]["std_quality"] + table_data[sample_name][f"{category} Std Dev"] = std_quality + else: + table_data[sample_name][f"{category} Std Dev"] = None + + if not table_data: + return None + + # Create table headers for each category (both mean and std dev) + headers: Dict[str, ColumnDict] = {} + + # Sort categories for consistent ordering + sorted_categories = sorted( + all_categories, + key=lambda x: ( + 0 + if x == "Pre-designed" + else 1 + if x == "Custom" + else 2 + if x == "Genomic Control Probe" + else 3 + if x == "Negative Control Probe" + else 4 + if x == "Negative Control Codeword" + else 5 + if x == "Unassigned Codeword" + else 6 + if x == "Deprecated Codeword" + else 7 + ), + ) + + for category in sorted_categories: + # Mean quality column + headers[f"{category} Mean QV"] = { + "title": f"{category} Mean", # Abbreviated for space + "description": f"Mean calibrated quality score (QV) for {category}", + "scale": "Blues", + "format": "{:.2f}", + "suffix": "", + "shared_key": "xenium_transcript_quality", + } - return bargraph.plot(plot_data, keys, config) + # Standard deviation column + headers[f"{category} Std Dev"] = { + "title": f"{category} StdDev", # Abbreviated for space + "description": f"Standard deviation of quality scores for {category}", + "scale": "Oranges", + "format": "{:.2f}", + "suffix": "", + "shared_key": "xenium_transcript_quality", + } + + return table.plot( + table_data, + headers, + pconfig=TableConfig( + id="xenium_transcript_quality_per_sample_table", + title="Xenium: Transcript Quality by Sample and Category", + ), + ) def xenium_cell_area_distribution_plot(self, cells_data_by_sample): """Create cell area distribution plot - line plot for single sample, violin plots for multiple""" @@ -1190,8 +1534,14 @@ def xenium_cell_area_distribution_plot(self, cells_data_by_sample): def _create_single_sample_area_density(self, cell_data): """Create density plot for single sample with mean/median lines""" + if not SCIPY_AVAILABLE: + log.warning("scipy not available, skipping density plots. Install scipy for enhanced plotting.") + return None + import numpy as np - from scipy.stats import gaussian_kde + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde cell_areas = cell_data["cell_area_values"] if not cell_areas or len(cell_areas) < 10: @@ -1208,13 +1558,11 @@ def _create_single_sample_area_density(self, cell_data): density_vals = kde(x_vals) # Prepare data for linegraph - datasets = {} density_data = {} for x, y in zip(x_vals, density_vals): density_data[float(x)] = float(y) - datasets["Density"] = density_data - config = { + config: Dict[str, Any] = { "id": "xenium_cell_area_distribution", "title": "Xenium: Cell Area Distribution", "xlab": "Cell area", @@ -1224,28 +1572,20 @@ def _create_single_sample_area_density(self, cell_data): # Add vertical lines for mean and median if "cell_area_mean" in cell_data and "cell_area_median" in cell_data: - config["x_lines"] = [ - { - "value": float(cell_data["cell_area_mean"]), - "color": "red", - "dash": "dash", - "width": 2, - "label": f"Mean ({cell_data['cell_area_mean']:.1f} μm²)", - }, - { - "value": float(cell_data["cell_area_median"]), - "color": "green", - "dash": "dash", - "width": 2, - "label": f"Median ({cell_data['cell_area_median']:.1f} μm²)", - }, - ] + density_keys = [float(k) for k in density_data.keys()] + config["x_lines"] = self._create_non_overlapping_labels( + cell_data["cell_area_mean"], + cell_data["cell_area_median"], + precision=1, + suffix=" μm²", + data_min=min(density_keys), + data_max=max(density_keys), + ) - return linegraph.plot(datasets, config) + return linegraph.plot({"Density": density_data}, config) def _create_multi_sample_area_violins(self, cells_data_by_sample, samples_with_areas): """Create box plots for multiple samples - one box per sample""" - from multiqc.plots import box # For box plots, we provide the raw data points grouped by sample data = {} @@ -1265,8 +1605,7 @@ def _create_multi_sample_area_violins(self, cells_data_by_sample, samples_with_a config = { "id": "xenium_cell_area_distribution", "title": "Xenium: Cell Area Distribution", - "ylab": "Cell area (μm²)", - "xlab": "Sample", + "xlab": "Cell area (μm²)", "boxpoints": False, } @@ -1294,10 +1633,11 @@ def xenium_nucleus_rna_fraction_plot(self, cells_data_by_sample): def _create_single_sample_nucleus_density(self, cell_data): """Create density plot for single sample nucleus RNA fractions""" - import numpy as np - from scipy import stats + if not SCIPY_AVAILABLE: + log.warning("scipy not available, skipping nucleus density plots. Install scipy for enhanced plotting.") + return None - from multiqc.plots import linegraph + from scipy import stats nucleus_fractions = cell_data["nucleus_rna_fraction_values"] if not nucleus_fractions: @@ -1316,6 +1656,22 @@ def _create_single_sample_nucleus_density(self, cell_data): x_range = (bin_edges[:-1] + bin_edges[1:]) / 2 density = hist + # Trim long tail: find cutoff where all values above X are below 1% of max + max_density = np.max(density) + threshold = max_density * 0.01 # 1% of max + + # Find the last point where density is above threshold + last_significant_point = len(density) - 1 + for i in range(len(density) - 1, -1, -1): + if density[i] >= threshold: + last_significant_point = i + break + + # Trim the data to only include up to the last significant point + if last_significant_point < len(density) - 1: + x_range = x_range[: last_significant_point + 1] + density = density[: last_significant_point + 1] + # Create the density plot data data = {} data["Nucleus RNA Fraction Density"] = {str(x): y for x, y in zip(x_range, density)} @@ -1324,8 +1680,8 @@ def _create_single_sample_nucleus_density(self, cell_data): config = { "id": "xenium_nucleus_rna_fraction_single", - "title": "Distribution of fractions of molecules in nucleus per cell", - "xlab": "Fraction of molecules in nucleus per cell", + "title": "Xenium: Fraction of Transcripts in Nucleus", + "xlab": "Distribution of the fraction of transcripts found in the nucleus across cells", "ylab": "Density", "data_labels": [ {"name": "Density", "ylab": "Density"}, @@ -1333,13 +1689,24 @@ def _create_single_sample_nucleus_density(self, cell_data): } # Add vertical lines for mean and median + mean_fraction = np.nanmean(nucleus_fractions) + median_fraction = np.nanmedian(nucleus_fractions) + + density_keys = [float(k) for k in data["Nucleus RNA Fraction Density"].keys()] + config["x_lines"] = self._create_non_overlapping_labels( + mean_fraction, + median_fraction, + precision=3, + data_min=min(density_keys), + data_max=max(density_keys), + ) + plot = linegraph.plot(data, config) return plot def _create_multi_sample_nucleus_boxes(self, cells_data_by_sample, samples_with_nucleus_data): """Create box plots for multiple samples - one box per sample""" - from multiqc.plots import box # For box plots, we provide the raw data points grouped by sample data = {} @@ -1357,9 +1724,8 @@ def _create_multi_sample_nucleus_boxes(self, cells_data_by_sample, samples_with_ config = { "id": "xenium_nucleus_rna_fraction_multi", - "title": "Distribution of fractions of molecules in nucleus per cell", - "ylab": "Fraction of molecules in nucleus per cell", - "xlab": "Sample", + "title": "Xenium: Fraction of Transcripts in Nucleus", + "xlab": "Distribution of the fraction of transcripts found in the nucleus across cells", "boxpoints": False, } @@ -1387,6 +1753,10 @@ def xenium_nucleus_cell_area_ratio_plot(self, cells_data_by_sample): def _create_single_sample_ratio_density(self, cell_data): """Create density plot for single sample nucleus-to-cell area ratios""" + if not SCIPY_AVAILABLE: + log.warning("scipy not available, skipping plots. Install scipy for enhanced plotting.") + return None + import numpy as np from scipy import stats @@ -1415,7 +1785,7 @@ def _create_single_sample_ratio_density(self, cell_data): config = { "id": "xenium_nucleus_cell_area_ratio_single", - "title": "Nucleus to cell area distribution", + "title": "Xenium: Nucleus to Cell Area Distribution", "xlab": "Nucleus-to-cell area ratio", "ylab": "Density", "data_labels": [ @@ -1423,13 +1793,21 @@ def _create_single_sample_ratio_density(self, cell_data): ], } + # Add vertical lines for mean and median + mean_ratio = np.nanmean(ratio_values) + median_ratio = np.nanmedian(ratio_values) + + density_keys = [float(k) for k in data["Nucleus-to-Cell Area Ratio Density"].keys()] + config["x_lines"] = self._create_non_overlapping_labels( + mean_ratio, median_ratio, precision=3, data_min=min(density_keys), data_max=max(density_keys) + ) + plot = linegraph.plot(data, config) return plot def _create_multi_sample_ratio_boxes(self, cells_data_by_sample, samples_with_ratio_data): """Create box plots for multiple samples - one box per sample""" - from multiqc.plots import box # For box plots, we provide the raw data points grouped by sample data = {} @@ -1447,165 +1825,104 @@ def _create_multi_sample_ratio_boxes(self, cells_data_by_sample, samples_with_ra config = { "id": "xenium_nucleus_cell_area_ratio_multi", - "title": "Nucleus to cell area distribution", - "ylab": "Nucleus-to-cell area ratio", - "xlab": "Sample", + "title": "Xenium: Nucleus to Cell Area Distribution", + "xlab": "Nucleus-to-cell area ratio", "boxpoints": False, } return box.plot(data, config) def xenium_fov_quality_plot(self, transcript_data_by_sample): - """Create adaptive FoV quality plot - violin plots for single sample, summary for multiple""" - fov_data_found = False - samples_with_fov = [] + """Create bar plot showing FoV count distribution across QV ranges""" + # Collect median quality per FoV per sample + fov_median_by_sample = {} - # Check which samples have FoV data for s_name, data in transcript_data_by_sample.items(): - if "fov_quality_distributions" in data and "fov_quality_stats" in data: - fov_data_found = True - samples_with_fov.append(s_name) - - if not fov_data_found: + data = transcript_data_by_sample[s_name] + if "fov_quality_stats" in data: + fov_median_by_sample[s_name] = {} + fov_stats = data["fov_quality_stats"] + for fov_name, stats in fov_stats.items(): + median_quality = stats["median_quality"] + if median_quality is not None: + fov_median_by_sample[s_name][fov_name] = median_quality + + if not fov_median_by_sample: return None - num_samples = len(samples_with_fov) - - if num_samples == 1: - # Single sample: Create violin plot showing quality distributions per FoV - return self._create_single_sample_fov_box(transcript_data_by_sample[samples_with_fov[0]]) - else: - # Multiple samples: Create bar plot showing mean quality per FoV across samples - return self._create_multi_sample_fov_summary(transcript_data_by_sample, samples_with_fov) - - def _create_single_sample_fov_box(self, sample_data): - """Create box plot showing quality distributions for single sample FoVs""" - if "fov_quality_distributions" not in sample_data: - return None + # Define QV ranges (ordered high to low for display) + qv_ranges = [ + ("Excellent (QV ≥ 35)", 35, float("inf")), + ("Good (QV 30-35)", 30, 35), + ("Fair (QV 25-30)", 25, 30), + ("Poor (QV 20-25)", 20, 25), + ("Low (QV < 20)", 0, 20), + ] - plot_data = {} + # Create bar plot data - count FoVs in each QV range per sample + bar_data = {} + for sample_name, fov_qualities in fov_median_by_sample.items(): + bar_data[sample_name] = {} - # Use the raw quality distributions for proper box plots - for fov_name, qv_values in sample_data["fov_quality_distributions"].items(): - if qv_values and len(qv_values) > 0: - # Box plot expects the raw data points - plot_data[fov_name] = qv_values + # Initialize counts for each range + for range_name, _, _ in qv_ranges: + bar_data[sample_name][range_name] = 0 - if not plot_data: - return None + # Count FoVs in each range + for fov_name, quality in fov_qualities.items(): + for range_name, min_qv, max_qv in qv_ranges: + if min_qv <= quality < max_qv: + bar_data[sample_name][range_name] += 1 + break config = { - "id": "xenium_fov_quality_single", - "title": "Xenium: transcript quality distribution by field of view", - "xlab": "Field of view", - "series_label": "fields of view", - "ylab": "Quality value (QV)", - "sort_by_median": True, # Use the new core box plot sorting feature - "sort_switch_sorted_active": True, # Start with sorted view active - "boxpoints": False, # Do not show individual data points + "id": "xenium_fov_quality_ranges", + "title": "Xenium: Field of View Quality Distribution", + "xlab": "Quality Range", + "ylab": "Number of Fields of View", + "cpswitch_c_active": False, + "use_legend": True, } - return box.plot(plot_data, config) - - def _create_multi_sample_fov_summary(self, transcript_data_by_sample, samples_with_fov): - """Create box plot showing quality distributions for each FoV aggregated across all samples""" - fov_quality_data = {} - - # Aggregate quality distributions for each FoV across all samples - for s_name in samples_with_fov: - data = transcript_data_by_sample[s_name] - if "fov_quality_distributions" in data: - fov_distributions = data["fov_quality_distributions"] - for fov_name, quality_values in fov_distributions.items(): - if fov_name not in fov_quality_data: - fov_quality_data[fov_name] = [] - # Add all quality values from this sample's FoV to the aggregated distribution - fov_quality_data[fov_name].extend(quality_values) - - if not fov_quality_data: - return None - - config = { - "id": "xenium_fov_quality_multi", - "title": "Xenium: Transcript quality distribution by field of view (averaged across samples)", - "xlab": "Quality Score (QV)", - "ylab": "Field of View", - "series_label": "fields of view", - "sort_by_median": True, # Use the new core box plot sorting feature - "sort_switch_sorted_active": True, # Start with sorted view active - "boxpoints": False, # Do not show individual data points + # Define categories with colors (grey-to-green gradient, ordered high to low) + cats = { + "Excellent (QV ≥ 35)": { + "name": "Excellent (QV ≥ 35)", + "color": "#32CD32", # Bright green for excellent quality + }, + "Good (QV 30-35)": { + "name": "Good (QV 30-35)", + "color": "#90EE90", # Light green for good quality + }, + "Fair (QV 25-30)": { + "name": "Fair (QV 25-30)", + "color": "#FFB6C1", # Light pink for fair quality + }, + "Poor (QV 20-25)": { + "name": "Poor (QV 20-25)", + "color": "#FF8C94", # Medium pink-red for poor quality + }, + "Low (QV < 20)": { + "name": "Low (QV < 20)", + "color": "#DC143C", # Dark red for low quality + }, } - return box.plot(fov_quality_data, config) - - def xenium_transcript_category_violin_plot(self, transcript_data_by_sample): - """Create violin plot showing transcript quality distribution by codeword category""" - # Check if any sample has category quality distributions - samples_with_categories = [] - for s_name, data in transcript_data_by_sample.items(): - if "category_quality_distributions" in data: - samples_with_categories.append(s_name) - - if not samples_with_categories: - return None - - # Aggregate quality distributions across all samples for each category - category_quality_data = {} - - for s_name in samples_with_categories: - data = transcript_data_by_sample[s_name] - if "category_quality_distributions" in data: - category_distributions = data["category_quality_distributions"] - for category, quality_values in category_distributions.items(): - if category not in category_quality_data: - category_quality_data[category] = [] - # Add all quality values from this sample's category to the aggregated distribution - category_quality_data[category].extend(quality_values) - - if not category_quality_data: - return None - - # Create headers for the violin plot - each category is a "metric" - headers = {} - for category in category_quality_data.keys(): - headers[category] = { - "title": category, - "description": f"Quality score distribution for {category}", - "suffix": " QV", - "color": GENE_CATS.get(category, {}).get("color", "#888888"), - "min": 0, - "max": 50, # QV scores typically range 0-50 - } - - # Create data dict - for violin plots we need sample -> category -> values - # For multiple samples, we want to show category distributions per sample - # but group all samples under each category tab - data = {} + return bargraph.plot(bar_data, cats, config) - # For each sample, create entries with category-wise data - for s_name in samples_with_categories: - sample_data = transcript_data_by_sample[s_name] - if "category_quality_distributions" in sample_data: - data[s_name] = sample_data["category_quality_distributions"] + def _sort_fov_names(self, fov_names): + """Sort FoV names naturally, handling numeric components if present""" + import re - # If we only have one sample but want to show all categories together, - # we can flatten the structure, otherwise keep sample-wise structure - if len(samples_with_categories) == 1: - # For single sample, use the aggregated data - sample_name = samples_with_categories[0] - data = {sample_name: category_quality_data} + def natural_sort_key(fov_name): + # Split on digits to handle natural sorting (e.g., fov_1, fov_2, fov_10) + parts = re.split(r"(\d+)", str(fov_name)) + return [int(part) if part.isdigit() else part.lower() for part in parts] - config = { - "id": "xenium_transcript_category_violin", - "title": "Xenium: Transcript Quality by Codeword Category (averaged across samples)", - "col1_header": "Category", - "series_label": "transcripts", - } + return sorted(fov_names, key=natural_sort_key) - return violin.plot(data, headers, config) - - def xenium_molecules_per_gene_plot(self, transcript_data_by_sample): - """Create histogram plot showing distribution of molecules per gene with separate lines per sample""" + def xenium_transcripts_per_gene_plot(self, transcript_data_by_sample): + """Create histogram plot showing distribution of transcripts per gene with separate lines per sample""" # Check if any sample has molecules per gene data samples_with_molecules = [] for s_name, data in transcript_data_by_sample.items(): @@ -1637,7 +1954,7 @@ def xenium_molecules_per_gene_plot(self, transcript_data_by_sample): data = transcript_data_by_sample[s_name] molecules_data = data["molecules_per_gene"] - for gene_name, gene_info in molecules_data.items(): + for _, gene_info in molecules_data.items(): count = gene_info["count"] if count > 0: if gene_info["is_gene"]: @@ -1655,89 +1972,109 @@ def xenium_molecules_per_gene_plot(self, transcript_data_by_sample): bins = np.logspace(np.log10(min_count), np.log10(max_count), 50) bin_centers = (bins[:-1] + bins[1:]) / 2 - # Check if single sample - handle differently - num_samples = len(samples_with_molecules) - - if num_samples == 1: - # Single sample: Put both Gene and Non-gene lines on the same plot - return self._create_single_sample_molecules_plot( - transcript_data_by_sample[samples_with_molecules[0]], bins, bin_centers, n_mols_threshold - ) - else: - # Multiple samples: Use tabs for Genes vs Non-genes - return self._create_multi_sample_molecules_plot( - transcript_data_by_sample, samples_with_molecules, bins, bin_centers, n_mols_threshold - ) - - def _create_single_sample_molecules_plot(self, sample_data, bins, bin_centers, n_mols_threshold): - """Create single plot with both Gene and Non-gene lines for single sample""" - import numpy as np - - molecules_data = sample_data["molecules_per_gene"] - - # Separate counts by gene type - gene_counts = [] - non_gene_counts = [] - - for gene_name, gene_info in molecules_data.items(): - count = gene_info["count"] - if count > 0: - if gene_info["is_gene"]: - gene_counts.append(count) - else: - non_gene_counts.append(count) - - # Create plot data with both lines - plot_data = {} - - if gene_counts: - gene_hist, _ = np.histogram(gene_counts, bins=bins) - gene_line_data = {} - for i, count in enumerate(gene_hist): - gene_line_data[float(bin_centers[i])] = int(count) - plot_data["Genes"] = gene_line_data - - if non_gene_counts: - non_gene_hist, _ = np.histogram(non_gene_counts, bins=bins) - non_gene_line_data = {} - for i, count in enumerate(non_gene_hist): - non_gene_line_data[float(bin_centers[i])] = int(count) - plot_data["Non-genes"] = non_gene_line_data - - if not plot_data: - return None - - config = { - "id": "xenium_molecules_per_gene", - "title": "Xenium: Distribution of Molecules per Gene", - "xlab": "Number of molecules per gene", - "ylab": "Number of features", - } - - # Add vertical line for noise threshold if calculated - if n_mols_threshold is not None and n_mols_threshold > 0: - config["x_lines"] = [ - { - "value": n_mols_threshold, - "color": "grey", - "dash": "dash", - "width": 1, - "label": f"Noise threshold ({n_mols_threshold:.0f})", - } - ] + # Always use multi-sample plot for consistent color-coded representation + return self._create_multi_sample_molecules_plot( + transcript_data_by_sample, samples_with_molecules, bins, bin_centers, n_mols_threshold + ) - return linegraph.plot(plot_data, config) + # def _create_single_sample_molecules_plot(self, sample_data, bins, bin_centers, n_mols_threshold): + # """Create single plot with both Gene and Non-gene lines for single sample""" + # import numpy as np + + # molecules_data = sample_data["molecules_per_gene"] + + # # Separate counts by gene type + # gene_counts = [] + # non_gene_counts = [] + + # for _, gene_info in molecules_data.items(): + # count = gene_info["count"] + # if count > 0: + # if gene_info["is_gene"]: + # gene_counts.append(count) + # else: + # non_gene_counts.append(count) + + # # Create plot data with both lines + # plot_data = {} + # all_histograms = [] + + # if gene_counts: + # gene_hist, _ = np.histogram(gene_counts, bins=bins) + # all_histograms.append(gene_hist) + # gene_line_data = {} + # for i, count in enumerate(gene_hist): + # gene_line_data[float(bin_centers[i])] = int(count) + # plot_data["Genes"] = gene_line_data + + # if non_gene_counts: + # non_gene_hist, _ = np.histogram(non_gene_counts, bins=bins) + # all_histograms.append(non_gene_hist) + # non_gene_line_data = {} + # for i, count in enumerate(non_gene_hist): + # non_gene_line_data[float(bin_centers[i])] = int(count) + # plot_data["Non-genes"] = non_gene_line_data + + # if not plot_data: + # return None + + # # Trim long tail: find cutoff where all values above X are below 1% of max + # if all_histograms: + # # Get maximum value across all histograms + # max_value = max(np.max(hist) for hist in all_histograms) + # threshold = max_value * 0.01 # 1% of max + + # # Find the last bin where any histogram has values above threshold + # last_significant_bin = len(bin_centers) - 1 + # for i in range(len(bin_centers) - 1, -1, -1): + # if any(hist[i] >= threshold for hist in all_histograms): + # last_significant_bin = i + # break + + # # Trim the data to only include up to the last significant bin + # if last_significant_bin < len(bin_centers) - 1: + # trimmed_plot_data = {} + # for dataset_name, data in plot_data.items(): + # trimmed_data = {} + # for i, (x_val, y_val) in enumerate(data.items()): + # if i <= last_significant_bin: + # trimmed_data[x_val] = y_val + # trimmed_plot_data[dataset_name] = trimmed_data + # plot_data = trimmed_plot_data + + # config: Dict[str, Any] = { + # "id": "xenium_transcripts_per_gene", + # "title": "Xenium: Distribution of Transcripts", + # "xlab": "Number of transcripts per gene", + # "ylab": "Number of features", + # } + + # # Add vertical line for noise threshold if calculated + # if n_mols_threshold is not None and n_mols_threshold > 0: + # config["x_lines"] = [ + # { + # "value": n_mols_threshold, + # "color": "grey", + # "dash": "dash", + # "width": 1, + # "label": f"Noise threshold ({n_mols_threshold:.0f})", + # } + # ] + + # return linegraph.plot(plot_data, config) def _create_multi_sample_molecules_plot( self, transcript_data_by_sample, samples_with_molecules, bins, bin_centers, n_mols_threshold ): - """Create tabbed plot with separate lines per sample for multiple samples""" + """Create single plot with all samples shown as separate lines, color-coded by gene type""" import numpy as np - # Create separate datasets for Genes and Non-genes - genes_dataset = {} - non_genes_dataset = {} + from multiqc.plots import linegraph + + plot_data = {} + all_histograms = [] + # Process each sample and separate by gene type for s_name in samples_with_molecules: data = transcript_data_by_sample[s_name] molecules_data = data["molecules_per_gene"] @@ -1746,7 +2083,7 @@ def _create_multi_sample_molecules_plot( sample_gene_counts = [] sample_non_gene_counts = [] - for gene_name, gene_info in molecules_data.items(): + for _, gene_info in molecules_data.items(): count = gene_info["count"] if count > 0: if gene_info["is_gene"]: @@ -1754,44 +2091,71 @@ def _create_multi_sample_molecules_plot( else: sample_non_gene_counts.append(count) - # Create histograms for this sample + # Create histograms for genes (blue lines) if sample_gene_counts: gene_hist, _ = np.histogram(sample_gene_counts, bins=bins) - gene_data = {} + all_histograms.append(gene_hist) + gene_line_data = {} for i, count in enumerate(gene_hist): - gene_data[float(bin_centers[i])] = int(count) - genes_dataset[s_name] = gene_data + gene_line_data[float(bin_centers[i])] = int(count) + plot_data[f"{s_name} (Genes)"] = gene_line_data + # Create histograms for non-genes (black lines) if sample_non_gene_counts: non_gene_hist, _ = np.histogram(sample_non_gene_counts, bins=bins) - non_gene_data = {} + all_histograms.append(non_gene_hist) + non_gene_line_data = {} for i, count in enumerate(non_gene_hist): - non_gene_data[float(bin_centers[i])] = int(count) - non_genes_dataset[s_name] = non_gene_data - - # Create datasets list for multiple tabs - datasets = [] - data_labels = [] - - if genes_dataset: - datasets.append(genes_dataset) - data_labels.append({"name": "Genes", "ylab": "Number of genes"}) + non_gene_line_data[float(bin_centers[i])] = int(count) + plot_data[f"{s_name} (Non-genes)"] = non_gene_line_data - if non_genes_dataset: - datasets.append(non_genes_dataset) - data_labels.append({"name": "Non-genes", "ylab": "Number of non-gene features"}) - - if not datasets: + if not plot_data: return None - config = { - "id": "xenium_molecules_per_gene", - "title": "Xenium: Distribution of Molecules per Gene", - "xlab": "Number of molecules per gene", - "ylab": "Number of genes", - "data_labels": data_labels, + # Trim long tail: find cutoff where all values above X are below 1% of max + if all_histograms: + # Get maximum value across all histograms + max_value = max(np.max(hist) for hist in all_histograms) + threshold = max_value * 0.01 # 1% of max + + # Find the last bin where any histogram has values above threshold + last_significant_bin = len(bin_centers) - 1 + for i in range(len(bin_centers) - 1, -1, -1): + if any(hist[i] >= threshold for hist in all_histograms): + last_significant_bin = i + break + + # Trim the data to only include up to the last significant bin + if last_significant_bin < len(bin_centers) - 1: + trimmed_plot_data = {} + for dataset_name, data in plot_data.items(): + trimmed_data = {} + for i, (x_val, y_val) in enumerate(data.items()): + if i <= last_significant_bin: + trimmed_data[x_val] = y_val + trimmed_plot_data[dataset_name] = trimmed_data + plot_data = trimmed_plot_data + + config: Dict[str, Any] = { + "id": "xenium_transcripts_per_gene", + "title": "Xenium: Distribution of Transcripts per Gene", + "xlab": "Number of transcripts per gene", + "ylab": "Number of features", + "series_label": None, + "xlog": True, } + # Add color configuration for genes (blue) and non-genes (black) + colors = {} + for dataset_name in plot_data.keys(): + if "(Genes)" in dataset_name: + colors[dataset_name] = "#7cb5ec" # Blue + elif "(Non-genes)" in dataset_name: + colors[dataset_name] = "#434348" # Black + + if colors: + config["colors"] = colors + # Add vertical line for noise threshold if calculated if n_mols_threshold is not None and n_mols_threshold > 0: config["x_lines"] = [ @@ -1804,7 +2168,7 @@ def _create_multi_sample_molecules_plot( } ] - return linegraph.plot(datasets, config) + return linegraph.plot(plot_data, config) def calculate_noise_threshold(self, gene_molecule_counts, quantile=0.99): """ @@ -1842,7 +2206,8 @@ def calculate_noise_threshold(self, gene_molecule_counts, quantile=0.99): std_log = mad * 1.4826 # Calculate upper bound using quantile - from scipy.stats import norm + if SCIPY_AVAILABLE: + from scipy.stats import norm z_score = norm.ppf(quantile) threshold_log = median_log + z_score * std_log @@ -1865,8 +2230,8 @@ def xenium_cell_distributions_combined_plot(self, cells_data_by_sample): samples_with_genes = {} for s_name, data in cells_data_by_sample.items(): - if data and "transcript_counts_values" in data and data["transcript_counts_values"]: - samples_with_transcripts[s_name] = data["transcript_counts_values"] + if data and "total_counts_values" in data and data["total_counts_values"]: + samples_with_transcripts[s_name] = data["total_counts_values"] if data and "detected_genes_values" in data and data["detected_genes_values"]: samples_with_genes[s_name] = data["detected_genes_values"] @@ -1884,16 +2249,26 @@ def xenium_cell_distributions_combined_plot(self, cells_data_by_sample): return self._create_multi_sample_combined_boxes(samples_with_transcripts, samples_with_genes) def _create_single_sample_combined_density(self, samples_with_transcripts, samples_with_genes): - """Create single sample combined density plots for transcripts and genes per cell""" - plot_data = [] - data_labels = [] + """Create single sample combined density plot with transcripts (blue) and genes (grey) on the same plot""" + import numpy as np + + from multiqc.plots import linegraph + + plot_data = {} + + # Store raw values for intelligent line positioning + raw_transcript_values = None + raw_gene_values = None # Handle transcripts per cell data if samples_with_transcripts: - s_name, transcript_values = next(iter(samples_with_transcripts.items())) + _, transcript_values = next(iter(samples_with_transcripts.items())) + raw_transcript_values = transcript_values try: import numpy as np - from scipy.stats import gaussian_kde + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde transcript_values = np.array(transcript_values) kde = gaussian_kde(transcript_values) @@ -1901,12 +2276,11 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl x_range = np.linspace(x_min, x_max, 1000) density = kde(x_range) - # Add to plot data with dataset identifier + # Add to plot data transcripts_data = {} for x, y in zip(x_range, density): transcripts_data[float(x)] = float(y) - plot_data.append({s_name: transcripts_data}) - data_labels.append({"name": "Transcripts per cell", "xlab": "Number of transcripts per cell"}) + plot_data["Transcripts per cell"] = transcripts_data except ImportError: # Fallback to histogram if scipy not available @@ -1919,15 +2293,17 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl transcripts_data = {} for x, y in zip(bin_centers, hist): transcripts_data[float(x)] = float(y) - plot_data.append({s_name: transcripts_data}) - data_labels.append({"name": "Transcripts per cell", "xlab": "Number of transcripts per cell"}) + plot_data["Transcripts per cell"] = transcripts_data # Handle detected genes per cell data if samples_with_genes: - s_name, gene_values = next(iter(samples_with_genes.items())) + _, gene_values = next(iter(samples_with_genes.items())) + raw_gene_values = gene_values try: import numpy as np - from scipy.stats import gaussian_kde + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde gene_values = np.array(gene_values) kde = gaussian_kde(gene_values) @@ -1939,8 +2315,7 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl genes_data = {} for x, y in zip(x_range, density): genes_data[float(x)] = float(y) - plot_data.append({s_name: genes_data}) - data_labels.append({"name": "Detected genes per cell", "xlab": "Number of detected genes per cell"}) + plot_data["Detected genes per cell"] = genes_data except ImportError: # Fallback to histogram if scipy not available @@ -1953,22 +2328,34 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl genes_data = {} for x, y in zip(bin_centers, hist): genes_data[float(x)] = float(y) - plot_data.append({s_name: genes_data}) - data_labels.append({"name": "Detected genes per cell", "xlab": "Number of detected genes per cell"}) + plot_data["Detected genes per cell"] = genes_data + + if not plot_data: + return None config = { "id": "xenium_cell_distributions_combined", - "title": "Xenium: Cell Distribution Analysis", + "title": "Xenium: Distribution of Transcripts/Genes per Cell", + "xlab": "Number per cell", "ylab": "Density", "smooth_points": 100, - "data_labels": data_labels, } + # Add color configuration + colors = {"Transcripts per cell": "#7cb5ec", "Detected genes per cell": "#434348"} + config["colors"] = colors + + # Add all mean/median lines with intelligent overlap prevention + combined_lines = self._create_non_overlapping_combined_lines( + transcript_values=raw_transcript_values, gene_values=raw_gene_values, plot_data=plot_data + ) + if combined_lines: + config["x_lines"] = combined_lines + return linegraph.plot(plot_data, config) def _create_multi_sample_combined_boxes(self, samples_with_transcripts, samples_with_genes): """Create multi-sample combined box plots for transcripts and genes per cell""" - from multiqc.plots import box plot_data = [] data_labels = [] @@ -1979,7 +2366,7 @@ def _create_multi_sample_combined_boxes(self, samples_with_transcripts, samples_ for s_name, transcript_values in samples_with_transcripts.items(): transcripts_data[s_name] = transcript_values plot_data.append(transcripts_data) - data_labels.append({"name": "Transcripts per cell", "ylab": "Number of transcripts per cell"}) + data_labels.append({"name": "Transcripts per Cell", "ylab": "Transcripts per cell"}) # Add detected genes per cell data if samples_with_genes: @@ -1987,12 +2374,13 @@ def _create_multi_sample_combined_boxes(self, samples_with_transcripts, samples_ for s_name, gene_values in samples_with_genes.items(): genes_data[s_name] = gene_values plot_data.append(genes_data) - data_labels.append({"name": "Detected genes per cell", "ylab": "Number of detected genes per cell"}) + data_labels.append({"name": "Detected Genes per Cell", "ylab": "Detected genes per cell"}) config = { "id": "xenium_cell_distributions_combined", - "title": "Xenium: Cell Distribution Analysis", + "title": "Xenium: Distribution of Transcripts/Genes per Cell", "boxpoints": False, + "xlab": "Transcripts per cell", "data_labels": data_labels, } @@ -2003,8 +2391,8 @@ def xenium_transcripts_per_cell_plot(self, cells_data_by_sample): # Filter samples with transcript count data samples_with_transcripts = {} for s_name, data in cells_data_by_sample.items(): - if data and "transcript_counts_values" in data and data["transcript_counts_values"]: - samples_with_transcripts[s_name] = data["transcript_counts_values"] + if data and "total_counts_values" in data and data["total_counts_values"]: + samples_with_transcripts[s_name] = data["total_counts_values"] if not samples_with_transcripts: return None @@ -2025,7 +2413,9 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): # Create kernel density estimation try: import numpy as np - from scipy.stats import gaussian_kde + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde transcript_values = np.array(transcript_values) kde = gaussian_kde(transcript_values) @@ -2048,6 +2438,14 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): "smooth_points": 100, } + # Add vertical lines for mean and median + mean_transcripts = np.mean(transcript_values) + median_transcripts = np.median(transcript_values) + + config["x_lines"] = self._create_non_overlapping_labels( + mean_transcripts, median_transcripts, data_min=x_min, data_max=x_max + ) + return linegraph.plot(plot_data, config) except ImportError: @@ -2069,11 +2467,21 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): "ylab": "Number of cells", } + # Add vertical lines for mean and median + mean_transcripts = np.mean(transcript_values) + median_transcripts = np.median(transcript_values) + + config["x_lines"] = self._create_non_overlapping_labels( # type: ignore + mean_transcripts, + median_transcripts, + data_min=np.min(transcript_values), + data_max=np.max(transcript_values), + ) + return linegraph.plot(plot_data, config) def _create_multi_sample_transcripts_boxes(self, samples_with_transcripts): """Create multi-sample transcripts per cell box plots""" - from multiqc.plots import box # Prepare data for box plot plot_data = {} @@ -2116,7 +2524,9 @@ def _create_single_sample_genes_density(self, samples_with_genes): # Create kernel density estimation try: import numpy as np - from scipy.stats import gaussian_kde + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde gene_values = np.array(gene_values) kde = gaussian_kde(gene_values) @@ -2164,7 +2574,6 @@ def _create_single_sample_genes_density(self, samples_with_genes): def _create_multi_sample_genes_boxes(self, samples_with_genes): """Create multi-sample detected genes per cell box plots""" - from multiqc.plots import box # Prepare data for box plot plot_data = {} diff --git a/multiqc/plots/bargraph.py b/multiqc/plots/bargraph.py index 287ce9172b..63a8314908 100644 --- a/multiqc/plots/bargraph.py +++ b/multiqc/plots/bargraph.py @@ -773,7 +773,6 @@ def create( axis_controlled_by_switches=["xaxis"], default_tt_label="%{meta}: %{x}", defer_render_if_large=False, # We hide samples on large bar plots, so no need to defer render - flat_if_very_large=True, # However, the data is still embedded into the HTML, and we don't want the report size to inflate ) model.datasets = [ diff --git a/multiqc/plots/heatmap.py b/multiqc/plots/heatmap.py index 50656825e1..a60c1782b9 100644 --- a/multiqc/plots/heatmap.py +++ b/multiqc/plots/heatmap.py @@ -521,7 +521,6 @@ def create( n_series_per_dataset=[max_n_rows], n_samples_per_dataset=[n_samples], defer_render_if_large=False, # We hide samples on large heatmaps, so no need to defer render - flat_if_very_large=True, # However, the data is still embedded into the HTML, and we don't want the report size to inflate ) model.layout.update( @@ -612,16 +611,18 @@ def n_elements_to_size(n: int): width = MAX_WIDTH x_px_per_elem = width / num_cols - if height > MAX_HEIGHT or width > MAX_WIDTH: + if height >= MAX_HEIGHT or width >= MAX_WIDTH: # logger.debug(f"Resizing from {width}x{height} to fit the maximum size {MAX_WIDTH}x{MAX_HEIGHT}") if model.square: px_per_elem = min(MAX_WIDTH / num_cols, MAX_HEIGHT / num_rows) width = height = int(num_rows * px_per_elem) else: - x_px_per_elem = MAX_WIDTH / num_cols - y_px_per_elem = MAX_HEIGHT / num_rows - width = int(num_cols * x_px_per_elem) - height = int(num_rows * y_px_per_elem) + if height >= MAX_HEIGHT: + x_px_per_elem = MAX_WIDTH / num_cols + height = int(num_rows * y_px_per_elem) + if width >= MAX_WIDTH: + y_px_per_elem = MAX_HEIGHT / num_rows + width = int(num_cols * x_px_per_elem) # logger.debug(f"Heatmap size: {width}x{height}, px per element: {x_px_per_elem:.2f}x{y_px_per_elem:.2f}") diff --git a/multiqc/plots/plot.py b/multiqc/plots/plot.py index 6ed8f4d791..9e2ca0e06e 100644 --- a/multiqc/plots/plot.py +++ b/multiqc/plots/plot.py @@ -188,6 +188,7 @@ class PConfig(ValidatedConfig): x_lines: Optional[List[FlatLine]] = None y_lines: Optional[List[FlatLine]] = None series_label: str = "samples" + flat_if_very_large: bool = True @classmethod def from_pconfig_dict(cls, pconfig: Union[Mapping[str, Any], "PConfig", None]): @@ -559,7 +560,6 @@ def initialize( axis_controlled_by_switches: Optional[List[str]] = None, default_tt_label: Optional[str] = None, defer_render_if_large: bool = True, - flat_if_very_large: bool = True, series_label: Optional[str] = None, n_samples_per_dataset: Optional[List[int]] = None, ) -> "Plot[DatasetT, PConfigT]": @@ -573,7 +573,6 @@ def initialize( log10 scale and percentage switch buttons, e.g. ["yaxis"] :param default_tt_label: default tooltip label :param defer_render_if_large: whether to defer rendering if the number of data points is large - :param flat_if_very_large: whether to render flat if the number of data points is very large :param series_label: label for the series, e.g. "samples" or "statuses" :param n_samples_per_dataset: number of actual samples for each dataset (assumes series_label are samples) """ @@ -601,7 +600,7 @@ def initialize( if config.plots_force_flat: flat = True if ( - flat_if_very_large + pconfig.flat_if_very_large and not config.plots_force_interactive and n_series_per_dataset[0] > config.plots_flat_numseries ): @@ -726,9 +725,9 @@ def initialize( n_samples = n_samples_per_dataset[idx] else: n_samples = 0 - if n_samples > 1: + if n_samples > 1 and series_label: subtitles += [f"{n_samples} {pconfig.series_label}"] - elif n_series > 1: + elif n_series > 1 and series_label: subtitles += [f"{n_series} {pconfig.series_label}"] if subtitles: dconfig["subtitle"] = ", ".join(subtitles) diff --git a/multiqc/plots/table_object.py b/multiqc/plots/table_object.py index 696313ab4d..b5efa4c789 100644 --- a/multiqc/plots/table_object.py +++ b/multiqc/plots/table_object.py @@ -36,6 +36,7 @@ class TableConfig(PConfig): min: Optional[Union[int, float]] = None parse_numeric: bool = True rows_are_samples: bool = True + flat_if_very_large: bool = False def __init__(self, path_in_cfg: Optional[Tuple[str, ...]] = None, **data): super().__init__(path_in_cfg=path_in_cfg or ("table",), **data) diff --git a/multiqc/plots/violin.py b/multiqc/plots/violin.py index e36d56f058..7b947a0784 100644 --- a/multiqc/plots/violin.py +++ b/multiqc/plots/violin.py @@ -915,7 +915,6 @@ def create( default_tt_label=": %{x}", # Violins scale well, so can always keep them interactive and visible: defer_render_if_large=False, - flat_if_very_large=False, ) no_violin: bool = model.pconfig.no_violin From 6bc731782f3144c8987596894716692f48a9a100 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 29 Aug 2025 14:52:02 +0200 Subject: [PATCH 12/35] Fix config flag types in schema (#3318) --- multiqc/config.py | 8 ++-- multiqc/utils/config_schema.json | 69 +++++++++++++++++++++++++++++++- multiqc/utils/config_schema.py | 2 +- 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/multiqc/config.py b/multiqc/config.py index e64bd691f2..da988704f7 100644 --- a/multiqc/config.py +++ b/multiqc/config.py @@ -192,10 +192,10 @@ no_version_check: bool log_filesize_limit: int filesearch_lines_limit: int -report_readerrors: int -skip_generalstats: int -skip_versions_section: int -disable_version_detection: int +report_readerrors: bool +skip_generalstats: bool +skip_versions_section: bool +disable_version_detection: bool versions_table_group_header: str data_format_extensions: Dict[str, str] export_plot_formats: List[str] diff --git a/multiqc/utils/config_schema.json b/multiqc/utils/config_schema.json index ab9cb67c55..0d46acdd99 100644 --- a/multiqc/utils/config_schema.json +++ b/multiqc/utils/config_schema.json @@ -1235,6 +1235,58 @@ "description": "Anonymize samples", "title": "Ai Anonymize Samples" }, + "ai_reasoning_effort": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Reasoning effort level for OpenAI reasoning models (low, medium, high)", + "title": "Ai Reasoning Effort" + }, + "ai_max_completion_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Maximum completion tokens for OpenAI reasoning models", + "title": "Ai Max Completion Tokens" + }, + "ai_extended_thinking": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Enable extended thinking for Anthropic Claude 4 models", + "title": "Ai Extended Thinking" + }, + "ai_thinking_budget_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Budget tokens for Anthropic extended thinking", + "title": "Ai Thinking Budget Tokens" + }, "seqera_api_url": { "anyOf": [ { @@ -1326,6 +1378,19 @@ "description": "Number of series to defer loading - user will need to press button to render plot", "title": "Plots Defer Loading Numseries" }, + "plot_theme": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Plotly theme template - any registered Plotly theme name (e.g. 'plotly', 'plotly_white', 'plotly_dark', 'ggplot2', 'seaborn', 'simple_white', 'none')", + "title": "Plot Theme" + }, "lineplot_number_of_points_to_hide_markers": { "anyOf": [ { @@ -2104,7 +2169,7 @@ "report_readerrors": { "anyOf": [ { - "type": "integer" + "type": "boolean" }, { "type": "null" @@ -2374,7 +2439,7 @@ } ], "default": null, - "description": "Parquet table format. Long format has columns 'sample_name', 'metric_name' and 'val_raw', \n 'val_raw_type', 'val_str'. To select values for a certain metric, you need to filter based on its name. In contrast, \n the wide format has columns named after metrics, prefixed with table name and optional namespace. It's easier to \n for analytics, however, might hit limits on the maximal number of columns in certain edge cases, as well as\n have potential issues in case of mixed types (i.e. if some values are non-numeric, as Parquet requires a column \n to have a single type).\n ", + "description": "Parquet table format. Long format has columns 'sample_name', 'metric_name' and 'val_raw',\n 'val_raw_type', 'val_str'. To select values for a certain metric, you need to filter based on its name. In contrast,\n the wide format has columns named after metrics, prefixed with table name and optional namespace. It's easier to\n for analytics, however, might hit limits on the maximal number of columns in certain edge cases, as well as\n have potential issues in case of mixed types (i.e. if some values are non-numeric, as Parquet requires a column\n to have a single type).\n ", "title": "Parquet Format" } }, diff --git a/multiqc/utils/config_schema.py b/multiqc/utils/config_schema.py index ed4e6abe88..ba806695ce 100644 --- a/multiqc/utils/config_schema.py +++ b/multiqc/utils/config_schema.py @@ -244,7 +244,7 @@ class MultiQCConfig(BaseModel): no_version_check: Optional[bool] = Field(None, description="No version check") log_filesize_limit: Optional[int] = Field(None, description="Log filesize limit") filesearch_lines_limit: Optional[int] = Field(None, description="Filesearch lines limit") - report_readerrors: Optional[int] = Field(None, description="Report read errors") + report_readerrors: Optional[bool] = Field(None, description="Report read errors") skip_generalstats: Optional[bool] = Field(None, description="Skip generalstats") skip_versions_section: Optional[bool] = Field(None, description="Skip versions section") disable_version_detection: Optional[bool] = Field(None, description="Disable version detection") From 01625d6521eee9e6e96b56a4e696b4a79b84c343 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 29 Aug 2025 17:00:07 +0200 Subject: [PATCH 13/35] Scatter plot: fix hiding dots by legend click (#3321) --- multiqc/templates/default/assets/js/plots/scatter.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/multiqc/templates/default/assets/js/plots/scatter.js b/multiqc/templates/default/assets/js/plots/scatter.js index 0938e981d8..f8fcfe9b50 100644 --- a/multiqc/templates/default/assets/js/plots/scatter.js +++ b/multiqc/templates/default/assets/js/plots/scatter.js @@ -107,7 +107,7 @@ class ScatterPlot extends Plot { displayName = point.name; } - return { + let trace = { type: "scatter", x: [point.x], y: [point.y], @@ -116,6 +116,13 @@ class ScatterPlot extends Plot { showlegend: showInLegend, ...params, }; + + // Add legendgroup for proper legend click behavior with groups + if (point.group) { + trace.legendgroup = point.group; + } + + return trace; }); } From 8cb5b70dde56663ad622b408acf8c3075cd80a0a Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Wed, 3 Sep 2025 16:07:56 +0200 Subject: [PATCH 14/35] bases2fastq: fix index error (#3321) (#3328) --- multiqc/modules/bases2fastq/plot_samples.py | 20 +++++++++++++++----- multiqc/plots/box.py | 18 +++++++++--------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py index 4cbfd71cf8..34388a847f 100644 --- a/multiqc/modules/bases2fastq/plot_samples.py +++ b/multiqc/modules/bases2fastq/plot_samples.py @@ -112,7 +112,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c for cycle in range(len(R1)): base_no = cycle + 1 - tot = sum([R1[cycle]["BaseComposition"][base] for base in ["A", "C", "T", "T"]]) + tot = sum([R1[cycle]["BaseComposition"][base] for base in ["A", "C", "T", "G"]]) for base in "ACTG": base_s_name = "__".join([s_name, base]) @@ -240,13 +240,23 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s R1_gc_counts = sample_data[s_name]["Reads"][0]["PerReadGCCountHistogram"] R2_gc_counts = [0] * len(R1_gc_counts) if len(sample_data[s_name]["Reads"]) > 1: - R2_gc_counts = sample_data[s_name]["Reads"][1]["PerReadGCCountHistogram"] + R2_gc_counts_raw = sample_data[s_name]["Reads"][1]["PerReadGCCountHistogram"] + # Handle potential length mismatch between R1 and R2 GC counts + if len(R2_gc_counts_raw) == len(R1_gc_counts): + R2_gc_counts = R2_gc_counts_raw + else: + # Pad shorter list with zeros or truncate longer list + min_len = min(len(R1_gc_counts), len(R2_gc_counts_raw)) + R2_gc_counts = R2_gc_counts_raw[:min_len] + [0] * (len(R1_gc_counts) - min_len) + R1R2_gc_counts = [r1 + r2 for r1, r2 in zip(R1_gc_counts, R2_gc_counts)] totalReads = sum(R1R2_gc_counts) gc_hist_dict.update({s_name: {}}) - RLen = len(R1_gc_counts) - for gc in range(0, RLen): - gc_hist_dict[s_name].update({gc / RLen * 100: R1R2_gc_counts[gc] / totalReads * 100}) + + if totalReads > 0 and len(R1R2_gc_counts) > 0: # Avoid division by zero and empty data + RLen = len(R1_gc_counts) + for gc in range(len(R1R2_gc_counts)): + gc_hist_dict[s_name].update({gc / RLen * 100: R1R2_gc_counts[gc] / totalReads * 100}) # perReadQualityHistogram plot_content = gc_hist_dict diff --git a/multiqc/plots/box.py b/multiqc/plots/box.py index 9360e27cc3..f4d4449c90 100644 --- a/multiqc/plots/box.py +++ b/multiqc/plots/box.py @@ -140,9 +140,9 @@ def create( def create_figure( self, layout: go.Layout, - is_log: bool = False, - is_pct: bool = False, - **kwargs, + is_log: bool = False, # noqa: ARG002 + is_pct: bool = False, # noqa: ARG002 + **kwargs, # noqa: ARG002 ) -> go.Figure: """ Create a Plotly figure for a dataset @@ -166,7 +166,7 @@ def save_data_file(self) -> None: vals_by_sample[sample] = values report.write_data_file(vals_by_sample, self.uid) - def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = True) -> str: + def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = True) -> str: # noqa: ARG002 """Format dataset as a markdown table with basic statistics""" prompt = "|Sample|Min|Q1|Median|Q3|Max|Mean|\n" prompt += "|---|---|---|---|---|---|---|\n" @@ -419,15 +419,15 @@ def create( yaxis=dict( automargin=True, # to make sure there is enough space for ticks labels categoryorder="trace", # keep sample order - hoverformat=model.layout.xaxis.hoverformat, - ticksuffix=model.layout.xaxis.ticksuffix, + hoverformat=getattr(model.layout.xaxis, "hoverformat", None), + ticksuffix=getattr(model.layout.xaxis, "ticksuffix", None), # Prevent JavaScript from automatically parsing categorical values as numbers: type="category", ), xaxis=dict( - title=dict(text=model.layout.yaxis.title.text), - hoverformat=model.layout.yaxis.hoverformat, - ticksuffix=model.layout.yaxis.ticksuffix, + title=dict(text=getattr(getattr(model.layout.yaxis, "title", None), "text", None)), + hoverformat=getattr(model.layout.yaxis, "hoverformat", None), + ticksuffix=getattr(model.layout.yaxis, "ticksuffix", None), ), hovermode="y", hoverlabel=dict( From 2359eda6d7b226b29cbe07045d19354b7f4b54f7 Mon Sep 17 00:00:00 2001 From: Anandashankar Anil Date: Wed, 3 Sep 2025 16:18:20 +0200 Subject: [PATCH 15/35] Ignore pyc files when copying html files (#3320) --- multiqc/core/write_results.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/multiqc/core/write_results.py b/multiqc/core/write_results.py index 8f5d923659..97e91ab967 100644 --- a/multiqc/core/write_results.py +++ b/multiqc/core/write_results.py @@ -510,11 +510,18 @@ def _write_html_report(to_stdout: bool, report_path: Optional[Path], return_html except AttributeError: pass # Not a child theme else: - shutil.copytree(parent_template.template_dir, tmp_dir.get_tmp_dir(), dirs_exist_ok=True) + shutil.copytree( + parent_template.template_dir, + tmp_dir.get_tmp_dir(), + dirs_exist_ok=True, + ignore=shutil.ignore_patterns("*.pyc"), + ) # Copy the template files to the tmp directory (`dirs_exist_ok` makes sure # parent template files are overwritten) - shutil.copytree(template_mod.template_dir, tmp_dir.get_tmp_dir(), dirs_exist_ok=True) + shutil.copytree( + template_mod.template_dir, tmp_dir.get_tmp_dir(), dirs_exist_ok=True, ignore=shutil.ignore_patterns("*.pyc") + ) # Function to include file contents in Jinja template def include_file(name, fdir=tmp_dir.get_tmp_dir(), b64=False): From 74e3478d79db223118712e754aeaecc5ad9d4320 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 4 Sep 2025 16:37:21 +0200 Subject: [PATCH 16/35] Xenium QC: feedback - round 2 (#3323) * Feedback: rename "detected genes" to "transcript genes", use single threshold line * Unncesessary imports * Box plot: support stats as input instsad of data arrays * Pass stats to box plots * Parse the h5 files for gene counts * Improve transcript quality table * Fix mypy * Fix mypy * Fix mypy * Clean up * Add scanpy dependency * Filter oput low-count cells for nucleus/transcript count ratio and cap plot at 1 * Pass extra legend group and tooltip to linegraph series * Improve Distribution of Transcripts multi-sample plot * Optimized noize threshold calculation --- .../special_case_modules/custom_content.py | 5 +- multiqc/modules/xenium/xenium.py | 829 ++++++++++++------ multiqc/plots/box.py | 150 +++- multiqc/plots/linegraph.py | 47 +- multiqc/plots/plot.py | 2 +- multiqc/search_patterns.yaml | 2 + .../templates/default/assets/js/plots/box.js | 118 ++- .../templates/default/assets/js/plots/line.js | 1 + pyproject.toml | 1 + tests/test_plots.py | 8 +- 10 files changed, 804 insertions(+), 359 deletions(-) diff --git a/multiqc/core/special_case_modules/custom_content.py b/multiqc/core/special_case_modules/custom_content.py index ae970fefe8..ebe3fd2970 100644 --- a/multiqc/core/special_case_modules/custom_content.py +++ b/multiqc/core/special_case_modules/custom_content.py @@ -573,7 +573,10 @@ def add_cc_section(self, section_id: SectionId, section_anchor: Anchor, ccdict: # Box plot elif plot_type == PlotType.BOX: - plot = box.plot(plot_datasets, pconfig=box.BoxPlotConfig(**pconfig)) + from multiqc.plots.box import BoxT + + box_data = cast(Union[Mapping[str, BoxT], List[Mapping[str, BoxT]]], plot_datasets) + plot = box.plot(box_data, pconfig=box.BoxPlotConfig(**pconfig)) # Violin plot elif plot_type == PlotType.VIOLIN: diff --git a/multiqc/modules/xenium/xenium.py b/multiqc/modules/xenium/xenium.py index 479c0fabbe..cfcf71a1b3 100644 --- a/multiqc/modules/xenium/xenium.py +++ b/multiqc/modules/xenium/xenium.py @@ -1,5 +1,6 @@ import json import logging +import re from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -9,6 +10,7 @@ from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound from multiqc.plots import bargraph, box, linegraph, scatter, table from multiqc.plots.table_object import ColumnDict, TableConfig +from multiqc.utils import mqc_colour # Try importing scipy, fallback gracefully if not available try: @@ -19,6 +21,14 @@ except ImportError: SCIPY_AVAILABLE = False +# Try importing scanpy for H5 file reading, fallback gracefully if not available +try: + import scanpy as sc + + SCANPY_AVAILABLE = True +except ImportError: + SCANPY_AVAILABLE = False + log = logging.getLogger(__name__) @@ -121,6 +131,20 @@ def __init__(self): cells_data_by_sample[parent_dir] = parsed_cells_data self.add_data_source(cells_f, parent_dir) + # Parse cell_feature_matrix.h5 files for detected genes per cell calculation + for h5_f in self.find_log_files("xenium/cell_feature_matrix", filecontents=False, filehandles=False): + detected_genes_data = self.parse_cell_feature_matrix_h5(h5_f) + if detected_genes_data: + # Use parent directory name as sample name + parent_dir = Path(h5_f["root"]).name if h5_f["root"] else h5_f["s_name"] + if parent_dir in cells_data_by_sample: + # Merge detected genes data with existing cells data + cells_data_by_sample[parent_dir].update(detected_genes_data) + else: + # Create new entry if cells.parquet wasn't found + cells_data_by_sample[parent_dir] = detected_genes_data + self.add_data_source(h5_f, parent_dir) + data_by_sample = self.ignore_samples(data_by_sample) transcript_data_by_sample = self.ignore_samples(transcript_data_by_sample) cells_data_by_sample = self.ignore_samples(cells_data_by_sample) @@ -234,9 +258,9 @@ def __init__(self): self.add_section( name="Transcript Quality Summary", anchor="xenium-transcript-quality", - description="Per-sample transcript quality statistics by gene category", + description="Per-sample mean transcript quality statistics by gene category", helptext=""" - This table shows transcript quality statistics for each sample, with separate columns for each gene category: + This table shows mean transcript quality statistics for each sample, with separate columns for each gene category: **Gene Categories:** * **Pre-designed**: Standard genes from Xenium panels @@ -425,13 +449,12 @@ def __init__(self): * **Typical range**: 50-2000 genes per cell depending on cell type and panel size * **High gene counts**: Metabolically active cells or cells with high expression diversity * **Low gene counts**: Specialized cells, inactive cells, or technical dropouts - * **Quality thresholds**: <20 may indicate poor cells or debris - + **What to look for:** * **Unimodal distributions**: Expected for homogeneous cell populations * **Multimodal distributions**: May indicate different cell types or technical artifacts * **Sample consistency**: Similar distributions expected for replicate samples - * **Positive correlation**: Generally expect transcripts and genes per cell to correlate + * **Positive correlation**: Generally expect transcripts and detected genes per cell to correlate **Panel considerations:** * **Pre-designed panels**: Gene counts limited by panel design (typically 100-1000 genes) @@ -439,8 +462,7 @@ def __init__(self): * **Detection efficiency**: Some genes may be harder to detect than others **Quality assessment:** - * **Transcripts**: Very low (<50) or very high (>10,000) may indicate segmentation issues - * **Genes**: Very low (<20) may indicate poor cells, counts near panel size may indicate artifacts + * **Counts**: Very low (<50) or very high (>10,000) may indicate segmentation issues * **Shoulder distributions**: May indicate presence of different cell types **Troubleshooting:** @@ -469,7 +491,7 @@ def __init__(self): **Plot interpretation:** * **X-axis**: Quality ranges (Low to Excellent QV ranges) - * **Y-axis**: Number of Fields of View in each quality range + * **Y-axis**: Fields of View in each quality range * **Colors**: Color-coded by quality level (grey=poor, green=excellent) * **Bars**: Each sample shown as separate colored bars for comparison @@ -971,6 +993,9 @@ def parse_transcripts_parquet(self, f) -> Optional[Dict]: } result["molecules_per_gene"] = molecules_per_gene + # Calculate noise threshold directly from transcript_stats DataFrame + result["noise_threshold"] = self.calculate_noise_threshold_from_df(transcript_stats) + # Add FoV quality analysis if fov_name column is present if "fov_name" in schema: fov_stats = ( @@ -1036,6 +1061,8 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: pl.col("cell_area").std().alias("std"), pl.col("cell_area").min().alias("min"), pl.col("cell_area").max().alias("max"), + pl.col("cell_area").quantile(0.25).alias("q1"), + pl.col("cell_area").quantile(0.75).alias("q3"), pl.col("cell_area").count().alias("count"), ] ) @@ -1050,16 +1077,21 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: "cell_area_std": cell_area_stats["std"].item(), "cell_area_min": cell_area_stats["min"].item(), "cell_area_max": cell_area_stats["max"].item(), + "cell_area_q1": cell_area_stats["q1"].item(), + "cell_area_q3": cell_area_stats["q3"].item(), } ) - # Sample cell area values for distribution plots - count = cell_area_stats["count"].item() - print(f"count: {count}, sample name: {f['s_name']}") - sample_values = ( - lazy_df.filter(pl.col("cell_area").is_not_null()).select("cell_area").collect().to_series().to_list() - ) - cell_stats["cell_area_values"] = sample_values + # Store box plot statistics instead of raw values + cell_stats["cell_area_box_stats"] = { + "min": cell_area_stats["min"].item(), + "q1": cell_area_stats["q1"].item(), + "median": cell_area_stats["median"].item(), + "q3": cell_area_stats["q3"].item(), + "max": cell_area_stats["max"].item(), + "mean": cell_area_stats["mean"].item(), + "count": cell_area_stats["count"].item(), + } # Nucleus area distribution stats using lazy operations nucleus_area_stats = ( @@ -1110,22 +1142,38 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: } ) - # Sample ratio values for distribution plots - count = ratio_stats["count"].item() - sample_values = ( + # Calculate ratio distribution statistics for box plots + ratio_dist_stats = ( lazy_df.filter( (pl.col("cell_area").is_not_null()) & (pl.col("nucleus_area").is_not_null()) & (pl.col("cell_area") > 0) ) .with_columns((pl.col("nucleus_area") / pl.col("cell_area")).alias("ratio")) - .select("ratio") + .select( + [ + pl.col("ratio").min().alias("min"), + pl.col("ratio").quantile(0.25).alias("q1"), + pl.col("ratio").median().alias("median"), + pl.col("ratio").quantile(0.75).alias("q3"), + pl.col("ratio").max().alias("max"), + pl.col("ratio").mean().alias("mean"), + pl.col("ratio").count().alias("count"), + ] + ) .collect() - .to_series() - .to_list() ) - cell_stats["nucleus_to_cell_area_ratio_values"] = sample_values + if ratio_dist_stats["count"].item() > 0: + cell_stats["nucleus_to_cell_area_ratio_box_stats"] = { + "min": ratio_dist_stats["min"].item(), + "q1": ratio_dist_stats["q1"].item(), + "median": ratio_dist_stats["median"].item(), + "q3": ratio_dist_stats["q3"].item(), + "max": ratio_dist_stats["max"].item(), + "mean": ratio_dist_stats["mean"].item(), + "count": ratio_dist_stats["count"].item(), + } # Store total transcript counts per cell (total_counts) for distribution plots total_count_check = ( @@ -1135,17 +1183,34 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: ) if total_count_check["count"].item() > 0: - count = total_count_check["count"].item() - sample_values = ( + # Calculate total counts distribution statistics for box plots + total_counts_stats = ( lazy_df.filter(pl.col("total_counts").is_not_null()) - .select("total_counts") + .select( + [ + pl.col("total_counts").min().alias("min"), + pl.col("total_counts").quantile(0.25).alias("q1"), + pl.col("total_counts").median().alias("median"), + pl.col("total_counts").quantile(0.75).alias("q3"), + pl.col("total_counts").max().alias("max"), + pl.col("total_counts").mean().alias("mean"), + pl.col("total_counts").count().alias("count"), + ] + ) .collect() - .to_series() - .to_list() ) - cell_stats["total_counts_values"] = sample_values + cell_stats["total_counts_box_stats"] = { + "min": total_counts_stats["min"].item(), + "q1": total_counts_stats["q1"].item(), + "median": total_counts_stats["median"].item(), + "q3": total_counts_stats["q3"].item(), + "max": total_counts_stats["max"].item(), + "mean": total_counts_stats["mean"].item(), + "count": total_counts_stats["count"].item(), + } # Store detected genes per cell (transcript_counts) for distribution plots + # NOTE: This will be overridden by H5-based calculation if cell_feature_matrix.h5 is available detected_count_check = ( lazy_df.filter(pl.col("transcript_counts").is_not_null()) .select(pl.col("transcript_counts").count().alias("count")) @@ -1153,20 +1218,36 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: ) if detected_count_check["count"].item() > 0: - count = detected_count_check["count"].item() - sample_values = ( + # Calculate detected genes per cell distribution statistics for box plots + gene_counts_stats = ( lazy_df.filter(pl.col("transcript_counts").is_not_null()) - .select("transcript_counts") + .select( + [ + pl.col("transcript_counts").min().alias("min"), + pl.col("transcript_counts").quantile(0.25).alias("q1"), + pl.col("transcript_counts").median().alias("median"), + pl.col("transcript_counts").quantile(0.75).alias("q3"), + pl.col("transcript_counts").max().alias("max"), + pl.col("transcript_counts").mean().alias("mean"), + pl.col("transcript_counts").count().alias("count"), + ] + ) .collect() - .to_series() - .to_list() ) - cell_stats["detected_genes_values"] = sample_values + cell_stats["gene_transcript_counts_box_stats"] = { + "min": gene_counts_stats["min"].item(), + "q1": gene_counts_stats["q1"].item(), + "median": gene_counts_stats["median"].item(), + "q3": gene_counts_stats["q3"].item(), + "max": gene_counts_stats["max"].item(), + "mean": gene_counts_stats["mean"].item(), + "count": gene_counts_stats["count"].item(), + } # Add nucleus RNA fraction if nucleus_count is available if "nucleus_count" in schema: nucleus_fraction_stats = ( - lazy_df.filter(pl.col("total_counts") > 0) + lazy_df.filter(pl.col("total_counts") >= 10) .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) .select( [ @@ -1186,17 +1267,32 @@ def parse_cells_parquet(self, f) -> Optional[Dict]: } ) - # Sample nucleus fraction values for distribution plots - count = nucleus_fraction_stats["count"].item() - sample_values = ( + # Calculate nucleus RNA fraction distribution statistics for box plots + nucleus_fraction_dist_stats = ( lazy_df.filter(pl.col("total_counts") > 0) .with_columns((pl.col("nucleus_count") / pl.col("total_counts")).alias("fraction")) - .select("fraction") + .select( + [ + pl.col("fraction").min().alias("min"), + pl.col("fraction").quantile(0.25).alias("q1"), + pl.col("fraction").median().alias("median"), + pl.col("fraction").quantile(0.75).alias("q3"), + pl.col("fraction").max().alias("max"), + pl.col("fraction").mean().alias("mean"), + pl.col("fraction").count().alias("count"), + ] + ) .collect() - .to_series() - .to_list() ) - cell_stats["nucleus_rna_fraction_values"] = sample_values + cell_stats["nucleus_rna_fraction_box_stats"] = { + "min": nucleus_fraction_dist_stats["min"].item(), + "q1": nucleus_fraction_dist_stats["q1"].item(), + "median": nucleus_fraction_dist_stats["median"].item(), + "q3": nucleus_fraction_dist_stats["q3"].item(), + "max": nucleus_fraction_dist_stats["max"].item(), + "mean": nucleus_fraction_dist_stats["mean"].item(), + "count": nucleus_fraction_dist_stats["count"].item(), + } return cell_stats @@ -1469,38 +1565,55 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): else 1 if x == "Custom" else 2 - if x == "Genomic Control Probe" + if x == "Genomic Сontrol probe" else 3 - if x == "Negative Control Probe" + if x == "Negative Сontrol probe" else 4 - if x == "Negative Control Codeword" + if x == "Negative Сontrol codeword" else 5 - if x == "Unassigned Codeword" + if x == "Unassigned Сodeword" else 6 - if x == "Deprecated Codeword" + if x == "Deprecated Сodeword" else 7 ), ) + # Create consistent abbreviations for column titles + category_abbreviations = { + "Pre-designed": "Pre-designed", + "Custom": "Custom", + "Genomic Сontrol Probe": "Genomic Ctrl", + "Negative Control Probe": "Negative Ctrl", + "Negative Control Codeword": "Neg Codeword", + "Unassigned Codeword": "Unassigned", + "Deprecated Codeword": "Deprecated", + } + for category in sorted_categories: + # Get abbreviated title for consistent column width + abbrev_title = category_abbreviations[category] + # Mean quality column headers[f"{category} Mean QV"] = { - "title": f"{category} Mean", # Abbreviated for space + "title": f"{abbrev_title}", "description": f"Mean calibrated quality score (QV) for {category}", "scale": "Blues", "format": "{:.2f}", "suffix": "", "shared_key": "xenium_transcript_quality", + "min": 0, + "max": 40, } # Standard deviation column headers[f"{category} Std Dev"] = { - "title": f"{category} StdDev", # Abbreviated for space + "title": f"{abbrev_title} StdDev", "description": f"Standard deviation of quality scores for {category}", "scale": "Oranges", "format": "{:.2f}", "suffix": "", "shared_key": "xenium_transcript_quality", + "hidden": True, } return table.plot( @@ -1508,7 +1621,7 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): headers, pconfig=TableConfig( id="xenium_transcript_quality_per_sample_table", - title="Xenium: Transcript Quality by Sample and Category", + title="Xenium: Mean Transcript Quality by Sample and Category", ), ) @@ -1517,7 +1630,8 @@ def xenium_cell_area_distribution_plot(self, cells_data_by_sample): # Check which samples have cell area data samples_with_areas = [] for s_name, data in cells_data_by_sample.items(): - if "cell_area_values" in data: + # Accept either pre-calculated statistics or raw values + if ("cell_area_box_stats" in data) or ("cell_area_values" in data and data["cell_area_values"]): samples_with_areas.append(s_name) if not samples_with_areas: @@ -1543,6 +1657,13 @@ def _create_single_sample_area_density(self, cell_data): if SCIPY_AVAILABLE: from scipy.stats import gaussian_kde + # Skip density plots if only pre-calculated statistics are available + if "cell_area_values" not in cell_data: + log.info( + "Skipping cell area density plot - using pre-calculated statistics. Density plots require raw data." + ) + return None + cell_areas = cell_data["cell_area_values"] if not cell_areas or len(cell_areas) < 10: return None @@ -1585,19 +1706,16 @@ def _create_single_sample_area_density(self, cell_data): return linegraph.plot({"Density": density_data}, config) def _create_multi_sample_area_violins(self, cells_data_by_sample, samples_with_areas): - """Create box plots for multiple samples - one box per sample""" + """Create box plots for multiple samples using pre-calculated statistics""" - # For box plots, we provide the raw data points grouped by sample + # For box plots, we now provide pre-calculated statistics instead of raw data data = {} for s_name in samples_with_areas: cell_data = cells_data_by_sample[s_name] - if "cell_area_values" in cell_data: - # Store all cell area values for this sample - cell_areas = cell_data["cell_area_values"] - if cell_areas: - # Box plots expect raw data points as a list - data[s_name] = [float(area) for area in cell_areas] + if "cell_area_box_stats" in cell_data: + # Use pre-calculated box plot statistics + data[s_name] = cell_data["cell_area_box_stats"] if not data: return None @@ -1616,7 +1734,9 @@ def xenium_nucleus_rna_fraction_plot(self, cells_data_by_sample): # Check which samples have nucleus RNA fraction data samples_with_nucleus_data = [] for s_name, data in cells_data_by_sample.items(): - if "nucleus_rna_fraction_values" in data and data["nucleus_rna_fraction_values"]: + if "nucleus_rna_fraction_box_stats" in data or ( + "nucleus_rna_fraction_values" in data and data["nucleus_rna_fraction_values"] + ): samples_with_nucleus_data.append(s_name) if not samples_with_nucleus_data: @@ -1639,6 +1759,13 @@ def _create_single_sample_nucleus_density(self, cell_data): from scipy import stats + # Skip density plots if only pre-calculated statistics are available + if "nucleus_rna_fraction_values" not in cell_data: + log.info( + "Skipping nucleus RNA fraction density plot - using pre-calculated statistics. Density plots require raw data." + ) + return None + nucleus_fractions = cell_data["nucleus_rna_fraction_values"] if not nucleus_fractions: return None @@ -1706,17 +1833,20 @@ def _create_single_sample_nucleus_density(self, cell_data): return plot def _create_multi_sample_nucleus_boxes(self, cells_data_by_sample, samples_with_nucleus_data): - """Create box plots for multiple samples - one box per sample""" + """Create box plots for multiple samples using pre-calculated statistics""" - # For box plots, we provide the raw data points grouped by sample + # For box plots, we now provide pre-calculated statistics instead of raw data data = {} for s_name in samples_with_nucleus_data: cell_data = cells_data_by_sample[s_name] - if "nucleus_rna_fraction_values" in cell_data: + if "nucleus_rna_fraction_box_stats" in cell_data: + # Use pre-calculated box plot statistics + data[s_name] = cell_data["nucleus_rna_fraction_box_stats"] + elif "nucleus_rna_fraction_values" in cell_data: + # Fallback to raw data if statistics not available (backward compatibility) nucleus_fractions = cell_data["nucleus_rna_fraction_values"] if nucleus_fractions: - # Box plots expect raw data points as a list data[s_name] = [float(fraction) for fraction in nucleus_fractions] if not data: @@ -1736,7 +1866,9 @@ def xenium_nucleus_cell_area_ratio_plot(self, cells_data_by_sample): # Check which samples have nucleus-to-cell area ratio data samples_with_ratio_data = [] for s_name, data in cells_data_by_sample.items(): - if "nucleus_to_cell_area_ratio_values" in data and data["nucleus_to_cell_area_ratio_values"]: + if "nucleus_to_cell_area_ratio_box_stats" in data or ( + "nucleus_to_cell_area_ratio_values" in data and data["nucleus_to_cell_area_ratio_values"] + ): samples_with_ratio_data.append(s_name) if not samples_with_ratio_data: @@ -1762,6 +1894,13 @@ def _create_single_sample_ratio_density(self, cell_data): from multiqc.plots import linegraph + # Skip density plots if only pre-calculated statistics are available + if "nucleus_to_cell_area_ratio_values" not in cell_data: + log.info( + "Skipping nucleus-to-cell area ratio density plot - using pre-calculated statistics. Density plots require raw data." + ) + return None + ratio_values = cell_data["nucleus_to_cell_area_ratio_values"] if not ratio_values: return None @@ -1807,28 +1946,33 @@ def _create_single_sample_ratio_density(self, cell_data): return plot def _create_multi_sample_ratio_boxes(self, cells_data_by_sample, samples_with_ratio_data): - """Create box plots for multiple samples - one box per sample""" + """Create box plots for multiple samples using pre-calculated statistics""" - # For box plots, we provide the raw data points grouped by sample + # For box plots, we now provide pre-calculated statistics instead of raw data data = {} for s_name in samples_with_ratio_data: cell_data = cells_data_by_sample[s_name] - if "nucleus_to_cell_area_ratio_values" in cell_data: + if "nucleus_to_cell_area_ratio_box_stats" in cell_data: + # Use pre-calculated box plot statistics + data[s_name] = cell_data["nucleus_to_cell_area_ratio_box_stats"] + elif "nucleus_to_cell_area_ratio_values" in cell_data: + # Fallback to raw data if statistics not available (backward compatibility) ratio_values = cell_data["nucleus_to_cell_area_ratio_values"] if ratio_values: - # Box plots expect raw data points as a list data[s_name] = [float(ratio) for ratio in ratio_values] if not data: return None - config = { - "id": "xenium_nucleus_cell_area_ratio_multi", - "title": "Xenium: Nucleus to Cell Area Distribution", - "xlab": "Nucleus-to-cell area ratio", - "boxpoints": False, - } + config = box.BoxPlotConfig( + id="xenium_nucleus_cell_area_ratio_multi", + title="Xenium: Nucleus to Cell Area Distribution", + xlab="Nucleus-to-cell area ratio", + boxpoints=False, + xmin=0, + xmax=1, + ) return box.plot(data, config) @@ -1879,7 +2023,7 @@ def xenium_fov_quality_plot(self, transcript_data_by_sample): "id": "xenium_fov_quality_ranges", "title": "Xenium: Field of View Quality Distribution", "xlab": "Quality Range", - "ylab": "Number of Fields of View", + "ylab": "Fields of View", "cpswitch_c_active": False, "use_legend": True, } @@ -1912,7 +2056,6 @@ def xenium_fov_quality_plot(self, transcript_data_by_sample): def _sort_fov_names(self, fov_names): """Sort FoV names naturally, handling numeric components if present""" - import re def natural_sort_key(fov_name): # Split on digits to handle natural sorting (e.g., fov_1, fov_2, fov_10) @@ -1932,19 +2075,27 @@ def xenium_transcripts_per_gene_plot(self, transcript_data_by_sample): if not samples_with_molecules: return None - import numpy as np + # Determine if single or multi-sample plot + num_samples = len(samples_with_molecules) + if num_samples == 1: + # Single sample: calculate noise threshold for this sample only + s_name = samples_with_molecules[0] + sample_data = transcript_data_by_sample[s_name] + molecules_data = sample_data["molecules_per_gene"] - # Calculate noise threshold based on aggregated data (for vertical line) - aggregated_gene_counts = {} - for s_name in samples_with_molecules: - data = transcript_data_by_sample[s_name] - molecules_data = data["molecules_per_gene"] - for gene_name, gene_info in molecules_data.items(): - if gene_name not in aggregated_gene_counts: - aggregated_gene_counts[gene_name] = {"count": 0, "is_gene": gene_info["is_gene"]} - aggregated_gene_counts[gene_name]["count"] += gene_info["count"] + # Use pre-calculated noise threshold if available + n_mols_threshold = sample_data.get("noise_threshold") + else: + # Multi-sample: use pre-calculated noise thresholds + sample_thresholds = {} + for s_name in samples_with_molecules: + sample_data = transcript_data_by_sample[s_name] + + # Use pre-calculated noise threshold if available + threshold = sample_data.get("noise_threshold") + sample_thresholds[s_name] = threshold - n_mols_threshold = self.calculate_noise_threshold(aggregated_gene_counts) + n_mols_threshold = None # Keep for single-sample compatibility # Determine global bins based on all samples' data all_gene_counts = [] @@ -1972,105 +2123,141 @@ def xenium_transcripts_per_gene_plot(self, transcript_data_by_sample): bins = np.logspace(np.log10(min_count), np.log10(max_count), 50) bin_centers = (bins[:-1] + bins[1:]) / 2 - # Always use multi-sample plot for consistent color-coded representation - return self._create_multi_sample_molecules_plot( - transcript_data_by_sample, samples_with_molecules, bins, bin_centers, n_mols_threshold - ) + # Choose between single and multi-sample plots + if num_samples == 1: + # Single sample with noise threshold + s_name = samples_with_molecules[0] + sample_data = transcript_data_by_sample[s_name] + # Create single-item threshold dict for consistency + single_sample_thresholds = {s_name: n_mols_threshold} + return self._create_single_sample_molecules_plot( + sample_data, bins, bin_centers, single_sample_thresholds, s_name + ) + else: + # Multi-sample with per-sample thresholds + return self._create_multi_sample_molecules_plot( + transcript_data_by_sample, samples_with_molecules, bins, bin_centers, sample_thresholds + ) + + def _create_single_sample_molecules_plot(self, sample_data, bins, bin_centers, sample_thresholds, sample_name): + """Create single plot with both Gene and Non-gene lines for single sample""" + molecules_data = sample_data["molecules_per_gene"] + + # Separate counts by gene type + gene_counts = [] + non_gene_counts = [] + + for _, gene_info in molecules_data.items(): + count = gene_info["count"] + if count > 0: + if gene_info["is_gene"]: + gene_counts.append(count) + else: + non_gene_counts.append(count) + + # Create plot data with both lines + plot_data = {} + all_histograms = [] + + if gene_counts: + gene_hist, _ = np.histogram(gene_counts, bins=bins) + all_histograms.append(gene_hist) + gene_line_data = {} + for i, count in enumerate(gene_hist): + gene_line_data[float(bin_centers[i])] = int(count) + plot_data["Genes"] = gene_line_data + + if non_gene_counts: + non_gene_hist, _ = np.histogram(non_gene_counts, bins=bins) + all_histograms.append(non_gene_hist) + non_gene_line_data = {} + for i, count in enumerate(non_gene_hist): + non_gene_line_data[float(bin_centers[i])] = int(count) + plot_data["Non-genes"] = non_gene_line_data + + if not plot_data: + return None + + # Trim long tail: find cutoff where all values above X are below 1% of max + if all_histograms: + # Get maximum value across all histograms + max_value = max(np.max(hist) for hist in all_histograms) + threshold = max_value * 0.01 # 1% of max + + # Find the last bin where any histogram has values above threshold + last_significant_bin = len(bin_centers) - 1 + for i in range(len(bin_centers) - 1, -1, -1): + if any(hist[i] >= threshold for hist in all_histograms): + last_significant_bin = i + break + + # Trim the data to only include up to the last significant bin + if last_significant_bin < len(bin_centers) - 1: + trimmed_plot_data = {} + for dataset_name, data in plot_data.items(): + trimmed_data = {} + for i, (x_val, y_val) in enumerate(data.items()): + if i <= last_significant_bin: + trimmed_data[x_val] = y_val + trimmed_plot_data[dataset_name] = trimmed_data + plot_data = trimmed_plot_data + + config: Dict[str, Any] = { + "id": "xenium_transcripts_per_gene", + "title": "Xenium: Distribution of Transcripts per Gene", + "xlab": "Number of transcripts per gene", + "ylab": "Number of features", + "xlog": True, + } + + # Use same color for genes and controls from same sample (distinguished by line style) + from multiqc.utils import mqc_colour + + scale = mqc_colour.mqc_colour_scale("plot_defaults") + sample_color = scale.get_colour(0, lighten=1) # Use first color for single sample - # def _create_single_sample_molecules_plot(self, sample_data, bins, bin_centers, n_mols_threshold): - # """Create single plot with both Gene and Non-gene lines for single sample""" - # import numpy as np - - # molecules_data = sample_data["molecules_per_gene"] - - # # Separate counts by gene type - # gene_counts = [] - # non_gene_counts = [] - - # for _, gene_info in molecules_data.items(): - # count = gene_info["count"] - # if count > 0: - # if gene_info["is_gene"]: - # gene_counts.append(count) - # else: - # non_gene_counts.append(count) - - # # Create plot data with both lines - # plot_data = {} - # all_histograms = [] - - # if gene_counts: - # gene_hist, _ = np.histogram(gene_counts, bins=bins) - # all_histograms.append(gene_hist) - # gene_line_data = {} - # for i, count in enumerate(gene_hist): - # gene_line_data[float(bin_centers[i])] = int(count) - # plot_data["Genes"] = gene_line_data - - # if non_gene_counts: - # non_gene_hist, _ = np.histogram(non_gene_counts, bins=bins) - # all_histograms.append(non_gene_hist) - # non_gene_line_data = {} - # for i, count in enumerate(non_gene_hist): - # non_gene_line_data[float(bin_centers[i])] = int(count) - # plot_data["Non-genes"] = non_gene_line_data - - # if not plot_data: - # return None - - # # Trim long tail: find cutoff where all values above X are below 1% of max - # if all_histograms: - # # Get maximum value across all histograms - # max_value = max(np.max(hist) for hist in all_histograms) - # threshold = max_value * 0.01 # 1% of max - - # # Find the last bin where any histogram has values above threshold - # last_significant_bin = len(bin_centers) - 1 - # for i in range(len(bin_centers) - 1, -1, -1): - # if any(hist[i] >= threshold for hist in all_histograms): - # last_significant_bin = i - # break - - # # Trim the data to only include up to the last significant bin - # if last_significant_bin < len(bin_centers) - 1: - # trimmed_plot_data = {} - # for dataset_name, data in plot_data.items(): - # trimmed_data = {} - # for i, (x_val, y_val) in enumerate(data.items()): - # if i <= last_significant_bin: - # trimmed_data[x_val] = y_val - # trimmed_plot_data[dataset_name] = trimmed_data - # plot_data = trimmed_plot_data - - # config: Dict[str, Any] = { - # "id": "xenium_transcripts_per_gene", - # "title": "Xenium: Distribution of Transcripts", - # "xlab": "Number of transcripts per gene", - # "ylab": "Number of features", - # } - - # # Add vertical line for noise threshold if calculated - # if n_mols_threshold is not None and n_mols_threshold > 0: - # config["x_lines"] = [ - # { - # "value": n_mols_threshold, - # "color": "grey", - # "dash": "dash", - # "width": 1, - # "label": f"Noise threshold ({n_mols_threshold:.0f})", - # } - # ] - - # return linegraph.plot(plot_data, config) + n_mols_threshold = sample_thresholds.get(sample_name) if sample_thresholds else None + threshold_text = f" (noise threshold: {n_mols_threshold:.0f})" if n_mols_threshold is not None else "" + + colors = { + "Genes": sample_color, + } + config["colors"] = colors + + # Use dash_styles and hovertemplates for series styling + if "Non-genes" in plot_data: + colors["Non-genes"] = sample_color # Same color as genes + config["dash_styles"] = { + "Genes": "solid", + "Non-genes": "dash", # Dashed line for controls + } + config["hovertemplates"] = { + "Genes": f"%{{text}}
%{{x}}: %{{y}}{threshold_text}", + "Non-genes": f"%{{text}}
%{{x}}: %{{y}}{threshold_text}", + } + config["legend_groups"] = {"Genes": sample_name, "Non-genes": sample_name} + else: + config["hovertemplates"] = {"Genes": f"%{{text}}
%{{x}}: %{{y}}{threshold_text}"} + config["legend_groups"] = {"Genes": sample_name} + + # Add vertical line for noise threshold if calculated + if n_mols_threshold is not None and n_mols_threshold > 0: + config["x_lines"] = [ + { + "value": n_mols_threshold, + "color": "grey", + "dash": "dash", + "width": 1, + "label": f"Noise threshold ({n_mols_threshold:.0f})", + } + ] + + return linegraph.plot(plot_data, config) def _create_multi_sample_molecules_plot( - self, transcript_data_by_sample, samples_with_molecules, bins, bin_centers, n_mols_threshold + self, transcript_data_by_sample, samples_with_molecules, bins, bin_centers, sample_thresholds ): """Create single plot with all samples shown as separate lines, color-coded by gene type""" - import numpy as np - - from multiqc.plots import linegraph - plot_data = {} all_histograms = [] @@ -2098,7 +2285,7 @@ def _create_multi_sample_molecules_plot( gene_line_data = {} for i, count in enumerate(gene_hist): gene_line_data[float(bin_centers[i])] = int(count) - plot_data[f"{s_name} (Genes)"] = gene_line_data + plot_data[f"{s_name} (genes)"] = gene_line_data # Create histograms for non-genes (black lines) if sample_non_gene_counts: @@ -2107,7 +2294,7 @@ def _create_multi_sample_molecules_plot( non_gene_line_data = {} for i, count in enumerate(non_gene_hist): non_gene_line_data[float(bin_centers[i])] = int(count) - plot_data[f"{s_name} (Non-genes)"] = non_gene_line_data + plot_data[f"{s_name} (non-genes)"] = non_gene_line_data if not plot_data: return None @@ -2143,54 +2330,81 @@ def _create_multi_sample_molecules_plot( "ylab": "Number of features", "series_label": None, "xlog": True, + "x_decimals": 0, } - # Add color configuration for genes (blue) and non-genes (black) - colors = {} + # Use per-sample coloring with mqc_colour plot_defaults scheme + scale = mqc_colour.mqc_colour_scale("plot_defaults") + + # Group paired lines by sample name and assign colors + sample_names = set() for dataset_name in plot_data.keys(): - if "(Genes)" in dataset_name: - colors[dataset_name] = "#7cb5ec" # Blue - elif "(Non-genes)" in dataset_name: - colors[dataset_name] = "#434348" # Black + if "(genes)" in dataset_name: + sample_name = dataset_name.replace(" (genes)", "") + sample_names.add(sample_name) + elif "(non-genes)" in dataset_name: + sample_name = dataset_name.replace(" (non-genes)", "") + sample_names.add(sample_name) + + # Create color mapping for each sample + sample_colors = {} + for idx, sample_name in enumerate(sorted(sample_names)): + sample_colors[sample_name] = scale.get_colour(idx, lighten=1) + + # Use the new parameters to style series instead of extra_series + colors = {} + dash_styles = {} + hovertemplates = {} + legend_groups = {} - if colors: - config["colors"] = colors + # Set up styling for all series using the new parameters + for dataset_name in plot_data.keys(): + if "(genes)" in dataset_name: + sample_name = dataset_name.replace(" (genes)", "") + threshold = sample_thresholds.get(sample_name) + threshold_text = f" (noise threshold: {threshold:.0f})" if threshold is not None else "" + + colors[dataset_name] = sample_colors[sample_name] + dash_styles[dataset_name] = "solid" # Solid lines for genes + hovertemplates[dataset_name] = f"%{{text}}
%{{x}}: %{{y}}{threshold_text}" + legend_groups[dataset_name] = sample_name + + elif "(non-genes)" in dataset_name: + sample_name = dataset_name.replace(" (non-genes)", "") + threshold = sample_thresholds.get(sample_name) + threshold_text = f" (noise threshold: {threshold:.0f})" if threshold is not None else "" + + colors[dataset_name] = sample_colors[sample_name] + dash_styles[dataset_name] = "dash" # Dashed lines for controls + hovertemplates[dataset_name] = f"%{{text}}
%{{x}}: %{{y}}{threshold_text}" + legend_groups[dataset_name] = sample_name - # Add vertical line for noise threshold if calculated - if n_mols_threshold is not None and n_mols_threshold > 0: - config["x_lines"] = [ - { - "value": n_mols_threshold, - "color": "grey", - "dash": "dash", - "width": 1, - "label": f"Noise threshold ({n_mols_threshold:.0f})", - } - ] + config["colors"] = colors + config["dash_styles"] = dash_styles + config["hovertemplates"] = hovertemplates + config["legend_groups"] = legend_groups return linegraph.plot(plot_data, config) - def calculate_noise_threshold(self, gene_molecule_counts, quantile=0.99): + def calculate_noise_threshold_from_df(self, transcript_stats_df, quantile=0.99): """ - Calculate noise threshold based on negative control molecules. + Calculate noise threshold directly from transcript_stats DataFrame. + This is the most efficient version as it works on the already-processed DataFrame. Args: - gene_molecule_counts: Dict of {gene_name: {"count": int, "is_gene": bool}} + transcript_stats_df: Polars DataFrame with columns ['feature_name', 'transcript_count', 'is_gene'] quantile: Quantile for threshold calculation (default 0.99) Returns: Float threshold value or None if insufficient data """ - import numpy as np + # Filter for negative control features using polars + neg_controls = transcript_stats_df.filter( + (~pl.col("is_gene")) & pl.col("feature_name").str.starts_with("NegControl") + ) - # Extract counts for negative control features (non-genes starting with "NegControl") - neg_control_counts = [] - neg_control_found = 0 - for gene_name, gene_info in gene_molecule_counts.items(): - if not gene_info["is_gene"] and gene_name.startswith("NegControl"): - neg_control_found += 1 - if gene_info["count"] > 0: - neg_control_counts.append(gene_info["count"]) + # Get counts > 0 for negative controls + neg_control_counts = neg_controls.filter(pl.col("transcript_count") > 0)["transcript_count"].to_list() if len(neg_control_counts) < 3: # Need at least 3 data points for meaningful statistics return None @@ -2226,34 +2440,36 @@ def calculate_noise_threshold(self, gene_molecule_counts, quantile=0.99): def xenium_cell_distributions_combined_plot(self, cells_data_by_sample): """Create combined plot for transcripts and detected genes per cell distributions""" # Check if we have data for either transcripts or genes - samples_with_transcripts = {} - samples_with_genes = {} + samples_with_transcript_counts = {} + samples_with_gene_counts = {} for s_name, data in cells_data_by_sample.items(): - if data and "total_counts_values" in data and data["total_counts_values"]: - samples_with_transcripts[s_name] = data["total_counts_values"] - if data and "detected_genes_values" in data and data["detected_genes_values"]: - samples_with_genes[s_name] = data["detected_genes_values"] + # Check for pre-calculated statistics first, fall back to raw values + if data and "total_counts_box_stats" in data: + samples_with_transcript_counts[s_name] = data["total_counts_box_stats"] + elif data and "total_counts_values" in data and data["total_counts_values"]: + samples_with_transcript_counts[s_name] = data["total_counts_values"] + + if data and "detected_genes_stats" in data: + samples_with_gene_counts[s_name] = data["detected_genes_stats"] + elif data and "detected_genes_values" in data and data["detected_genes_values"]: + samples_with_gene_counts[s_name] = data["detected_genes_values"] # If neither dataset is available, return None - if not samples_with_transcripts and not samples_with_genes: + if not samples_with_transcript_counts and not samples_with_gene_counts: return None - num_samples = max(len(samples_with_transcripts), len(samples_with_genes)) + num_samples = max(len(samples_with_transcript_counts), len(samples_with_gene_counts)) if num_samples == 1: # Single sample: Create combined density plots - return self._create_single_sample_combined_density(samples_with_transcripts, samples_with_genes) + return self._create_single_sample_combined_density(samples_with_transcript_counts, samples_with_gene_counts) else: # Multiple samples: Create combined box plots - return self._create_multi_sample_combined_boxes(samples_with_transcripts, samples_with_genes) + return self._create_multi_sample_combined_boxes(samples_with_transcript_counts, samples_with_gene_counts) - def _create_single_sample_combined_density(self, samples_with_transcripts, samples_with_genes): + def _create_single_sample_combined_density(self, samples_with_transcript_counts, samples_with_gene_counts): """Create single sample combined density plot with transcripts (blue) and genes (grey) on the same plot""" - import numpy as np - - from multiqc.plots import linegraph - plot_data = {} # Store raw values for intelligent line positioning @@ -2261,12 +2477,16 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl raw_gene_values = None # Handle transcripts per cell data - if samples_with_transcripts: - _, transcript_values = next(iter(samples_with_transcripts.items())) + if samples_with_transcript_counts: + _, transcript_values = next(iter(samples_with_transcript_counts.items())) + # Skip density plots for pre-calculated statistics (use box plots instead) + if isinstance(transcript_values, dict) and "min" in transcript_values: + log.info( + "Skipping density plot for transcripts per cell - using pre-calculated statistics. Density plots require raw data." + ) + return None raw_transcript_values = transcript_values try: - import numpy as np - if SCIPY_AVAILABLE: from scipy.stats import gaussian_kde @@ -2284,8 +2504,6 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl except ImportError: # Fallback to histogram if scipy not available - import numpy as np - bins = min(50, len(transcript_values) // 20) hist, bin_edges = np.histogram(transcript_values, bins=bins) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 @@ -2296,18 +2514,25 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl plot_data["Transcripts per cell"] = transcripts_data # Handle detected genes per cell data - if samples_with_genes: - _, gene_values = next(iter(samples_with_genes.items())) - raw_gene_values = gene_values + if samples_with_gene_counts: + _, gene_counts = next(iter(samples_with_gene_counts.items())) + # Skip density plots for pre-calculated statistics + if isinstance(gene_counts, dict) and "min" in gene_counts: + log.info( + "Skipping density plot for detected genes per cell - using pre-calculated statistics. Density plots require raw data." + ) + # For mixed cases, only show available density plots + if not raw_transcript_values: + return None + else: + raw_gene_values = gene_counts try: - import numpy as np - if SCIPY_AVAILABLE: from scipy.stats import gaussian_kde - gene_values = np.array(gene_values) - kde = gaussian_kde(gene_values) - x_min, x_max = gene_values.min(), gene_values.max() + gene_counts = np.array(gene_counts) + kde = gaussian_kde(gene_counts) + x_min, x_max = gene_counts.min(), gene_counts.max() x_range = np.linspace(x_min, x_max, 1000) density = kde(x_range) @@ -2319,10 +2544,8 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl except ImportError: # Fallback to histogram if scipy not available - import numpy as np - - bins = min(50, len(gene_values) // 20) - hist, bin_edges = np.histogram(gene_values, bins=bins) + bins = min(50, len(gene_counts) // 20) + hist, bin_edges = np.histogram(gene_counts, bins=bins) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 genes_data = {} @@ -2335,7 +2558,7 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl config = { "id": "xenium_cell_distributions_combined", - "title": "Xenium: Distribution of Transcripts/Genes per Cell", + "title": "Xenium: Distribution of Transcripts per Cell", "xlab": "Number per cell", "ylab": "Density", "smooth_points": 100, @@ -2354,31 +2577,31 @@ def _create_single_sample_combined_density(self, samples_with_transcripts, sampl return linegraph.plot(plot_data, config) - def _create_multi_sample_combined_boxes(self, samples_with_transcripts, samples_with_genes): - """Create multi-sample combined box plots for transcripts and genes per cell""" + def _create_multi_sample_combined_boxes(self, samples_with_transcript_counts, samples_with_genes_counts): + """Create multi-sample combined box plots for transcripts and genes per cell using pre-calculated statistics""" plot_data = [] data_labels = [] - # Add transcripts per cell data - if samples_with_transcripts: + # Add transcripts per cell data (prefer statistics over raw values) + if samples_with_transcript_counts: transcripts_data = {} - for s_name, transcript_values in samples_with_transcripts.items(): - transcripts_data[s_name] = transcript_values + for s_name, transcript_counts_stats in samples_with_transcript_counts.items(): + transcripts_data[s_name] = transcript_counts_stats plot_data.append(transcripts_data) data_labels.append({"name": "Transcripts per Cell", "ylab": "Transcripts per cell"}) - # Add detected genes per cell data - if samples_with_genes: + # Add detected genes per cell data (prefer statistics over raw values) + if samples_with_genes_counts: genes_data = {} - for s_name, gene_values in samples_with_genes.items(): - genes_data[s_name] = gene_values + for s_name, gene_count_stats in samples_with_genes_counts.items(): + genes_data[s_name] = gene_count_stats plot_data.append(genes_data) data_labels.append({"name": "Detected Genes per Cell", "ylab": "Detected genes per cell"}) config = { "id": "xenium_cell_distributions_combined", - "title": "Xenium: Distribution of Transcripts/Genes per Cell", + "title": "Xenium: Distribution of Transcripts per Cell", "boxpoints": False, "xlab": "Transcripts per cell", "data_labels": data_labels, @@ -2412,8 +2635,6 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): # Create kernel density estimation try: - import numpy as np - if SCIPY_AVAILABLE: from scipy.stats import gaussian_kde @@ -2450,8 +2671,6 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): except ImportError: # Fallback to histogram if scipy not available - import numpy as np - bins = min(50, len(transcript_values) // 20) hist, bin_edges = np.histogram(transcript_values, bins=bins) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 @@ -2500,31 +2719,29 @@ def _create_multi_sample_transcripts_boxes(self, samples_with_transcripts): def xenium_detected_genes_per_cell_plot(self, cells_data_by_sample): """Create detected genes per cell distribution plot""" # Filter samples with detected genes data - samples_with_genes = {} + samples_with_transcript_counts = {} for s_name, data in cells_data_by_sample.items(): - if data and "detected_genes_values" in data and data["detected_genes_values"]: - samples_with_genes[s_name] = data["detected_genes_values"] + if data and "gene_transcript_counts_values" in data and data["gene_transcript_counts_values"]: + samples_with_transcript_counts[s_name] = data["gene_transcript_counts_values"] - if not samples_with_genes: + if not samples_with_transcript_counts: return None - num_samples = len(samples_with_genes) + num_samples = len(samples_with_transcript_counts) if num_samples == 1: # Single sample: Create density plot - return self._create_single_sample_genes_density(samples_with_genes) + return self._create_single_sample_transcript_counts_density(samples_with_transcript_counts) else: # Multiple samples: Create box plots - return self._create_multi_sample_genes_boxes(samples_with_genes) + return self._create_multi_sample_transcript_counts_boxes(samples_with_transcript_counts) - def _create_single_sample_genes_density(self, samples_with_genes): + def _create_single_sample_transcript_counts_density(self, samples_with_transcript_counts): """Create single sample detected genes per cell density plot""" - s_name, gene_values = next(iter(samples_with_genes.items())) + s_name, gene_values = next(iter(samples_with_transcript_counts.items())) # Create kernel density estimation try: - import numpy as np - if SCIPY_AVAILABLE: from scipy.stats import gaussian_kde @@ -2544,7 +2761,7 @@ def _create_single_sample_genes_density(self, samples_with_genes): config = { "id": "xenium_detected_genes_per_cell_single", "title": "Xenium: Distribution of Detected Genes per Cell", - "xlab": "Number of detected genes per cell", + "xlab": "Detected genes per cell", "ylab": "Density", "smooth_points": 100, } @@ -2553,8 +2770,6 @@ def _create_single_sample_genes_density(self, samples_with_genes): except ImportError: # Fallback to histogram if scipy not available - import numpy as np - bins = min(50, len(gene_values) // 20) hist, bin_edges = np.histogram(gene_values, bins=bins) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 @@ -2566,25 +2781,75 @@ def _create_single_sample_genes_density(self, samples_with_genes): config = { "id": "xenium_detected_genes_per_cell_single", "title": "Xenium: Distribution of Detected Genes per Cell", - "xlab": "Number of detected genes per cell", + "xlab": "Detected genes per cell", "ylab": "Number of cells", } return linegraph.plot(plot_data, config) - def _create_multi_sample_genes_boxes(self, samples_with_genes): + def _create_multi_sample_transcript_counts_boxes(self, samples_with_transcript_counts): """Create multi-sample detected genes per cell box plots""" # Prepare data for box plot plot_data = {} - for s_name, gene_values in samples_with_genes.items(): + for s_name, gene_values in samples_with_transcript_counts.items(): plot_data[s_name] = gene_values config = { "id": "xenium_detected_genes_per_cell_multi", "title": "Xenium: Distribution of Detected Genes per Cell", - "ylab": "Number of detected genes per cell", + "ylab": "Detected genes per cell", "boxpoints": False, } return box.plot(plot_data, config) + + def parse_cell_feature_matrix_h5(self, f): + """Parse cell_feature_matrix.h5 file to calculate detected genes per cell""" + if not SCANPY_AVAILABLE: + log.warning( + "scanpy is not available. Cannot process cell_feature_matrix.h5 files. Install scanpy to enable detected genes per cell calculation." + ) + return None + + try: + # Construct full file path + file_path = Path(f["root"]) / f["fn"] + + # Read H5 file using scanpy + adata = sc.read_10x_h5(str(file_path)) + + # Calculate detected genes per cell (number of non-zero genes per cell) + # This matches the notebook's approach: (ad.X != 0).sum(axis=1).A1 + n_genes_per_cell = (adata.X != 0).sum(axis=1).A1 + + result = {} + + # Calculate statistics for detected genes per cell (similar to transcript_counts processing) + if len(n_genes_per_cell) > 0: + detected_genes_stats = { + "min": float(np.min(n_genes_per_cell)), + "q1": float(np.percentile(n_genes_per_cell, 25)), + "median": float(np.median(n_genes_per_cell)), + "q3": float(np.percentile(n_genes_per_cell, 75)), + "max": float(np.max(n_genes_per_cell)), + "mean": float(np.mean(n_genes_per_cell)), + "count": len(n_genes_per_cell), + } + + # Store as gene_transcript_counts_box_stats to replace the current implementation + result["detected_genes_stats"] = detected_genes_stats + + # Also store raw values if needed for single-sample density plots + result["detected_genes_values"] = n_genes_per_cell.tolist() + + log.info(f"Processed {file_path}: {len(n_genes_per_cell)} cells, {adata.n_vars} genes") + log.info( + f"Detected genes per cell - mean: {detected_genes_stats['mean']:.1f}, median: {detected_genes_stats['median']:.1f}" + ) + + return result + + except Exception as e: + log.warning(f"Failed to process {f.get('fn', 'cell_feature_matrix.h5')}: {str(e)}") + return None diff --git a/multiqc/plots/box.py b/multiqc/plots/box.py index f4d4449c90..6d533af72c 100644 --- a/multiqc/plots/box.py +++ b/multiqc/plots/box.py @@ -3,11 +3,11 @@ import copy import json import logging -from typing import Any, Dict, List, Optional, OrderedDict, Tuple, Union, cast +from typing import Any, Dict, List, Mapping, Optional, OrderedDict, Tuple, Union, cast -from natsort import natsorted import plotly.graph_objects as go # type: ignore import polars as pl +from natsort import natsorted from multiqc import config, report from multiqc.plots.plot import BaseDataset, NormalizedPlotInputData, PConfig, Plot, PlotType, plot_anchor @@ -32,8 +32,10 @@ def __init__(self, path_in_cfg: Optional[Tuple[str, ...]] = None, **data): super().__init__(path_in_cfg=path_in_cfg or ("boxplot",), **data) -# Type of single box (matching one sample) -BoxT = List[Union[int, float]] +# Type of single box (matching one sample) - can be raw data or statistics +BoxT = Union[List[Union[int, float]], Dict[str, Union[int, float]]] +# Type for statistics dict +BoxStatsT = Dict[str, Union[int, float]] class Dataset(BaseDataset): @@ -41,6 +43,7 @@ class Dataset(BaseDataset): samples: List[str] data_sorted: Optional[List[BoxT]] = None # Sorted version of data samples_sorted: Optional[List[str]] = None # Sorted version of samples + is_stats_data: bool = False # True if data contains pre-calculated statistics def sample_names(self) -> List[SampleName]: return [SampleName(sample) for sample in self.samples] @@ -48,9 +51,15 @@ def sample_names(self) -> List[SampleName]: @staticmethod def create( dataset: BaseDataset, - data_by_sample: Dict[str, BoxT], + data_by_sample: Mapping[str, BoxT], pconfig: Optional[BoxPlotConfig] = None, ) -> "Dataset": + # Detect if we have statistics data or raw data + is_stats_data = False + if data_by_sample: + first_sample_data = next(iter(data_by_sample.values())) + if isinstance(first_sample_data, dict): + is_stats_data = True # Store original order (reversed for box plot display) original_data = list(data_by_sample.values()) original_samples = list(data_by_sample.keys()) @@ -68,12 +77,21 @@ def create( median_values = {} for sample, values in data_by_sample.items(): if values: - sorted_values = sorted(values) - n = len(sorted_values) - median = ( - sorted_values[n // 2] if n % 2 == 1 else (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2 - ) - median_values[sample] = median + if is_stats_data: + # Use pre-calculated median from statistics + stats_dict = cast(BoxStatsT, values) + median_values[sample] = stats_dict.get("median", 0) + else: + # Calculate median from raw data + raw_values = cast(List[Union[int, float]], values) + sorted_values = sorted(raw_values) + n = len(sorted_values) + median = ( + sorted_values[n // 2] + if n % 2 == 1 + else (sorted_values[n // 2 - 1] + sorted_values[n // 2]) / 2 + ) + median_values[sample] = median else: median_values[sample] = 0 # Handle empty data @@ -99,12 +117,16 @@ def create( samples=main_samples, data_sorted=data_sorted, samples_sorted=samples_sorted, + is_stats_data=is_stats_data, ) # Determine boxpoints based on PConfig first, then global config, then dynamic logic boxpoints: Union[bool, str] = "outliers" - if pconfig and pconfig.boxpoints is not None: + if is_stats_data: + # For statistics data, we can't show individual points + boxpoints = False + elif pconfig and pconfig.boxpoints is not None: # Use explicit PConfig boxpoints setting boxpoints = pconfig.boxpoints elif config.boxplot_boxpoints is not None: @@ -151,22 +173,46 @@ def create_figure( for sname, values in zip(self.samples, self.data): params = copy.deepcopy(self.trace_params) - fig.add_trace( - go.Box( - x=values, - name=sname, - **params, - ), - ) + + if self.is_stats_data: + # Use statistics to create box plot + stats_dict = cast(BoxStatsT, values) + fig.add_trace( + go.Box( + q1=[stats_dict.get("q1", 0)], + median=[stats_dict.get("median", 0)], + q3=[stats_dict.get("q3", 0)], + lowerfence=[stats_dict.get("min", 0)], + upperfence=[stats_dict.get("max", 0)], + mean=[stats_dict.get("mean", stats_dict.get("median", 0))], + name=sname, + **params, + ), + ) + else: + # Use raw data points + raw_values = cast(List[Union[int, float]], values) + fig.add_trace( + go.Box( + x=raw_values, + name=sname, + **params, + ), + ) return fig def save_data_file(self) -> None: vals_by_sample: Dict[str, BoxT] = {} for sample, values in zip(self.samples, self.data): - vals_by_sample[sample] = values + if self.is_stats_data: + # For statistics data, save the statistics dict + vals_by_sample[sample] = values + else: + # For raw data, save the raw values + vals_by_sample[sample] = values report.write_data_file(vals_by_sample, self.uid) - def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = True) -> str: # noqa: ARG002 + def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = True) -> str: # type: ignore[override] """Format dataset as a markdown table with basic statistics""" prompt = "|Sample|Min|Q1|Median|Q3|Max|Mean|\n" prompt += "|---|---|---|---|---|---|---|\n" @@ -177,22 +223,33 @@ def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = Tru for sample, values in zip(self.samples, self.data): # Skip samples with no data - if len(values) == 0: + if (self.is_stats_data and not values) or (not self.is_stats_data and len(values) == 0): continue # Use pseudonym if available, otherwise use original sample name pseudonym = report.anonymize_sample_name(sample) - # Calculate statistics - sorted_vals = sorted(values) - n = len(sorted_vals) - - min_val = sorted_vals[0] - max_val = sorted_vals[-1] - median = sorted_vals[n // 2] if n % 2 == 1 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2 - q1 = sorted_vals[n // 4] if n >= 4 else sorted_vals[0] - q3 = sorted_vals[3 * n // 4] if n >= 4 else sorted_vals[-1] - mean = sum(values) / len(values) + if self.is_stats_data: + # Use pre-calculated statistics + stats_dict = cast(BoxStatsT, values) + min_val = stats_dict.get("min", 0) + max_val = stats_dict.get("max", 0) + median = stats_dict.get("median", 0) + q1 = stats_dict.get("q1", min_val) + q3 = stats_dict.get("q3", max_val) + mean = stats_dict.get("mean", median) + else: + # Calculate statistics from raw data + raw_values = cast(List[Union[int, float]], values) + sorted_vals = sorted(raw_values) + n = len(sorted_vals) + + min_val = sorted_vals[0] + max_val = sorted_vals[-1] + median = sorted_vals[n // 2] if n % 2 == 1 else (sorted_vals[n // 2 - 1] + sorted_vals[n // 2]) / 2 + q1 = sorted_vals[n // 4] if n >= 4 else sorted_vals[0] + q3 = sorted_vals[3 * n // 4] if n >= 4 else sorted_vals[-1] + mean = sum(raw_values) / len(raw_values) prompt += ( f"|{pseudonym}|" @@ -207,7 +264,7 @@ def format_dataset_for_ai_prompt(self, pconfig: PConfig, keep_hidden: bool = Tru class BoxPlotInputData(NormalizedPlotInputData): - list_of_data_by_sample: List[Dict[str, BoxT]] + list_of_data_by_sample: List[Mapping[str, BoxT]] pconfig: BoxPlotConfig def is_empty(self) -> bool: @@ -271,7 +328,7 @@ def from_df(cls, df: pl.DataFrame, pconfig: Union[Dict, BoxPlotConfig], anchor: pconf = cast(BoxPlotConfig, BoxPlotConfig.from_df(df)) # Group by dataset_idx to rebuild data structure - list_of_data_by_sample: List[Dict[str, BoxT]] = [] + list_of_data_by_sample: List[Mapping[str, BoxT]] = [] data_labels = [] max_dataset_idx = df.select(pl.col("dataset_idx").max()).item() if not df.is_empty() else 0 @@ -307,7 +364,7 @@ def from_df(cls, df: pl.DataFrame, pconfig: Union[Dict, BoxPlotConfig], anchor: if sample_values: dataset[str(sample_name)] = sample_values - list_of_data_by_sample.append(dataset) + list_of_data_by_sample.append(cast(Mapping[str, BoxT], dataset)) if any(d for d in data_labels if d): pconf.data_labels = data_labels @@ -352,7 +409,7 @@ def merge(cls, old_data: "BoxPlotInputData", new_data: "BoxPlotInputData") -> "B @staticmethod def create( - list_of_data_by_sample: Union[Dict[str, BoxT], List[Dict[str, BoxT]]], + list_of_data_by_sample: Union[Mapping[str, BoxT], List[Mapping[str, BoxT]]], pconfig: Union[Dict[str, Any], BoxPlotConfig, None] = None, ) -> "BoxPlotInputData": pconf: BoxPlotConfig = cast(BoxPlotConfig, BoxPlotConfig.from_pconfig_dict(pconfig)) @@ -391,7 +448,7 @@ def sample_names(self) -> List[SampleName]: @staticmethod def create( - list_of_data_by_sample: List[Dict[str, BoxT]], + list_of_data_by_sample: List[Mapping[str, BoxT]], pconfig: BoxPlotConfig, anchor: Anchor, ) -> "BoxPlot": @@ -479,13 +536,24 @@ def from_inputs(inputs: BoxPlotInputData) -> Union["BoxPlot", str, None]: def plot( - list_of_data_by_sample: Union[Dict[str, BoxT], List[Dict[str, BoxT]]], + list_of_data_by_sample: Union[Mapping[str, BoxT], List[Mapping[str, BoxT]]], pconfig: Union[Dict[str, Any], BoxPlotConfig, None] = None, ) -> Union["BoxPlot", str, None]: """ - Plot a box plot. Expects either: - - a dict mapping sample names to data point lists or dicts, - - a dict mapping sample names to a dict of statistics (e.g. {min, max, median, mean, std, q1, q3 etc.}) + Plot a box plot. Supports two input formats: + + 1. Raw data points (traditional): + {'sample1': [1.2, 3.4, 2.1, ...], 'sample2': [2.3, 4.1, ...]} + + 2. Pre-calculated statistics (memory efficient for large datasets): + {'sample1': {'min': 1.0, 'q1': 2.0, 'median': 3.0, 'q3': 4.0, 'max': 5.0, 'mean': 3.2}, + 'sample2': {'min': 1.5, 'q1': 2.2, 'median': 3.1, 'q3': 4.1, 'max': 5.2, 'mean': 3.3}} + + The statistics format dramatically reduces memory usage and file sizes for large datasets + (e.g., cell-level measurements) while producing identical visual output. + + Required statistics keys: min, q1, median, q3, max + Optional statistics keys: mean, count, std """ inputs: BoxPlotInputData = BoxPlotInputData.create(list_of_data_by_sample, pconfig) inputs = BoxPlotInputData.merge_with_previous(inputs) diff --git a/multiqc/plots/linegraph.py b/multiqc/plots/linegraph.py index f3bf8acc0b..b3b72f7984 100644 --- a/multiqc/plots/linegraph.py +++ b/multiqc/plots/linegraph.py @@ -57,6 +57,8 @@ class Series(ValidatedConfig, Generic[KeyT, ValT]): dash: Optional[str] = None showlegend: bool = True marker: Optional[Marker] = None + # Store additional trace parameters that should be passed to Plotly + extra_trace_params: Dict[str, Any] = Field(default_factory=dict) def __init__(self, path_in_cfg: Optional[Tuple[str, ...]] = None, **data): path_in_cfg = path_in_cfg or ("Series",) @@ -75,6 +77,15 @@ def __init__(self, path_in_cfg: Optional[Tuple[str, ...]] = None, **data): tuples.append(p) data["pairs"] = tuples + # Extract extra trace parameters (fields not in the main model) + main_fields = {"name", "pairs", "color", "width", "dash", "showlegend", "marker", "extra_trace_params"} + extra_params = {k: v for k, v in data.items() if k not in main_fields} + if extra_params: + data["extra_trace_params"] = extra_params + # Remove extra params from data to avoid validation errors + for k in extra_params: + data.pop(k) + super().__init__(**data, path_in_cfg=path_in_cfg) if self.dash is not None: @@ -106,6 +117,9 @@ class LinePlotConfig(PConfig): style: Optional[Literal["lines", "lines+markers"]] = None hide_empty: bool = Field(True) colors: Dict[str, str] = {} + dash_styles: Dict[str, str] = {} + hovertemplates: Dict[str, str] = {} + legend_groups: Dict[str, str] = {} @classmethod def parse_extra_series( @@ -223,6 +237,8 @@ def create_figure( }, } params = update_dict(params, self.trace_params, none_only=True) + # Add extra trace parameters from series + params = update_dict(params, series.extra_trace_params, none_only=True) if len(series.pairs) == 1: params["mode"] = "lines+markers" # otherwise it's invisible @@ -693,6 +709,9 @@ def _make_series_dict( xmax = pconfig.xmax xmin = pconfig.xmin colors = pconfig.colors + dash_styles = pconfig.dash_styles + hovertemplates = pconfig.hovertemplates + legend_groups = pconfig.legend_groups if data_label: if isinstance(data_label, dict): _x_are_categories = data_label.get("categories", x_are_categories) @@ -713,6 +732,15 @@ def _make_series_dict( _colors = data_label.get("colors") if _colors and isinstance(_colors, dict): colors = {**colors, **cast(Dict[str, str], _colors)} + _dash_styles = data_label.get("dash_styles") + if _dash_styles and isinstance(_dash_styles, dict): + dash_styles = {**dash_styles, **cast(Dict[str, str], _dash_styles)} + _hovertemplates = data_label.get("hovertemplates") + if _hovertemplates and isinstance(_hovertemplates, dict): + hovertemplates = {**hovertemplates, **cast(Dict[str, str], _hovertemplates)} + _legend_groups = data_label.get("legend_groups") + if _legend_groups and isinstance(_legend_groups, dict): + legend_groups = {**legend_groups, **cast(Dict[str, str], _legend_groups)} xs = [x for x in y_by_x.keys()] if not x_are_categories: @@ -761,7 +789,24 @@ def _make_series_dict( if pconfig.smooth_points is not None: pairs = smooth_array(pairs, pconfig.smooth_points) - return Series(name=s, pairs=pairs, color=colors.get(s), path_in_cfg=("lineplot", "pconfig", "pairs")) + # Prepare extra trace parameters for hovertemplate and legendgroup + extra_trace_params = {} + hovertemplate = hovertemplates.get(s) + if hovertemplate: + extra_trace_params["hovertemplate"] = hovertemplate + + legendgroup = legend_groups.get(s) + if legendgroup: + extra_trace_params["legendgroup"] = legendgroup + + return Series( + name=s, + pairs=pairs, + color=colors.get(s), + dash=dash_styles.get(s), + extra_trace_params=extra_trace_params, + path_in_cfg=("lineplot", "pconfig", "pairs"), + ) def smooth_line_data(data_by_sample: DatasetT[KeyT, ValT], numpoints: int) -> Dict[SampleName, Dict[KeyT, ValT]]: diff --git a/multiqc/plots/plot.py b/multiqc/plots/plot.py index 9e2ca0e06e..bb41c95e04 100644 --- a/multiqc/plots/plot.py +++ b/multiqc/plots/plot.py @@ -1706,7 +1706,7 @@ def rename_deprecated_highcharts_keys(conf: Dict) -> Dict: conf["yaxis"] = conf.pop("y_ceiling") if "xAxis" in conf: conf["xaxis"] = conf.pop("xAxis") - if "tooltip" in conf: + if "tooltip" in conf and "hovertemplate" not in conf: conf["hovertemplate"] = conf.pop("tooltip") return conf diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index ab10a5ae2f..b0caaed68c 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -19,6 +19,8 @@ xenium/experiment: num_lines: 50 xenium/cells: fn: "cells.parquet" +xenium/cell_feature_matrix: + fn: "cell_feature_matrix.h5" afterqc: fn: "*.json" contents: "allow_mismatch_in_poly" diff --git a/multiqc/templates/default/assets/js/plots/box.js b/multiqc/templates/default/assets/js/plots/box.js index 34a7fcdb02..b8be08bae7 100644 --- a/multiqc/templates/default/assets/js/plots/box.js +++ b/multiqc/templates/default/assets/js/plots/box.js @@ -3,6 +3,7 @@ class BoxPlot extends Plot { super(dump); this.filteredSettings = []; this.sortSwitchSortedActive = dump["sort_switch_sorted_active"]; + this.isStatsData = dump.datasets && dump.datasets.length > 0 ? dump.datasets[0].is_stats_data : false; } activeDatasetSize() { @@ -44,28 +45,42 @@ class BoxPlot extends Plot { const suffix = this.layout.xaxis.ticksuffix ? " " + this.layout.xaxis.ticksuffix : ""; - // Calculate statistics for each sample + // Format each value and add suffix + let fmt = (val) => { + if (!Number.isFinite(val)) return ""; + const isInt = Number.isInteger(val); + return (isInt ? val : parseFloat(val.toFixed(2))) + suffix; + }; + + // Handle statistics or raw data samples.forEach((sample, idx) => { - const values = data[idx].filter((v) => Number.isFinite(v)).sort((a, b) => a - b); - if (values.length === 0) return; - - let n = values.length; - - let min = values[0]; - let max = values[n - 1]; - let median = n % 2 === 1 ? values[Math.floor(n / 2)] : (values[n / 2 - 1] + values[n / 2]) / 2; - let q1 = n >= 4 ? values[Math.floor(n / 4)] : values[0]; - let q3 = n >= 4 ? values[Math.floor((3 * n) / 4)] : values[n - 1]; - let mean = values.reduce((a, b) => a + b, 0) / n; - - // Format each value and add suffix - let fmt = (val) => { - if (!Number.isFinite(val)) return ""; - const isInt = Number.isInteger(val); - return (isInt ? val : parseFloat(val.toFixed(2))) + suffix; - }; - - prompt += `|${sample}|${fmt(min)}|${fmt(q1)}|${fmt(median)}|${fmt(q3)}|${fmt(max)}|${fmt(mean)}|\n`; + if (this.isStatsData) { + // Use pre-calculated statistics + const stats = data[idx]; + const min = stats.min || 0; + const max = stats.max || 0; + const median = stats.median || 0; + const q1 = stats.q1 || min; + const q3 = stats.q3 || max; + const mean = stats.mean || median; + + prompt += `|${sample}|${fmt(min)}|${fmt(q1)}|${fmt(median)}|${fmt(q3)}|${fmt(max)}|${fmt(mean)}|\n`; + } else { + // Calculate statistics for raw data + const values = data[idx].filter((v) => Number.isFinite(v)).sort((a, b) => a - b); + if (values.length === 0) return; + + let n = values.length; + + let min = values[0]; + let max = values[n - 1]; + let median = n % 2 === 1 ? values[Math.floor(n / 2)] : (values[n / 2 - 1] + values[n / 2]) / 2; + let q1 = n >= 4 ? values[Math.floor(n / 4)] : values[0]; + let q3 = n >= 4 ? values[Math.floor((3 * n) / 4)] : values[n - 1]; + let mean = values.reduce((a, b) => a + b, 0) / n; + + prompt += `|${sample}|${fmt(min)}|${fmt(q1)}|${fmt(median)}|${fmt(q3)}|${fmt(max)}|${fmt(mean)}|\n`; + } }); return prompt; @@ -103,12 +118,30 @@ class BoxPlot extends Plot { } let values = data[sampleIdx]; - return { - type: "box", - x: values, - name: sample.name, - ...params, - }; + + if (this.isStatsData) { + // Create box plot from statistics + return { + type: "box", + q1: [values.q1 || 0], + median: [values.median || 0], + q3: [values.q3 || 0], + lowerfence: [values.min || 0], + upperfence: [values.max || 0], + mean: [values.mean || values.median || 0], + y: [sample.name], // Add y-coordinate for horizontal box plot positioning + name: sample.name, + ...params, + }; + } else { + // Create box plot from raw data + return { + type: "box", + x: values, + name: sample.name, + ...params, + }; + } }); } @@ -118,9 +151,36 @@ class BoxPlot extends Plot { let delim = format === "tsv" ? "\t" : ","; let csv = ""; - for (let i = 0; i < data.length; i++) { - csv += samples[i] + delim + data[i].join(delim) + "\n"; + + if (this.isStatsData) { + // Export statistics as CSV + csv = + "Sample" + delim + "Min" + delim + "Q1" + delim + "Median" + delim + "Q3" + delim + "Max" + delim + "Mean\n"; + for (let i = 0; i < data.length; i++) { + const stats = data[i]; + csv += + samples[i] + + delim + + (stats.min || 0) + + delim + + (stats.q1 || 0) + + delim + + (stats.median || 0) + + delim + + (stats.q3 || 0) + + delim + + (stats.max || 0) + + delim + + (stats.mean || stats.median || 0) + + "\n"; + } + } else { + // Export raw data + for (let i = 0; i < data.length; i++) { + csv += samples[i] + delim + data[i].join(delim) + "\n"; + } } + return csv; } } diff --git a/multiqc/templates/default/assets/js/plots/line.js b/multiqc/templates/default/assets/js/plots/line.js index 070cfecd64..c230fe87fe 100644 --- a/multiqc/templates/default/assets/js/plots/line.js +++ b/multiqc/templates/default/assets/js/plots/line.js @@ -122,6 +122,7 @@ class LinePlot extends Plot { }; } + updateObject(params, line["extra_trace_params"], true); updateObject(params, dataset["trace_params"], true); return { diff --git a/pyproject.toml b/pyproject.toml index efdf8f097e..6b2296e2a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ dependencies = [ "jsonschema", "polars-lts-cpu", # for parquet support. Using LTS version for compatibility with older architectures "pyarrow", # for parquet support + "scanpy", # to parse h5 files ] requires-python = ">=3.8" authors = [ diff --git a/tests/test_plots.py b/tests/test_plots.py index c437b5dc67..246a8251f3 100644 --- a/tests/test_plots.py +++ b/tests/test_plots.py @@ -1,6 +1,6 @@ import sys import tempfile -from typing import Dict +from typing import Dict, List, Union from unittest.mock import patch import pytest @@ -187,7 +187,7 @@ def test_boxplot_dynamic_boxpoints(): config.box_min_threshold_no_points = 10 config.box_min_threshold_outliers = 5 - data_few = { + data_few: Dict[str, List[Union[int, float]]] = { "Sample1": [1.0, 2.0, 3.0, 4.0, 5.0], "Sample2": [2.0, 3.0, 4.0, 5.0, 6.0], } @@ -206,7 +206,7 @@ def test_boxplot_dynamic_boxpoints(): report.reset() # Test with many samples (should show only outliers) - data_many = {f"Sample{i}": [1.0, 2.0, 3.0, 4.0, 5.0] for i in range(10)} + data_many: Dict[str, List[Union[int, float]]] = {f"Sample{i}": [1.0, 2.0, 3.0, 4.0, 5.0] for i in range(10)} plot_many = _verify_rendered( box.plot( @@ -222,7 +222,7 @@ def test_boxplot_dynamic_boxpoints(): report.reset() # Test with very many samples (should show no points) - data_very_many = {f"Sample{i}": [1.0, 2.0, 3.0, 4.0, 5.0] for i in range(15)} + data_very_many: Dict[str, List[Union[int, float]]] = {f"Sample{i}": [1.0, 2.0, 3.0, 4.0, 5.0] for i in range(15)} plot_very_many = _verify_rendered( box.plot( From 695eca6d37ea9329938b77d72b7849d743192778 Mon Sep 17 00:00:00 2001 From: Chris Hakkaart Date: Fri, 5 Sep 2025 02:38:01 +1200 Subject: [PATCH 17/35] docs: Fix links causing Docusaurus warnings (#3329) --- docs/markdown/getting_started/installation.md | 2 +- multiqc/modules/fastqc/fastqc.py | 2 +- multiqc/modules/featurecounts/featurecounts.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/markdown/getting_started/installation.md b/docs/markdown/getting_started/installation.md index de00ae2cff..7371fd9bfb 100644 --- a/docs/markdown/getting_started/installation.md +++ b/docs/markdown/getting_started/installation.md @@ -12,7 +12,7 @@ If you're new to software packaging, this page can be a little overwhelming. If in doubt, a general rule is: - _Running MultiQC in a pipeline?_   Use [Docker](#docker) or [Singularity](#singularity). -- _Running MultiQC locally?_   Use [Pip](#pip) or [Conda](#conda). +- _Running MultiQC locally?_   Use [Pip](#pip--pypi) or [Conda](#conda). :::tip{title="Installation cheat sheet"} diff --git a/multiqc/modules/fastqc/fastqc.py b/multiqc/modules/fastqc/fastqc.py index 414f21cf6c..7fce48ccea 100755 --- a/multiqc/modules/fastqc/fastqc.py +++ b/multiqc/modules/fastqc/fastqc.py @@ -72,7 +72,7 @@ class MultiqcModule(BaseMultiqcModule): ::: You can customise the patterns used for finding these files in your - MultiQC config (see [Module search patterns](#module-search-patterns)). + MultiQC config (see [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns)). The below code shows the default file patterns: ```yaml diff --git a/multiqc/modules/featurecounts/featurecounts.py b/multiqc/modules/featurecounts/featurecounts.py index 1c8af3c6c4..581e4f0b85 100755 --- a/multiqc/modules/featurecounts/featurecounts.py +++ b/multiqc/modules/featurecounts/featurecounts.py @@ -36,7 +36,7 @@ class MultiqcModule(BaseMultiqcModule): As of MultiQC v1.10, the module should also work with output from [Rsubread](https://bioconductor.org/packages/release/bioc/html/Rsubread.html). Note that your filenames must end in `.summary` to be discovered. - See [Module search patterns](#module-search-patterns) for how to customise this. + See [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns) for how to customise this. Please note that if files are in "Rsubread mode" then lines will be split by any whitespace, instead of tab characters. As such, filenames with spaces in will From e438655d0ea4353e825fc8a41c90007a0796a83c Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Thu, 4 Sep 2025 17:31:06 +0200 Subject: [PATCH 18/35] Unique series label for each plot type (#3330) * Fixes * Unique series label for each plot type --- multiqc/modules/fastqc/fastqc.py | 2 +- multiqc/modules/xenium/xenium.py | 5 ++-- multiqc/plots/plot.py | 39 +++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/multiqc/modules/fastqc/fastqc.py b/multiqc/modules/fastqc/fastqc.py index 7fce48ccea..8ddfa4a869 100755 --- a/multiqc/modules/fastqc/fastqc.py +++ b/multiqc/modules/fastqc/fastqc.py @@ -1320,7 +1320,7 @@ def adapter_content_plot(self, status_checks: bool = True): "y_minrange": 5, "ymin": 0, "tt_label": "Base {point.x}: {point.y:.2f}%", - "hide_zero_cats": True, + "hide_empty": True, "series_label": "sample-adapter combinations", } if status_checks: diff --git a/multiqc/modules/xenium/xenium.py b/multiqc/modules/xenium/xenium.py index cfcf71a1b3..35a47d37b5 100644 --- a/multiqc/modules/xenium/xenium.py +++ b/multiqc/modules/xenium/xenium.py @@ -2208,11 +2208,10 @@ def _create_single_sample_molecules_plot(self, sample_data, bins, bin_centers, s "xlab": "Number of transcripts per gene", "ylab": "Number of features", "xlog": True, + "series_label": False, } # Use same color for genes and controls from same sample (distinguished by line style) - from multiqc.utils import mqc_colour - scale = mqc_colour.mqc_colour_scale("plot_defaults") sample_color = scale.get_colour(0, lighten=1) # Use first color for single sample @@ -2328,7 +2327,7 @@ def _create_multi_sample_molecules_plot( "title": "Xenium: Distribution of Transcripts per Gene", "xlab": "Number of transcripts per gene", "ylab": "Number of features", - "series_label": None, + "series_label": False, "xlog": True, "x_decimals": 0, } diff --git a/multiqc/plots/plot.py b/multiqc/plots/plot.py index bb41c95e04..f7e284ec3d 100644 --- a/multiqc/plots/plot.py +++ b/multiqc/plots/plot.py @@ -43,6 +43,29 @@ check_plotly_version() + +def _get_series_label(plot_type: PlotType, series_label: Union[str, bool]) -> str: + """ + Get the appropriate series label for a plot type. + If series_label is the default "samples", return a plot type-specific label. + Otherwise, return the custom series_label as-is. + """ + if series_label != "samples": + return str(series_label) + + # Map plot types to their specific series labels + plot_type_labels = { + PlotType.LINE: "lines", + PlotType.BAR: "bars", + PlotType.BOX: "boxes", + PlotType.SCATTER: "points", + PlotType.HEATMAP: "samples", # heatmaps typically show samples + PlotType.VIOLIN: "samples", # violins keep the default "samples" + } + + return plot_type_labels.get(plot_type, "samples") # fallback for unknown plot types + + # Create and register MultiQC default Plotly template multiqc_plotly_template = dict( layout=go.Layout( @@ -187,7 +210,7 @@ class PConfig(ValidatedConfig): y_bands: Optional[List[LineBand]] = None x_lines: Optional[List[FlatLine]] = None y_lines: Optional[List[FlatLine]] = None - series_label: str = "samples" + series_label: Union[str, bool] = "samples" flat_if_very_large: bool = True @classmethod @@ -560,7 +583,6 @@ def initialize( axis_controlled_by_switches: Optional[List[str]] = None, default_tt_label: Optional[str] = None, defer_render_if_large: bool = True, - series_label: Optional[str] = None, n_samples_per_dataset: Optional[List[int]] = None, ) -> "Plot[DatasetT, PConfigT]": """ @@ -573,8 +595,7 @@ def initialize( log10 scale and percentage switch buttons, e.g. ["yaxis"] :param default_tt_label: default tooltip label :param defer_render_if_large: whether to defer rendering if the number of data points is large - :param series_label: label for the series, e.g. "samples" or "statuses" - :param n_samples_per_dataset: number of actual samples for each dataset (assumes series_label are samples) + :param n_samples_per_dataset: number of actual samples for each dataset (assumes series_label from pconfig are samples) """ if len(n_series_per_dataset) == 0: raise ValueError("No datasets to plot") @@ -725,10 +746,12 @@ def initialize( n_samples = n_samples_per_dataset[idx] else: n_samples = 0 - if n_samples > 1 and series_label: - subtitles += [f"{n_samples} {pconfig.series_label}"] - elif n_series > 1 and series_label: - subtitles += [f"{n_series} {pconfig.series_label}"] + if n_samples > 1 and pconfig.series_label: + series_label = _get_series_label(plot_type, pconfig.series_label) + subtitles += [f"{n_samples} {series_label}"] + elif n_series > 1 and pconfig.series_label: + series_label = _get_series_label(plot_type, pconfig.series_label) + subtitles += [f"{n_series} {series_label}"] if subtitles: dconfig["subtitle"] = ", ".join(subtitles) From be6cb4c787d739fa153f7819c5a17ccc887a2032 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 10:43:28 +0200 Subject: [PATCH 19/35] Docs: cross-link sample renaming docs (fixes https://github.com/MultiQC/MultiQC/issues/3316) --- docs/markdown/getting_started/config.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/markdown/getting_started/config.md b/docs/markdown/getting_started/config.md index 2889ab91e3..ee807e9751 100644 --- a/docs/markdown/getting_started/config.md +++ b/docs/markdown/getting_started/config.md @@ -289,6 +289,14 @@ sample_names_rename: Each entry is a pair of [from, to] values that will be applied to sample names. +### Advanced sample name replacement + +For more powerful sample name replacement options, including regex support and different replacement modes, see the [Sample name replacement](../reports/customisation.md#sample-name-replacement) section in the report customisation documentation. These advanced features include: + +- `sample_names_replace` - Direct pattern-to-replacement mapping in config files +- `sample_names_replace_regex` - Regular expression support for complex patterns +- `sample_names_replace_exact` and `sample_names_replace_complete` - Fine-tuned matching behavior + ## Module search patterns Many bioinformatics tools have standard output formats, filenames and other From 9cd6c06ce84e9c386e4ad3c91241fa2c2237e4f3 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 12:40:11 +0200 Subject: [PATCH 20/35] Rename `BETA-multiqc.parquet` to `multiqc.parquet` (#3332) --- docs/markdown/usage/downstream.md | 16 ++++++++-------- docs/markdown/usage/scripts.md | 4 ++-- multiqc/core/tmp_dir.py | 2 +- tests/test_interactive.py | 2 +- tests/test_rerun.py | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/markdown/usage/downstream.md b/docs/markdown/usage/downstream.md index 731e6141a9..c1926960a6 100644 --- a/docs/markdown/usage/downstream.md +++ b/docs/markdown/usage/downstream.md @@ -7,7 +7,7 @@ description: How to use MultiQC raw data outputs Whilst MultiQC is typically used as a final reporting step in an analysis, it can also be used as an intermediate in your analysis. -MultiQC saves a directory of machine-readable outputs called `multiqc_data/`. In here there are files from each module and table, as well as a verbose `multiqc.log` file and, a `BETA-multiqc.parquet` file that contains all the intermediate data and metadata needed to regenereate a report. +MultiQC saves a directory of machine-readable outputs called `multiqc_data/`. In here there are files from each module and table, as well as a verbose `multiqc.log` file and, a `multiqc.parquet` file that contains all the intermediate data and metadata needed to regenereate a report. Most of these files are tab-separated `.tsv` files by default, but you can choose to have them as JSON, YAML if you prefer with the `-k`/`--data-format` flag or the `data_format` option in a config file. @@ -316,7 +316,7 @@ ChronQC is a quality control (QC) tracking system for clinical implementation of ## MultiQC Parquet Output (BETA) -Starting from version 1.29, MultiQC writes out all plot and table data in a standardized Apache Parquet file format (`BETA-multiqc.parquet`) in the `multiqc_data` directory. This feature provides several significant benefits: +Starting from version 1.29, MultiQC writes out all plot and table data in a standardized Apache Parquet file format (`multiqc.parquet`) in the `multiqc_data` directory. This feature provides several significant benefits: - **Persistence**: The parquet file contains all the data necessary to regenerate MultiQC reports without needing access to the original analysis files - **Reusability**: The data is structured in a way that's optimized for cross-run analysis and data warehousing @@ -328,7 +328,7 @@ Note that the format is unstable as of 1.29 may change in 1.30, where it will be ### Parquet File Structure -The `BETA-multiqc.parquet` file contains several different types of rows that can be distinguished by the `type` column: +The `multiqc.parquet` file contains several different types of rows that can be distinguished by the `type` column: 1. **`run_metadata`**: Contains metadata about the MultiQC run, including: @@ -385,7 +385,7 @@ To explore the structure programmatically: import polars as pl # Load the parquet file -df = pl.read_parquet("multiqc_data/BETA-multiqc.parquet") +df = pl.read_parquet("multiqc_data/multiqc.parquet") # Get unique row types print(df.select("type").unique()) @@ -419,7 +419,7 @@ Developers can use these relationships to reconstruct the full structure of the One of the key benefits of the parquet output is the ability to regenerate MultiQC reports without needing the original data files: ```bash -multiqc multiqc_data/BETA-multiqc.parquet +multiqc multiqc_data/multiqc.parquet ``` This will load all the data from the parquet file and generate a new report. @@ -435,7 +435,7 @@ The parquet output enables easy aggregation of data from multiple MultiQC runs: multiqc /path/to/analysis1/ -o run1_output # Run MultiQC on both the second set of data and the parquet from the first run -multiqc /path/to/analysis2/ run1_output/multiqc_data/BETA-multiqc.parquet -o combined_output +multiqc /path/to/analysis2/ run1_output/multiqc_data/multiqc.parquet -o combined_output ``` This will generate a report containing data from both runs. You can combine any number of parquet files with new data in a single command. @@ -448,7 +448,7 @@ For programmatic access to MultiQC data, you can use the Python API to load parq import multiqc # Load data from a parquet file -multiqc.parse_logs('multiqc_data/BETA-multiqc.parquet') +multiqc.parse_logs('multiqc_data/multiqc.parquet') # List loaded modules and access data modules = multiqc.list_modules() @@ -465,7 +465,7 @@ import polars as pl from pyiceberg.catalog import load_catalog # Load the MultiQC parquet file -multiqc_df = pl.read_parquet("multiqc_data/BETA-multiqc.parquet") +multiqc_df = pl.read_parquet("multiqc_data/multiqc.parquet") # Configure and load Iceberg catalog catalog = load_catalog( diff --git a/docs/markdown/usage/scripts.md b/docs/markdown/usage/scripts.md index a3cb2ee329..1ba7d4907a 100644 --- a/docs/markdown/usage/scripts.md +++ b/docs/markdown/usage/scripts.md @@ -102,12 +102,12 @@ multiqc.parse_logs( ) ``` -MultiQC v1.29 and higher generates `BETA-multiqc.parquet` file in `multiqc_data` output directory. You can pass that file to `parse_logs`, and it will load that previous MultiQC run into memory. +MultiQC v1.29 and higher generates `multiqc.parquet` file in `multiqc_data` output directory. You can pass that file to `parse_logs`, and it will load that previous MultiQC run into memory. Example: ```python -multiqc.parse_logs('multiqc_data/BETA-multiqc.parquet') +multiqc.parse_logs('multiqc_data/multiqc.parquet') ``` ## List what's loaded diff --git a/multiqc/core/tmp_dir.py b/multiqc/core/tmp_dir.py index e537ccfd61..8c9e628747 100644 --- a/multiqc/core/tmp_dir.py +++ b/multiqc/core/tmp_dir.py @@ -49,7 +49,7 @@ def parquet_file() -> Path: """ Returns the path to the combined parquet file that contains all plot data """ - return data_tmp_dir() / "BETA-multiqc.parquet" + return data_tmp_dir() / "multiqc.parquet" def new_tmp_dir(): diff --git a/tests/test_interactive.py b/tests/test_interactive.py index 283ed016c1..53437e7660 100644 --- a/tests/test_interactive.py +++ b/tests/test_interactive.py @@ -119,7 +119,7 @@ def test_parse_parquet(tmp_path): assert multiqc.list_samples() == [] # Load data from the JSON file - multiqc.parse_logs(tmp_path / "multiqc_data" / "BETA-multiqc.parquet") + multiqc.parse_logs(tmp_path / "multiqc_data" / "multiqc.parquet") # Verify data was loaded correctly assert "sample1" in multiqc.list_samples() diff --git a/tests/test_rerun.py b/tests/test_rerun.py index 9c58f5093b..e4b3f45aba 100644 --- a/tests/test_rerun.py +++ b/tests/test_rerun.py @@ -30,7 +30,7 @@ def test_rerun_parquet(data_dir, tmp_path): # Run 2: Run on the intermediate data from run1 run_b_dir = tmp_path / "run_b" run_b_dir.mkdir() - multiqc.run(run_a_dir / "multiqc_data" / "BETA-multiqc.parquet", cfg=ClConfig(output_dir=run_b_dir, strict=True)) + multiqc.run(run_a_dir / "multiqc_data" / "multiqc.parquet", cfg=ClConfig(output_dir=run_b_dir, strict=True)) # Compare reports with open(run_b_dir / "multiqc_data" / "multiqc_data.json") as f: @@ -64,7 +64,7 @@ def test_rerun_and_combine(data_dir, tmp_path): run_combined_dir = tmp_path / "run_combined" multiqc.run( data_dir / "modules/fastp/SAMPLE.json", - run_a_dir / "multiqc_data" / "BETA-multiqc.parquet", + run_a_dir / "multiqc_data" / "multiqc.parquet", cfg=ClConfig(output_dir=run_combined_dir, strict=True), ) From c5949d9272703634941a7f946c29bfd92dd9b4a7 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 12:50:12 +0200 Subject: [PATCH 21/35] Cutadapt: improve sample name extraction for stdin input (#3333) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When cutadapt processes input from stdin (e.g., /dev/fd/*, -), the original sample name extraction would use these unusable paths as sample names. This fix implements a fallback strategy: 1. Use input paths if they are valid file names (original behavior) 2. Extract sample names from output arguments (--output, --paired-output) when input is from stdin 3. Fall back to JSON filename if other methods fail Fixes cases where stdin input would result in sample names like "/dev/fd/63" instead of meaningful names like "SRR8615409". 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- multiqc/modules/cutadapt/cutadapt.py | 49 +++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/multiqc/modules/cutadapt/cutadapt.py b/multiqc/modules/cutadapt/cutadapt.py index 7cca65022a..f1424c3076 100755 --- a/multiqc/modules/cutadapt/cutadapt.py +++ b/multiqc/modules/cutadapt/cutadapt.py @@ -123,12 +123,59 @@ def version_parse(v): if pairs_filtered_unexplained > 0: self.cutadapt_data[s_name]["pairs_filtered_unexplained"] = pairs_filtered_unexplained + def _extract_sample_name_from_json(self, data, f): + """Extract sample name from JSON data, handling stdin input cases""" + # First try to get sample name from input paths (original method) + input_paths = [v for k, v in data["input"].items() if k.startswith("path") and v] + + # Check if all input paths are stdin-like (e.g., /dev/fd/*, -, stdin) + stdin_patterns = ["/dev/fd/", "/dev/stdin", "stdin", "-"] + is_stdin = ( + all(any(pattern in str(path) for pattern in stdin_patterns) or str(path) == "-" for path in input_paths) + if input_paths + else False + ) + + if not is_stdin and input_paths: + # Use original method if we have valid file paths + return self.clean_s_name(input_paths, f=f) + + # If input is from stdin, try to extract sample name from output arguments + if "command_line_arguments" in data: + args = data["command_line_arguments"] + + # Look for output parameters that might contain sample names + output_args = ["--output", "-o", "--paired-output", "-p"] + + for i, arg in enumerate(args): + if arg in output_args and i + 1 < len(args): + output_path = args[i + 1] + # Extract sample name from output path + sample_name = self.clean_s_name([output_path], f=f) + if sample_name: + return sample_name + + # Fall back to using the JSON filename + json_filename = f["fn"] + if json_filename.endswith(".json"): + # Remove .json extension and clean the name + base_name = json_filename[:-5] # Remove .json + # Remove common cutadapt suffixes + for suffix in [".cutadapt", "_cutadapt", "-cutadapt"]: + if base_name.endswith(suffix): + base_name = base_name[: -len(suffix)] + break + return self.clean_s_name([base_name], f=f) + + # Final fallback + return f["s_name"] + def parse_json(self, f): path = os.path.join(f["root"], f["fn"]) with open(path, "r") as fh: data = json.load(fh) - s_name = SampleName(self.clean_s_name([v for k, v in data["input"].items() if k.startswith("path") and v], f=f)) + s_name = SampleName(self._extract_sample_name_from_json(data, f)) if s_name in self.cutadapt_data: log.debug(f"Duplicate sample name found! Overwriting: {s_name}") From 09bbf1f47f38b127609815fcf50d162cac42dc0c Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 12:57:26 +0200 Subject: [PATCH 22/35] Bump v1.31 (#3331) * Bump version * Fix finding repos for current milestone * Changelog * New module docs * Cross-link sample renaming docs (fixes https://github.com/MultiQC/MultiQC/issues/3316) * Changelog --- CHANGELOG.md | 51 +++ docs/markdown/modules.mdx | 423 +++++++++++------- docs/markdown/modules/adapterremoval.md | 5 +- docs/markdown/modules/afterqc.md | 5 +- docs/markdown/modules/anglerfish.md | 5 +- docs/markdown/modules/ataqv.md | 5 +- docs/markdown/modules/bakta.md | 5 +- docs/markdown/modules/bamdst.md | 5 +- docs/markdown/modules/bamtools.md | 5 +- docs/markdown/modules/bases2fastq.md | 5 +- docs/markdown/modules/bbduk.md | 5 +- docs/markdown/modules/bbmap.md | 5 +- docs/markdown/modules/bcftools.md | 5 +- docs/markdown/modules/bcl2fastq.md | 5 +- docs/markdown/modules/bclconvert.md | 5 +- docs/markdown/modules/biobambam2.md | 5 +- docs/markdown/modules/biobloomtools.md | 5 +- docs/markdown/modules/biscuit.md | 5 +- docs/markdown/modules/bismark.md | 5 +- docs/markdown/modules/bowtie1.md | 5 +- docs/markdown/modules/bowtie2.md | 5 +- docs/markdown/modules/busco.md | 5 +- docs/markdown/modules/bustools.md | 5 +- docs/markdown/modules/ccs.md | 5 +- docs/markdown/modules/cellranger.md | 5 +- docs/markdown/modules/cellranger_arc.md | 5 +- docs/markdown/modules/cells2stats.md | 7 +- docs/markdown/modules/checkm.md | 5 +- docs/markdown/modules/checkm2.md | 5 +- docs/markdown/modules/checkqc.md | 5 +- docs/markdown/modules/clipandmerge.md | 5 +- docs/markdown/modules/clusterflow.md | 5 +- docs/markdown/modules/conpair.md | 5 +- docs/markdown/modules/cutadapt.md | 5 +- docs/markdown/modules/damageprofiler.md | 5 +- docs/markdown/modules/dedup.md | 5 +- docs/markdown/modules/deeptools.md | 5 +- docs/markdown/modules/diamond.md | 5 +- docs/markdown/modules/disambiguate.md | 5 +- docs/markdown/modules/dragen.md | 5 +- docs/markdown/modules/dragen_fastqc.md | 5 +- .../modules/eigenstratdatabasetools.md | 5 +- docs/markdown/modules/fastp.md | 5 +- docs/markdown/modules/fastq_screen.md | 5 +- docs/markdown/modules/fastqc.md | 7 +- docs/markdown/modules/featurecounts.md | 7 +- docs/markdown/modules/fgbio.md | 5 +- docs/markdown/modules/filtlong.md | 5 +- docs/markdown/modules/flash.md | 5 +- docs/markdown/modules/flexbar.md | 5 +- docs/markdown/modules/freyja.md | 5 +- docs/markdown/modules/ganon.md | 5 +- docs/markdown/modules/gatk.md | 5 +- docs/markdown/modules/gffcompare.md | 5 +- docs/markdown/modules/glimpse.md | 5 +- docs/markdown/modules/goleft_indexcov.md | 5 +- docs/markdown/modules/gopeaks.md | 5 +- docs/markdown/modules/gtdbtk.md | 5 +- docs/markdown/modules/haplocheck.md | 5 +- docs/markdown/modules/happy.md | 5 +- docs/markdown/modules/hicexplorer.md | 5 +- docs/markdown/modules/hicpro.md | 5 +- docs/markdown/modules/hicup.md | 5 +- docs/markdown/modules/hifiasm.md | 5 +- docs/markdown/modules/hisat2.md | 5 +- docs/markdown/modules/homer.md | 5 +- docs/markdown/modules/hops.md | 5 +- docs/markdown/modules/hostile.md | 5 +- docs/markdown/modules/htseq.md | 5 +- docs/markdown/modules/humid.md | 5 +- docs/markdown/modules/interop.md | 5 +- docs/markdown/modules/isoseq.md | 5 +- docs/markdown/modules/ivar.md | 5 +- docs/markdown/modules/jcvi.md | 5 +- docs/markdown/modules/jellyfish.md | 5 +- docs/markdown/modules/kaiju.md | 5 +- docs/markdown/modules/kallisto.md | 5 +- docs/markdown/modules/kat.md | 5 +- docs/markdown/modules/kraken.md | 5 +- docs/markdown/modules/leehom.md | 5 +- docs/markdown/modules/librarian.md | 5 +- docs/markdown/modules/lima.md | 5 +- docs/markdown/modules/longranger.md | 5 +- docs/markdown/modules/macs2.md | 5 +- docs/markdown/modules/malt.md | 5 +- docs/markdown/modules/mapdamage.md | 5 +- docs/markdown/modules/megahit.md | 5 +- docs/markdown/modules/metaphlan.md | 5 +- docs/markdown/modules/methylqa.md | 5 +- docs/markdown/modules/mgikit.md | 5 +- docs/markdown/modules/minionqc.md | 5 +- docs/markdown/modules/mirtop.md | 5 +- docs/markdown/modules/mirtrace.md | 5 +- docs/markdown/modules/mosaicatcher.md | 5 +- docs/markdown/modules/mosdepth.md | 5 +- docs/markdown/modules/motus.md | 5 +- docs/markdown/modules/mtnucratio.md | 5 +- docs/markdown/modules/multivcfanalyzer.md | 5 +- docs/markdown/modules/nanoq.md | 5 +- docs/markdown/modules/nanostat.md | 5 +- docs/markdown/modules/nextclade.md | 5 +- docs/markdown/modules/ngsbits.md | 5 +- docs/markdown/modules/ngsderive.md | 5 +- docs/markdown/modules/nonpareil.md | 5 +- docs/markdown/modules/odgi.md | 5 +- docs/markdown/modules/optitype.md | 5 +- docs/markdown/modules/pairtools.md | 5 +- docs/markdown/modules/pangolin.md | 5 +- docs/markdown/modules/pbmarkdup.md | 5 +- docs/markdown/modules/peddy.md | 5 +- docs/markdown/modules/percolator.md | 5 +- docs/markdown/modules/phantompeakqualtools.md | 5 +- docs/markdown/modules/picard.md | 5 +- docs/markdown/modules/porechop.md | 5 +- docs/markdown/modules/preseq.md | 5 +- docs/markdown/modules/prinseqplusplus.md | 5 +- docs/markdown/modules/prokka.md | 5 +- docs/markdown/modules/purple.md | 5 +- docs/markdown/modules/pychopper.md | 5 +- docs/markdown/modules/pycoqc.md | 5 +- docs/markdown/modules/qc3C.md | 5 +- docs/markdown/modules/qorts.md | 5 +- docs/markdown/modules/qualimap.md | 5 +- docs/markdown/modules/quast.md | 5 +- docs/markdown/modules/rna_seqc.md | 5 +- docs/markdown/modules/rockhopper.md | 5 +- docs/markdown/modules/rsem.md | 5 +- docs/markdown/modules/rseqc.md | 5 +- docs/markdown/modules/salmon.md | 5 +- docs/markdown/modules/sambamba.md | 5 +- docs/markdown/modules/samblaster.md | 5 +- docs/markdown/modules/samtools.md | 5 +- docs/markdown/modules/sargasso.md | 5 +- docs/markdown/modules/seqera_cli.md | 5 +- docs/markdown/modules/seqfu.md | 48 ++ docs/markdown/modules/sequali.md | 5 +- docs/markdown/modules/seqwho.md | 5 +- docs/markdown/modules/seqyclean.md | 5 +- docs/markdown/modules/sexdeterrmine.md | 5 +- docs/markdown/modules/sickle.md | 5 +- docs/markdown/modules/skewer.md | 5 +- docs/markdown/modules/slamdunk.md | 5 +- docs/markdown/modules/snippy.md | 5 +- docs/markdown/modules/snpeff.md | 5 +- docs/markdown/modules/snpsplit.md | 5 +- docs/markdown/modules/somalier.md | 5 +- docs/markdown/modules/sortmerna.md | 5 +- docs/markdown/modules/sourmash.md | 5 +- docs/markdown/modules/spaceranger.md | 5 +- docs/markdown/modules/stacks.md | 5 +- docs/markdown/modules/star.md | 5 +- docs/markdown/modules/supernova.md | 5 +- docs/markdown/modules/telseq.md | 5 +- docs/markdown/modules/theta2.md | 5 +- docs/markdown/modules/tophat.md | 5 +- docs/markdown/modules/trimmomatic.md | 5 +- docs/markdown/modules/truvari.md | 7 +- docs/markdown/modules/umicollapse.md | 5 +- docs/markdown/modules/umitools.md | 9 +- docs/markdown/modules/varscan2.md | 11 +- docs/markdown/modules/vcftools.md | 5 +- docs/markdown/modules/vep.md | 5 +- docs/markdown/modules/verifybamid.md | 5 +- docs/markdown/modules/vg.md | 5 +- docs/markdown/modules/whatshap.md | 5 +- docs/markdown/modules/xengsort.md | 5 +- docs/markdown/modules/xenium.md | 49 ++ docs/markdown/modules/xenome.md | 5 +- multiqc/modules/seqfu/seqfu.py | 2 +- pyproject.toml | 2 +- scripts/print_changelog.py | 60 ++- 171 files changed, 950 insertions(+), 523 deletions(-) create mode 100644 docs/markdown/modules/seqfu.md create mode 100644 docs/markdown/modules/xenium.md diff --git a/CHANGELOG.md b/CHANGELOG.md index b0782f743e..d56bd88a53 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,56 @@ # MultiQC Version History +## [MultiQC v1.31](https://github.com/MultiQC/MultiQC/releases/tag/v1.31) - 2025-09-05 + +Adding new module for [Xenium analysis](https://www.10xgenomics.com/products/xenium-analysis), 10x Genomics Xenium spatial transcriptomics quality control report. + +The parquet format is stable since 1.29, renaming the output file from `BETA-multiqc.parquet` to `multiqc.parquet`. + +### New modules + +- Xenium QC ([#3276](https://github.com/MultiQC/MultiQC/pull/3276), [#3313](https://github.com/MultiQC/MultiQC/pull/3313), [#3323](https://github.com/MultiQC/MultiQC/pull/3323)) +- Seqfu: `stats` command - FASTA/FASTQ files stats ([#3271](https://github.com/MultiQC/MultiQC/pull/3271)) + +### Feature updates and improvements + +- Add `return_html` parameter for programmatic HTML access ([#3304](https://github.com/MultiQC/MultiQC/pull/3304)) +- File search optimization: avoid reading files when contents exclusion patterns are not + provided ([#3312](https://github.com/MultiQC/MultiQC/pull/3312)) +- Rename `BETA-multiqc.parquet` to `multiqc.parquet` ([#3332](https://github.com/MultiQC/MultiQC/pull/3332)) + +### Fixes + +- Scatter plot: fix hiding dots by legend click ([#3321](https://github.com/MultiQC/MultiQC/pull/3321)) +- Plots: seuset unique series label for each plot type ([#3330](https://github.com/MultiQC/MultiQC/pull/3330)) +- Fix bulk sample renaming buttons ([#3300](https://github.com/MultiQC/MultiQC/pull/3300)) +- Fix config flag types in schema ([#3318](https://github.com/MultiQC/MultiQC/pull/3318)) +- Ignore pyc files when copying html files ([#3320](https://github.com/MultiQC/MultiQC/pull/3320)) + +### Module updates + +- Picard tools: enhance QualityByCycleMetrics to support original quality scores ([#3307](https://github.com/MultiQC/MultiQC/pull/3307)) +- STAR: improve module color scheme for better accessibility ([#3305](https://github.com/MultiQC/MultiQC/pull/3305), [#3306](https://github.com/MultiQC/MultiQC/pull/3306)) +- Sequali: support insert size metrics ([#3303](https://github.com/MultiQC/MultiQC/pull/3303)) +- Missing modules to general stats: Busco, CheckM, CheckM2, GTDB-Tk ([#3289](https://github.com/MultiQC/MultiQC/pull/3289)) +- cells2stats: add support for optical pooled screening output ([#3277](https://github.com/MultiQC/MultiQC/pull/3277)) +- fastp: add before-filtering mean r1/r2 length to general stats ([#3280](https://github.com/MultiQC/MultiQC/pull/3280)) +- Cutadapt: improve sample name extraction for stdin input ([#3333](https://github.com/MultiQC/MultiQC/pull/3333)) + +### Module fixes + +- bases2fastq: fix index error ([#3328](https://github.com/MultiQC/MultiQC/pull/3328)) +- Picard: fix VariantCallingMetrics to support sample renaming ([#3298](https://github.com/MultiQC/MultiQC/pull/3298)) +- fastp: fix sample naming for paired-end reads ([#3302](https://github.com/MultiQC/MultiQC/pull/3302)) +- bcftools: fix singleton count calculation to include indels ([#3295](https://github.com/MultiQC/MultiQC/pull/3295)) +- CheckM2: column spelling fix ([#3283](https://github.com/MultiQC/MultiQC/pull/3283)) +- Bulk replace deprecated `hide_empty` with `hide_zero_cats` ([#3296](https://github.com/MultiQC/MultiQC/pull/3296)) + +### Infrastructure and packaging + +- WASM workaround: if `write_parquet` not supported by polars, write a CSV file ([#3309](https://github.com/MultiQC/MultiQC/pull/3309)) +- Add Claude instructions ([#3301](https://github.com/MultiQC/MultiQC/pull/3301)) +- Add Claude review action ([#3299](https://github.com/MultiQC/MultiQC/pull/3299)) + ## [MultiQC v1.30](https://github.com/MultiQC/MultiQC/releases/tag/v1.30) - 2025-07-09 Minor improvements and fixes. diff --git a/docs/markdown/modules.mdx b/docs/markdown/modules.mdx index 451df5b392..f048054a55 100644 --- a/docs/markdown/modules.mdx +++ b/docs/markdown/modules.mdx @@ -10,7 +10,7 @@ This file is autogenerated. Do not edit the markdown, it will be overwritten. ~~~~~~~~~~~~~~~~~~~~~~~ --> -MultiQC currently has modules to support 165 bioinformatics tools, listed below. +MultiQC currently has modules to support 167 bioinformatics tools, listed below. Click the tool name to go to the MultiQC documentation for that tool. @@ -27,218 +27,243 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "Adapter Removal", summary: - "Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus", + "

Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus.

", }, }, { id: "modules/afterqc", data: { name: "AfterQC", - summary: "Automatic filtering, trimming, error removing, and quality control for FastQ data.", + summary: "

Automatic filtering, trimming, error removing, and quality control for FastQ data.

", }, }, { id: "modules/anglerfish", data: { name: "Anglerfish", - summary: "Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells", + summary: "

Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells.

", }, }, { id: "modules/ataqv", - data: { name: "ATAQV", summary: "Toolkit for quality control and visualization of ATAC-seq data" }, + data: { name: "ATAQV", summary: "

Toolkit for quality control and visualization of ATAC-seq data.

" }, }, { id: "modules/bakta", - data: { name: "Bakta", summary: "Rapid & standardized annotation of bacterial genomes, MAGs & plasmids" }, + data: { + name: "Bakta", + summary: "

Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.

", + }, }, { id: "modules/bamdst", - data: { name: "Bamdst", summary: "Lightweight tool to stat the depth coverage of target regions of BAM file(s)" }, + data: { + name: "Bamdst", + summary: "

Lightweight tool to stat the depth coverage of target regions of BAM file(s).

", + }, }, { id: "modules/bamtools", data: { name: "Bamtools", - summary: "Provides both a programmer's API and an end-user's toolkit for handling BAM files.", + summary: "

Provides both a programmer's API and an end-user's toolkit for handling BAM files.

", }, }, { id: "modules/bases2fastq", - data: { name: "Bases2Fastq", summary: "Demultiplexes and converts Element AVITI base calls into FASTQ files" }, + data: { + name: "Bases2Fastq", + summary: "

Demultiplexes and converts Element AVITI base calls into FASTQ files.

", + }, }, { id: "modules/bbduk", data: { name: "BBDuk", - summary: "Common data-quality-related trimming, filtering, and masking operations with a kmer based approach", + summary: + "

Common data-quality-related trimming, filtering, and masking operations with a kmer based approach.

", }, }, { id: "modules/bbmap", data: { name: "BBTools", - summary: "Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads", + summary: "

Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads.

", }, }, { id: "modules/bcftools", - data: { name: "Bcftools", summary: "Utilities for variant calling and manipulating VCFs and BCFs." }, + data: { name: "Bcftools", summary: "

Utilities for variant calling and manipulating VCFs and BCFs.

" }, }, { id: "modules/bcl2fastq", data: { name: "bcl2fastq", - summary: "Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.", + summary: "

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

", }, }, { id: "modules/bclconvert", data: { name: "BCL Convert", - summary: "Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.", + summary: "

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

", }, }, { id: "modules/biobambam2", - data: { name: "biobambam2", summary: "Tools for early stage alignment file processing" }, + data: { name: "biobambam2", summary: "

Tools for early stage alignment file processing.

" }, }, { id: "modules/biobloomtools", data: { name: "BioBloom Tools", summary: - "Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.", + "

Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.

", }, }, { id: "modules/biscuit", data: { name: "BISCUIT", - summary: "Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.", + summary: "

Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.

", }, }, { id: "modules/bismark", data: { name: "Bismark", - summary: "Maps bisulfite converted sequence reads and determine cytosine methylation states.", + summary: "

Maps bisulfite converted sequence reads and determine cytosine methylation states.

", }, }, - { id: "modules/bowtie1", data: { name: "Bowtie 1", summary: "Ultrafast, memory-efficient short read aligner." } }, + { + id: "modules/bowtie1", + data: { name: "Bowtie 1", summary: "

Ultrafast, memory-efficient short read aligner.

" }, + }, { id: "modules/bowtie2", data: { name: "Bowtie 2 / HiSAT2", - summary: "Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.", + summary: "

Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.

", }, }, - { id: "modules/busco", data: { name: "BUSCO", summary: "Assesses genome assembly and annotation completeness" } }, + { + id: "modules/busco", + data: { name: "BUSCO", summary: "

Assesses genome assembly and annotation completeness.

" }, + }, { id: "modules/bustools", data: { name: "Bustools", summary: - "Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.", + "

Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.

", }, }, { id: "modules/ccs", data: { name: "CCS", - summary: "PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads)", + summary: "

PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads).

", }, }, { id: "modules/cellranger", - data: { name: "Cell Ranger", summary: "Analyzes single cell expression or VDJ data produced by 10X Genomics." }, + data: { + name: "Cell Ranger", + summary: "

Analyzes single cell expression or VDJ data produced by 10X Genomics.

", + }, }, { id: "modules/cellranger_arc", data: { name: "Cell Ranger ARC", - summary: "Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.", + summary: "

Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.

", }, }, { id: "modules/cells2stats", data: { name: "cells2stats", - summary: "Generate output files and statistics from Element Biosciences Teton Assay", + summary: "

Generate output files and statistics from Element Biosciences Teton cytoprofiling assays.

", }, }, { id: "modules/checkm", data: { name: "CheckM", - summary: "Estimates genome completeness and contamination based on the presence or absence of marker genes.", + summary: + "

Estimates genome completeness and contamination based on the presence or absence of marker genes.

", }, }, { id: "modules/checkm2", - data: { name: "CheckM2", summary: "Assesses microbial genome quality using machine learning." }, + data: { name: "CheckM2", summary: "

Assesses microbial genome quality using machine learning.

" }, }, { id: "modules/checkqc", - data: { name: "CheckQC", summary: "Checks a set of quality criteria against an Illumina runfolder." }, + data: { name: "CheckQC", summary: "

Checks a set of quality criteria against an Illumina runfolder.

" }, }, { id: "modules/clipandmerge", - data: { name: "ClipAndMerge", summary: "Adapter clipping and read merging for ancient DNA data." }, + data: { name: "ClipAndMerge", summary: "

Adapter clipping and read merging for ancient DNA data.

" }, }, { id: "modules/clusterflow", - data: { name: "Cluster Flow", summary: "Simple and flexible bioinformatics pipeline tool." }, + data: { name: "Cluster Flow", summary: "

Simple and flexible bioinformatics pipeline tool.

" }, }, { id: "modules/conpair", - data: { name: "Conpair", summary: "Estimates concordance and contamination for tumor\u2013normal pairs" }, + data: { name: "Conpair", summary: "

Estimates concordance and contamination for tumor\u2013normal pairs.

" }, }, { id: "modules/cutadapt", data: { name: "Cutadapt", - summary: "Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.", + summary: + "

Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.

", }, }, { id: "modules/damageprofiler", - data: { name: "DamageProfiler", summary: "DNA damage pattern retrieval for ancient DNA analysis" }, + data: { name: "DamageProfiler", summary: "

DNA damage pattern retrieval for ancient DNA analysis.

" }, }, { id: "modules/dedup", - data: { name: "DeDup", summary: "Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis" }, + data: { + name: "DeDup", + summary: "

Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis.

", + }, }, { id: "modules/deeptools", - data: { name: "deepTools", summary: "Tools to process and analyze deep sequencing data." }, + data: { name: "deepTools", summary: "

Tools to process and analyze deep sequencing data.

" }, }, { id: "modules/diamond", data: { name: "DIAMOND", - summary: "Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST", + summary: + "

Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST.

", }, }, { id: "modules/disambiguate", data: { name: "Disambiguate", - summary: "Disambiguate reads aligned to two different species (e.g. human and mouse)", + summary: "

Disambiguate reads aligned to two different species (e.g. human and mouse).

", }, }, { id: "modules/dragen", data: { name: "DRAGEN", - summary: "Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.", + summary: "

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

", }, }, { id: "modules/dragen_fastqc", data: { name: "DRAGEN-FastQC", - summary: "Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data", + summary: "

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

", }, }, { @@ -246,14 +271,14 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "eigenstratdatabasetools", summary: - "Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.", + "

Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.

", }, }, { id: "modules/fastp", data: { name: "fastp", - summary: "All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...)", + summary: "

All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...).

", }, }, { @@ -261,68 +286,75 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "FastQ Screen", summary: - "Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.", + "

Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.

", }, }, { id: "modules/fastqc", - data: { name: "FastQC", summary: "Quality control tool for high throughput sequencing data" }, + data: { name: "FastQC", summary: "

Quality control tool for high throughput sequencing data.

" }, }, { id: "modules/featurecounts", data: { name: "featureCounts", summary: - "Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.", + "

Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.

", }, }, - { id: "modules/fgbio", data: { name: "fgbio", summary: "Processing and evaluating data containing UMIs" } }, - { id: "modules/filtlong", data: { name: "Filtlong", summary: "Filters long reads by quality." } }, + { id: "modules/fgbio", data: { name: "fgbio", summary: "

Processing and evaluating data containing UMIs.

" } }, + { id: "modules/filtlong", data: { name: "Filtlong", summary: "

Filters long reads by quality.

" } }, { id: "modules/flash", - data: { name: "FLASh", summary: "Merges paired-end reads from next-generation sequencing experiments." }, + data: { name: "FLASh", summary: "

Merges paired-end reads from next-generation sequencing experiments.

" }, }, - { id: "modules/flexbar", data: { name: "Flexbar", summary: "Barcode and adapter removal tool." } }, + { id: "modules/flexbar", data: { name: "Flexbar", summary: "

Barcode and adapter removal tool.

" } }, { id: "modules/freyja", - data: { name: "Freyja", summary: "Recovers relative lineage abundances from mixed SARS-CoV-2 samples." }, + data: { name: "Freyja", summary: "

Recovers relative lineage abundances from mixed SARS-CoV-2 samples.

" }, }, { id: "modules/ganon", data: { name: "Ganon", summary: - "Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.", + "

Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.

", }, }, { id: "modules/gatk", data: { name: "GATK", - summary: "Wide variety of tools with a primary focus on variant discovery and genotyping.", + summary: "

Wide variety of tools with a primary focus on variant discovery and genotyping.

", }, }, { id: "modules/gffcompare", data: { name: "GffCompare", - summary: "Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.", + summary: + "

Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.

", }, }, - { id: "modules/glimpse", data: { name: "GLIMPSE", summary: "Low-coverage whole genome sequencing imputation" } }, + { + id: "modules/glimpse", + data: { name: "GLIMPSE", summary: "

Low-coverage whole genome sequencing imputation.

" }, + }, { id: "modules/goleft_indexcov", data: { name: "goleft indexcov", - summary: "Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution", + summary: "

Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.

", }, }, - { id: "modules/gopeaks", data: { name: "GoPeaks", summary: "Calls peaks in CUT&TAG/CUT&RUN datasets." } }, + { + id: "modules/gopeaks", + data: { name: "GoPeaks", summary: "

Calls peaks in CUT&TAG/CUT&RUN datasets.

" }, + }, { id: "modules/gtdbtk", data: { name: "GTDB-Tk", - summary: "Assigns objective taxonomic classifications to bacterial and archaeal genomes.", + summary: "

Assigns objective taxonomic classifications to bacterial and archaeal genomes.

", }, }, { @@ -330,240 +362,257 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "Haplocheck", summary: - "Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.", + "

Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.

", }, }, { id: "modules/happy", - data: { name: "hap.py", summary: "Benchmarks variant calls against gold standard truth datasets." }, + data: { name: "hap.py", summary: "

Benchmarks variant calls against gold standard truth datasets.

" }, }, { id: "modules/hicexplorer", - data: { name: "HiCExplorer", summary: "Hi-C analysis from processing to visualization." }, + data: { name: "HiCExplorer", summary: "

Hi-C analysis from processing to visualization.

" }, }, - { id: "modules/hicpro", data: { name: "HiC-Pro", summary: "Pipeline for Hi-C data processing" } }, - { id: "modules/hicup", data: { name: "HiCUP", summary: "Mapping and quality control on Hi-C data." } }, + { id: "modules/hicpro", data: { name: "HiC-Pro", summary: "

Pipeline for Hi-C data processing.

" } }, + { id: "modules/hicup", data: { name: "HiCUP", summary: "

Mapping and quality control on Hi-C data.

" } }, { id: "modules/hifiasm", - data: { name: "HiFiasm", summary: "Haplotype-resolved assembler for accurate Hifi reads" }, + data: { name: "HiFiasm", summary: "

Haplotype-resolved assembler for accurate Hifi reads.

" }, }, { id: "modules/hisat2", - data: { name: "HISAT2", summary: "Maps DNA or RNA reads against a genome or a population of genomes" }, + data: { name: "HISAT2", summary: "

Maps DNA or RNA reads against a genome or a population of genomes.

" }, + }, + { + id: "modules/homer", + data: { name: "HOMER", summary: "

Motif discovery and next-gen sequencing analysis.

" }, }, - { id: "modules/homer", data: { name: "HOMER", summary: "Motif discovery and next-gen sequencing analysis." } }, { id: "modules/hops", data: { name: "HOPS", - summary: "Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.", + summary: "

Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.

", }, }, { id: "modules/hostile", data: { name: "Hostile", - summary: "Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz]", + summary: + "

Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz].

", }, }, { id: "modules/htseq", data: { name: "HTSeq Count", - summary: "Part of the HTSeq package: counts reads covering specified genomic features", + summary: "

Part of the HTSeq package: counts reads covering specified genomic features.

", }, }, { id: "modules/humid", data: { name: "HUMID", - summary: "Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.", + summary: "

Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.

", }, }, { id: "modules/interop", - data: { name: "Illumina InterOp Statistics", summary: "Reading and writing InterOp metric files." }, + data: { name: "Illumina InterOp Statistics", summary: "

Reading and writing InterOp metric files.

" }, }, { id: "modules/isoseq", data: { name: "Iso-Seq", - summary: "Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).", + summary: "

Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).

", }, }, - { id: "modules/ivar", data: { name: "iVar", summary: "Functions for viral amplicon-based sequencing." } }, + { id: "modules/ivar", data: { name: "iVar", summary: "

Functions for viral amplicon-based sequencing.

" } }, { id: "modules/jcvi", - data: { name: "JCVI Genome Annotation", summary: "Computes statistics on genome annotation." }, + data: { name: "JCVI Genome Annotation", summary: "

Computes statistics on genome annotation.

" }, }, - { id: "modules/jellyfish", data: { name: "Jellyfish", summary: "Counting k-mers in DNA." } }, - { id: "modules/kaiju", data: { name: "Kaiju", summary: "Taxonomic classification for metagenomics." } }, + { id: "modules/jellyfish", data: { name: "Jellyfish", summary: "

Counting k-mers in DNA.

" } }, + { id: "modules/kaiju", data: { name: "Kaiju", summary: "

Taxonomic classification for metagenomics.

" } }, { id: "modules/kallisto", data: { name: "Kallisto", - summary: "Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data", + summary: + "

Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data.

", }, }, { id: "modules/kat", - data: { name: "K-mer Analysis Toolkit", summary: "Analyses sequencing data via its k-mer spectra." }, + data: { name: "K-mer Analysis Toolkit", summary: "

Analyses sequencing data via its k-mer spectra.

" }, }, { id: "modules/kraken", data: { name: "Kraken", summary: - "Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.", + "

Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.

", }, }, - { id: "modules/leehom", data: { name: "leeHom", summary: "Bayesian reconstruction of ancient DNA" } }, + { id: "modules/leehom", data: { name: "leeHom", summary: "

Bayesian reconstruction of ancient DNA.

" } }, { id: "modules/librarian", data: { name: "Librarian", - summary: "Predicts the sequencing library type from the base composition of a FastQ file.", + summary: "

Predicts the sequencing library type from the base composition of a FastQ file.

", }, }, - { id: "modules/lima", data: { name: "Lima", summary: "Demultiplex PacBio single-molecule sequencing reads." } }, + { + id: "modules/lima", + data: { name: "Lima", summary: "

Demultiplex PacBio single-molecule sequencing reads.

" }, + }, { id: "modules/longranger", data: { name: "Long Ranger", summary: - "Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.", + "

Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.

", }, }, { id: "modules/macs2", - data: { name: "MACS2", summary: "Identifies transcription factor binding sites in ChIP-seq data." }, + data: { name: "MACS2", summary: "

Identifies transcription factor binding sites in ChIP-seq data.

" }, }, { id: "modules/malt", data: { name: "MALT", summary: - "Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file", + "

Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file.

", }, }, { id: "modules/mapdamage", - data: { name: "mapDamage", summary: "Tracks and quantifies damage patterns in ancient DNA sequences." }, + data: { name: "mapDamage", summary: "

Tracks and quantifies damage patterns in ancient DNA sequences.

" }, }, - { id: "modules/megahit", data: { name: "MEGAHIT", summary: "NGS read assembler" } }, + { id: "modules/megahit", data: { name: "MEGAHIT", summary: "

NGS read assembler.

" } }, { id: "modules/metaphlan", data: { name: "MetaPhlAn", - summary: "Profiles the composition of microbial communities from metagenomic shotgun sequencing data.", + summary: "

Profiles the composition of microbial communities from metagenomic shotgun sequencing data.

", }, }, { id: "modules/methylqa", - data: { name: "methylQA", summary: "Methylation sequencing data quality assessment tool." }, + data: { name: "methylQA", summary: "

Methylation sequencing data quality assessment tool.

" }, }, { id: "modules/mgikit", - data: { name: "mgikit", summary: "Demultiplexes FASTQ files from an MGI sequencing instrument" }, + data: { name: "mgikit", summary: "

Demultiplexes FASTQ files from an MGI sequencing instrument.

" }, }, { id: "modules/minionqc", - data: { name: "MinIONQC", summary: "Quality control for ONT (Oxford Nanopore) long reads" }, + data: { name: "MinIONQC", summary: "

Quality control for ONT (Oxford Nanopore) long reads.

" }, }, { id: "modules/mirtop", data: { name: "mirtop", - summary: "Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.", + summary: "

Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.

", }, }, - { id: "modules/mirtrace", data: { name: "miRTrace", summary: "Quality control for small RNA sequencing data." } }, + { + id: "modules/mirtrace", + data: { name: "miRTrace", summary: "

Quality control for small RNA sequencing data.

" }, + }, { id: "modules/mosaicatcher", data: { name: "MosaiCatcher", summary: - "Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.", + "

Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.

", }, }, { id: "modules/mosdepth", - data: { name: "Mosdepth", summary: "Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing" }, + data: { + name: "Mosdepth", + summary: "

Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing.

", + }, }, { id: "modules/motus", data: { name: "Motus", - summary: "Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).", + summary: "

Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).

", }, }, { id: "modules/mtnucratio", - data: { name: "mtnucratio", summary: "Computes mitochondrial to nuclear genome ratios in NGS datasets." }, + data: { name: "mtnucratio", summary: "

Computes mitochondrial to nuclear genome ratios in NGS datasets.

" }, }, { id: "modules/multivcfanalyzer", data: { name: "MultiVCFAnalyzer", summary: - "Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats", + "

Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats.

", }, }, { id: "modules/nanoq", - data: { name: "nanoq", summary: "Reports read quality and length from nanopore sequencing data" }, + data: { name: "nanoq", summary: "

Reports read quality and length from nanopore sequencing data.

" }, }, { id: "modules/nanostat", data: { name: "NanoStat", summary: - "Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).", + "

Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).

", }, }, { id: "modules/nextclade", data: { name: "Nextclade", - summary: "Viral genome alignment, clade assignment, mutation calling, and quality checks", + summary: "

Viral genome alignment, clade assignment, mutation calling, and quality checks.

", }, }, - { id: "modules/ngsbits", data: { name: "ngs-bits", summary: "Calculating statistics from FASTQ, BAM, and VCF" } }, + { + id: "modules/ngsbits", + data: { name: "ngs-bits", summary: "

Calculating statistics from FASTQ, BAM, and VCF.

" }, + }, { id: "modules/ngsderive", data: { name: "ngsderive", - summary: "Forensic tool for by backwards computing library information in sequencing data", + summary: "

Forensic tool for by backwards computing library information in sequencing data.

", }, }, { id: "modules/nonpareil", - data: { name: "Nonpareil", summary: "Estimates metagenomic coverage and sequence diversity " }, + data: { name: "Nonpareil", summary: "

Estimates metagenomic coverage and sequence diversity.

" }, }, { id: "modules/odgi", data: { name: "ODGI", - summary: "Analysis and manipulation of pangenome graphs structured in the variation graph model.", + summary: "

Analysis and manipulation of pangenome graphs structured in the variation graph model.

", }, }, { id: "modules/optitype", - data: { name: "OptiType", summary: "Precision HLA typing from next-generation sequencing data." }, + data: { name: "OptiType", summary: "

Precision HLA typing from next-generation sequencing data.

" }, }, { id: "modules/pairtools", data: { name: "pairtools", summary: - "Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.", + "

Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.

", }, }, { id: "modules/pangolin", data: { name: "Pangolin", - summary: "Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.", + summary: "

Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.

", }, }, { @@ -571,21 +620,22 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "pbmarkdup", summary: - "Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.", + "

Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.

", }, }, { id: "modules/peddy", data: { name: "Peddy", - summary: "Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.", + summary: + "

Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.

", }, }, { id: "modules/percolator", data: { name: "Percolator", - summary: "Semi-supervised learning for peptide identification from shotgun proteomics datasets.", + summary: "

Semi-supervised learning for peptide identification from shotgun proteomics datasets.

", }, }, { @@ -593,23 +643,23 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "phantompeakqualtools", summary: - "Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.", + "

Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.

", }, }, { id: "modules/picard", - data: { name: "Picard", summary: "Tools for manipulating high-throughput sequencing data." }, + data: { name: "Picard", summary: "

Tools for manipulating high-throughput sequencing data.

" }, }, { id: "modules/porechop", - data: { name: "Porechop", summary: "Finds and removes adapters from Oxford Nanopore reads." }, + data: { name: "Porechop", summary: "

Finds and removes adapters from Oxford Nanopore reads.

" }, }, { id: "modules/preseq", data: { name: "Preseq", summary: - "Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.", + "

Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.

", }, }, { @@ -617,223 +667,266 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "PRINSEQ++", summary: - "C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.", + "

C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.

", }, }, - { id: "modules/prokka", data: { name: "Prokka", summary: "Rapid annotation of prokaryotic genomes." } }, + { id: "modules/prokka", data: { name: "Prokka", summary: "

Rapid annotation of prokaryotic genomes.

" } }, { id: "modules/purple", - data: { name: "PURPLE", summary: "A purity, ploidy and copy number estimator for whole genome tumor data" }, + data: { + name: "PURPLE", + summary: "

A purity, ploidy and copy number estimator for whole genome tumor data.

", + }, }, { id: "modules/pychopper", data: { name: "Pychopper", - summary: "Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.", + summary: + "

Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.

", }, }, { id: "modules/pycoqc", data: { name: "pycoQC", - summary: "Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data", + summary: + "

Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data.

", }, }, { id: "modules/qc3C", - data: { name: "qc3C", summary: "Reference-free and BAM based quality control for Hi-C data" }, + data: { name: "qc3C", summary: "

Reference-free and BAM based quality control for Hi-C data.

" }, }, { id: "modules/qorts", - data: { name: "QoRTs", summary: "Toolkit for analysis, QC, and data management of RNA-Seq datasets." }, + data: { name: "QoRTs", summary: "

Toolkit for analysis, QC, and data management of RNA-Seq datasets.

" }, }, { id: "modules/qualimap", - data: { name: "QualiMap", summary: "Quality control of alignment data and its derivatives like feature counts." }, + data: { + name: "QualiMap", + summary: "

Quality control of alignment data and its derivatives like feature counts.

", + }, }, - { id: "modules/quast", data: { name: "QUAST", summary: "Quality assessment tool for genome assemblies" } }, + { id: "modules/quast", data: { name: "QUAST", summary: "

Quality assessment tool for genome assemblies.

" } }, { id: "modules/rna_seqc", - data: { name: "RNA-SeQC", summary: "RNA-Seq metrics for quality control and process optimization" }, + data: { name: "RNA-SeQC", summary: "

RNA-Seq metrics for quality control and process optimization.

" }, }, { id: "modules/rockhopper", data: { name: "Rockhopper", - summary: "Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs", + summary: + "

Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs.

", }, }, { id: "modules/rsem", - data: { name: "RSEM", summary: "Estimates gene and isoform expression levels from RNA-Seq data." }, + data: { name: "RSEM", summary: "

Estimates gene and isoform expression levels from RNA-Seq data.

" }, }, - { id: "modules/rseqc", data: { name: "RSeQC", summary: "Evaluates high throughput RNA-seq data." } }, + { id: "modules/rseqc", data: { name: "RSeQC", summary: "

Evaluates high throughput RNA-seq data.

" } }, { id: "modules/salmon", - data: { name: "Salmon", summary: "Quantifies expression of transcripts using RNA-seq data." }, + data: { name: "Salmon", summary: "

Quantifies expression of transcripts using RNA-seq data.

" }, + }, + { + id: "modules/sambamba", + data: { name: "Sambamba", summary: "

Toolkit for interacting with BAM/CRAM files.

" }, }, - { id: "modules/sambamba", data: { name: "Sambamba", summary: "Toolkit for interacting with BAM/CRAM files." } }, { id: "modules/samblaster", - data: { name: "Samblaster", summary: "Marks duplicates and extracts discordant and split reads from sam files." }, + data: { + name: "Samblaster", + summary: "

Marks duplicates and extracts discordant and split reads from sam files.

", + }, + }, + { + id: "modules/samtools", + data: { name: "Samtools", summary: "

Toolkit for interacting with BAM/CRAM files.

" }, }, - { id: "modules/samtools", data: { name: "Samtools", summary: "Toolkit for interacting with BAM/CRAM files." } }, { id: "modules/sargasso", data: { name: "Sargasso", - summary: "Separates mixed-species RNA-seq reads according to their species of origin.", + summary: "

Separates mixed-species RNA-seq reads according to their species of origin.

", }, }, { id: "modules/seqera_cli", - data: { name: "Seqera Platform CLI", summary: "Reports statistics generated by the Seqera Platform CLI." }, + data: { name: "Seqera Platform CLI", summary: "

Reports statistics generated by the Seqera Platform CLI.

" }, }, + { id: "modules/seqfu", data: { name: "Seqfu", summary: "

Manipulate FASTA/FASTQ files.

" } }, { id: "modules/sequali", - data: { name: "Sequali", summary: "Sequencing quality control for both long-read and short-read data" }, + data: { name: "Sequali", summary: "

Sequencing quality control for both long-read and short-read data.

" }, }, { id: "modules/seqwho", data: { name: "SeqWho", summary: - "Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.", + "

Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.

", }, }, { id: "modules/seqyclean", - data: { name: "SeqyClean", summary: "Filters adapters, vectors, and contaminants while quality trimming." }, + data: { + name: "SeqyClean", + summary: "

Filters adapters, vectors, and contaminants while quality trimming.

", + }, }, { id: "modules/sexdeterrmine", data: { name: "SexDetErrmine", summary: - "Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.", + "

Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.

", }, }, { id: "modules/sickle", - data: { name: "Sickle", summary: "A windowed adaptive trimming tool for FASTQ files using quality." }, + data: { name: "Sickle", summary: "

A windowed adaptive trimming tool for FASTQ files using quality.

" }, + }, + { + id: "modules/skewer", + data: { name: "Skewer", summary: "

Adapter trimming tool for NGS paired-end sequences.

" }, }, - { id: "modules/skewer", data: { name: "Skewer", summary: "Adapter trimming tool for NGS paired-end sequences." } }, - { id: "modules/slamdunk", data: { name: "Slamdunk", summary: "Tool to analyze SLAM-Seq data." } }, + { id: "modules/slamdunk", data: { name: "Slamdunk", summary: "

Tool to analyze SLAM-Seq data.

" } }, { id: "modules/snippy", - data: { name: "Snippy", summary: "Rapid haploid variant calling and core genome alignment." }, + data: { name: "Snippy", summary: "

Rapid haploid variant calling and core genome alignment.

" }, }, { id: "modules/snpeff", data: { name: "SnpEff", - summary: "Annotates and predicts the effects of variants on genes (such as amino acid changes). ", + summary: "

Annotates and predicts the effects of variants on genes (such as amino acid changes).

", }, }, { id: "modules/snpsplit", data: { name: "SNPsplit", - summary: "Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions", + summary: + "

Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions.

", }, }, { id: "modules/somalier", data: { name: "Somalier", - summary: "Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF", + summary: "

Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF.

", }, }, { id: "modules/sortmerna", data: { name: "SortMeRNA", - summary: "Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.", + summary: + "

Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.

", }, }, { id: "modules/sourmash", data: { name: "Sourmash", - summary: "Quickly searches, compares, and analyzes genomic and metagenomic data sets.", + summary: "

Quickly searches, compares, and analyzes genomic and metagenomic data sets.

", }, }, { id: "modules/spaceranger", - data: { name: "Space Ranger", summary: "Tool to analyze 10x Genomics spatial transcriptomics data." }, + data: { name: "Space Ranger", summary: "

Tool to analyze 10x Genomics spatial transcriptomics data.

" }, }, { id: "modules/stacks", - data: { name: "Stacks", summary: "Analyzes restriction enzyme-based data (e.g. RAD-seq)." }, + data: { name: "Stacks", summary: "

Analyzes restriction enzyme-based data (e.g. RAD-seq).

" }, }, - { id: "modules/star", data: { name: "STAR", summary: "Universal RNA-seq aligner." } }, + { id: "modules/star", data: { name: "STAR", summary: "

Universal RNA-seq aligner.

" } }, { id: "modules/supernova", - data: { name: "Supernova", summary: "De novo genome assembler of 10X Genomics linked-reads." }, + data: { name: "Supernova", summary: "

De novo genome assembler of 10X Genomics linked-reads.

" }, }, { id: "modules/telseq", - data: { name: "telseq", summary: "Estimates telomere length from whole genome sequencing data (BAMs)." }, + data: { name: "telseq", summary: "

Estimates telomere length from whole genome sequencing data (BAMs).

" }, }, { id: "modules/theta2", - data: { name: "THetA2", summary: "Estimates tumour purity and clonal / subclonal copy number." }, + data: { name: "THetA2", summary: "

Estimates tumour purity and clonal / subclonal copy number.

" }, }, { id: "modules/tophat", - data: { name: "Tophat", summary: "Splice junction RNA-Seq reads mapper for mammalian-sized genomes." }, + data: { name: "Tophat", summary: "

Splice junction RNA-Seq reads mapper for mammalian-sized genomes.

" }, + }, + { + id: "modules/trimmomatic", + data: { name: "Trimmomatic", summary: "

Read trimming tool for Illumina NGS data.

" }, }, - { id: "modules/trimmomatic", data: { name: "Trimmomatic", summary: "Read trimming tool for Illumina NGS data." } }, { id: "modules/truvari", - data: { name: "Truvari", summary: "Benchmarking, merging, and annotating structural variants" }, + data: { name: "Truvari", summary: "

Benchmarking, merging, and annotating structural variants.

" }, }, { id: "modules/umicollapse", data: { name: "UMICollapse", - summary: "Algorithms for efficiently collapsing reads with Unique Molecular Identifiers", + summary: "

Algorithms for efficiently collapsing reads with Unique Molecular Identifiers.

", }, }, { id: "modules/umitools", data: { name: "UMI-tools", - summary: "Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.", + summary: "

Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.

", }, }, { id: "modules/varscan2", - data: { name: "VarScan2", summary: "Variant detection in massively parallel sequencing data" }, + data: { name: "VarScan2", summary: "

Variant detection in massively parallel sequencing data.

" }, + }, + { + id: "modules/vcftools", + data: { name: "VCFTools", summary: "

Program to analyse and reporting on VCF files.

" }, }, - { id: "modules/vcftools", data: { name: "VCFTools", summary: "Program to analyse and reporting on VCF files." } }, { id: "modules/vep", data: { name: "VEP", summary: - "Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.", + "

Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.

", }, }, { id: "modules/verifybamid", - data: { name: "VerifyBAMID", summary: "Detects sample contamination and/or sample swaps." }, + data: { name: "VerifyBAMID", summary: "

Detects sample contamination and/or sample swaps.

" }, }, { id: "modules/vg", - data: { name: "VG", summary: "Toolkit to manipulate and analyze graphical genomes, including read alignment" }, + data: { + name: "VG", + summary: "

Toolkit to manipulate and analyze graphical genomes, including read alignment.

", + }, }, { id: "modules/whatshap", data: { name: "WhatsHap", - summary: "Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly)", + summary: "

Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly).

", }, }, { id: "modules/xengsort", - data: { name: "Xengsort", summary: "Fast xenograft read sorter based on space-efficient k-mer hashing" }, + data: { name: "Xengsort", summary: "

Fast xenograft read sorter based on space-efficient k-mer hashing.

" }, + }, + { + id: "modules/xenium", + data: { + name: "Xenium", + summary: "

Spatial transcriptomics platform from 10x Genomics that provides subcellular resolution.

", + }, }, - { id: "modules/xenome", data: { name: "Xenome", summary: "Classifies reads from xenograft sources." } }, + { id: "modules/xenome", data: { name: "Xenome", summary: "

Classifies reads from xenograft sources.

" } }, ]} /> diff --git a/docs/markdown/modules/adapterremoval.md b/docs/markdown/modules/adapterremoval.md index ea9a99cfbe..8ca22ca30d 100644 --- a/docs/markdown/modules/adapterremoval.md +++ b/docs/markdown/modules/adapterremoval.md @@ -2,7 +2,7 @@ title: Adapter Removal displayed_sidebar: multiqcSidebar description: > - Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus +

Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus.

--- :::note -Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus + +

Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus.

[https://github.com/mikkelschubert/adapterremoval](https://github.com/mikkelschubert/adapterremoval) ::: diff --git a/docs/markdown/modules/afterqc.md b/docs/markdown/modules/afterqc.md index 80f6aad85c..2b87a3d3a5 100644 --- a/docs/markdown/modules/afterqc.md +++ b/docs/markdown/modules/afterqc.md @@ -2,7 +2,7 @@ title: AfterQC displayed_sidebar: multiqcSidebar description: > - Automatic filtering, trimming, error removing, and quality control for FastQ data. +

Automatic filtering, trimming, error removing, and quality control for FastQ data.

--- :::note -Automatic filtering, trimming, error removing, and quality control for FastQ data. + +

Automatic filtering, trimming, error removing, and quality control for FastQ data.

[https://github.com/OpenGene/AfterQC](https://github.com/OpenGene/AfterQC) ::: diff --git a/docs/markdown/modules/anglerfish.md b/docs/markdown/modules/anglerfish.md index 2d9fecb0a4..aa1b144a16 100644 --- a/docs/markdown/modules/anglerfish.md +++ b/docs/markdown/modules/anglerfish.md @@ -2,7 +2,7 @@ title: Anglerfish displayed_sidebar: multiqcSidebar description: > - Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells +

Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells.

--- :::note -Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells + +

Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells.

[https://github.com/remiolsen/anglerfish](https://github.com/remiolsen/anglerfish) ::: diff --git a/docs/markdown/modules/ataqv.md b/docs/markdown/modules/ataqv.md index f9258a06d3..8f42d994f9 100644 --- a/docs/markdown/modules/ataqv.md +++ b/docs/markdown/modules/ataqv.md @@ -2,7 +2,7 @@ title: ATAQV displayed_sidebar: multiqcSidebar description: > - Toolkit for quality control and visualization of ATAC-seq data +

Toolkit for quality control and visualization of ATAC-seq data.

--- :::note -Toolkit for quality control and visualization of ATAC-seq data + +

Toolkit for quality control and visualization of ATAC-seq data.

[https://github.com/ParkerLab/ataqv/](https://github.com/ParkerLab/ataqv/) ::: diff --git a/docs/markdown/modules/bakta.md b/docs/markdown/modules/bakta.md index 67ef1aa1a2..029c2d9b19 100644 --- a/docs/markdown/modules/bakta.md +++ b/docs/markdown/modules/bakta.md @@ -2,7 +2,7 @@ title: Bakta displayed_sidebar: multiqcSidebar description: > - Rapid & standardized annotation of bacterial genomes, MAGs & plasmids +

Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.

--- :::note -Rapid & standardized annotation of bacterial genomes, MAGs & plasmids + +

Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.

[https://github.com/oschwengers/bakta](https://github.com/oschwengers/bakta) ::: diff --git a/docs/markdown/modules/bamdst.md b/docs/markdown/modules/bamdst.md index 1b13e86f58..ed6bd6d00f 100644 --- a/docs/markdown/modules/bamdst.md +++ b/docs/markdown/modules/bamdst.md @@ -2,7 +2,7 @@ title: Bamdst displayed_sidebar: multiqcSidebar description: > - Lightweight tool to stat the depth coverage of target regions of BAM file(s) +

Lightweight tool to stat the depth coverage of target regions of BAM file(s).

--- :::note -Lightweight tool to stat the depth coverage of target regions of BAM file(s) + +

Lightweight tool to stat the depth coverage of target regions of BAM file(s).

[https://https://github.com/shiquan/bamdst](https://https://github.com/shiquan/bamdst) ::: diff --git a/docs/markdown/modules/bamtools.md b/docs/markdown/modules/bamtools.md index 2b0ce82f49..c3bd383ce2 100644 --- a/docs/markdown/modules/bamtools.md +++ b/docs/markdown/modules/bamtools.md @@ -2,7 +2,7 @@ title: Bamtools displayed_sidebar: multiqcSidebar description: > - Provides both a programmer's API and an end-user's toolkit for handling BAM files. +

Provides both a programmer's API and an end-user's toolkit for handling BAM files.

--- :::note -Provides both a programmer's API and an end-user's toolkit for handling BAM files. + +

Provides both a programmer's API and an end-user's toolkit for handling BAM files.

[https://github.com/pezmaster31/bamtools](https://github.com/pezmaster31/bamtools) ::: diff --git a/docs/markdown/modules/bases2fastq.md b/docs/markdown/modules/bases2fastq.md index 2ce4857320..cb84eb1bcd 100644 --- a/docs/markdown/modules/bases2fastq.md +++ b/docs/markdown/modules/bases2fastq.md @@ -2,7 +2,7 @@ title: Bases2Fastq displayed_sidebar: multiqcSidebar description: > - Demultiplexes and converts Element AVITI base calls into FASTQ files +

Demultiplexes and converts Element AVITI base calls into FASTQ files.

--- :::note -Demultiplexes and converts Element AVITI base calls into FASTQ files + +

Demultiplexes and converts Element AVITI base calls into FASTQ files.

[https://docs.elembio.io/docs/bases2fastq/introduction/](https://docs.elembio.io/docs/bases2fastq/introduction/) ::: diff --git a/docs/markdown/modules/bbduk.md b/docs/markdown/modules/bbduk.md index de3f1b2e25..7f3e6fa1b6 100644 --- a/docs/markdown/modules/bbduk.md +++ b/docs/markdown/modules/bbduk.md @@ -2,7 +2,7 @@ title: BBDuk displayed_sidebar: multiqcSidebar description: > - Common data-quality-related trimming, filtering, and masking operations with a kmer based approach +

Common data-quality-related trimming, filtering, and masking operations with a kmer based approach.

--- :::note -Common data-quality-related trimming, filtering, and masking operations with a kmer based approach + +

Common data-quality-related trimming, filtering, and masking operations with a kmer based approach.

[https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) ::: diff --git a/docs/markdown/modules/bbmap.md b/docs/markdown/modules/bbmap.md index 88692fe725..f5a9191ea0 100644 --- a/docs/markdown/modules/bbmap.md +++ b/docs/markdown/modules/bbmap.md @@ -2,7 +2,7 @@ title: BBTools displayed_sidebar: multiqcSidebar description: > - Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads +

Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads.

--- :::note -Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads + +

Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads.

[http://jgi.doe.gov/data-and-tools/bbtools/](http://jgi.doe.gov/data-and-tools/bbtools/) ::: diff --git a/docs/markdown/modules/bcftools.md b/docs/markdown/modules/bcftools.md index 02b6def85a..e9cfc3585a 100644 --- a/docs/markdown/modules/bcftools.md +++ b/docs/markdown/modules/bcftools.md @@ -2,7 +2,7 @@ title: Bcftools displayed_sidebar: multiqcSidebar description: > - Utilities for variant calling and manipulating VCFs and BCFs. +

Utilities for variant calling and manipulating VCFs and BCFs.

--- :::note -Utilities for variant calling and manipulating VCFs and BCFs. + +

Utilities for variant calling and manipulating VCFs and BCFs.

[https://samtools.github.io/bcftools/](https://samtools.github.io/bcftools/) ::: diff --git a/docs/markdown/modules/bcl2fastq.md b/docs/markdown/modules/bcl2fastq.md index cb1d13fc48..a4abf44c06 100644 --- a/docs/markdown/modules/bcl2fastq.md +++ b/docs/markdown/modules/bcl2fastq.md @@ -2,7 +2,7 @@ title: bcl2fastq displayed_sidebar: multiqcSidebar description: > - Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis. +

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

--- :::note -Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis. + +

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

[https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html](https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software.html) ::: diff --git a/docs/markdown/modules/bclconvert.md b/docs/markdown/modules/bclconvert.md index 4a28b68461..6a8ab6f9ad 100644 --- a/docs/markdown/modules/bclconvert.md +++ b/docs/markdown/modules/bclconvert.md @@ -2,7 +2,7 @@ title: BCL Convert displayed_sidebar: multiqcSidebar description: > - Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis. +

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

--- :::note -Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis. + +

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

[https://support.illumina.com/sequencing/sequencing_software/bcl-convert.html](https://support.illumina.com/sequencing/sequencing_software/bcl-convert.html) ::: diff --git a/docs/markdown/modules/biobambam2.md b/docs/markdown/modules/biobambam2.md index e166cd6504..6566ff1b5f 100644 --- a/docs/markdown/modules/biobambam2.md +++ b/docs/markdown/modules/biobambam2.md @@ -2,7 +2,7 @@ title: biobambam2 displayed_sidebar: multiqcSidebar description: > - Tools for early stage alignment file processing +

Tools for early stage alignment file processing.

--- :::note -Tools for early stage alignment file processing + +

Tools for early stage alignment file processing.

[https://gitlab.com/german.tischler/biobambam2](https://gitlab.com/german.tischler/biobambam2) ::: diff --git a/docs/markdown/modules/biobloomtools.md b/docs/markdown/modules/biobloomtools.md index 693eb5621d..1b0417ffc0 100644 --- a/docs/markdown/modules/biobloomtools.md +++ b/docs/markdown/modules/biobloomtools.md @@ -2,7 +2,7 @@ title: BioBloom Tools displayed_sidebar: multiqcSidebar description: > - Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection. +

Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.

--- :::note -Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection. + +

Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.

[https://github.com/bcgsc/biobloom/](https://github.com/bcgsc/biobloom/) ::: diff --git a/docs/markdown/modules/biscuit.md b/docs/markdown/modules/biscuit.md index 982eb8a48c..9ab48cc1cc 100644 --- a/docs/markdown/modules/biscuit.md +++ b/docs/markdown/modules/biscuit.md @@ -2,7 +2,7 @@ title: BISCUIT displayed_sidebar: multiqcSidebar description: > - Maps bisulfite converted DNA sequence reads and determines cytosine methylation states. +

Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.

--- :::note -Maps bisulfite converted DNA sequence reads and determines cytosine methylation states. + +

Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.

[https://github.com/huishenlab/biscuit](https://github.com/huishenlab/biscuit) ::: diff --git a/docs/markdown/modules/bismark.md b/docs/markdown/modules/bismark.md index 2ff651508a..49c94cab92 100644 --- a/docs/markdown/modules/bismark.md +++ b/docs/markdown/modules/bismark.md @@ -2,7 +2,7 @@ title: Bismark displayed_sidebar: multiqcSidebar description: > - Maps bisulfite converted sequence reads and determine cytosine methylation states. +

Maps bisulfite converted sequence reads and determine cytosine methylation states.

--- :::note -Maps bisulfite converted sequence reads and determine cytosine methylation states. + +

Maps bisulfite converted sequence reads and determine cytosine methylation states.

[http://www.bioinformatics.babraham.ac.uk/projects/bismark/](http://www.bioinformatics.babraham.ac.uk/projects/bismark/) ::: diff --git a/docs/markdown/modules/bowtie1.md b/docs/markdown/modules/bowtie1.md index f647d8c65c..3dd954ecfe 100644 --- a/docs/markdown/modules/bowtie1.md +++ b/docs/markdown/modules/bowtie1.md @@ -2,7 +2,7 @@ title: Bowtie 1 displayed_sidebar: multiqcSidebar description: > - Ultrafast, memory-efficient short read aligner. +

Ultrafast, memory-efficient short read aligner.

--- :::note -Ultrafast, memory-efficient short read aligner. + +

Ultrafast, memory-efficient short read aligner.

[http://bowtie-bio.sourceforge.net/](http://bowtie-bio.sourceforge.net/) ::: diff --git a/docs/markdown/modules/bowtie2.md b/docs/markdown/modules/bowtie2.md index fe59a80cf9..91ec29b9ef 100644 --- a/docs/markdown/modules/bowtie2.md +++ b/docs/markdown/modules/bowtie2.md @@ -2,7 +2,7 @@ title: Bowtie 2 / HiSAT2 displayed_sidebar: multiqcSidebar description: > - Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome. +

Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.

--- :::note -Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome. + +

Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.

[http://bowtie-bio.sourceforge.net/bowtie2/](http://bowtie-bio.sourceforge.net/bowtie2/), [https://ccb.jhu.edu/software/hisat2/](https://ccb.jhu.edu/software/hisat2/) ::: diff --git a/docs/markdown/modules/busco.md b/docs/markdown/modules/busco.md index 5d974e1a58..d0c886244b 100644 --- a/docs/markdown/modules/busco.md +++ b/docs/markdown/modules/busco.md @@ -2,7 +2,7 @@ title: BUSCO displayed_sidebar: multiqcSidebar description: > - Assesses genome assembly and annotation completeness +

Assesses genome assembly and annotation completeness.

--- :::note -Assesses genome assembly and annotation completeness + +

Assesses genome assembly and annotation completeness.

[http://busco.ezlab.org/](http://busco.ezlab.org/) ::: diff --git a/docs/markdown/modules/bustools.md b/docs/markdown/modules/bustools.md index 66adfdf245..66e8f84fee 100644 --- a/docs/markdown/modules/bustools.md +++ b/docs/markdown/modules/bustools.md @@ -2,7 +2,7 @@ title: Bustools displayed_sidebar: multiqcSidebar description: > - Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing. +

Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.

--- :::note -Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing. + +

Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.

[https://bustools.github.io/](https://bustools.github.io/) ::: diff --git a/docs/markdown/modules/ccs.md b/docs/markdown/modules/ccs.md index d70d64b41b..738bdd1538 100644 --- a/docs/markdown/modules/ccs.md +++ b/docs/markdown/modules/ccs.md @@ -2,7 +2,7 @@ title: CCS displayed_sidebar: multiqcSidebar description: > - PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads) +

PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads).

--- :::note -PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads) + +

PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads).

[https://github.com/PacificBiosciences/ccs](https://github.com/PacificBiosciences/ccs) ::: diff --git a/docs/markdown/modules/cellranger.md b/docs/markdown/modules/cellranger.md index d6ee1704a9..4101fb0a21 100644 --- a/docs/markdown/modules/cellranger.md +++ b/docs/markdown/modules/cellranger.md @@ -2,7 +2,7 @@ title: Cell Ranger displayed_sidebar: multiqcSidebar description: > - Analyzes single cell expression or VDJ data produced by 10X Genomics. +

Analyzes single cell expression or VDJ data produced by 10X Genomics.

--- :::note -Analyzes single cell expression or VDJ data produced by 10X Genomics. + +

Analyzes single cell expression or VDJ data produced by 10X Genomics.

[https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) ::: diff --git a/docs/markdown/modules/cellranger_arc.md b/docs/markdown/modules/cellranger_arc.md index 6ae267c0b4..a092c25957 100644 --- a/docs/markdown/modules/cellranger_arc.md +++ b/docs/markdown/modules/cellranger_arc.md @@ -2,7 +2,7 @@ title: Cell Ranger ARC displayed_sidebar: multiqcSidebar description: > - Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics. +

Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.

--- :::note -Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics. + +

Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.

[https://www.10xgenomics.com/support/software/cell-ranger-arc/latest](https://www.10xgenomics.com/support/software/cell-ranger-arc/latest) ::: diff --git a/docs/markdown/modules/cells2stats.md b/docs/markdown/modules/cells2stats.md index 626f05c30c..51b6d7584e 100644 --- a/docs/markdown/modules/cells2stats.md +++ b/docs/markdown/modules/cells2stats.md @@ -2,7 +2,7 @@ title: cells2stats displayed_sidebar: multiqcSidebar description: > - Generate output files and statistics from Element Biosciences Teton Assay +

Generate output files and statistics from Element Biosciences Teton cytoprofiling assays.

--- :::note -Generate output files and statistics from Element Biosciences Teton Assay + +

Generate output files and statistics from Element Biosciences Teton cytoprofiling assays.

[https://docs.elembio.io/docs/cells2stats/introduction/](https://docs.elembio.io/docs/cells2stats/introduction/) ::: @@ -24,7 +25,7 @@ Generate output files and statistics from Element Biosciences Teton Assay ```yaml cells2stats/run: - contents: DemuxStats + contents: '"AnalysisID": "c2s.' fn: RunStats.json num_lines: 100 ``` diff --git a/docs/markdown/modules/checkm.md b/docs/markdown/modules/checkm.md index 8a5acc0403..2f66b4c438 100644 --- a/docs/markdown/modules/checkm.md +++ b/docs/markdown/modules/checkm.md @@ -2,7 +2,7 @@ title: CheckM displayed_sidebar: multiqcSidebar description: > - Estimates genome completeness and contamination based on the presence or absence of marker genes. +

Estimates genome completeness and contamination based on the presence or absence of marker genes.

--- :::note -Estimates genome completeness and contamination based on the presence or absence of marker genes. + +

Estimates genome completeness and contamination based on the presence or absence of marker genes.

[https://github.com/Ecogenomics/CheckM](https://github.com/Ecogenomics/CheckM) ::: diff --git a/docs/markdown/modules/checkm2.md b/docs/markdown/modules/checkm2.md index 2f2c440d79..c17abd1e31 100644 --- a/docs/markdown/modules/checkm2.md +++ b/docs/markdown/modules/checkm2.md @@ -2,7 +2,7 @@ title: CheckM2 displayed_sidebar: multiqcSidebar description: > - Assesses microbial genome quality using machine learning. +

Assesses microbial genome quality using machine learning.

--- :::note -Assesses microbial genome quality using machine learning. + +

Assesses microbial genome quality using machine learning.

[https://github.com/chklovski/CheckM2](https://github.com/chklovski/CheckM2) ::: diff --git a/docs/markdown/modules/checkqc.md b/docs/markdown/modules/checkqc.md index e5136c0a5e..d1a6d1d129 100644 --- a/docs/markdown/modules/checkqc.md +++ b/docs/markdown/modules/checkqc.md @@ -2,7 +2,7 @@ title: CheckQC displayed_sidebar: multiqcSidebar description: > - Checks a set of quality criteria against an Illumina runfolder. +

Checks a set of quality criteria against an Illumina runfolder.

--- :::note -Checks a set of quality criteria against an Illumina runfolder. + +

Checks a set of quality criteria against an Illumina runfolder.

[https://github.com/Molmed/checkQC](https://github.com/Molmed/checkQC) ::: diff --git a/docs/markdown/modules/clipandmerge.md b/docs/markdown/modules/clipandmerge.md index 895513d3d9..59e3ca6665 100644 --- a/docs/markdown/modules/clipandmerge.md +++ b/docs/markdown/modules/clipandmerge.md @@ -2,7 +2,7 @@ title: ClipAndMerge displayed_sidebar: multiqcSidebar description: > - Adapter clipping and read merging for ancient DNA data. +

Adapter clipping and read merging for ancient DNA data.

--- :::note -Adapter clipping and read merging for ancient DNA data. + +

Adapter clipping and read merging for ancient DNA data.

[http://www.github.com/apeltzer/ClipAndMerge](http://www.github.com/apeltzer/ClipAndMerge) ::: diff --git a/docs/markdown/modules/clusterflow.md b/docs/markdown/modules/clusterflow.md index c8d8799b16..821399e84f 100644 --- a/docs/markdown/modules/clusterflow.md +++ b/docs/markdown/modules/clusterflow.md @@ -2,7 +2,7 @@ title: Cluster Flow displayed_sidebar: multiqcSidebar description: > - Simple and flexible bioinformatics pipeline tool. +

Simple and flexible bioinformatics pipeline tool.

--- :::note -Simple and flexible bioinformatics pipeline tool. + +

Simple and flexible bioinformatics pipeline tool.

[http://clusterflow.io](http://clusterflow.io) ::: diff --git a/docs/markdown/modules/conpair.md b/docs/markdown/modules/conpair.md index b60d193e5c..3e0b08af5f 100644 --- a/docs/markdown/modules/conpair.md +++ b/docs/markdown/modules/conpair.md @@ -2,7 +2,7 @@ title: Conpair displayed_sidebar: multiqcSidebar description: > - Estimates concordance and contamination for tumor–normal pairs +

Estimates concordance and contamination for tumor–normal pairs.

--- :::note -Estimates concordance and contamination for tumor–normal pairs + +

Estimates concordance and contamination for tumor–normal pairs.

[https://github.com/nygenome/Conpair](https://github.com/nygenome/Conpair) ::: diff --git a/docs/markdown/modules/cutadapt.md b/docs/markdown/modules/cutadapt.md index ac6ade102e..50d72ecb63 100644 --- a/docs/markdown/modules/cutadapt.md +++ b/docs/markdown/modules/cutadapt.md @@ -2,7 +2,7 @@ title: Cutadapt displayed_sidebar: multiqcSidebar description: > - Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences. +

Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.

--- :::note -Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences. + +

Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.

[https://cutadapt.readthedocs.io/](https://cutadapt.readthedocs.io/) ::: diff --git a/docs/markdown/modules/damageprofiler.md b/docs/markdown/modules/damageprofiler.md index a0aa3ff49f..299cf632c8 100644 --- a/docs/markdown/modules/damageprofiler.md +++ b/docs/markdown/modules/damageprofiler.md @@ -2,7 +2,7 @@ title: DamageProfiler displayed_sidebar: multiqcSidebar description: > - DNA damage pattern retrieval for ancient DNA analysis +

DNA damage pattern retrieval for ancient DNA analysis.

--- :::note -DNA damage pattern retrieval for ancient DNA analysis + +

DNA damage pattern retrieval for ancient DNA analysis.

[https://github.com/Integrative-Transcriptomics/DamageProfiler](https://github.com/Integrative-Transcriptomics/DamageProfiler) ::: diff --git a/docs/markdown/modules/dedup.md b/docs/markdown/modules/dedup.md index 6e95565c52..55ce3d750f 100644 --- a/docs/markdown/modules/dedup.md +++ b/docs/markdown/modules/dedup.md @@ -2,7 +2,7 @@ title: DeDup displayed_sidebar: multiqcSidebar description: > - Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis +

Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis.

--- :::note -Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis + +

Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis.

[http://www.github.com/apeltzer/DeDup](http://www.github.com/apeltzer/DeDup) ::: diff --git a/docs/markdown/modules/deeptools.md b/docs/markdown/modules/deeptools.md index 3dd8d2f83b..4c26a2dbac 100644 --- a/docs/markdown/modules/deeptools.md +++ b/docs/markdown/modules/deeptools.md @@ -2,7 +2,7 @@ title: deepTools displayed_sidebar: multiqcSidebar description: > - Tools to process and analyze deep sequencing data. +

Tools to process and analyze deep sequencing data.

--- :::note -Tools to process and analyze deep sequencing data. + +

Tools to process and analyze deep sequencing data.

[http://deeptools.readthedocs.io](http://deeptools.readthedocs.io) ::: diff --git a/docs/markdown/modules/diamond.md b/docs/markdown/modules/diamond.md index 7d96a606d0..619a17b5ae 100644 --- a/docs/markdown/modules/diamond.md +++ b/docs/markdown/modules/diamond.md @@ -2,7 +2,7 @@ title: DIAMOND displayed_sidebar: multiqcSidebar description: > - Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST +

Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST.

--- :::note -Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST + +

Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST.

[https://github.com/bbuchfink/diamond](https://github.com/bbuchfink/diamond) ::: diff --git a/docs/markdown/modules/disambiguate.md b/docs/markdown/modules/disambiguate.md index c49ee3c028..97f35da750 100644 --- a/docs/markdown/modules/disambiguate.md +++ b/docs/markdown/modules/disambiguate.md @@ -2,7 +2,7 @@ title: Disambiguate displayed_sidebar: multiqcSidebar description: > - Disambiguate reads aligned to two different species (e.g. human and mouse) +

Disambiguate reads aligned to two different species (e.g. human and mouse).

--- :::note -Disambiguate reads aligned to two different species (e.g. human and mouse) + +

Disambiguate reads aligned to two different species (e.g. human and mouse).

[https://github.com/AstraZeneca-NGS/disambiguate](https://github.com/AstraZeneca-NGS/disambiguate) ::: diff --git a/docs/markdown/modules/dragen.md b/docs/markdown/modules/dragen.md index 94521dba2d..95339b72ca 100644 --- a/docs/markdown/modules/dragen.md +++ b/docs/markdown/modules/dragen.md @@ -2,7 +2,7 @@ title: DRAGEN displayed_sidebar: multiqcSidebar description: > - Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data. +

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

--- :::note -Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data. + +

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

[https://www.illumina.com/products/by-type/informatics-products/dragen-bio-it-platform.html](https://www.illumina.com/products/by-type/informatics-products/dragen-bio-it-platform.html) ::: diff --git a/docs/markdown/modules/dragen_fastqc.md b/docs/markdown/modules/dragen_fastqc.md index 0796d75383..060ac30a6e 100644 --- a/docs/markdown/modules/dragen_fastqc.md +++ b/docs/markdown/modules/dragen_fastqc.md @@ -2,7 +2,7 @@ title: DRAGEN-FastQC displayed_sidebar: multiqcSidebar description: > - Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data +

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

--- :::note -Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data + +

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

[https://www.illumina.com/products/by-type/informatics-products/dragen-bio-it-platform.html](https://www.illumina.com/products/by-type/informatics-products/dragen-bio-it-platform.html) ::: diff --git a/docs/markdown/modules/eigenstratdatabasetools.md b/docs/markdown/modules/eigenstratdatabasetools.md index e7b9d8f8e5..cdab1ce7c5 100644 --- a/docs/markdown/modules/eigenstratdatabasetools.md +++ b/docs/markdown/modules/eigenstratdatabasetools.md @@ -2,7 +2,7 @@ title: eigenstratdatabasetools displayed_sidebar: multiqcSidebar description: > - Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases. +

Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.

--- :::note -Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases. + +

Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.

[https://github.com/TCLamnidis/EigenStratDatabaseTools](https://github.com/TCLamnidis/EigenStratDatabaseTools) ::: diff --git a/docs/markdown/modules/fastp.md b/docs/markdown/modules/fastp.md index 488e5323ce..df92c3fe5f 100644 --- a/docs/markdown/modules/fastp.md +++ b/docs/markdown/modules/fastp.md @@ -2,7 +2,7 @@ title: fastp displayed_sidebar: multiqcSidebar description: > - All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...) +

All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...).

--- :::note -All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...) + +

All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...).

[https://github.com/OpenGene/fastp](https://github.com/OpenGene/fastp) ::: diff --git a/docs/markdown/modules/fastq_screen.md b/docs/markdown/modules/fastq_screen.md index d32717b733..298e17e912 100644 --- a/docs/markdown/modules/fastq_screen.md +++ b/docs/markdown/modules/fastq_screen.md @@ -2,7 +2,7 @@ title: FastQ Screen displayed_sidebar: multiqcSidebar description: > - Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect. +

Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.

--- :::note -Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect. + +

Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.

[http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/](http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/) ::: diff --git a/docs/markdown/modules/fastqc.md b/docs/markdown/modules/fastqc.md index 94f410653a..15e816ee3b 100644 --- a/docs/markdown/modules/fastqc.md +++ b/docs/markdown/modules/fastqc.md @@ -2,7 +2,7 @@ title: FastQC displayed_sidebar: multiqcSidebar description: > - Quality control tool for high throughput sequencing data +

Quality control tool for high throughput sequencing data.

--- :::note -Quality control tool for high throughput sequencing data + +

Quality control tool for high throughput sequencing data.

[http://www.bioinformatics.babraham.ac.uk/projects/fastqc/](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) ::: @@ -50,7 +51,7 @@ that they will share a sample name with data that has already been parsed. ::: You can customise the patterns used for finding these files in your -MultiQC config (see [Module search patterns](#module-search-patterns)). +MultiQC config (see [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns)). The below code shows the default file patterns: ```yaml diff --git a/docs/markdown/modules/featurecounts.md b/docs/markdown/modules/featurecounts.md index 41eda71122..f3a92001af 100644 --- a/docs/markdown/modules/featurecounts.md +++ b/docs/markdown/modules/featurecounts.md @@ -2,7 +2,7 @@ title: featureCounts displayed_sidebar: multiqcSidebar description: > - Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. +

Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.

--- :::note -Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. + +

Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.

[http://subread.sourceforge.net/](http://subread.sourceforge.net/) ::: @@ -23,7 +24,7 @@ Counts mapped reads for genomic features such as genes, exons, promoter, gene bo As of MultiQC v1.10, the module should also work with output from [Rsubread](https://bioconductor.org/packages/release/bioc/html/Rsubread.html). Note that your filenames must end in `.summary` to be discovered. -See [Module search patterns](#module-search-patterns) for how to customise this. +See [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns) for how to customise this. Please note that if files are in "Rsubread mode" then lines will be split by any whitespace, instead of tab characters. As such, filenames with spaces in will diff --git a/docs/markdown/modules/fgbio.md b/docs/markdown/modules/fgbio.md index 2aa0fbf86a..424b3066e2 100644 --- a/docs/markdown/modules/fgbio.md +++ b/docs/markdown/modules/fgbio.md @@ -2,7 +2,7 @@ title: fgbio displayed_sidebar: multiqcSidebar description: > - Processing and evaluating data containing UMIs +

Processing and evaluating data containing UMIs.

--- :::note -Processing and evaluating data containing UMIs + +

Processing and evaluating data containing UMIs.

[http://fulcrumgenomics.github.io/fgbio/](http://fulcrumgenomics.github.io/fgbio/) ::: diff --git a/docs/markdown/modules/filtlong.md b/docs/markdown/modules/filtlong.md index e0909cbaa1..a6cb8ad8f1 100644 --- a/docs/markdown/modules/filtlong.md +++ b/docs/markdown/modules/filtlong.md @@ -2,7 +2,7 @@ title: Filtlong displayed_sidebar: multiqcSidebar description: > - Filters long reads by quality. +

Filters long reads by quality.

--- :::note -Filters long reads by quality. + +

Filters long reads by quality.

[https://github.com/rrwick/Filtlong](https://github.com/rrwick/Filtlong) ::: diff --git a/docs/markdown/modules/flash.md b/docs/markdown/modules/flash.md index 57950a465f..e9c2618ea4 100644 --- a/docs/markdown/modules/flash.md +++ b/docs/markdown/modules/flash.md @@ -2,7 +2,7 @@ title: FLASh displayed_sidebar: multiqcSidebar description: > - Merges paired-end reads from next-generation sequencing experiments. +

Merges paired-end reads from next-generation sequencing experiments.

--- :::note -Merges paired-end reads from next-generation sequencing experiments. + +

Merges paired-end reads from next-generation sequencing experiments.

[https://ccb.jhu.edu/software/FLASH/](https://ccb.jhu.edu/software/FLASH/) ::: diff --git a/docs/markdown/modules/flexbar.md b/docs/markdown/modules/flexbar.md index d1d6cbf51c..81e5e3b1dc 100644 --- a/docs/markdown/modules/flexbar.md +++ b/docs/markdown/modules/flexbar.md @@ -2,7 +2,7 @@ title: Flexbar displayed_sidebar: multiqcSidebar description: > - Barcode and adapter removal tool. +

Barcode and adapter removal tool.

--- :::note -Barcode and adapter removal tool. + +

Barcode and adapter removal tool.

[https://github.com/seqan/flexbar](https://github.com/seqan/flexbar) ::: diff --git a/docs/markdown/modules/freyja.md b/docs/markdown/modules/freyja.md index f0013d6bfd..0870eb7445 100644 --- a/docs/markdown/modules/freyja.md +++ b/docs/markdown/modules/freyja.md @@ -2,7 +2,7 @@ title: Freyja displayed_sidebar: multiqcSidebar description: > - Recovers relative lineage abundances from mixed SARS-CoV-2 samples. +

Recovers relative lineage abundances from mixed SARS-CoV-2 samples.

--- :::note -Recovers relative lineage abundances from mixed SARS-CoV-2 samples. + +

Recovers relative lineage abundances from mixed SARS-CoV-2 samples.

[https://github.com/andersen-lab/Freyja](https://github.com/andersen-lab/Freyja) ::: diff --git a/docs/markdown/modules/ganon.md b/docs/markdown/modules/ganon.md index 53b1df86eb..7710f25f5f 100644 --- a/docs/markdown/modules/ganon.md +++ b/docs/markdown/modules/ganon.md @@ -2,7 +2,7 @@ title: Ganon displayed_sidebar: multiqcSidebar description: > - Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers. +

Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.

--- :::note -Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers. + +

Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.

[https://pirovc.github.io/ganon/](https://pirovc.github.io/ganon/) ::: diff --git a/docs/markdown/modules/gatk.md b/docs/markdown/modules/gatk.md index c152c248ef..82c6de1883 100644 --- a/docs/markdown/modules/gatk.md +++ b/docs/markdown/modules/gatk.md @@ -2,7 +2,7 @@ title: GATK displayed_sidebar: multiqcSidebar description: > - Wide variety of tools with a primary focus on variant discovery and genotyping. +

Wide variety of tools with a primary focus on variant discovery and genotyping.

--- :::note -Wide variety of tools with a primary focus on variant discovery and genotyping. + +

Wide variety of tools with a primary focus on variant discovery and genotyping.

[https://www.broadinstitute.org/gatk/](https://www.broadinstitute.org/gatk/) ::: diff --git a/docs/markdown/modules/gffcompare.md b/docs/markdown/modules/gffcompare.md index cb0016f0c4..1a3c50222d 100644 --- a/docs/markdown/modules/gffcompare.md +++ b/docs/markdown/modules/gffcompare.md @@ -2,7 +2,7 @@ title: GffCompare displayed_sidebar: multiqcSidebar description: > - Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format. +

Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.

--- :::note -Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format. + +

Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.

[https://ccb.jhu.edu/software/stringtie/gffcompare.shtml](https://ccb.jhu.edu/software/stringtie/gffcompare.shtml) ::: diff --git a/docs/markdown/modules/glimpse.md b/docs/markdown/modules/glimpse.md index fd27d8ad16..85da87a00b 100644 --- a/docs/markdown/modules/glimpse.md +++ b/docs/markdown/modules/glimpse.md @@ -2,7 +2,7 @@ title: GLIMPSE displayed_sidebar: multiqcSidebar description: > - Low-coverage whole genome sequencing imputation +

Low-coverage whole genome sequencing imputation.

--- :::note -Low-coverage whole genome sequencing imputation + +

Low-coverage whole genome sequencing imputation.

[https://odelaneau.github.io/GLIMPSE/](https://odelaneau.github.io/GLIMPSE/) ::: diff --git a/docs/markdown/modules/goleft_indexcov.md b/docs/markdown/modules/goleft_indexcov.md index 8c286ce453..c907ce7354 100644 --- a/docs/markdown/modules/goleft_indexcov.md +++ b/docs/markdown/modules/goleft_indexcov.md @@ -2,7 +2,7 @@ title: goleft indexcov displayed_sidebar: multiqcSidebar description: > - Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution +

Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.

--- :::note -Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution + +

Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.

[https://github.com/brentp/goleft/tree/master/indexcov](https://github.com/brentp/goleft/tree/master/indexcov) ::: diff --git a/docs/markdown/modules/gopeaks.md b/docs/markdown/modules/gopeaks.md index c39f901ab5..abb6b0710c 100644 --- a/docs/markdown/modules/gopeaks.md +++ b/docs/markdown/modules/gopeaks.md @@ -2,7 +2,7 @@ title: GoPeaks displayed_sidebar: multiqcSidebar description: > - Calls peaks in CUT&TAG/CUT&RUN datasets. +

Calls peaks in CUT&TAG/CUT&RUN datasets.

--- :::note -Calls peaks in CUT&TAG/CUT&RUN datasets. + +

Calls peaks in CUT&TAG/CUT&RUN datasets.

[https://github.com/maxsonBraunLab/gopeaks](https://github.com/maxsonBraunLab/gopeaks) ::: diff --git a/docs/markdown/modules/gtdbtk.md b/docs/markdown/modules/gtdbtk.md index e859a474b7..f8018cafc2 100644 --- a/docs/markdown/modules/gtdbtk.md +++ b/docs/markdown/modules/gtdbtk.md @@ -2,7 +2,7 @@ title: GTDB-Tk displayed_sidebar: multiqcSidebar description: > - Assigns objective taxonomic classifications to bacterial and archaeal genomes. +

Assigns objective taxonomic classifications to bacterial and archaeal genomes.

--- :::note -Assigns objective taxonomic classifications to bacterial and archaeal genomes. + +

Assigns objective taxonomic classifications to bacterial and archaeal genomes.

[https://ecogenomics.github.io/GTDBTk/index.html](https://ecogenomics.github.io/GTDBTk/index.html) ::: diff --git a/docs/markdown/modules/haplocheck.md b/docs/markdown/modules/haplocheck.md index b6c6c33868..cc0b4b99c1 100644 --- a/docs/markdown/modules/haplocheck.md +++ b/docs/markdown/modules/haplocheck.md @@ -2,7 +2,7 @@ title: Haplocheck displayed_sidebar: multiqcSidebar description: > - Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content. +

Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.

--- :::note -Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content. + +

Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.

[https://github.com/genepi/haplocheck/](https://github.com/genepi/haplocheck/) ::: diff --git a/docs/markdown/modules/happy.md b/docs/markdown/modules/happy.md index a8bc1bdfd9..9499e27ce2 100644 --- a/docs/markdown/modules/happy.md +++ b/docs/markdown/modules/happy.md @@ -2,7 +2,7 @@ title: hap.py displayed_sidebar: multiqcSidebar description: > - Benchmarks variant calls against gold standard truth datasets. +

Benchmarks variant calls against gold standard truth datasets.

--- :::note -Benchmarks variant calls against gold standard truth datasets. + +

Benchmarks variant calls against gold standard truth datasets.

[https://github.com/Illumina/hap.py](https://github.com/Illumina/hap.py) ::: diff --git a/docs/markdown/modules/hicexplorer.md b/docs/markdown/modules/hicexplorer.md index 3fbba02b16..d131970d81 100644 --- a/docs/markdown/modules/hicexplorer.md +++ b/docs/markdown/modules/hicexplorer.md @@ -2,7 +2,7 @@ title: HiCExplorer displayed_sidebar: multiqcSidebar description: > - Hi-C analysis from processing to visualization. +

Hi-C analysis from processing to visualization.

--- :::note -Hi-C analysis from processing to visualization. + +

Hi-C analysis from processing to visualization.

[https://hicexplorer.readthedocs.io](https://hicexplorer.readthedocs.io) ::: diff --git a/docs/markdown/modules/hicpro.md b/docs/markdown/modules/hicpro.md index 4b5d95ed37..72e575b27e 100644 --- a/docs/markdown/modules/hicpro.md +++ b/docs/markdown/modules/hicpro.md @@ -2,7 +2,7 @@ title: HiC-Pro displayed_sidebar: multiqcSidebar description: > - Pipeline for Hi-C data processing +

Pipeline for Hi-C data processing.

--- :::note -Pipeline for Hi-C data processing + +

Pipeline for Hi-C data processing.

[https://github.com/nservant/HiC-Pro](https://github.com/nservant/HiC-Pro) ::: diff --git a/docs/markdown/modules/hicup.md b/docs/markdown/modules/hicup.md index 09fcc6d51f..a8fac015b5 100644 --- a/docs/markdown/modules/hicup.md +++ b/docs/markdown/modules/hicup.md @@ -2,7 +2,7 @@ title: HiCUP displayed_sidebar: multiqcSidebar description: > - Mapping and quality control on Hi-C data. +

Mapping and quality control on Hi-C data.

--- :::note -Mapping and quality control on Hi-C data. + +

Mapping and quality control on Hi-C data.

[http://www.bioinformatics.babraham.ac.uk/projects/hicup/](http://www.bioinformatics.babraham.ac.uk/projects/hicup/) ::: diff --git a/docs/markdown/modules/hifiasm.md b/docs/markdown/modules/hifiasm.md index d9bb2caa3a..6ce06ab465 100644 --- a/docs/markdown/modules/hifiasm.md +++ b/docs/markdown/modules/hifiasm.md @@ -2,7 +2,7 @@ title: HiFiasm displayed_sidebar: multiqcSidebar description: > - Haplotype-resolved assembler for accurate Hifi reads +

Haplotype-resolved assembler for accurate Hifi reads.

--- :::note -Haplotype-resolved assembler for accurate Hifi reads + +

Haplotype-resolved assembler for accurate Hifi reads.

[https://github.com/chhylp123/hifiasm](https://github.com/chhylp123/hifiasm) ::: diff --git a/docs/markdown/modules/hisat2.md b/docs/markdown/modules/hisat2.md index 0180730e46..bce0b5ce21 100644 --- a/docs/markdown/modules/hisat2.md +++ b/docs/markdown/modules/hisat2.md @@ -2,7 +2,7 @@ title: HISAT2 displayed_sidebar: multiqcSidebar description: > - Maps DNA or RNA reads against a genome or a population of genomes +

Maps DNA or RNA reads against a genome or a population of genomes.

--- :::note -Maps DNA or RNA reads against a genome or a population of genomes + +

Maps DNA or RNA reads against a genome or a population of genomes.

[https://ccb.jhu.edu/software/hisat2/](https://ccb.jhu.edu/software/hisat2/) ::: diff --git a/docs/markdown/modules/homer.md b/docs/markdown/modules/homer.md index 283b9c5629..700c58e473 100644 --- a/docs/markdown/modules/homer.md +++ b/docs/markdown/modules/homer.md @@ -2,7 +2,7 @@ title: HOMER displayed_sidebar: multiqcSidebar description: > - Motif discovery and next-gen sequencing analysis. +

Motif discovery and next-gen sequencing analysis.

--- :::note -Motif discovery and next-gen sequencing analysis. + +

Motif discovery and next-gen sequencing analysis.

[http://homer.ucsd.edu/homer/](http://homer.ucsd.edu/homer/) ::: diff --git a/docs/markdown/modules/hops.md b/docs/markdown/modules/hops.md index 13a184a05a..777951cd74 100644 --- a/docs/markdown/modules/hops.md +++ b/docs/markdown/modules/hops.md @@ -2,7 +2,7 @@ title: HOPS displayed_sidebar: multiqcSidebar description: > - Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT. +

Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.

--- :::note -Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT. + +

Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.

[https://github.com/rhuebler/HOPS/](https://github.com/rhuebler/HOPS/) ::: diff --git a/docs/markdown/modules/hostile.md b/docs/markdown/modules/hostile.md index 38cd75da81..d981d28a66 100644 --- a/docs/markdown/modules/hostile.md +++ b/docs/markdown/modules/hostile.md @@ -2,7 +2,7 @@ title: Hostile displayed_sidebar: multiqcSidebar description: > - Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz] +

Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz].

--- :::note -Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz] + +

Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz].

[https://github.com/bede/hostile](https://github.com/bede/hostile) ::: diff --git a/docs/markdown/modules/htseq.md b/docs/markdown/modules/htseq.md index a6bb684382..e559c8dc96 100644 --- a/docs/markdown/modules/htseq.md +++ b/docs/markdown/modules/htseq.md @@ -2,7 +2,7 @@ title: HTSeq Count displayed_sidebar: multiqcSidebar description: > - Part of the HTSeq package: counts reads covering specified genomic features +

Part of the HTSeq package: counts reads covering specified genomic features.

--- :::note -Part of the HTSeq package: counts reads covering specified genomic features + +

Part of the HTSeq package: counts reads covering specified genomic features.

[https://htseq.readthedocs.io/en/master/htseqcount.html](https://htseq.readthedocs.io/en/master/htseqcount.html) ::: diff --git a/docs/markdown/modules/humid.md b/docs/markdown/modules/humid.md index a5cfdcb9ed..601e7aa249 100644 --- a/docs/markdown/modules/humid.md +++ b/docs/markdown/modules/humid.md @@ -2,7 +2,7 @@ title: HUMID displayed_sidebar: multiqcSidebar description: > - Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs. +

Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.

--- :::note -Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs. + +

Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.

[https://github.com/jfjlaros/HUMID](https://github.com/jfjlaros/HUMID) ::: diff --git a/docs/markdown/modules/interop.md b/docs/markdown/modules/interop.md index 0d491b128e..b25b21cf70 100644 --- a/docs/markdown/modules/interop.md +++ b/docs/markdown/modules/interop.md @@ -2,7 +2,7 @@ title: Illumina InterOp Statistics displayed_sidebar: multiqcSidebar description: > - Reading and writing InterOp metric files. +

Reading and writing InterOp metric files.

--- :::note -Reading and writing InterOp metric files. + +

Reading and writing InterOp metric files.

[http://illumina.github.io/interop/index.html](http://illumina.github.io/interop/index.html) ::: diff --git a/docs/markdown/modules/isoseq.md b/docs/markdown/modules/isoseq.md index b8aae3a348..37049ed636 100644 --- a/docs/markdown/modules/isoseq.md +++ b/docs/markdown/modules/isoseq.md @@ -2,7 +2,7 @@ title: Iso-Seq displayed_sidebar: multiqcSidebar description: > - Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads). +

Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).

--- :::note -Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads). + +

Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).

[https://github.com/PacificBiosciences/IsoSeq](https://github.com/PacificBiosciences/IsoSeq) ::: diff --git a/docs/markdown/modules/ivar.md b/docs/markdown/modules/ivar.md index acdd083f51..705977aa53 100644 --- a/docs/markdown/modules/ivar.md +++ b/docs/markdown/modules/ivar.md @@ -2,7 +2,7 @@ title: iVar displayed_sidebar: multiqcSidebar description: > - Functions for viral amplicon-based sequencing. +

Functions for viral amplicon-based sequencing.

--- :::note -Functions for viral amplicon-based sequencing. + +

Functions for viral amplicon-based sequencing.

[https://github.com/andersen-lab/ivar](https://github.com/andersen-lab/ivar) ::: diff --git a/docs/markdown/modules/jcvi.md b/docs/markdown/modules/jcvi.md index 0bb501bfe1..71c1c0e435 100644 --- a/docs/markdown/modules/jcvi.md +++ b/docs/markdown/modules/jcvi.md @@ -2,7 +2,7 @@ title: JCVI Genome Annotation displayed_sidebar: multiqcSidebar description: > - Computes statistics on genome annotation. +

Computes statistics on genome annotation.

--- :::note -Computes statistics on genome annotation. + +

Computes statistics on genome annotation.

[https://pypi.org/project/jcvi/](https://pypi.org/project/jcvi/) ::: diff --git a/docs/markdown/modules/jellyfish.md b/docs/markdown/modules/jellyfish.md index f4b15e6be5..1eaac7bf23 100644 --- a/docs/markdown/modules/jellyfish.md +++ b/docs/markdown/modules/jellyfish.md @@ -2,7 +2,7 @@ title: Jellyfish displayed_sidebar: multiqcSidebar description: > - Counting k-mers in DNA. +

Counting k-mers in DNA.

--- :::note -Counting k-mers in DNA. + +

Counting k-mers in DNA.

[https://github.com/gmarcais/Jellyfish](https://github.com/gmarcais/Jellyfish) ::: diff --git a/docs/markdown/modules/kaiju.md b/docs/markdown/modules/kaiju.md index 55db39a02c..75cd9a7efd 100644 --- a/docs/markdown/modules/kaiju.md +++ b/docs/markdown/modules/kaiju.md @@ -2,7 +2,7 @@ title: Kaiju displayed_sidebar: multiqcSidebar description: > - Taxonomic classification for metagenomics. +

Taxonomic classification for metagenomics.

--- :::note -Taxonomic classification for metagenomics. + +

Taxonomic classification for metagenomics.

[http://kaiju.binf.ku.dk/](http://kaiju.binf.ku.dk/) ::: diff --git a/docs/markdown/modules/kallisto.md b/docs/markdown/modules/kallisto.md index 27dd0a9724..18113d94bc 100644 --- a/docs/markdown/modules/kallisto.md +++ b/docs/markdown/modules/kallisto.md @@ -2,7 +2,7 @@ title: Kallisto displayed_sidebar: multiqcSidebar description: > - Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data +

Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data.

--- :::note -Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data + +

Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data.

[http://pachterlab.github.io/kallisto/](http://pachterlab.github.io/kallisto/) ::: diff --git a/docs/markdown/modules/kat.md b/docs/markdown/modules/kat.md index fcf2bd5a63..fabcf9e25b 100644 --- a/docs/markdown/modules/kat.md +++ b/docs/markdown/modules/kat.md @@ -2,7 +2,7 @@ title: K-mer Analysis Toolkit displayed_sidebar: multiqcSidebar description: > - Analyses sequencing data via its k-mer spectra. +

Analyses sequencing data via its k-mer spectra.

--- :::note -Analyses sequencing data via its k-mer spectra. + +

Analyses sequencing data via its k-mer spectra.

[https://github.com/TGAC/KAT](https://github.com/TGAC/KAT) ::: diff --git a/docs/markdown/modules/kraken.md b/docs/markdown/modules/kraken.md index 8383e67cf8..3c1e1da27f 100644 --- a/docs/markdown/modules/kraken.md +++ b/docs/markdown/modules/kraken.md @@ -2,7 +2,7 @@ title: Kraken displayed_sidebar: multiqcSidebar description: > - Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence. +

Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.

--- :::note -Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence. + +

Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.

[https://ccb.jhu.edu/software/kraken/](https://ccb.jhu.edu/software/kraken/) ::: diff --git a/docs/markdown/modules/leehom.md b/docs/markdown/modules/leehom.md index ad9ead31ed..efd6e3a835 100644 --- a/docs/markdown/modules/leehom.md +++ b/docs/markdown/modules/leehom.md @@ -2,7 +2,7 @@ title: leeHom displayed_sidebar: multiqcSidebar description: > - Bayesian reconstruction of ancient DNA +

Bayesian reconstruction of ancient DNA.

--- :::note -Bayesian reconstruction of ancient DNA + +

Bayesian reconstruction of ancient DNA.

[https://github.com/grenaud/leeHom](https://github.com/grenaud/leeHom) ::: diff --git a/docs/markdown/modules/librarian.md b/docs/markdown/modules/librarian.md index 36a42c924e..e7367354f1 100644 --- a/docs/markdown/modules/librarian.md +++ b/docs/markdown/modules/librarian.md @@ -2,7 +2,7 @@ title: Librarian displayed_sidebar: multiqcSidebar description: > - Predicts the sequencing library type from the base composition of a FastQ file. +

Predicts the sequencing library type from the base composition of a FastQ file.

--- :::note -Predicts the sequencing library type from the base composition of a FastQ file. + +

Predicts the sequencing library type from the base composition of a FastQ file.

[https://github.com/DesmondWillowbrook/Librarian](https://github.com/DesmondWillowbrook/Librarian) ::: diff --git a/docs/markdown/modules/lima.md b/docs/markdown/modules/lima.md index 76415f7c51..b7dd696194 100644 --- a/docs/markdown/modules/lima.md +++ b/docs/markdown/modules/lima.md @@ -2,7 +2,7 @@ title: Lima displayed_sidebar: multiqcSidebar description: > - Demultiplex PacBio single-molecule sequencing reads. +

Demultiplex PacBio single-molecule sequencing reads.

--- :::note -Demultiplex PacBio single-molecule sequencing reads. + +

Demultiplex PacBio single-molecule sequencing reads.

[https://github.com/PacificBiosciences/barcoding](https://github.com/PacificBiosciences/barcoding) ::: diff --git a/docs/markdown/modules/longranger.md b/docs/markdown/modules/longranger.md index cbe9663d7f..a8224d3b69 100644 --- a/docs/markdown/modules/longranger.md +++ b/docs/markdown/modules/longranger.md @@ -2,7 +2,7 @@ title: Long Ranger displayed_sidebar: multiqcSidebar description: > - Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling. +

Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.

--- :::note -Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling. + +

Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.

[https://support.10xgenomics.com/genome-exome/software/pipelines/latest/what-is-long-ranger](https://support.10xgenomics.com/genome-exome/software/pipelines/latest/what-is-long-ranger) ::: diff --git a/docs/markdown/modules/macs2.md b/docs/markdown/modules/macs2.md index 6fc6bf9852..9f74d58fb9 100644 --- a/docs/markdown/modules/macs2.md +++ b/docs/markdown/modules/macs2.md @@ -2,7 +2,7 @@ title: MACS2 displayed_sidebar: multiqcSidebar description: > - Identifies transcription factor binding sites in ChIP-seq data. +

Identifies transcription factor binding sites in ChIP-seq data.

--- :::note -Identifies transcription factor binding sites in ChIP-seq data. + +

Identifies transcription factor binding sites in ChIP-seq data.

[https://macs3-project.github.io/MACS/](https://macs3-project.github.io/MACS/) ::: diff --git a/docs/markdown/modules/malt.md b/docs/markdown/modules/malt.md index ed8daf3623..cb5913141c 100644 --- a/docs/markdown/modules/malt.md +++ b/docs/markdown/modules/malt.md @@ -2,7 +2,7 @@ title: MALT displayed_sidebar: multiqcSidebar description: > - Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file +

Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file.

--- :::note -Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file + +

Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file.

[http://ab.inf.uni-tuebingen.de/software/malt/](http://ab.inf.uni-tuebingen.de/software/malt/) ::: diff --git a/docs/markdown/modules/mapdamage.md b/docs/markdown/modules/mapdamage.md index a54e5db5bd..a303cf5945 100644 --- a/docs/markdown/modules/mapdamage.md +++ b/docs/markdown/modules/mapdamage.md @@ -2,7 +2,7 @@ title: mapDamage displayed_sidebar: multiqcSidebar description: > - Tracks and quantifies damage patterns in ancient DNA sequences. +

Tracks and quantifies damage patterns in ancient DNA sequences.

--- :::note -Tracks and quantifies damage patterns in ancient DNA sequences. + +

Tracks and quantifies damage patterns in ancient DNA sequences.

[https://github.com/ginolhac/mapDamage](https://github.com/ginolhac/mapDamage) ::: diff --git a/docs/markdown/modules/megahit.md b/docs/markdown/modules/megahit.md index 90aed20fde..887039dd5b 100644 --- a/docs/markdown/modules/megahit.md +++ b/docs/markdown/modules/megahit.md @@ -2,7 +2,7 @@ title: MEGAHIT displayed_sidebar: multiqcSidebar description: > - NGS read assembler +

NGS read assembler.

--- :::note -NGS read assembler + +

NGS read assembler.

[https://github.com/voutcn/megahit](https://github.com/voutcn/megahit) ::: diff --git a/docs/markdown/modules/metaphlan.md b/docs/markdown/modules/metaphlan.md index b09d070ac2..ec6ae2fd25 100644 --- a/docs/markdown/modules/metaphlan.md +++ b/docs/markdown/modules/metaphlan.md @@ -2,7 +2,7 @@ title: MetaPhlAn displayed_sidebar: multiqcSidebar description: > - Profiles the composition of microbial communities from metagenomic shotgun sequencing data. +

Profiles the composition of microbial communities from metagenomic shotgun sequencing data.

--- :::note -Profiles the composition of microbial communities from metagenomic shotgun sequencing data. + +

Profiles the composition of microbial communities from metagenomic shotgun sequencing data.

[https://github.com/biobakery/MetaPhlAn](https://github.com/biobakery/MetaPhlAn) ::: diff --git a/docs/markdown/modules/methylqa.md b/docs/markdown/modules/methylqa.md index de09d26e49..67e28d3bf1 100644 --- a/docs/markdown/modules/methylqa.md +++ b/docs/markdown/modules/methylqa.md @@ -2,7 +2,7 @@ title: methylQA displayed_sidebar: multiqcSidebar description: > - Methylation sequencing data quality assessment tool. +

Methylation sequencing data quality assessment tool.

--- :::note -Methylation sequencing data quality assessment tool. + +

Methylation sequencing data quality assessment tool.

[http://methylqa.sourceforge.net/](http://methylqa.sourceforge.net/) ::: diff --git a/docs/markdown/modules/mgikit.md b/docs/markdown/modules/mgikit.md index 08b147f0cc..043c713177 100644 --- a/docs/markdown/modules/mgikit.md +++ b/docs/markdown/modules/mgikit.md @@ -2,7 +2,7 @@ title: mgikit displayed_sidebar: multiqcSidebar description: > - Demultiplexes FASTQ files from an MGI sequencing instrument +

Demultiplexes FASTQ files from an MGI sequencing instrument.

--- :::note -Demultiplexes FASTQ files from an MGI sequencing instrument + +

Demultiplexes FASTQ files from an MGI sequencing instrument.

[https://github.com/sagc-bioinformatics/mgikit](https://github.com/sagc-bioinformatics/mgikit) ::: diff --git a/docs/markdown/modules/minionqc.md b/docs/markdown/modules/minionqc.md index 4cc8313ea0..65de4abb31 100644 --- a/docs/markdown/modules/minionqc.md +++ b/docs/markdown/modules/minionqc.md @@ -2,7 +2,7 @@ title: MinIONQC displayed_sidebar: multiqcSidebar description: > - Quality control for ONT (Oxford Nanopore) long reads +

Quality control for ONT (Oxford Nanopore) long reads.

--- :::note -Quality control for ONT (Oxford Nanopore) long reads + +

Quality control for ONT (Oxford Nanopore) long reads.

[https://github.com/roblanf/minion_qc](https://github.com/roblanf/minion_qc) ::: diff --git a/docs/markdown/modules/mirtop.md b/docs/markdown/modules/mirtop.md index 06a070fa41..59b7c8ba10 100644 --- a/docs/markdown/modules/mirtop.md +++ b/docs/markdown/modules/mirtop.md @@ -2,7 +2,7 @@ title: mirtop displayed_sidebar: multiqcSidebar description: > - Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format. +

Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.

--- :::note -Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format. + +

Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.

[https://github.com/miRTop/mirtop/](https://github.com/miRTop/mirtop/) ::: diff --git a/docs/markdown/modules/mirtrace.md b/docs/markdown/modules/mirtrace.md index 7ed4990a02..09e10a0fd6 100644 --- a/docs/markdown/modules/mirtrace.md +++ b/docs/markdown/modules/mirtrace.md @@ -2,7 +2,7 @@ title: miRTrace displayed_sidebar: multiqcSidebar description: > - Quality control for small RNA sequencing data. +

Quality control for small RNA sequencing data.

--- :::note -Quality control for small RNA sequencing data. + +

Quality control for small RNA sequencing data.

[https://github.com/friedlanderlab/mirtrace](https://github.com/friedlanderlab/mirtrace) ::: diff --git a/docs/markdown/modules/mosaicatcher.md b/docs/markdown/modules/mosaicatcher.md index e8097bbc36..724160bfca 100644 --- a/docs/markdown/modules/mosaicatcher.md +++ b/docs/markdown/modules/mosaicatcher.md @@ -2,7 +2,7 @@ title: MosaiCatcher displayed_sidebar: multiqcSidebar description: > - Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model. +

Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.

--- :::note -Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model. + +

Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.

[https://github.com/friendsofstrandseq/mosaicatcher](https://github.com/friendsofstrandseq/mosaicatcher) ::: diff --git a/docs/markdown/modules/mosdepth.md b/docs/markdown/modules/mosdepth.md index 9a5e6d602d..b4d8a7fab4 100644 --- a/docs/markdown/modules/mosdepth.md +++ b/docs/markdown/modules/mosdepth.md @@ -2,7 +2,7 @@ title: Mosdepth displayed_sidebar: multiqcSidebar description: > - Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing +

Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing.

--- :::note -Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing + +

Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing.

[https://github.com/brentp/mosdepth](https://github.com/brentp/mosdepth) ::: diff --git a/docs/markdown/modules/motus.md b/docs/markdown/modules/motus.md index 0ce95c5fb1..168ace304b 100644 --- a/docs/markdown/modules/motus.md +++ b/docs/markdown/modules/motus.md @@ -2,7 +2,7 @@ title: Motus displayed_sidebar: multiqcSidebar description: > - Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs). +

Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).

--- :::note -Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs). + +

Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).

[https://motu-tool.org/](https://motu-tool.org/) ::: diff --git a/docs/markdown/modules/mtnucratio.md b/docs/markdown/modules/mtnucratio.md index 846cd3ad6f..d7a7529074 100644 --- a/docs/markdown/modules/mtnucratio.md +++ b/docs/markdown/modules/mtnucratio.md @@ -2,7 +2,7 @@ title: mtnucratio displayed_sidebar: multiqcSidebar description: > - Computes mitochondrial to nuclear genome ratios in NGS datasets. +

Computes mitochondrial to nuclear genome ratios in NGS datasets.

--- :::note -Computes mitochondrial to nuclear genome ratios in NGS datasets. + +

Computes mitochondrial to nuclear genome ratios in NGS datasets.

[http://www.github.com/apeltzer/MTNucRatioCalculator](http://www.github.com/apeltzer/MTNucRatioCalculator) ::: diff --git a/docs/markdown/modules/multivcfanalyzer.md b/docs/markdown/modules/multivcfanalyzer.md index 3ca396a555..d70e8129a2 100644 --- a/docs/markdown/modules/multivcfanalyzer.md +++ b/docs/markdown/modules/multivcfanalyzer.md @@ -2,7 +2,7 @@ title: MultiVCFAnalyzer displayed_sidebar: multiqcSidebar description: > - Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats +

Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats.

--- :::note -Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats + +

Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats.

[https://github.com/alexherbig/MultiVCFAnalyzer](https://github.com/alexherbig/MultiVCFAnalyzer) ::: diff --git a/docs/markdown/modules/nanoq.md b/docs/markdown/modules/nanoq.md index 7cdf33b6ac..98add59812 100644 --- a/docs/markdown/modules/nanoq.md +++ b/docs/markdown/modules/nanoq.md @@ -2,7 +2,7 @@ title: nanoq displayed_sidebar: multiqcSidebar description: > - Reports read quality and length from nanopore sequencing data +

Reports read quality and length from nanopore sequencing data.

--- :::note -Reports read quality and length from nanopore sequencing data + +

Reports read quality and length from nanopore sequencing data.

[https://github.com/nerdna/nanoq/](https://github.com/nerdna/nanoq/) ::: diff --git a/docs/markdown/modules/nanostat.md b/docs/markdown/modules/nanostat.md index 321cad0fb1..e3044415ef 100644 --- a/docs/markdown/modules/nanostat.md +++ b/docs/markdown/modules/nanostat.md @@ -2,7 +2,7 @@ title: NanoStat displayed_sidebar: multiqcSidebar description: > - Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp). +

Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).

--- :::note -Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp). + +

Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).

[https://github.com/wdecoster/nanostat/](https://github.com/wdecoster/nanostat/), [https://github.com/wdecoster/nanoplot/](https://github.com/wdecoster/nanoplot/) ::: diff --git a/docs/markdown/modules/nextclade.md b/docs/markdown/modules/nextclade.md index b4fa3c2949..2af051a12c 100644 --- a/docs/markdown/modules/nextclade.md +++ b/docs/markdown/modules/nextclade.md @@ -2,7 +2,7 @@ title: Nextclade displayed_sidebar: multiqcSidebar description: > - Viral genome alignment, clade assignment, mutation calling, and quality checks +

Viral genome alignment, clade assignment, mutation calling, and quality checks.

--- :::note -Viral genome alignment, clade assignment, mutation calling, and quality checks + +

Viral genome alignment, clade assignment, mutation calling, and quality checks.

[https://github.com/nextstrain/nextclade](https://github.com/nextstrain/nextclade) ::: diff --git a/docs/markdown/modules/ngsbits.md b/docs/markdown/modules/ngsbits.md index bafd841014..311f50bb63 100644 --- a/docs/markdown/modules/ngsbits.md +++ b/docs/markdown/modules/ngsbits.md @@ -2,7 +2,7 @@ title: ngs-bits displayed_sidebar: multiqcSidebar description: > - Calculating statistics from FASTQ, BAM, and VCF +

Calculating statistics from FASTQ, BAM, and VCF.

--- :::note -Calculating statistics from FASTQ, BAM, and VCF + +

Calculating statistics from FASTQ, BAM, and VCF.

[https://github.com/imgag/ngs-bits](https://github.com/imgag/ngs-bits) ::: diff --git a/docs/markdown/modules/ngsderive.md b/docs/markdown/modules/ngsderive.md index 6c147c5d71..e93a86b399 100644 --- a/docs/markdown/modules/ngsderive.md +++ b/docs/markdown/modules/ngsderive.md @@ -2,7 +2,7 @@ title: ngsderive displayed_sidebar: multiqcSidebar description: > - Forensic tool for by backwards computing library information in sequencing data +

Forensic tool for by backwards computing library information in sequencing data.

--- :::note -Forensic tool for by backwards computing library information in sequencing data + +

Forensic tool for by backwards computing library information in sequencing data.

[https://github.com/stjudecloud/ngsderive](https://github.com/stjudecloud/ngsderive) ::: diff --git a/docs/markdown/modules/nonpareil.md b/docs/markdown/modules/nonpareil.md index 8bb660f379..8bdd46e5bb 100644 --- a/docs/markdown/modules/nonpareil.md +++ b/docs/markdown/modules/nonpareil.md @@ -2,7 +2,7 @@ title: Nonpareil displayed_sidebar: multiqcSidebar description: > - Estimates metagenomic coverage and sequence diversity +

Estimates metagenomic coverage and sequence diversity.

--- :::note -Estimates metagenomic coverage and sequence diversity + +

Estimates metagenomic coverage and sequence diversity.

[https://github.com/lmrodriguezr/nonpareil](https://github.com/lmrodriguezr/nonpareil) ::: diff --git a/docs/markdown/modules/odgi.md b/docs/markdown/modules/odgi.md index 1d0eed2b1d..33fa716d2f 100644 --- a/docs/markdown/modules/odgi.md +++ b/docs/markdown/modules/odgi.md @@ -2,7 +2,7 @@ title: ODGI displayed_sidebar: multiqcSidebar description: > - Analysis and manipulation of pangenome graphs structured in the variation graph model. +

Analysis and manipulation of pangenome graphs structured in the variation graph model.

--- :::note -Analysis and manipulation of pangenome graphs structured in the variation graph model. + +

Analysis and manipulation of pangenome graphs structured in the variation graph model.

[https://github.com/pangenome/odgi](https://github.com/pangenome/odgi) ::: diff --git a/docs/markdown/modules/optitype.md b/docs/markdown/modules/optitype.md index 64223d83ec..11d5650d20 100644 --- a/docs/markdown/modules/optitype.md +++ b/docs/markdown/modules/optitype.md @@ -2,7 +2,7 @@ title: OptiType displayed_sidebar: multiqcSidebar description: > - Precision HLA typing from next-generation sequencing data. +

Precision HLA typing from next-generation sequencing data.

--- :::note -Precision HLA typing from next-generation sequencing data. + +

Precision HLA typing from next-generation sequencing data.

[https://github.com/FRED-2/OptiType](https://github.com/FRED-2/OptiType) ::: diff --git a/docs/markdown/modules/pairtools.md b/docs/markdown/modules/pairtools.md index ab046d4597..75691f5e46 100644 --- a/docs/markdown/modules/pairtools.md +++ b/docs/markdown/modules/pairtools.md @@ -2,7 +2,7 @@ title: pairtools displayed_sidebar: multiqcSidebar description: > - Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication. +

Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.

--- :::note -Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication. + +

Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.

[https://github.com/mirnylab/pairtools](https://github.com/mirnylab/pairtools) ::: diff --git a/docs/markdown/modules/pangolin.md b/docs/markdown/modules/pangolin.md index 16dd108171..e01b987777 100644 --- a/docs/markdown/modules/pangolin.md +++ b/docs/markdown/modules/pangolin.md @@ -2,7 +2,7 @@ title: Pangolin displayed_sidebar: multiqcSidebar description: > - Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages. +

Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.

--- :::note -Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages. + +

Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.

[https://github.com/cov-lineages/pangolin](https://github.com/cov-lineages/pangolin) ::: diff --git a/docs/markdown/modules/pbmarkdup.md b/docs/markdown/modules/pbmarkdup.md index 21e3526ce0..7617173a10 100644 --- a/docs/markdown/modules/pbmarkdup.md +++ b/docs/markdown/modules/pbmarkdup.md @@ -2,7 +2,7 @@ title: pbmarkdup displayed_sidebar: multiqcSidebar description: > - Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates. +

Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.

--- :::note -Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates. + +

Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.

[https://github.com/PacificBiosciences/pbmarkdup](https://github.com/PacificBiosciences/pbmarkdup) ::: diff --git a/docs/markdown/modules/peddy.md b/docs/markdown/modules/peddy.md index 756816779e..f2b7296397 100644 --- a/docs/markdown/modules/peddy.md +++ b/docs/markdown/modules/peddy.md @@ -2,7 +2,7 @@ title: Peddy displayed_sidebar: multiqcSidebar description: > - Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF. +

Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.

--- :::note -Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF. + +

Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.

[https://github.com/brentp/peddy](https://github.com/brentp/peddy) ::: diff --git a/docs/markdown/modules/percolator.md b/docs/markdown/modules/percolator.md index 648a50138d..9d6691895a 100644 --- a/docs/markdown/modules/percolator.md +++ b/docs/markdown/modules/percolator.md @@ -2,7 +2,7 @@ title: Percolator displayed_sidebar: multiqcSidebar description: > - Semi-supervised learning for peptide identification from shotgun proteomics datasets. +

Semi-supervised learning for peptide identification from shotgun proteomics datasets.

--- :::note -Semi-supervised learning for peptide identification from shotgun proteomics datasets. + +

Semi-supervised learning for peptide identification from shotgun proteomics datasets.

[https://github.com/percolator/percolator](https://github.com/percolator/percolator) ::: diff --git a/docs/markdown/modules/phantompeakqualtools.md b/docs/markdown/modules/phantompeakqualtools.md index e2b516d8eb..dc22bf6262 100644 --- a/docs/markdown/modules/phantompeakqualtools.md +++ b/docs/markdown/modules/phantompeakqualtools.md @@ -2,7 +2,7 @@ title: phantompeakqualtools displayed_sidebar: multiqcSidebar description: > - Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data. +

Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.

--- :::note -Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data. + +

Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.

[https://www.encodeproject.org/software/phantompeakqualtools](https://www.encodeproject.org/software/phantompeakqualtools) ::: diff --git a/docs/markdown/modules/picard.md b/docs/markdown/modules/picard.md index dd931d8ead..ac4377c4ba 100644 --- a/docs/markdown/modules/picard.md +++ b/docs/markdown/modules/picard.md @@ -2,7 +2,7 @@ title: Picard displayed_sidebar: multiqcSidebar description: > - Tools for manipulating high-throughput sequencing data. +

Tools for manipulating high-throughput sequencing data.

--- :::note -Tools for manipulating high-throughput sequencing data. + +

Tools for manipulating high-throughput sequencing data.

[http://broadinstitute.github.io/picard/](http://broadinstitute.github.io/picard/) ::: diff --git a/docs/markdown/modules/porechop.md b/docs/markdown/modules/porechop.md index 13d3dc86d2..442fe419e6 100644 --- a/docs/markdown/modules/porechop.md +++ b/docs/markdown/modules/porechop.md @@ -2,7 +2,7 @@ title: Porechop displayed_sidebar: multiqcSidebar description: > - Finds and removes adapters from Oxford Nanopore reads. +

Finds and removes adapters from Oxford Nanopore reads.

--- :::note -Finds and removes adapters from Oxford Nanopore reads. + +

Finds and removes adapters from Oxford Nanopore reads.

[https://github.com/rrwick/Porechop](https://github.com/rrwick/Porechop) ::: diff --git a/docs/markdown/modules/preseq.md b/docs/markdown/modules/preseq.md index 9607cf761d..7d85d1aec9 100644 --- a/docs/markdown/modules/preseq.md +++ b/docs/markdown/modules/preseq.md @@ -2,7 +2,7 @@ title: Preseq displayed_sidebar: multiqcSidebar description: > - Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count. +

Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.

--- :::note -Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count. + +

Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.

[http://smithlabresearch.org/software/preseq/](http://smithlabresearch.org/software/preseq/) ::: diff --git a/docs/markdown/modules/prinseqplusplus.md b/docs/markdown/modules/prinseqplusplus.md index 6f889167d0..7d8aa83bfd 100644 --- a/docs/markdown/modules/prinseqplusplus.md +++ b/docs/markdown/modules/prinseqplusplus.md @@ -2,7 +2,7 @@ title: PRINSEQ++ displayed_sidebar: multiqcSidebar description: > - C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads. +

C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.

--- :::note -C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads. + +

C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.

[https://github.com/Adrian-Cantu/PRINSEQ-plus-plus](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) ::: diff --git a/docs/markdown/modules/prokka.md b/docs/markdown/modules/prokka.md index 5271eafb0d..b10b947015 100644 --- a/docs/markdown/modules/prokka.md +++ b/docs/markdown/modules/prokka.md @@ -2,7 +2,7 @@ title: Prokka displayed_sidebar: multiqcSidebar description: > - Rapid annotation of prokaryotic genomes. +

Rapid annotation of prokaryotic genomes.

--- :::note -Rapid annotation of prokaryotic genomes. + +

Rapid annotation of prokaryotic genomes.

[http://www.vicbioinformatics.com/software.prokka.shtml](http://www.vicbioinformatics.com/software.prokka.shtml) ::: diff --git a/docs/markdown/modules/purple.md b/docs/markdown/modules/purple.md index 58b0cae221..4b4956f94c 100644 --- a/docs/markdown/modules/purple.md +++ b/docs/markdown/modules/purple.md @@ -2,7 +2,7 @@ title: PURPLE displayed_sidebar: multiqcSidebar description: > - A purity, ploidy and copy number estimator for whole genome tumor data +

A purity, ploidy and copy number estimator for whole genome tumor data.

--- :::note -A purity, ploidy and copy number estimator for whole genome tumor data + +

A purity, ploidy and copy number estimator for whole genome tumor data.

[https://github.com/hartwigmedical/hmftools/](https://github.com/hartwigmedical/hmftools/) ::: diff --git a/docs/markdown/modules/pychopper.md b/docs/markdown/modules/pychopper.md index 00df666f14..7f0a83f760 100644 --- a/docs/markdown/modules/pychopper.md +++ b/docs/markdown/modules/pychopper.md @@ -2,7 +2,7 @@ title: Pychopper displayed_sidebar: multiqcSidebar description: > - Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads. +

Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.

--- :::note -Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads. + +

Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.

[https://github.com/nanoporetech/pychopper](https://github.com/nanoporetech/pychopper) ::: diff --git a/docs/markdown/modules/pycoqc.md b/docs/markdown/modules/pycoqc.md index 6cb86689c8..88e3d3e8f4 100644 --- a/docs/markdown/modules/pycoqc.md +++ b/docs/markdown/modules/pycoqc.md @@ -2,7 +2,7 @@ title: pycoQC displayed_sidebar: multiqcSidebar description: > - Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data +

Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data.

--- :::note -Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data + +

Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data.

[https://github.com/tleonardi/pycoQC](https://github.com/tleonardi/pycoQC) ::: diff --git a/docs/markdown/modules/qc3C.md b/docs/markdown/modules/qc3C.md index 158a17ff61..53b0344333 100644 --- a/docs/markdown/modules/qc3C.md +++ b/docs/markdown/modules/qc3C.md @@ -2,7 +2,7 @@ title: qc3C displayed_sidebar: multiqcSidebar description: > - Reference-free and BAM based quality control for Hi-C data +

Reference-free and BAM based quality control for Hi-C data.

--- :::note -Reference-free and BAM based quality control for Hi-C data + +

Reference-free and BAM based quality control for Hi-C data.

[http://github.com/cerebis/qc3C](http://github.com/cerebis/qc3C) ::: diff --git a/docs/markdown/modules/qorts.md b/docs/markdown/modules/qorts.md index dc98387d36..7eaa9e1a2d 100644 --- a/docs/markdown/modules/qorts.md +++ b/docs/markdown/modules/qorts.md @@ -2,7 +2,7 @@ title: QoRTs displayed_sidebar: multiqcSidebar description: > - Toolkit for analysis, QC, and data management of RNA-Seq datasets. +

Toolkit for analysis, QC, and data management of RNA-Seq datasets.

--- :::note -Toolkit for analysis, QC, and data management of RNA-Seq datasets. + +

Toolkit for analysis, QC, and data management of RNA-Seq datasets.

[http://hartleys.github.io/QoRTs/](http://hartleys.github.io/QoRTs/) ::: diff --git a/docs/markdown/modules/qualimap.md b/docs/markdown/modules/qualimap.md index 382d58b554..ace24290c5 100644 --- a/docs/markdown/modules/qualimap.md +++ b/docs/markdown/modules/qualimap.md @@ -2,7 +2,7 @@ title: QualiMap displayed_sidebar: multiqcSidebar description: > - Quality control of alignment data and its derivatives like feature counts. +

Quality control of alignment data and its derivatives like feature counts.

--- :::note -Quality control of alignment data and its derivatives like feature counts. + +

Quality control of alignment data and its derivatives like feature counts.

[http://qualimap.bioinfo.cipf.es/](http://qualimap.bioinfo.cipf.es/) ::: diff --git a/docs/markdown/modules/quast.md b/docs/markdown/modules/quast.md index 6f0e26e2e9..39d4e477c8 100644 --- a/docs/markdown/modules/quast.md +++ b/docs/markdown/modules/quast.md @@ -2,7 +2,7 @@ title: QUAST displayed_sidebar: multiqcSidebar description: > - Quality assessment tool for genome assemblies +

Quality assessment tool for genome assemblies.

--- :::note -Quality assessment tool for genome assemblies + +

Quality assessment tool for genome assemblies.

[http://quast.bioinf.spbau.ru/](http://quast.bioinf.spbau.ru/) ::: diff --git a/docs/markdown/modules/rna_seqc.md b/docs/markdown/modules/rna_seqc.md index db93910ad6..0e43a34532 100644 --- a/docs/markdown/modules/rna_seqc.md +++ b/docs/markdown/modules/rna_seqc.md @@ -2,7 +2,7 @@ title: RNA-SeQC displayed_sidebar: multiqcSidebar description: > - RNA-Seq metrics for quality control and process optimization +

RNA-Seq metrics for quality control and process optimization.

--- :::note -RNA-Seq metrics for quality control and process optimization + +

RNA-Seq metrics for quality control and process optimization.

[https://github.com/getzlab/rnaseqc](https://github.com/getzlab/rnaseqc) ::: diff --git a/docs/markdown/modules/rockhopper.md b/docs/markdown/modules/rockhopper.md index 53e86f693d..57b9b68976 100644 --- a/docs/markdown/modules/rockhopper.md +++ b/docs/markdown/modules/rockhopper.md @@ -2,7 +2,7 @@ title: Rockhopper displayed_sidebar: multiqcSidebar description: > - Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs +

Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs.

--- :::note -Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs + +

Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs.

[https://cs.wellesley.edu/~btjaden/Rockhopper/](https://cs.wellesley.edu/~btjaden/Rockhopper/) ::: diff --git a/docs/markdown/modules/rsem.md b/docs/markdown/modules/rsem.md index d2d32621bc..14052862ab 100644 --- a/docs/markdown/modules/rsem.md +++ b/docs/markdown/modules/rsem.md @@ -2,7 +2,7 @@ title: RSEM displayed_sidebar: multiqcSidebar description: > - Estimates gene and isoform expression levels from RNA-Seq data. +

Estimates gene and isoform expression levels from RNA-Seq data.

--- :::note -Estimates gene and isoform expression levels from RNA-Seq data. + +

Estimates gene and isoform expression levels from RNA-Seq data.

[https://deweylab.github.io/RSEM/](https://deweylab.github.io/RSEM/) ::: diff --git a/docs/markdown/modules/rseqc.md b/docs/markdown/modules/rseqc.md index 60bce81128..3bacf459c3 100644 --- a/docs/markdown/modules/rseqc.md +++ b/docs/markdown/modules/rseqc.md @@ -2,7 +2,7 @@ title: RSeQC displayed_sidebar: multiqcSidebar description: > - Evaluates high throughput RNA-seq data. +

Evaluates high throughput RNA-seq data.

--- :::note -Evaluates high throughput RNA-seq data. + +

Evaluates high throughput RNA-seq data.

[http://rseqc.sourceforge.net/](http://rseqc.sourceforge.net/) ::: diff --git a/docs/markdown/modules/salmon.md b/docs/markdown/modules/salmon.md index 929c092714..824a8dca06 100644 --- a/docs/markdown/modules/salmon.md +++ b/docs/markdown/modules/salmon.md @@ -2,7 +2,7 @@ title: Salmon displayed_sidebar: multiqcSidebar description: > - Quantifies expression of transcripts using RNA-seq data. +

Quantifies expression of transcripts using RNA-seq data.

--- :::note -Quantifies expression of transcripts using RNA-seq data. + +

Quantifies expression of transcripts using RNA-seq data.

[https://combine-lab.github.io/salmon/](https://combine-lab.github.io/salmon/) ::: diff --git a/docs/markdown/modules/sambamba.md b/docs/markdown/modules/sambamba.md index d1a8568adf..d55a677b08 100644 --- a/docs/markdown/modules/sambamba.md +++ b/docs/markdown/modules/sambamba.md @@ -2,7 +2,7 @@ title: Sambamba displayed_sidebar: multiqcSidebar description: > - Toolkit for interacting with BAM/CRAM files. +

Toolkit for interacting with BAM/CRAM files.

--- :::note -Toolkit for interacting with BAM/CRAM files. + +

Toolkit for interacting with BAM/CRAM files.

[https://lomereiter.github.io/sambamba/](https://lomereiter.github.io/sambamba/) ::: diff --git a/docs/markdown/modules/samblaster.md b/docs/markdown/modules/samblaster.md index 5d6fad59f6..ba7625b246 100644 --- a/docs/markdown/modules/samblaster.md +++ b/docs/markdown/modules/samblaster.md @@ -2,7 +2,7 @@ title: Samblaster displayed_sidebar: multiqcSidebar description: > - Marks duplicates and extracts discordant and split reads from sam files. +

Marks duplicates and extracts discordant and split reads from sam files.

--- :::note -Marks duplicates and extracts discordant and split reads from sam files. + +

Marks duplicates and extracts discordant and split reads from sam files.

[https://github.com/GregoryFaust/samblaster](https://github.com/GregoryFaust/samblaster) ::: diff --git a/docs/markdown/modules/samtools.md b/docs/markdown/modules/samtools.md index 1d98131f63..6742944938 100644 --- a/docs/markdown/modules/samtools.md +++ b/docs/markdown/modules/samtools.md @@ -2,7 +2,7 @@ title: Samtools displayed_sidebar: multiqcSidebar description: > - Toolkit for interacting with BAM/CRAM files. +

Toolkit for interacting with BAM/CRAM files.

--- :::note -Toolkit for interacting with BAM/CRAM files. + +

Toolkit for interacting with BAM/CRAM files.

[http://www.htslib.org](http://www.htslib.org) ::: diff --git a/docs/markdown/modules/sargasso.md b/docs/markdown/modules/sargasso.md index 6553cdb1c6..b08b8b8c6a 100644 --- a/docs/markdown/modules/sargasso.md +++ b/docs/markdown/modules/sargasso.md @@ -2,7 +2,7 @@ title: Sargasso displayed_sidebar: multiqcSidebar description: > - Separates mixed-species RNA-seq reads according to their species of origin. +

Separates mixed-species RNA-seq reads according to their species of origin.

--- :::note -Separates mixed-species RNA-seq reads according to their species of origin. + +

Separates mixed-species RNA-seq reads according to their species of origin.

[http://biomedicalinformaticsgroup.github.io/Sargasso/](http://biomedicalinformaticsgroup.github.io/Sargasso/) ::: diff --git a/docs/markdown/modules/seqera_cli.md b/docs/markdown/modules/seqera_cli.md index 1693426223..b8a68215ec 100644 --- a/docs/markdown/modules/seqera_cli.md +++ b/docs/markdown/modules/seqera_cli.md @@ -2,7 +2,7 @@ title: Seqera Platform CLI displayed_sidebar: multiqcSidebar description: > - Reports statistics generated by the Seqera Platform CLI. +

Reports statistics generated by the Seqera Platform CLI.

--- :::note -Reports statistics generated by the Seqera Platform CLI. + +

Reports statistics generated by the Seqera Platform CLI.

[https://github.com/seqeralabs/tower-cli](https://github.com/seqeralabs/tower-cli) ::: diff --git a/docs/markdown/modules/seqfu.md b/docs/markdown/modules/seqfu.md new file mode 100644 index 0000000000..c426367820 --- /dev/null +++ b/docs/markdown/modules/seqfu.md @@ -0,0 +1,48 @@ +--- +title: Seqfu +displayed_sidebar: multiqcSidebar +description: > +

Manipulate FASTA/FASTQ files.

+--- + + + +:::note + +

Manipulate FASTA/FASTQ files.

+ +[https://telatin.github.io/seqfu2](https://telatin.github.io/seqfu2) +::: + +Supported commands: + +- `stats`: + +### seqfu stats + +#### Input files + +`seqfu stats` can generated reports in multiple formats, see https://telatin.github.io/seqfu2/tools/stats.html. Only TSVs with headers (default `seqfu stats` output) are currently detected and parsed by MultiQC. + +:::note +`seqfu stats` has a `--multiqc` option that generates a `_mqc.txt` file can be used with MuliQC as custom content. This is different from this module which enables additional features. +::: + +#### Configuration + +Sample names are automatically extracted from the "File" columns by default. If you only have one sample per file and prefer to use the filename as the sample name instead, you can set the global `use_filename_as_sample_name` option to `true` or list `seqfu` under it. + +### File search patterns + +```yaml +seqfu/stats: + contents: "File\t#Seq\tTotal bp\tAvg\tN50\tN75\tN90\tauN\tMin\tMax" + num_lines: 1 +``` diff --git a/docs/markdown/modules/sequali.md b/docs/markdown/modules/sequali.md index 06de6f2c65..cee23865a9 100644 --- a/docs/markdown/modules/sequali.md +++ b/docs/markdown/modules/sequali.md @@ -2,7 +2,7 @@ title: Sequali displayed_sidebar: multiqcSidebar description: > - Sequencing quality control for both long-read and short-read data +

Sequencing quality control for both long-read and short-read data.

--- :::note -Sequencing quality control for both long-read and short-read data + +

Sequencing quality control for both long-read and short-read data.

[https://github.com/rhpvorderman/sequali](https://github.com/rhpvorderman/sequali) ::: diff --git a/docs/markdown/modules/seqwho.md b/docs/markdown/modules/seqwho.md index 891bc40b2a..84a366b831 100644 --- a/docs/markdown/modules/seqwho.md +++ b/docs/markdown/modules/seqwho.md @@ -2,7 +2,7 @@ title: SeqWho displayed_sidebar: multiqcSidebar description: > - Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected. +

Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.

--- :::note -Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected. + +

Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.

[https://daehwankimlab.github.io/seqwho/](https://daehwankimlab.github.io/seqwho/) ::: diff --git a/docs/markdown/modules/seqyclean.md b/docs/markdown/modules/seqyclean.md index 8cfd9342af..c746d43b10 100644 --- a/docs/markdown/modules/seqyclean.md +++ b/docs/markdown/modules/seqyclean.md @@ -2,7 +2,7 @@ title: SeqyClean displayed_sidebar: multiqcSidebar description: > - Filters adapters, vectors, and contaminants while quality trimming. +

Filters adapters, vectors, and contaminants while quality trimming.

--- :::note -Filters adapters, vectors, and contaminants while quality trimming. + +

Filters adapters, vectors, and contaminants while quality trimming.

[https://github.com/ibest/seqyclean](https://github.com/ibest/seqyclean) ::: diff --git a/docs/markdown/modules/sexdeterrmine.md b/docs/markdown/modules/sexdeterrmine.md index 1b0d144ebb..33b243ab26 100644 --- a/docs/markdown/modules/sexdeterrmine.md +++ b/docs/markdown/modules/sexdeterrmine.md @@ -2,7 +2,7 @@ title: SexDetErrmine displayed_sidebar: multiqcSidebar description: > - Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs. +

Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.

--- :::note -Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs. + +

Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.

[https://github.com/TCLamnidis/Sex.DetERRmine](https://github.com/TCLamnidis/Sex.DetERRmine) ::: diff --git a/docs/markdown/modules/sickle.md b/docs/markdown/modules/sickle.md index 6a58f16432..6b25536804 100644 --- a/docs/markdown/modules/sickle.md +++ b/docs/markdown/modules/sickle.md @@ -2,7 +2,7 @@ title: Sickle displayed_sidebar: multiqcSidebar description: > - A windowed adaptive trimming tool for FASTQ files using quality. +

A windowed adaptive trimming tool for FASTQ files using quality.

--- :::note -A windowed adaptive trimming tool for FASTQ files using quality. + +

A windowed adaptive trimming tool for FASTQ files using quality.

[https://github.com/najoshi/sickle](https://github.com/najoshi/sickle) ::: diff --git a/docs/markdown/modules/skewer.md b/docs/markdown/modules/skewer.md index 3d31cf39f5..c36d9d5a61 100644 --- a/docs/markdown/modules/skewer.md +++ b/docs/markdown/modules/skewer.md @@ -2,7 +2,7 @@ title: Skewer displayed_sidebar: multiqcSidebar description: > - Adapter trimming tool for NGS paired-end sequences. +

Adapter trimming tool for NGS paired-end sequences.

--- :::note -Adapter trimming tool for NGS paired-end sequences. + +

Adapter trimming tool for NGS paired-end sequences.

[https://github.com/relipmoc/skewer](https://github.com/relipmoc/skewer) ::: diff --git a/docs/markdown/modules/slamdunk.md b/docs/markdown/modules/slamdunk.md index 822663338d..a38ce8b8a7 100644 --- a/docs/markdown/modules/slamdunk.md +++ b/docs/markdown/modules/slamdunk.md @@ -2,7 +2,7 @@ title: Slamdunk displayed_sidebar: multiqcSidebar description: > - Tool to analyze SLAM-Seq data. +

Tool to analyze SLAM-Seq data.

--- :::note -Tool to analyze SLAM-Seq data. + +

Tool to analyze SLAM-Seq data.

[http://t-neumann.github.io/slamdunk/](http://t-neumann.github.io/slamdunk/) ::: diff --git a/docs/markdown/modules/snippy.md b/docs/markdown/modules/snippy.md index 2f33d88864..2227d029b6 100644 --- a/docs/markdown/modules/snippy.md +++ b/docs/markdown/modules/snippy.md @@ -2,7 +2,7 @@ title: Snippy displayed_sidebar: multiqcSidebar description: > - Rapid haploid variant calling and core genome alignment. +

Rapid haploid variant calling and core genome alignment.

--- :::note -Rapid haploid variant calling and core genome alignment. + +

Rapid haploid variant calling and core genome alignment.

[https://github.com/tseemann/snippy](https://github.com/tseemann/snippy) ::: diff --git a/docs/markdown/modules/snpeff.md b/docs/markdown/modules/snpeff.md index 6b09bdce65..220b764cc1 100644 --- a/docs/markdown/modules/snpeff.md +++ b/docs/markdown/modules/snpeff.md @@ -2,7 +2,7 @@ title: SnpEff displayed_sidebar: multiqcSidebar description: > - Annotates and predicts the effects of variants on genes (such as amino acid changes). +

Annotates and predicts the effects of variants on genes (such as amino acid changes).

--- :::note -Annotates and predicts the effects of variants on genes (such as amino acid changes). + +

Annotates and predicts the effects of variants on genes (such as amino acid changes).

[http://snpeff.sourceforge.net/](http://snpeff.sourceforge.net/) ::: diff --git a/docs/markdown/modules/snpsplit.md b/docs/markdown/modules/snpsplit.md index c94a6b6f68..6990dc60bc 100644 --- a/docs/markdown/modules/snpsplit.md +++ b/docs/markdown/modules/snpsplit.md @@ -2,7 +2,7 @@ title: SNPsplit displayed_sidebar: multiqcSidebar description: > - Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions +

Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions.

--- :::note -Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions + +

Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions.

[https://www.bioinformatics.babraham.ac.uk/projects/SNPsplit/](https://www.bioinformatics.babraham.ac.uk/projects/SNPsplit/) ::: diff --git a/docs/markdown/modules/somalier.md b/docs/markdown/modules/somalier.md index 366484fcf7..faeb56ce2a 100644 --- a/docs/markdown/modules/somalier.md +++ b/docs/markdown/modules/somalier.md @@ -2,7 +2,7 @@ title: Somalier displayed_sidebar: multiqcSidebar description: > - Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF +

Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF.

--- :::note -Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF + +

Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF.

[https://github.com/brentp/somalier](https://github.com/brentp/somalier) ::: diff --git a/docs/markdown/modules/sortmerna.md b/docs/markdown/modules/sortmerna.md index 2144141a79..ef50aa2d40 100644 --- a/docs/markdown/modules/sortmerna.md +++ b/docs/markdown/modules/sortmerna.md @@ -2,7 +2,7 @@ title: SortMeRNA displayed_sidebar: multiqcSidebar description: > - Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data. +

Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.

--- :::note -Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data. + +

Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.

[http://bioinfo.lifl.fr/RNA/sortmerna/](http://bioinfo.lifl.fr/RNA/sortmerna/) ::: diff --git a/docs/markdown/modules/sourmash.md b/docs/markdown/modules/sourmash.md index 3a2629916b..2c80e7f7ca 100644 --- a/docs/markdown/modules/sourmash.md +++ b/docs/markdown/modules/sourmash.md @@ -2,7 +2,7 @@ title: Sourmash displayed_sidebar: multiqcSidebar description: > - Quickly searches, compares, and analyzes genomic and metagenomic data sets. +

Quickly searches, compares, and analyzes genomic and metagenomic data sets.

--- :::note -Quickly searches, compares, and analyzes genomic and metagenomic data sets. + +

Quickly searches, compares, and analyzes genomic and metagenomic data sets.

[https://github.com/sourmash-bio/sourmash](https://github.com/sourmash-bio/sourmash) ::: diff --git a/docs/markdown/modules/spaceranger.md b/docs/markdown/modules/spaceranger.md index 6bff1c600d..73e6235535 100644 --- a/docs/markdown/modules/spaceranger.md +++ b/docs/markdown/modules/spaceranger.md @@ -2,7 +2,7 @@ title: Space Ranger displayed_sidebar: multiqcSidebar description: > - Tool to analyze 10x Genomics spatial transcriptomics data. +

Tool to analyze 10x Genomics spatial transcriptomics data.

--- :::note -Tool to analyze 10x Genomics spatial transcriptomics data. + +

Tool to analyze 10x Genomics spatial transcriptomics data.

[https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/what-is-space-ranger](https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/what-is-space-ranger) ::: diff --git a/docs/markdown/modules/stacks.md b/docs/markdown/modules/stacks.md index cb28837b0c..52b03de9da 100644 --- a/docs/markdown/modules/stacks.md +++ b/docs/markdown/modules/stacks.md @@ -2,7 +2,7 @@ title: Stacks displayed_sidebar: multiqcSidebar description: > - Analyzes restriction enzyme-based data (e.g. RAD-seq). +

Analyzes restriction enzyme-based data (e.g. RAD-seq).

--- :::note -Analyzes restriction enzyme-based data (e.g. RAD-seq). + +

Analyzes restriction enzyme-based data (e.g. RAD-seq).

[http://catchenlab.life.illinois.edu/stacks/](http://catchenlab.life.illinois.edu/stacks/) ::: diff --git a/docs/markdown/modules/star.md b/docs/markdown/modules/star.md index 54f03b280f..fedb39134d 100644 --- a/docs/markdown/modules/star.md +++ b/docs/markdown/modules/star.md @@ -2,7 +2,7 @@ title: STAR displayed_sidebar: multiqcSidebar description: > - Universal RNA-seq aligner. +

Universal RNA-seq aligner.

--- :::note -Universal RNA-seq aligner. + +

Universal RNA-seq aligner.

[https://github.com/alexdobin/STAR](https://github.com/alexdobin/STAR) ::: diff --git a/docs/markdown/modules/supernova.md b/docs/markdown/modules/supernova.md index bf1a74661b..0415694cf9 100644 --- a/docs/markdown/modules/supernova.md +++ b/docs/markdown/modules/supernova.md @@ -2,7 +2,7 @@ title: Supernova displayed_sidebar: multiqcSidebar description: > - De novo genome assembler of 10X Genomics linked-reads. +

De novo genome assembler of 10X Genomics linked-reads.

--- :::note -De novo genome assembler of 10X Genomics linked-reads. + +

De novo genome assembler of 10X Genomics linked-reads.

[https://www.10xgenomics.com/](https://www.10xgenomics.com/) ::: diff --git a/docs/markdown/modules/telseq.md b/docs/markdown/modules/telseq.md index 11c1f4820c..ce20204126 100644 --- a/docs/markdown/modules/telseq.md +++ b/docs/markdown/modules/telseq.md @@ -2,7 +2,7 @@ title: telseq displayed_sidebar: multiqcSidebar description: > - Estimates telomere length from whole genome sequencing data (BAMs). +

Estimates telomere length from whole genome sequencing data (BAMs).

--- :::note -Estimates telomere length from whole genome sequencing data (BAMs). + +

Estimates telomere length from whole genome sequencing data (BAMs).

[https://github.com/zd1/telseq](https://github.com/zd1/telseq) ::: diff --git a/docs/markdown/modules/theta2.md b/docs/markdown/modules/theta2.md index 77e5adf156..60b8fce3e8 100644 --- a/docs/markdown/modules/theta2.md +++ b/docs/markdown/modules/theta2.md @@ -2,7 +2,7 @@ title: THetA2 displayed_sidebar: multiqcSidebar description: > - Estimates tumour purity and clonal / subclonal copy number. +

Estimates tumour purity and clonal / subclonal copy number.

--- :::note -Estimates tumour purity and clonal / subclonal copy number. + +

Estimates tumour purity and clonal / subclonal copy number.

[http://compbio.cs.brown.edu/projects/theta/](http://compbio.cs.brown.edu/projects/theta/) ::: diff --git a/docs/markdown/modules/tophat.md b/docs/markdown/modules/tophat.md index 1ea45bf565..852be063dd 100644 --- a/docs/markdown/modules/tophat.md +++ b/docs/markdown/modules/tophat.md @@ -2,7 +2,7 @@ title: Tophat displayed_sidebar: multiqcSidebar description: > - Splice junction RNA-Seq reads mapper for mammalian-sized genomes. +

Splice junction RNA-Seq reads mapper for mammalian-sized genomes.

--- :::note -Splice junction RNA-Seq reads mapper for mammalian-sized genomes. + +

Splice junction RNA-Seq reads mapper for mammalian-sized genomes.

[https://ccb.jhu.edu/software/tophat/](https://ccb.jhu.edu/software/tophat/) ::: diff --git a/docs/markdown/modules/trimmomatic.md b/docs/markdown/modules/trimmomatic.md index 48c4374f48..b6ca36ba36 100644 --- a/docs/markdown/modules/trimmomatic.md +++ b/docs/markdown/modules/trimmomatic.md @@ -2,7 +2,7 @@ title: Trimmomatic displayed_sidebar: multiqcSidebar description: > - Read trimming tool for Illumina NGS data. +

Read trimming tool for Illumina NGS data.

--- :::note -Read trimming tool for Illumina NGS data. + +

Read trimming tool for Illumina NGS data.

[http://www.usadellab.org/cms/?page=trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic) ::: diff --git a/docs/markdown/modules/truvari.md b/docs/markdown/modules/truvari.md index 089a426ca2..55614ff819 100644 --- a/docs/markdown/modules/truvari.md +++ b/docs/markdown/modules/truvari.md @@ -2,7 +2,7 @@ title: Truvari displayed_sidebar: multiqcSidebar description: > - Benchmarking, merging, and annotating structural variants +

Benchmarking, merging, and annotating structural variants.

--- :::note -Benchmarking, merging, and annotating structural variants + +

Benchmarking, merging, and annotating structural variants.

[https://github.com/ACEnglish/truvari](https://github.com/ACEnglish/truvari) ::: @@ -30,5 +31,5 @@ Supported commands: truvari/bench: contents_re: .*truvari.* bench.* fn: log.txt - num_lines: 2 + num_lines: 10 ``` diff --git a/docs/markdown/modules/umicollapse.md b/docs/markdown/modules/umicollapse.md index 35a55c0b5b..e6ccdce0eb 100644 --- a/docs/markdown/modules/umicollapse.md +++ b/docs/markdown/modules/umicollapse.md @@ -2,7 +2,7 @@ title: UMICollapse displayed_sidebar: multiqcSidebar description: > - Algorithms for efficiently collapsing reads with Unique Molecular Identifiers +

Algorithms for efficiently collapsing reads with Unique Molecular Identifiers.

--- :::note -Algorithms for efficiently collapsing reads with Unique Molecular Identifiers + +

Algorithms for efficiently collapsing reads with Unique Molecular Identifiers.

[https://github.com/Daniel-Liu-c0deb0t/UMICollapse](https://github.com/Daniel-Liu-c0deb0t/UMICollapse) ::: diff --git a/docs/markdown/modules/umitools.md b/docs/markdown/modules/umitools.md index 9da81ab35d..b5467a70b0 100644 --- a/docs/markdown/modules/umitools.md +++ b/docs/markdown/modules/umitools.md @@ -2,7 +2,7 @@ title: UMI-tools displayed_sidebar: multiqcSidebar description: > - Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes. +

Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.

--- :::note -Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes. + +

Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.

[https://github.com/CGATOxford/UMI-tools](https://github.com/CGATOxford/UMI-tools) ::: @@ -46,8 +47,8 @@ assumption fails, we extract the sample name from the log file name. ```yaml umitools/dedup: contents: "# output generated by dedup" - num_lines: 3 + num_lines: 100 umitools/extract: contents: "# output generated by extract" - num_lines: 3 + num_lines: 100 ``` diff --git a/docs/markdown/modules/varscan2.md b/docs/markdown/modules/varscan2.md index 3be7b2a9c4..8653ce86a1 100644 --- a/docs/markdown/modules/varscan2.md +++ b/docs/markdown/modules/varscan2.md @@ -2,7 +2,7 @@ title: VarScan2 displayed_sidebar: multiqcSidebar description: > - Variant detection in massively parallel sequencing data +

Variant detection in massively parallel sequencing data.

--- :::note -Variant detection in massively parallel sequencing data + +

Variant detection in massively parallel sequencing data.

[http://dkoboldt.github.io/varscan/](http://dkoboldt.github.io/varscan/) ::: @@ -36,11 +37,11 @@ The MultiQC module can read output from `mpileup2cns`, `mpileup2snp` and `mpileu ```yaml varscan2/mpileup2cns: contents: Only variants will be reported - num_lines: 3 + num_lines: 10 varscan2/mpileup2indel: contents: Only indels will be reported - num_lines: 3 + num_lines: 10 varscan2/mpileup2snp: contents: Only SNPs will be reported - num_lines: 3 + num_lines: 10 ``` diff --git a/docs/markdown/modules/vcftools.md b/docs/markdown/modules/vcftools.md index 191dfac7cb..0327e634c4 100644 --- a/docs/markdown/modules/vcftools.md +++ b/docs/markdown/modules/vcftools.md @@ -2,7 +2,7 @@ title: VCFTools displayed_sidebar: multiqcSidebar description: > - Program to analyse and reporting on VCF files. +

Program to analyse and reporting on VCF files.

--- :::note -Program to analyse and reporting on VCF files. + +

Program to analyse and reporting on VCF files.

[https://vcftools.github.io](https://vcftools.github.io) ::: diff --git a/docs/markdown/modules/vep.md b/docs/markdown/modules/vep.md index cfc78275f4..5023c28358 100644 --- a/docs/markdown/modules/vep.md +++ b/docs/markdown/modules/vep.md @@ -2,7 +2,7 @@ title: VEP displayed_sidebar: multiqcSidebar description: > - Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions. +

Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.

--- :::note -Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions. + +

Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.

[https://www.ensembl.org/info/docs/tools/vep/index.html](https://www.ensembl.org/info/docs/tools/vep/index.html) ::: diff --git a/docs/markdown/modules/verifybamid.md b/docs/markdown/modules/verifybamid.md index 1173a98513..dc74b8dc53 100644 --- a/docs/markdown/modules/verifybamid.md +++ b/docs/markdown/modules/verifybamid.md @@ -2,7 +2,7 @@ title: VerifyBAMID displayed_sidebar: multiqcSidebar description: > - Detects sample contamination and/or sample swaps. +

Detects sample contamination and/or sample swaps.

--- :::note -Detects sample contamination and/or sample swaps. + +

Detects sample contamination and/or sample swaps.

[https://genome.sph.umich.edu/wiki/VerifyBamID](https://genome.sph.umich.edu/wiki/VerifyBamID) ::: diff --git a/docs/markdown/modules/vg.md b/docs/markdown/modules/vg.md index 89e9286113..5e2d76331d 100644 --- a/docs/markdown/modules/vg.md +++ b/docs/markdown/modules/vg.md @@ -2,7 +2,7 @@ title: VG displayed_sidebar: multiqcSidebar description: > - Toolkit to manipulate and analyze graphical genomes, including read alignment +

Toolkit to manipulate and analyze graphical genomes, including read alignment.

--- :::note -Toolkit to manipulate and analyze graphical genomes, including read alignment + +

Toolkit to manipulate and analyze graphical genomes, including read alignment.

[https://github.com/vgteam/vg](https://github.com/vgteam/vg) ::: diff --git a/docs/markdown/modules/whatshap.md b/docs/markdown/modules/whatshap.md index d7e87ab78e..778da929ef 100644 --- a/docs/markdown/modules/whatshap.md +++ b/docs/markdown/modules/whatshap.md @@ -2,7 +2,7 @@ title: WhatsHap displayed_sidebar: multiqcSidebar description: > - Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly) +

Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly).

--- :::note -Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly) + +

Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly).

[https://whatshap.readthedocs.io/](https://whatshap.readthedocs.io/) ::: diff --git a/docs/markdown/modules/xengsort.md b/docs/markdown/modules/xengsort.md index 6e6f2753dd..a50ae563ed 100644 --- a/docs/markdown/modules/xengsort.md +++ b/docs/markdown/modules/xengsort.md @@ -2,7 +2,7 @@ title: Xengsort displayed_sidebar: multiqcSidebar description: > - Fast xenograft read sorter based on space-efficient k-mer hashing +

Fast xenograft read sorter based on space-efficient k-mer hashing.

--- :::note -Fast xenograft read sorter based on space-efficient k-mer hashing + +

Fast xenograft read sorter based on space-efficient k-mer hashing.

[https://gitlab.com/genomeinformatics/xengsort](https://gitlab.com/genomeinformatics/xengsort) ::: diff --git a/docs/markdown/modules/xenium.md b/docs/markdown/modules/xenium.md new file mode 100644 index 0000000000..37a87065a2 --- /dev/null +++ b/docs/markdown/modules/xenium.md @@ -0,0 +1,49 @@ +--- +title: Xenium +displayed_sidebar: multiqcSidebar +description: > +

Spatial transcriptomics platform from 10x Genomics that provides subcellular resolution.

+--- + + + +:::note + +

Spatial transcriptomics platform from 10x Genomics that provides subcellular resolution.

+ +[https://www.10xgenomics.com/platforms/xenium](https://www.10xgenomics.com/platforms/xenium) +::: + +Xenium is a spatial transcriptomics platform from 10x Genomics that provides subcellular resolution. + +NOTE: parsing huge files is not an intended MultiQC usage. By default, MultiQC will ignore the `*.parquet` files +as they are gigabyte-sized. To enable parsing those, make sure to have this line in your config: + +``` +log_filesize_limit: 5000000000 # 5GB +``` + +### File search patterns + +```yaml +xenium/cell_feature_matrix: + fn: cell_feature_matrix.h5 +xenium/cells: + fn: cells.parquet +xenium/experiment: + fn: experiment.xenium + num_lines: 50 +xenium/metrics: + contents: num_cells_detected + fn: metrics_summary.csv + num_lines: 5 +xenium/transcripts: + fn: transcripts.parquet +``` diff --git a/docs/markdown/modules/xenome.md b/docs/markdown/modules/xenome.md index bb6d3cf1ad..7335d52abe 100644 --- a/docs/markdown/modules/xenome.md +++ b/docs/markdown/modules/xenome.md @@ -2,7 +2,7 @@ title: Xenome displayed_sidebar: multiqcSidebar description: > - Classifies reads from xenograft sources. +

Classifies reads from xenograft sources.

--- :::note -Classifies reads from xenograft sources. + +

Classifies reads from xenograft sources.

[https://github.com/data61/gossamer/blob/master/docs/xenome.md](https://github.com/data61/gossamer/blob/master/docs/xenome.md) ::: diff --git a/multiqc/modules/seqfu/seqfu.py b/multiqc/modules/seqfu/seqfu.py index 08035493c4..ead93822a1 100644 --- a/multiqc/modules/seqfu/seqfu.py +++ b/multiqc/modules/seqfu/seqfu.py @@ -34,7 +34,7 @@ def __init__(self): anchor="seqfu", target="seqfu", href="https://telatin.github.io/seqfu2", - info="A general-purpose program to manipulate and parse information from FASTX files", + info="Manipulate FASTA/FASTQ files.", doi="10.3390/bioengineering8050059", ) diff --git a/pyproject.toml b/pyproject.toml index 6b2296e2a2..cca9270bf0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "multiqc" -version = "1.30dev" +version = "1.31" dependencies = [ "boto3", # for aws bedrock ai support "click", diff --git a/scripts/print_changelog.py b/scripts/print_changelog.py index 316f11ddab..55cca5b4b8 100755 --- a/scripts/print_changelog.py +++ b/scripts/print_changelog.py @@ -21,6 +21,9 @@ WORKSPACE_PATH = Path(os.environ.get("GITHUB_WORKSPACE", ".")) MODULES_SUBDIR = Path("multiqc/modules") +if not GITHUB_TOKEN: + raise ValueError("Please set the GITHUB_TOKEN environment variable") + def run_cmd(cmd): print(cmd) @@ -30,25 +33,44 @@ def run_cmd(cmd): return result -def get_milestone_prs(repo, current_tag: str, previous_tag: str, limit=100) -> List[PullRequest]: - all_pulls: List[PullRequest] = [] - - page = repo.get_pulls(state="closed", sort="updated", direction="desc") - for p in page: - if not p.milestone: - print(f"PR does not have a milestone: {p.number} {p.title}") - elif p.milestone.title == previous_tag: - return all_pulls - elif p.milestone.title == current_tag: - all_pulls.append(p) - if len(all_pulls) >= limit: - print(f"Reached limit of {limit} PRs") - return all_pulls - else: - print( - f"The PR is not in the previous miletone {previous_tag} nor the current milestone {current_tag}: '{p.milestone.title}': {p.number} {p.title}" - ) +def get_milestone_prs(repo, current_tag: str, limit=100) -> List[PullRequest]: + """ + Get PRs for the current milestone using direct milestone filtering. + + This version directly queries PRs by milestone, ensuring completeness + and accuracy while minimizing API calls by using the Issue API. + """ + # Find the milestone object for the current tag + milestones = {m.title: m for m in repo.get_milestones(state="all")} + + current_milestone = milestones.get(current_tag) + if not current_milestone: + raise ValueError(f"Current milestone '{current_tag}' not found") + + print( + f"Found milestone '{current_tag}' with {current_milestone.open_issues + current_milestone.closed_issues} total issues/PRs" + ) + # Get issues/PRs for this specific milestone (GitHub API treats PRs as issues) + # This is more reliable than iterating through all PRs + issues = repo.get_issues(state="closed", milestone=current_milestone, sort="updated", direction="desc") + + # Filter to merged PRs only + all_pulls = [] + for issue in issues: + if issue.pull_request is not None: # This is a PR + try: + pr = repo.get_pull(issue.number) + if pr.merged: # Only include merged PRs + all_pulls.append(pr) + if len(all_pulls) >= limit: + print(f"Reached limit of {limit} PRs") + break + except Exception as e: + print(f"Error fetching PR #{issue.number}: {e}") + continue + + print(f"Found {len(all_pulls)} merged PRs for milestone '{current_tag}'") return all_pulls @@ -80,7 +102,7 @@ def main(): milestones = repo.get_milestones(state="all") assert_milestone_exists(milestones, current_tag) assert_milestone_exists(milestones, previous_minor_tag) - prs: List[PullRequest] = get_milestone_prs(repo, current_tag, previous_minor_tag) + prs: List[PullRequest] = get_milestone_prs(repo, current_tag) label_to_section: Dict[str, str] = { "module: new": "New modules", From 178dce8ddd92bfd5bb1cd719a38c3dc2070a46e5 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 13:07:04 +0200 Subject: [PATCH 23/35] Typo in changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d56bd88a53..11205b7909 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ The parquet format is stable since 1.29, renaming the output file from `BETA-mul ### Fixes - Scatter plot: fix hiding dots by legend click ([#3321](https://github.com/MultiQC/MultiQC/pull/3321)) -- Plots: seuset unique series label for each plot type ([#3330](https://github.com/MultiQC/MultiQC/pull/3330)) +- Plots: set unique series label for each plot type ([#3330](https://github.com/MultiQC/MultiQC/pull/3330)) - Fix bulk sample renaming buttons ([#3300](https://github.com/MultiQC/MultiQC/pull/3300)) - Fix config flag types in schema ([#3318](https://github.com/MultiQC/MultiQC/pull/3318)) - Ignore pyc files when copying html files ([#3320](https://github.com/MultiQC/MultiQC/pull/3320)) From 38652ebef2ded73195fee6a7e2fa1fffb24d4931 Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 13:29:58 +0200 Subject: [PATCH 24/35] Xenium: fix `Transcript Quality Summary ` order. Add warnings about missing `scipy` (#3334) * Xenium: warnings about missing scipy * Fix sorting categories * Add scipy dep --- multiqc/modules/xenium/xenium.py | 134 +++++++++++++++---------------- pyproject.toml | 3 +- 2 files changed, 65 insertions(+), 72 deletions(-) diff --git a/multiqc/modules/xenium/xenium.py b/multiqc/modules/xenium/xenium.py index 35a47d37b5..5ebfd27b3a 100644 --- a/multiqc/modules/xenium/xenium.py +++ b/multiqc/modules/xenium/xenium.py @@ -1526,6 +1526,28 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): if not all_categories: return None + # Sort categories for consistent ordering + sorted_categories = sorted( + all_categories, + key=lambda x: ( + 0 + if x == "Pre-designed" + else 1 + if x == "Custom" + else 2 + if x == "Genomic Control Probe" + else 3 + if x == "Negative Control Probe" + else 4 + if x == "Negative Control Codeword" + else 5 + if x == "Unassigned Codeword" + else 6 + if x == "Deprecated Codeword" + else 7 + ), + ) + # Create table data: samples as rows, categories as columns table_data = {} for sample_name, sample_data in transcript_data_by_sample.items(): @@ -1535,7 +1557,7 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): table_data[sample_name] = {} # Add mean quality for each category - for category in all_categories: + for category in sorted_categories: if category in sample_data["category_summary"]: mean_quality = sample_data["category_summary"][category]["mean_quality"] table_data[sample_name][f"{category} Mean QV"] = mean_quality @@ -1543,7 +1565,7 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): table_data[sample_name][f"{category} Mean QV"] = None # Add standard deviation for each category - for category in all_categories: + for category in sorted_categories: if category in sample_data["category_summary"]: std_quality = sample_data["category_summary"][category]["std_quality"] table_data[sample_name][f"{category} Std Dev"] = std_quality @@ -1556,28 +1578,6 @@ def xenium_transcript_quality_table(self, transcript_data_by_sample): # Create table headers for each category (both mean and std dev) headers: Dict[str, ColumnDict] = {} - # Sort categories for consistent ordering - sorted_categories = sorted( - all_categories, - key=lambda x: ( - 0 - if x == "Pre-designed" - else 1 - if x == "Custom" - else 2 - if x == "Genomic Сontrol probe" - else 3 - if x == "Negative Сontrol probe" - else 4 - if x == "Negative Сontrol codeword" - else 5 - if x == "Unassigned Сodeword" - else 6 - if x == "Deprecated Сodeword" - else 7 - ), - ) - # Create consistent abbreviations for column titles category_abbreviations = { "Pre-designed": "Pre-designed", @@ -1652,10 +1652,7 @@ def _create_single_sample_area_density(self, cell_data): log.warning("scipy not available, skipping density plots. Install scipy for enhanced plotting.") return None - import numpy as np - - if SCIPY_AVAILABLE: - from scipy.stats import gaussian_kde + from scipy.stats import gaussian_kde # Skip density plots if only pre-calculated statistics are available if "cell_area_values" not in cell_data: @@ -1889,11 +1886,8 @@ def _create_single_sample_ratio_density(self, cell_data): log.warning("scipy not available, skipping plots. Install scipy for enhanced plotting.") return None - import numpy as np from scipy import stats - from multiqc.plots import linegraph - # Skip density plots if only pre-calculated statistics are available if "nucleus_to_cell_area_ratio_values" not in cell_data: log.info( @@ -2408,33 +2402,28 @@ def calculate_noise_threshold_from_df(self, transcript_stats_df, quantile=0.99): if len(neg_control_counts) < 3: # Need at least 3 data points for meaningful statistics return None - try: - # Calculate threshold using log-space statistics (similar to notebook) - log_counts = np.log10(neg_control_counts) - - # Use median absolute deviation as robust estimate of standard deviation - median_log = np.median(log_counts) - mad = np.median(np.abs(log_counts - median_log)) - # Convert MAD to standard deviation equivalent (normal distribution scaling factor) - std_log = mad * 1.4826 + if not SCIPY_AVAILABLE: + # Fallback to simple percentile if scipy not available + log.warning("scipy not available, falling back to simple percentile for noise threshold") + return np.percentile(neg_control_counts, quantile * 100) - # Calculate upper bound using quantile - if SCIPY_AVAILABLE: - from scipy.stats import norm + # Calculate upper bound using quantile + from scipy.stats import norm - z_score = norm.ppf(quantile) - threshold_log = median_log + z_score * std_log + # Calculate threshold using log-space statistics (similar to notebook) + log_counts = np.log10(neg_control_counts) - threshold = 10**threshold_log - return threshold + # Use median absolute deviation as robust estimate of standard deviation + median_log = np.median(log_counts) + mad = np.median(np.abs(log_counts - median_log)) + # Convert MAD to standard deviation equivalent (normal distribution scaling factor) + std_log = mad * 1.4826 - except (ImportError, ValueError): - # Fallback to simple percentile if scipy not available - return np.percentile(neg_control_counts, quantile * 100) + z_score = norm.ppf(quantile) + threshold_log = median_log + z_score * std_log - except Exception: - # Return None if calculation fails - return None + threshold = 10**threshold_log + return threshold def xenium_cell_distributions_combined_plot(self, cells_data_by_sample): """Create combined plot for transcripts and detected genes per cell distributions""" @@ -2485,11 +2474,11 @@ def _create_single_sample_combined_density(self, samples_with_transcript_counts, ) return None raw_transcript_values = transcript_values - try: - if SCIPY_AVAILABLE: - from scipy.stats import gaussian_kde + transcript_values = np.array(transcript_values) + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde - transcript_values = np.array(transcript_values) kde = gaussian_kde(transcript_values) x_min, x_max = transcript_values.min(), transcript_values.max() x_range = np.linspace(x_min, x_max, 1000) @@ -2501,7 +2490,8 @@ def _create_single_sample_combined_density(self, samples_with_transcript_counts, transcripts_data[float(x)] = float(y) plot_data["Transcripts per cell"] = transcripts_data - except ImportError: + else: + log.warning("scipy not available, falling back to histogram") # Fallback to histogram if scipy not available bins = min(50, len(transcript_values) // 20) hist, bin_edges = np.histogram(transcript_values, bins=bins) @@ -2525,11 +2515,12 @@ def _create_single_sample_combined_density(self, samples_with_transcript_counts, return None else: raw_gene_values = gene_counts - try: - if SCIPY_AVAILABLE: - from scipy.stats import gaussian_kde - gene_counts = np.array(gene_counts) + gene_counts = np.array(gene_counts) + + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde + kde = gaussian_kde(gene_counts) x_min, x_max = gene_counts.min(), gene_counts.max() x_range = np.linspace(x_min, x_max, 1000) @@ -2541,7 +2532,8 @@ def _create_single_sample_combined_density(self, samples_with_transcript_counts, genes_data[float(x)] = float(y) plot_data["Detected genes per cell"] = genes_data - except ImportError: + else: + log.warning("scipy not available, falling back to histogram") # Fallback to histogram if scipy not available bins = min(50, len(gene_counts) // 20) hist, bin_edges = np.histogram(gene_counts, bins=bins) @@ -2633,9 +2625,8 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): s_name, transcript_values = next(iter(samples_with_transcripts.items())) # Create kernel density estimation - try: - if SCIPY_AVAILABLE: - from scipy.stats import gaussian_kde + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde transcript_values = np.array(transcript_values) kde = gaussian_kde(transcript_values) @@ -2668,7 +2659,8 @@ def _create_single_sample_transcripts_density(self, samples_with_transcripts): return linegraph.plot(plot_data, config) - except ImportError: + else: + log.warning("scipy not available, falling back to histogram") # Fallback to histogram if scipy not available bins = min(50, len(transcript_values) // 20) hist, bin_edges = np.histogram(transcript_values, bins=bins) @@ -2740,9 +2732,8 @@ def _create_single_sample_transcript_counts_density(self, samples_with_transcrip s_name, gene_values = next(iter(samples_with_transcript_counts.items())) # Create kernel density estimation - try: - if SCIPY_AVAILABLE: - from scipy.stats import gaussian_kde + if SCIPY_AVAILABLE: + from scipy.stats import gaussian_kde gene_values = np.array(gene_values) kde = gaussian_kde(gene_values) @@ -2767,7 +2758,8 @@ def _create_single_sample_transcript_counts_density(self, samples_with_transcrip return linegraph.plot(plot_data, config) - except ImportError: + else: + log.warning("scipy not available, falling back to histogram") # Fallback to histogram if scipy not available bins = min(50, len(gene_values) // 20) hist, bin_edges = np.histogram(gene_values, bins=bins) diff --git a/pyproject.toml b/pyproject.toml index cca9270bf0..fd901dca5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,8 @@ dependencies = [ "jsonschema", "polars-lts-cpu", # for parquet support. Using LTS version for compatibility with older architectures "pyarrow", # for parquet support - "scanpy", # to parse h5 files + "scanpy", # to parse h5 files for Xenium module + "scipy", # for Xenium module. Though Scanpy depends on scipy anyway. ] requires-python = ">=3.8" authors = [ From 309600bf8b6f8d41e14cf4d5ae6ee7de0301edca Mon Sep 17 00:00:00 2001 From: Vlad Savelyev Date: Fri, 5 Sep 2025 13:31:54 +0200 Subject: [PATCH 25/35] Rich codex (#3335) * Generate new screengrabs with rich-codex * Fix path --------- Co-authored-by: github-actions[bot] --- docs/images/screenshots/fastqc-run.svg | 54 +++++++++++++------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/images/screenshots/fastqc-run.svg b/docs/images/screenshots/fastqc-run.svg index dc166b3d1d..a3cc7a85ee 100644 --- a/docs/images/screenshots/fastqc-run.svg +++ b/docs/images/screenshots/fastqc-run.svg @@ -19,52 +19,52 @@ font-weight: 700; } - .terminal-3560008324-matrix { + .terminal-3616762501-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3560008324-title { + .terminal-3616762501-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3560008324-r1 { fill: #c5c8c6 } -.terminal-3560008324-r2 { fill: #ff2627 } -.terminal-3560008324-r3 { fill: #c5c8c6;font-weight: bold } -.terminal-3560008324-r4 { fill: #868887 } -.terminal-3560008324-r5 { fill: #608ab1 } + .terminal-3616762501-r1 { fill: #c5c8c6 } +.terminal-3616762501-r2 { fill: #ff2627 } +.terminal-3616762501-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-3616762501-r4 { fill: #868887 } +.terminal-3616762501-r5 { fill: #608ab1 } - + - + - + - + - + - + - + - + - + @@ -76,18 +76,18 @@ - + - - $ multiqc . - -///MultiQC 🔍 v1.30 - -       file_search | Search path: /home/runner/work/MultiQC/MultiQC/test-data/data/modules/fastqc/v0.10.1 -            fastqc | Found 2 reports -     write_results | Data        : multiqc_data -     write_results | Report      : multiqc_report.html -           multiqc | MultiQC complete + + $ multiqc . + +///MultiQC 🔍 v1.31 + +       file_search | Search path: /home/runner/work/MultiQC/MultiQC/test-data/data/modules/fastqc/v0.10.1 +            fastqc | Found 2 reports +     write_results | Data        : multiqc_data +     write_results | Report      : multiqc_report.html +           multiqc | MultiQC complete From d5908ecf94717e372fd84c3dd917b5b6664440a5 Mon Sep 17 00:00:00 2001 From: Rintze Zelle <78232505+rzelle-lallemand@users.noreply.github.com> Date: Tue, 16 Sep 2025 07:37:17 -0400 Subject: [PATCH 26/35] Fix flag typo in running_multiqc.md (#3347) --- docs/markdown/getting_started/running_multiqc.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/markdown/getting_started/running_multiqc.md b/docs/markdown/getting_started/running_multiqc.md index 792f0eafeb..1d42a62c05 100644 --- a/docs/markdown/getting_started/running_multiqc.md +++ b/docs/markdown/getting_started/running_multiqc.md @@ -102,7 +102,7 @@ You can do this by using `-m`/`--modules` to explicitly define which modules you If an explicitly requested module couldn't find any expected input files, MultiQC will just continue with other modules. You can change this behaviour and make MultiQC -strict about missing input by setting the `--require-log` flag. +strict about missing input by setting the `--require-logs` flag. If set, MultiQC will exit with an error and exit code `1` if any of the modules specified with `-m` did not produce a section in the report. ## Directory prefixes in sample names From 9bc89542b068d242600fb83f7a65f3780815fb73 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 16 Sep 2025 14:04:22 +0200 Subject: [PATCH 27/35] Xenium docs: Mention supported version ranges. Closes #3344 --- multiqc/modules/xenium/xenium.py | 85 +++++++++++++++++--------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/multiqc/modules/xenium/xenium.py b/multiqc/modules/xenium/xenium.py index 5ebfd27b3a..c31504ffd6 100644 --- a/multiqc/modules/xenium/xenium.py +++ b/multiqc/modules/xenium/xenium.py @@ -69,12 +69,17 @@ class MultiqcModule(BaseMultiqcModule): """ Xenium is a spatial transcriptomics platform from 10x Genomics that provides subcellular resolution. - NOTE: parsing huge files is not an intended MultiQC usage. By default, MultiQC will ignore the `*.parquet` files + :::note + Parsing huge files is not an intended MultiQC usage. By default, MultiQC will ignore the `*.parquet` files as they are gigabyte-sized. To enable parsing those, make sure to have this line in your config: ``` log_filesize_limit: 5000000000 # 5GB ``` + ::: + + The MultiQC module is tested with outputs from xenium-3.x, older versions of xenium output are + not supported and may even cause MultiQC to crash (see https://github.com/MultiQC/MultiQC/issues/3344). """ def __init__(self): @@ -236,17 +241,17 @@ def __init__(self): description="Transcript quality statistics by gene category", helptext=""" This scatter plot shows transcript quality statistics broken down by gene category: - + **Gene Categories:** - * **Pre-designed**: Standard genes from Xenium panels + * **Pre-designed**: Standard genes from Xenium panels * **Custom**: User-added custom targets * **Deprecated**: Genes no longer recommended for use * **Control**: Control probe sequences (e.g., negative controls) - + **Quality Metrics:** * **X-axis**: Transcript count per gene category * **Y-axis**: Quality score distribution for each category - + **Expected patterns:** * Pre-designed genes typically show the highest counts and quality * Custom genes may show variable performance depending on probe design @@ -261,24 +266,24 @@ def __init__(self): description="Per-sample mean transcript quality statistics by gene category", helptext=""" This table shows mean transcript quality statistics for each sample, with separate columns for each gene category: - + **Gene Categories:** * **Pre-designed**: Standard genes from Xenium panels - * **Custom**: User-added custom targets + * **Custom**: User-added custom targets * **Negative Control Probe/Codeword**: Control probes for background estimation * **Genomic Control Probe**: Genomic DNA controls * **Unassigned/Deprecated Codeword**: Other transcript types - + **Quality Score (QV) Interpretation:** * QV ≥20: High-quality transcripts (≥99% accuracy) * QV 10-20: Medium quality (90-99% accuracy) * QV <10: Low-quality transcripts (<90% accuracy) - + **Table Layout:** * **Rows**: Individual samples * **Columns**: Mean QV and Standard Deviation for each category * Values show quality statistics computed from all transcripts in that category for each sample - + **What to look for:** * Pre-designed genes should have high mean QV (>20) across all samples * Consistent quality patterns across samples indicate good data quality @@ -297,24 +302,24 @@ def __init__(self): description="Distribution of transcript counts per gene", helptext=""" This histogram shows the distribution of transcript counts per gene across all samples: - + **What it shows:** * **X-axis**: Number of transcripts per gene (log scale) * **Y-axis**: Number of genes with that transcript count * **Two categories**: Genes vs. non-genes (controls, unassigned, etc.) - + **Interpretation:** * **Most genes** should have moderate transcript counts (hundreds to thousands) * **Controls and non-genes** typically have lower counts * **Very high counts** may indicate highly expressed genes or technical artifacts * **Very low counts** may indicate poorly detected genes - + **What to look for:** * **Smooth distribution** for genes with a peak in the hundreds-thousands range - * **Lower counts** for non-gene features (controls) + * **Lower counts** for non-gene features (controls) * **No extreme outliers** unless biologically expected * **Consistent patterns** across similar tissue types - + **Quality indicators:** * Peak gene expression around 100-10,000 transcripts per gene is typical * Clear separation between gene and non-gene distributions @@ -333,20 +338,20 @@ def __init__(self): description="Distribution of cell areas across samples", helptext=""" This plot shows the distribution of cell areas in the sample(s): - + **Single sample**: Density plot with vertical lines showing mean and median cell area **Multiple samples**: Violin plots showing the distribution for each sample - + **Typical cell area ranges (tissue-dependent):** * **Most tissues**: 50-200 μm² * **Large cells** (e.g., neurons): 200-500 μm² * **Small cells** (e.g., lymphocytes): 20-80 μm² - + **What to look for:** * **Consistent distributions** across samples of the same tissue type * **Biologically reasonable values** for your tissue * **Outliers**: Very large or small cells may indicate segmentation issues - + **Troubleshooting:** * Bimodal distributions: May indicate mixed cell types or segmentation artifacts * Very large cells: Over-segmentation, cell doublets, or debris @@ -364,20 +369,20 @@ def __init__(self): description="Distribution of the fraction of transcripts found in the nucleus across cells", helptext=""" This plot shows the distribution of the fraction of RNA molecules located in the nucleus versus cytoplasm for each cell: - + **Single sample**: Density plot showing the distribution of nucleus RNA fractions **Multiple samples**: Box plots comparing distributions across samples - + **Biological interpretation:** * **Low values (0.0-0.2)**: Most RNA is cytoplasmic (expected for mature mRNAs) * **High values (>0.5)**: High nuclear retention (may indicate processing issues) * **Peak around 0.0-0.1**: Normal for most cell types with efficient RNA export - + **What to look for:** * **Consistent distributions** across samples of the same tissue type * **Biologically reasonable values** for your cell types * **Sample differences**: May reflect cell type composition or processing efficiency - + **Troubleshooting:** * Very high nuclear fractions: Check for nuclear segmentation issues * Bimodal distributions: May indicate different cell types or states @@ -395,26 +400,26 @@ def __init__(self): description="Distribution of nucleus-to-cell area ratios across cells", helptext=""" This plot shows the distribution of the ratio between nucleus area and total cell area for each cell: - + **Single sample**: Density plot showing the distribution of nucleus-to-cell area ratios **Multiple samples**: Box plots comparing distributions across samples - + **Biological interpretation:** * **Typical range**: 0.2-0.6 for most cell types * **Low values (<0.2)**: Small nucleus relative to cell (may indicate active/mature cells) * **High values (>0.6)**: Large nucleus relative to cell (may indicate dividing or stressed cells) * **Peak around 0.3-0.5**: Normal for most healthy cell types - + **What to look for:** * **Consistent distributions** across samples of the same tissue type * **Biologically reasonable values** for your cell types * **Sample differences**: May reflect different cell states or tissue composition - + **Quality assessment:** * Very low ratios: May indicate over-segmented cells or debris * Very high ratios: May indicate under-segmented cells or nuclear fragments * Bimodal distributions: May indicate different cell types or segmentation artifacts - + **Troubleshooting:** * Unusual distributions may suggest issues with nuclear or cell segmentation parameters * Consider tissue-specific expected ranges when evaluating results @@ -431,20 +436,20 @@ def __init__(self): description="Distribution of transcripts and detected genes per cell", helptext=""" This plot shows two key cell-level distributions with separate tabs/datasets: - + **Tab 1: Transcripts per cell** - Shows the distribution of total transcript counts per cell **Tab 2: Detected genes per cell** - Shows the distribution of unique genes detected per cell - + **Plot types:** * **Single sample**: Density plots showing the distribution shapes * **Multiple samples**: Box plots comparing distributions across samples - + **Transcripts per cell interpretation:** * **Typical range**: 100-5000 transcripts per cell for most tissues * **High transcript counts**: Metabolically active cells or large cell types * **Low transcript counts**: Less active cells, technical dropouts, or small cell fragments * **Quality thresholds**: <50 may indicate poor segmentation, >10,000 may indicate doublets - + **Detected genes per cell interpretation:** * **Typical range**: 50-2000 genes per cell depending on cell type and panel size * **High gene counts**: Metabolically active cells or cells with high expression diversity @@ -455,16 +460,16 @@ def __init__(self): * **Multimodal distributions**: May indicate different cell types or technical artifacts * **Sample consistency**: Similar distributions expected for replicate samples * **Positive correlation**: Generally expect transcripts and detected genes per cell to correlate - + **Panel considerations:** * **Pre-designed panels**: Gene counts limited by panel design (typically 100-1000 genes) * **Custom panels**: Consider gene selection bias when interpreting results * **Detection efficiency**: Some genes may be harder to detect than others - + **Quality assessment:** * **Counts**: Very low (<50) or very high (>10,000) may indicate segmentation issues * **Shoulder distributions**: May indicate presence of different cell types - + **Troubleshooting:** * Unusual distributions may suggest issues with transcript detection or cell segmentation * Consider cell type and tissue context when evaluating expected ranges @@ -483,30 +488,30 @@ def __init__(self): description="Field of View quality distribution across QV ranges", helptext=""" This plot shows the distribution of Field of View (FoV) quality across different quality ranges: - + **What is a Field of View?** * Each FoV represents one microscope imaging area/tile * Large tissue sections are imaged as multiple overlapping FoVs * FoVs are systematically captured in a grid pattern across the tissue - + **Plot interpretation:** * **X-axis**: Quality ranges (Low to Excellent QV ranges) * **Y-axis**: Fields of View in each quality range * **Colors**: Color-coded by quality level (grey=poor, green=excellent) * **Bars**: Each sample shown as separate colored bars for comparison - + **Quality ranges:** * **Low (QV < 20)**: Poor imaging quality - investigate issues (dark grey) * **Poor (QV 20-25)**: Below optimal quality - may need attention (light grey) * **Fair (QV 25-30)**: Acceptable quality (lighter grey) * **Good (QV 30-35)**: Good imaging quality (light green) * **Excellent (QV ≥ 35)**: Optimal imaging quality (bright green) - + **What to look for:** * **Good distribution**: Most FoVs should be in "Good" or "Excellent" ranges * **Few poor FoVs**: Minimal counts in "Low" and "Poor" ranges * **Sample consistency**: Similar distributions across samples - + **Troubleshooting:** * Many low-quality FoVs: Focus/illumination issues, debris, tissue damage * Sample inconsistency: Processing or storage differences From 2fc5fe864b573c6219cad748d3c530d38e121a2d Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Tue, 16 Sep 2025 18:00:37 +0200 Subject: [PATCH 28/35] Dockerfile: Add variant with all LaTeX requirements, switch to LuaTeX from PdfLatex (#3349) * Update Dockerfile to optionally include all LaTeX requirements. Build two docker images, with and without PDF deps. * Remove duplicate latex margin argument * Shorter easier to read GHA variable names * Docker: simplify CI logic, bump to Python 3.13 * Fix cache cleanup in Dockerfile after Python version bump. Kudos @cursor for spotting that * Update the docs --- .github/workflows/docker.yml | 29 ++++++++++++------- Dockerfile | 19 ++++++++---- .../getting_started/running_multiqc.md | 6 ++-- multiqc/core/write_results.py | 12 +++++++- multiqc/multiqc.py | 8 ++--- 5 files changed, 50 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 8f356cd499..427f84795b 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -21,6 +21,13 @@ jobs: build: if: github.repository == 'MultiQC/MultiQC' runs-on: ubuntu-latest + strategy: + matrix: + v: + - lab: "" + build_args: "" + - lab: "pdf-" + build_args: "INSTALL_PANDOC=true" steps: - name: "Check out the repo" uses: actions/checkout@v4 @@ -51,29 +58,31 @@ jobs: - name: "Build dev" uses: docker/build-push-action@v3 - if: github.event_name != 'pull_request' && github.event_name != 'release' + if: github.event_name != 'release' with: # All available with python:3.X-slim are: # platforms: linux/386,linux/amd64,linux/arm/v5,linux/arm/v7,linux/arm64/v8,linux/ppc64le,linux/s390x # But 32-bit binaries likely require compilation from source so stick with linux/amd64 and linux/arm64 for now platforms: linux/amd64,linux/arm64 - push: true - # If it's a PR, use the branch name as a tag, otherwise use "dev" + build-args: ${{ matrix.v.build_args }} + # Only push if it's commit to main, or workflow_dispatch + push: ${{github.event_name != 'pull_request'}} tags: | - ${{ github.ref == 'refs/heads/main' && 'multiqc/multiqc:dev' || format('multiqc/multiqc:{0}', github.head_ref) }} - ${{ github.ref == 'refs/heads/main' && 'ghcr.io/multiqc/multiqc:dev' || format('ghcr.io/multiqc/multiqc:{0}', github.head_ref) }} + multiqc/multiqc:${{matrix.v.lab}}dev + ghcr.io/multiqc/multiqc:${{matrix.v.lab}}dev - name: "Build release" uses: docker/build-push-action@v3 - if: github.event_name != 'pull_request' && github.event_name == 'release' + if: github.event_name == 'release' with: # All available with python:3.X-slim are: # platforms: linux/386,linux/amd64,linux/arm/v5,linux/arm/v7,linux/arm64/v8,linux/ppc64le,linux/s390x # But 32-bit binaries likely require compilation from source so stick with linux/amd64 and linux/arm64 for now platforms: linux/amd64,linux/arm64 + build-args: ${{ matrix.v.build_args }} push: true tags: | - multiqc/multiqc:${{ github.event.release.tag_name }} - multiqc/multiqc:latest - ghcr.io/multiqc/multiqc:${{ github.event.release.tag_name }} - ghcr.io/multiqc/multiqc:latest + multiqc/multiqc:${{ matrix.v.lab }}${{ github.event.release.tag_name }} + multiqc/multiqc:${{ matrix.v.lab }}latest + ghcr.io/multiqc/multiqc:${{ matrix.v.lab }}${{ github.event.release.tag_name }} + ghcr.io/multiqc/multiqc:${{ matrix.v.lab }}latest diff --git a/Dockerfile b/Dockerfile index 561ac6cfd9..1388fa5288 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,12 @@ -FROM python:3.12-slim +FROM python:3.13-slim LABEL author="Phil Ewels & Vlad Savelyev" \ description="MultiQC" \ maintainer="phil.ewels@seqera.io" +# Optional pandoc installation for PDF support +ARG INSTALL_PANDOC=false + RUN mkdir /usr/src/multiqc # Add the MultiQC source files to the container @@ -26,8 +29,12 @@ RUN \ echo "Docker build log: Run apt-get update" 1>&2 && \ apt-get update -y -qq \ && \ - echo "Docker build log: Install procps and pandoc" 1>&2 && \ - apt-get install -y -qq procps pandoc && \ + echo "Docker build log: Install procps" 1>&2 && \ + apt-get install -y -qq procps && \ + if [ "$INSTALL_PANDOC" = "true" ]; then \ + echo "Docker build log: Install pandoc and LaTeX for PDF generation" 1>&2 && \ + apt-get install -y -qq pandoc texlive-latex-base texlive-fonts-recommended texlive-latex-extra texlive-luatex; \ + fi && \ echo "Docker build log: Clean apt cache" 1>&2 && \ rm -rf /var/lib/apt/lists/* && \ apt-get clean -y && \ @@ -37,7 +44,7 @@ RUN \ # Install MultiQC pip install --verbose --no-cache-dir /usr/src/multiqc && \ echo "Docker build log: Delete python cache directories" 1>&2 && \ - find /usr/local/lib/python3.12 \( -iname '*.c' -o -iname '*.pxd' -o -iname '*.pyd' -o -iname '__pycache__' \) -printf "\"%p\" " | \ + find /usr/local/lib/python3.13 \( -iname '*.c' -o -iname '*.pxd' -o -iname '*.pyd' -o -iname '__pycache__' \) -printf "\"%p\" " | \ xargs rm -rf {} && \ echo "Docker build log: Delete /usr/src/multiqc" 1>&2 && \ rm -rf "/usr/src/multiqc/" && \ @@ -53,7 +60,7 @@ WORKDIR /home/multiqc # Check everything is working smoothly RUN echo "Docker build log: Testing multiqc" 1>&2 && \ - multiqc --help + multiqc --help # Display the command line help if the container is run without any parameters -CMD multiqc --help \ No newline at end of file +CMD multiqc --help diff --git a/docs/markdown/getting_started/running_multiqc.md b/docs/markdown/getting_started/running_multiqc.md index 1d42a62c05..bdc8c74b81 100644 --- a/docs/markdown/getting_started/running_multiqc.md +++ b/docs/markdown/getting_started/running_multiqc.md @@ -225,16 +225,16 @@ Error creating PDF - pandoc not found. Is it installed? http://pandoc.org/ ``` Please note that Pandoc is a complex tool and has a number of its own dependencies -for PDF generation. Notably, it uses LaTeX / XeLaTeX which you must also have installed. +for PDF generation. Notably, it uses LaTeX / LuaLaTeX which you must also have installed. Please make sure that you have the latest version of Pandoc and that it can successfully convert basic HTML files to PDF before reporting and errors. Error messages from Pandoc are piped through to the MultiQC log, -for example if the xelatex dependency is not installed you will see the following: +for example if the lualatex dependency is not installed you will see the following: ``` -xelatex not found. Please select a different --pdf-engine or install xelatex +lualatex not found. Please select a different --pdf-engine or install lualatex ``` Note that not all plots have flat image equivalents, so diff --git a/multiqc/core/write_results.py b/multiqc/core/write_results.py index 97e91ab967..9ab22239d3 100644 --- a/multiqc/core/write_results.py +++ b/multiqc/core/write_results.py @@ -608,13 +608,23 @@ def _write_pdf(report_path: Path) -> Optional[Path]: str(report_path), "--output", str(pdf_path), - "--pdf-engine=pdflatex", + "--pdf-engine=lualatex", "-V", "documentclass=article", "-V", "geometry=margin=1in", "-V", + "mainfont=DejaVu Sans", + "-V", + "sansfont=DejaVu Sans", + "-V", + "monofont=DejaVu Sans Mono", + "-V", + "fontsize=10pt", + "-V", "title=", + "-V", + "tables=true", ] if config.pandoc_template is not None: pandoc_call.append(f"--template={config.pandoc_template}") diff --git a/multiqc/multiqc.py b/multiqc/multiqc.py index 260e6ba3ab..81beee0e58 100644 --- a/multiqc/multiqc.py +++ b/multiqc/multiqc.py @@ -691,16 +691,16 @@ def run( def _check_pdf_export_possible(): if subprocess.call(["which", "pandoc"]) != 0: logger.error( - "`pandoc` and `pdflatex` tools are required to create a PDF report. Please install those and try " + "`pandoc` and `lualatex` tools are required to create a PDF report. Please install those and try " "again. See http://pandoc.org/installing.html for the `pandoc` installation instructions " - "(e.g. `brew install pandoc` on macOS), and install LaTeX for `pdflatex` (e.g. `brew install basictex`" + "(e.g. `brew install pandoc` on macOS), and install LaTeX for `lualatex` (e.g. `brew install basictex`" "on macOS). Alternatively, omit the `--pdf` option or unset `make_pdf: true` in the MultiQC config." ) return RunResult(message="Pandoc is required to create PDF reports", sys_exit_code=1) - if subprocess.call(["which", "pdflatex"]) != 0: + if subprocess.call(["which", "lualatex"]) != 0: logger.error( - "The `pdflatex` tool is required to create a PDF report. Please install LaTeX and try again, " + "The `lualatex` tool is required to create a PDF report. Please install LaTeX and try again, " "e.g. `brew install basictex` on macOS. Alternatively, omit the `--pdf` option" "or unset `make_pdf: true` in the MultiQC config." ) From a1da4427198f1010576614c8f94f613570c4df65 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Wed, 17 Sep 2025 21:00:09 +0200 Subject: [PATCH 29/35] Docs: strip

tags from info line for modules table --- docs/markdown/modules.mdx | 407 +++++++++++++------------------- docs/markdown/modules/xenium.md | 8 +- scripts/make_module_docs.py | 2 +- 3 files changed, 174 insertions(+), 243 deletions(-) diff --git a/docs/markdown/modules.mdx b/docs/markdown/modules.mdx index f048054a55..32a5721205 100644 --- a/docs/markdown/modules.mdx +++ b/docs/markdown/modules.mdx @@ -27,243 +27,227 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "Adapter Removal", summary: - "

Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus.

", + "Removes adapter sequences, trims low quality bases from 3' ends, or merges overlapping pairs into consensus.", }, }, { id: "modules/afterqc", data: { name: "AfterQC", - summary: "

Automatic filtering, trimming, error removing, and quality control for FastQ data.

", + summary: "Automatic filtering, trimming, error removing, and quality control for FastQ data.", }, }, { id: "modules/anglerfish", data: { name: "Anglerfish", - summary: "

Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells.

", + summary: "Quality controls Illumina libraries sequenced on Oxford Nanopore flowcells.", }, }, { id: "modules/ataqv", - data: { name: "ATAQV", summary: "

Toolkit for quality control and visualization of ATAC-seq data.

" }, + data: { name: "ATAQV", summary: "Toolkit for quality control and visualization of ATAC-seq data." }, }, { id: "modules/bakta", data: { name: "Bakta", - summary: "

Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.

", + summary: "Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.", }, }, { id: "modules/bamdst", data: { name: "Bamdst", - summary: "

Lightweight tool to stat the depth coverage of target regions of BAM file(s).

", + summary: "Lightweight tool to stat the depth coverage of target regions of BAM file(s).", }, }, { id: "modules/bamtools", data: { name: "Bamtools", - summary: "

Provides both a programmer's API and an end-user's toolkit for handling BAM files.

", + summary: "Provides both a programmer's API and an end-user's toolkit for handling BAM files.", }, }, { id: "modules/bases2fastq", - data: { - name: "Bases2Fastq", - summary: "

Demultiplexes and converts Element AVITI base calls into FASTQ files.

", - }, + data: { name: "Bases2Fastq", summary: "Demultiplexes and converts Element AVITI base calls into FASTQ files." }, }, { id: "modules/bbduk", data: { name: "BBDuk", - summary: - "

Common data-quality-related trimming, filtering, and masking operations with a kmer based approach.

", + summary: "Common data-quality-related trimming, filtering, and masking operations with a kmer based approach.", }, }, { id: "modules/bbmap", data: { name: "BBTools", - summary: "

Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads.

", + summary: "Pre-processing, assembly, alignment, and statistics tools for DNA/RNA sequencing reads.", }, }, { id: "modules/bcftools", - data: { name: "Bcftools", summary: "

Utilities for variant calling and manipulating VCFs and BCFs.

" }, + data: { name: "Bcftools", summary: "Utilities for variant calling and manipulating VCFs and BCFs." }, }, { id: "modules/bcl2fastq", data: { name: "bcl2fastq", - summary: "

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

", + summary: "Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.", }, }, { id: "modules/bclconvert", data: { name: "BCL Convert", - summary: "

Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.

", + summary: "Demultiplexes data and converts BCL files to FASTQ file formats for downstream analysis.", }, }, { id: "modules/biobambam2", - data: { name: "biobambam2", summary: "

Tools for early stage alignment file processing.

" }, + data: { name: "biobambam2", summary: "Tools for early stage alignment file processing." }, }, { id: "modules/biobloomtools", data: { name: "BioBloom Tools", summary: - "

Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.

", + "Assigns reads to different references using bloom filters. This is faster than alignment and can be used for contamination detection.", }, }, { id: "modules/biscuit", data: { name: "BISCUIT", - summary: "

Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.

", + summary: "Maps bisulfite converted DNA sequence reads and determines cytosine methylation states.", }, }, { id: "modules/bismark", data: { name: "Bismark", - summary: "

Maps bisulfite converted sequence reads and determine cytosine methylation states.

", + summary: "Maps bisulfite converted sequence reads and determine cytosine methylation states.", }, }, - { - id: "modules/bowtie1", - data: { name: "Bowtie 1", summary: "

Ultrafast, memory-efficient short read aligner.

" }, - }, + { id: "modules/bowtie1", data: { name: "Bowtie 1", summary: "Ultrafast, memory-efficient short read aligner." } }, { id: "modules/bowtie2", data: { name: "Bowtie 2 / HiSAT2", - summary: "

Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.

", + summary: "Results from both Bowtie 2 and HISAT2, tools for aligning reads against a reference genome.", }, }, - { - id: "modules/busco", - data: { name: "BUSCO", summary: "

Assesses genome assembly and annotation completeness.

" }, - }, + { id: "modules/busco", data: { name: "BUSCO", summary: "Assesses genome assembly and annotation completeness." } }, { id: "modules/bustools", data: { name: "Bustools", summary: - "

Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.

", + "Tools for BUS files - a file format for single-cell RNA-seq data designed to facilitate the development of modular workflows for data processing.", }, }, { id: "modules/ccs", data: { name: "CCS", - summary: "

PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads).

", + summary: "PacBio tool that generates highly accurate single-molecule consensus reads (HiFi Reads).", }, }, { id: "modules/cellranger", - data: { - name: "Cell Ranger", - summary: "

Analyzes single cell expression or VDJ data produced by 10X Genomics.

", - }, + data: { name: "Cell Ranger", summary: "Analyzes single cell expression or VDJ data produced by 10X Genomics." }, }, { id: "modules/cellranger_arc", data: { name: "Cell Ranger ARC", - summary: "

Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.

", + summary: "Analyzes single-cell multiome ATAC and gene expression data produced by 10X Genomics.", }, }, { id: "modules/cells2stats", data: { name: "cells2stats", - summary: "

Generate output files and statistics from Element Biosciences Teton cytoprofiling assays.

", + summary: "Generate output files and statistics from Element Biosciences Teton cytoprofiling assays.", }, }, { id: "modules/checkm", data: { name: "CheckM", - summary: - "

Estimates genome completeness and contamination based on the presence or absence of marker genes.

", + summary: "Estimates genome completeness and contamination based on the presence or absence of marker genes.", }, }, { id: "modules/checkm2", - data: { name: "CheckM2", summary: "

Assesses microbial genome quality using machine learning.

" }, + data: { name: "CheckM2", summary: "Assesses microbial genome quality using machine learning." }, }, { id: "modules/checkqc", - data: { name: "CheckQC", summary: "

Checks a set of quality criteria against an Illumina runfolder.

" }, + data: { name: "CheckQC", summary: "Checks a set of quality criteria against an Illumina runfolder." }, }, { id: "modules/clipandmerge", - data: { name: "ClipAndMerge", summary: "

Adapter clipping and read merging for ancient DNA data.

" }, + data: { name: "ClipAndMerge", summary: "Adapter clipping and read merging for ancient DNA data." }, }, { id: "modules/clusterflow", - data: { name: "Cluster Flow", summary: "

Simple and flexible bioinformatics pipeline tool.

" }, + data: { name: "Cluster Flow", summary: "Simple and flexible bioinformatics pipeline tool." }, }, { id: "modules/conpair", - data: { name: "Conpair", summary: "

Estimates concordance and contamination for tumor\u2013normal pairs.

" }, + data: { name: "Conpair", summary: "Estimates concordance and contamination for tumor\u2013normal pairs." }, }, { id: "modules/cutadapt", data: { name: "Cutadapt", - summary: - "

Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.

", + summary: "Finds and removes adapter sequences, primers, poly-A tails, and other types of unwanted sequences.", }, }, { id: "modules/damageprofiler", - data: { name: "DamageProfiler", summary: "

DNA damage pattern retrieval for ancient DNA analysis.

" }, + data: { name: "DamageProfiler", summary: "DNA damage pattern retrieval for ancient DNA analysis." }, }, { id: "modules/dedup", data: { name: "DeDup", - summary: "

Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis.

", + summary: "Improved Duplicate Removal for merged/collapsed reads in ancient DNA analysis.", }, }, { id: "modules/deeptools", - data: { name: "deepTools", summary: "

Tools to process and analyze deep sequencing data.

" }, + data: { name: "deepTools", summary: "Tools to process and analyze deep sequencing data." }, }, { id: "modules/diamond", data: { name: "DIAMOND", - summary: - "

Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST.

", + summary: "Sequence aligner for protein and translated DNA searches, a drop-in replacement for the NCBI BLAST.", }, }, { id: "modules/disambiguate", data: { name: "Disambiguate", - summary: "

Disambiguate reads aligned to two different species (e.g. human and mouse).

", + summary: "Disambiguate reads aligned to two different species (e.g. human and mouse).", }, }, { id: "modules/dragen", data: { name: "DRAGEN", - summary: "

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

", + summary: "Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.", }, }, { id: "modules/dragen_fastqc", data: { name: "DRAGEN-FastQC", - summary: "

Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.

", + summary: "Illumina Bio-IT Platform that uses FPGA for secondary analysis of sequencing data.", }, }, { @@ -271,14 +255,14 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "eigenstratdatabasetools", summary: - "

Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.

", + "Tools to compare and manipulate the contents of EingenStrat databases, and to calculate SNP coverage statistics in such databases.", }, }, { id: "modules/fastp", data: { name: "fastp", - summary: "

All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...).

", + summary: "All-in-one FASTQ preprocessor (QC, adapters, trimming, filtering, splitting...).", }, }, { @@ -286,75 +270,68 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "FastQ Screen", summary: - "

Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.

", + "Screens a library of sequences in FastQ format against a set of sequence databases to see if the composition of the library matches with what you expect.", }, }, { id: "modules/fastqc", - data: { name: "FastQC", summary: "

Quality control tool for high throughput sequencing data.

" }, + data: { name: "FastQC", summary: "Quality control tool for high throughput sequencing data." }, }, { id: "modules/featurecounts", data: { name: "featureCounts", summary: - "

Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.

", + "Counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations.", }, }, - { id: "modules/fgbio", data: { name: "fgbio", summary: "

Processing and evaluating data containing UMIs.

" } }, - { id: "modules/filtlong", data: { name: "Filtlong", summary: "

Filters long reads by quality.

" } }, + { id: "modules/fgbio", data: { name: "fgbio", summary: "Processing and evaluating data containing UMIs." } }, + { id: "modules/filtlong", data: { name: "Filtlong", summary: "Filters long reads by quality." } }, { id: "modules/flash", - data: { name: "FLASh", summary: "

Merges paired-end reads from next-generation sequencing experiments.

" }, + data: { name: "FLASh", summary: "Merges paired-end reads from next-generation sequencing experiments." }, }, - { id: "modules/flexbar", data: { name: "Flexbar", summary: "

Barcode and adapter removal tool.

" } }, + { id: "modules/flexbar", data: { name: "Flexbar", summary: "Barcode and adapter removal tool." } }, { id: "modules/freyja", - data: { name: "Freyja", summary: "

Recovers relative lineage abundances from mixed SARS-CoV-2 samples.

" }, + data: { name: "Freyja", summary: "Recovers relative lineage abundances from mixed SARS-CoV-2 samples." }, }, { id: "modules/ganon", data: { name: "Ganon", summary: - "

Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.

", + "Metagenomics classification: quickly assigns sequence fragments to their closest reference among thousands of references via Interleaved Bloom Filters of k-mer/minimizers.", }, }, { id: "modules/gatk", data: { name: "GATK", - summary: "

Wide variety of tools with a primary focus on variant discovery and genotyping.

", + summary: "Wide variety of tools with a primary focus on variant discovery and genotyping.", }, }, { id: "modules/gffcompare", data: { name: "GffCompare", - summary: - "

Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.

", + summary: "Tool to compare, merge and annotate one or more GFF files with a reference annotation in GFF format.", }, }, - { - id: "modules/glimpse", - data: { name: "GLIMPSE", summary: "

Low-coverage whole genome sequencing imputation.

" }, - }, + { id: "modules/glimpse", data: { name: "GLIMPSE", summary: "Low-coverage whole genome sequencing imputation." } }, { id: "modules/goleft_indexcov", data: { name: "goleft indexcov", - summary: "

Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.

", + summary: "Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.", }, }, - { - id: "modules/gopeaks", - data: { name: "GoPeaks", summary: "

Calls peaks in CUT&TAG/CUT&RUN datasets.

" }, - }, + { id: "modules/gopeaks", data: { name: "GoPeaks", summary: "Calls peaks in CUT&TAG/CUT&RUN datasets." } }, { id: "modules/gtdbtk", data: { name: "GTDB-Tk", - summary: "

Assigns objective taxonomic classifications to bacterial and archaeal genomes.

", + summary: "Assigns objective taxonomic classifications to bacterial and archaeal genomes.", }, }, { @@ -362,257 +339,240 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "Haplocheck", summary: - "

Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.

", + "Detects in-sample contamination in mtDNA or WGS sequencing studies by analyzing the mitchondrial content.", }, }, { id: "modules/happy", - data: { name: "hap.py", summary: "

Benchmarks variant calls against gold standard truth datasets.

" }, + data: { name: "hap.py", summary: "Benchmarks variant calls against gold standard truth datasets." }, }, { id: "modules/hicexplorer", - data: { name: "HiCExplorer", summary: "

Hi-C analysis from processing to visualization.

" }, + data: { name: "HiCExplorer", summary: "Hi-C analysis from processing to visualization." }, }, - { id: "modules/hicpro", data: { name: "HiC-Pro", summary: "

Pipeline for Hi-C data processing.

" } }, - { id: "modules/hicup", data: { name: "HiCUP", summary: "

Mapping and quality control on Hi-C data.

" } }, + { id: "modules/hicpro", data: { name: "HiC-Pro", summary: "Pipeline for Hi-C data processing." } }, + { id: "modules/hicup", data: { name: "HiCUP", summary: "Mapping and quality control on Hi-C data." } }, { id: "modules/hifiasm", - data: { name: "HiFiasm", summary: "

Haplotype-resolved assembler for accurate Hifi reads.

" }, + data: { name: "HiFiasm", summary: "Haplotype-resolved assembler for accurate Hifi reads." }, }, { id: "modules/hisat2", - data: { name: "HISAT2", summary: "

Maps DNA or RNA reads against a genome or a population of genomes.

" }, - }, - { - id: "modules/homer", - data: { name: "HOMER", summary: "

Motif discovery and next-gen sequencing analysis.

" }, + data: { name: "HISAT2", summary: "Maps DNA or RNA reads against a genome or a population of genomes." }, }, + { id: "modules/homer", data: { name: "HOMER", summary: "Motif discovery and next-gen sequencing analysis." } }, { id: "modules/hops", data: { name: "HOPS", - summary: "

Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.

", + summary: "Ancient DNA characteristics screening tool of output from the metagenomic aligner MALT.", }, }, { id: "modules/hostile", data: { name: "Hostile", - summary: - "

Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz].

", + summary: "Removes host sequences from short and long read (meta)genomes, from paired or unpaired fastq[.gz].", }, }, { id: "modules/htseq", data: { name: "HTSeq Count", - summary: "

Part of the HTSeq package: counts reads covering specified genomic features.

", + summary: "Part of the HTSeq package: counts reads covering specified genomic features.", }, }, { id: "modules/humid", data: { name: "HUMID", - summary: "

Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.

", + summary: "Reference-free tool to quickly remove duplicates from FastQ files, with or without UMIs.", }, }, { id: "modules/interop", - data: { name: "Illumina InterOp Statistics", summary: "

Reading and writing InterOp metric files.

" }, + data: { name: "Illumina InterOp Statistics", summary: "Reading and writing InterOp metric files." }, }, { id: "modules/isoseq", data: { name: "Iso-Seq", - summary: "

Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).

", + summary: "Identifies transcripts in PacBio single-molecule sequencing data (HiFi reads).", }, }, - { id: "modules/ivar", data: { name: "iVar", summary: "

Functions for viral amplicon-based sequencing.

" } }, + { id: "modules/ivar", data: { name: "iVar", summary: "Functions for viral amplicon-based sequencing." } }, { id: "modules/jcvi", - data: { name: "JCVI Genome Annotation", summary: "

Computes statistics on genome annotation.

" }, + data: { name: "JCVI Genome Annotation", summary: "Computes statistics on genome annotation." }, }, - { id: "modules/jellyfish", data: { name: "Jellyfish", summary: "

Counting k-mers in DNA.

" } }, - { id: "modules/kaiju", data: { name: "Kaiju", summary: "

Taxonomic classification for metagenomics.

" } }, + { id: "modules/jellyfish", data: { name: "Jellyfish", summary: "Counting k-mers in DNA." } }, + { id: "modules/kaiju", data: { name: "Kaiju", summary: "Taxonomic classification for metagenomics." } }, { id: "modules/kallisto", data: { name: "Kallisto", - summary: - "

Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data.

", + summary: "Quantifies abundances of transcripts (or more generally, of target sequences) from RNA-Seq data.", }, }, { id: "modules/kat", - data: { name: "K-mer Analysis Toolkit", summary: "

Analyses sequencing data via its k-mer spectra.

" }, + data: { name: "K-mer Analysis Toolkit", summary: "Analyses sequencing data via its k-mer spectra." }, }, { id: "modules/kraken", data: { name: "Kraken", summary: - "

Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.

", + "Taxonomic classification using exact k-mer matches to find the lowest common ancestor (LCA) of a given sequence.", }, }, - { id: "modules/leehom", data: { name: "leeHom", summary: "

Bayesian reconstruction of ancient DNA.

" } }, + { id: "modules/leehom", data: { name: "leeHom", summary: "Bayesian reconstruction of ancient DNA." } }, { id: "modules/librarian", data: { name: "Librarian", - summary: "

Predicts the sequencing library type from the base composition of a FastQ file.

", + summary: "Predicts the sequencing library type from the base composition of a FastQ file.", }, }, - { - id: "modules/lima", - data: { name: "Lima", summary: "

Demultiplex PacBio single-molecule sequencing reads.

" }, - }, + { id: "modules/lima", data: { name: "Lima", summary: "Demultiplex PacBio single-molecule sequencing reads." } }, { id: "modules/longranger", data: { name: "Long Ranger", summary: - "

Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.

", + "Sample demultiplexing, barcode processing, alignment, quality control, variant calling, phasing, and structural variant calling.", }, }, { id: "modules/macs2", - data: { name: "MACS2", summary: "

Identifies transcription factor binding sites in ChIP-seq data.

" }, + data: { name: "MACS2", summary: "Identifies transcription factor binding sites in ChIP-seq data." }, }, { id: "modules/malt", data: { name: "MALT", summary: - "

Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file.

", + "Aligns of metagenomic reads to a database of reference sequences (such as NR, GenBank or Silva) and outputs a MEGAN RMA file.", }, }, { id: "modules/mapdamage", - data: { name: "mapDamage", summary: "

Tracks and quantifies damage patterns in ancient DNA sequences.

" }, + data: { name: "mapDamage", summary: "Tracks and quantifies damage patterns in ancient DNA sequences." }, }, - { id: "modules/megahit", data: { name: "MEGAHIT", summary: "

NGS read assembler.

" } }, + { id: "modules/megahit", data: { name: "MEGAHIT", summary: "NGS read assembler." } }, { id: "modules/metaphlan", data: { name: "MetaPhlAn", - summary: "

Profiles the composition of microbial communities from metagenomic shotgun sequencing data.

", + summary: "Profiles the composition of microbial communities from metagenomic shotgun sequencing data.", }, }, { id: "modules/methylqa", - data: { name: "methylQA", summary: "

Methylation sequencing data quality assessment tool.

" }, + data: { name: "methylQA", summary: "Methylation sequencing data quality assessment tool." }, }, { id: "modules/mgikit", - data: { name: "mgikit", summary: "

Demultiplexes FASTQ files from an MGI sequencing instrument.

" }, + data: { name: "mgikit", summary: "Demultiplexes FASTQ files from an MGI sequencing instrument." }, }, { id: "modules/minionqc", - data: { name: "MinIONQC", summary: "

Quality control for ONT (Oxford Nanopore) long reads.

" }, + data: { name: "MinIONQC", summary: "Quality control for ONT (Oxford Nanopore) long reads." }, }, { id: "modules/mirtop", data: { name: "mirtop", - summary: "

Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.

", + summary: "Annotates miRNAs and isomiRs and compute general statistics in mirGFF3 format.", }, }, - { - id: "modules/mirtrace", - data: { name: "miRTrace", summary: "

Quality control for small RNA sequencing data.

" }, - }, + { id: "modules/mirtrace", data: { name: "miRTrace", summary: "Quality control for small RNA sequencing data." } }, { id: "modules/mosaicatcher", data: { name: "MosaiCatcher", summary: - "

Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.

", + "Counts strand-seq reads and classifies strand states of each chromosome in each cell using a Hidden Markov Model.", }, }, { id: "modules/mosdepth", - data: { - name: "Mosdepth", - summary: "

Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing.

", - }, + data: { name: "Mosdepth", summary: "Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing." }, }, { id: "modules/motus", data: { name: "Motus", - summary: "

Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).

", + summary: "Microbial profiling through marker gene (MG)-based operational taxonomic units (mOTUs).", }, }, { id: "modules/mtnucratio", - data: { name: "mtnucratio", summary: "

Computes mitochondrial to nuclear genome ratios in NGS datasets.

" }, + data: { name: "mtnucratio", summary: "Computes mitochondrial to nuclear genome ratios in NGS datasets." }, }, { id: "modules/multivcfanalyzer", data: { name: "MultiVCFAnalyzer", summary: - "

Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats.

", + "Reads multiple VCF files into combined genotype calls, produces summary statistics and downstream formats.", }, }, { id: "modules/nanoq", - data: { name: "nanoq", summary: "

Reports read quality and length from nanopore sequencing data.

" }, + data: { name: "nanoq", summary: "Reports read quality and length from nanopore sequencing data." }, }, { id: "modules/nanostat", data: { name: "NanoStat", summary: - "

Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).

", + "Reports various statistics for long read dataset in FASTQ, BAM, or albacore sequencing summary format (supports NanoPack; NanoPlot, NanoComp).", }, }, { id: "modules/nextclade", data: { name: "Nextclade", - summary: "

Viral genome alignment, clade assignment, mutation calling, and quality checks.

", + summary: "Viral genome alignment, clade assignment, mutation calling, and quality checks.", }, }, - { - id: "modules/ngsbits", - data: { name: "ngs-bits", summary: "

Calculating statistics from FASTQ, BAM, and VCF.

" }, - }, + { id: "modules/ngsbits", data: { name: "ngs-bits", summary: "Calculating statistics from FASTQ, BAM, and VCF." } }, { id: "modules/ngsderive", data: { name: "ngsderive", - summary: "

Forensic tool for by backwards computing library information in sequencing data.

", + summary: "Forensic tool for by backwards computing library information in sequencing data.", }, }, { id: "modules/nonpareil", - data: { name: "Nonpareil", summary: "

Estimates metagenomic coverage and sequence diversity.

" }, + data: { name: "Nonpareil", summary: "Estimates metagenomic coverage and sequence diversity." }, }, { id: "modules/odgi", data: { name: "ODGI", - summary: "

Analysis and manipulation of pangenome graphs structured in the variation graph model.

", + summary: "Analysis and manipulation of pangenome graphs structured in the variation graph model.", }, }, { id: "modules/optitype", - data: { name: "OptiType", summary: "

Precision HLA typing from next-generation sequencing data.

" }, + data: { name: "OptiType", summary: "Precision HLA typing from next-generation sequencing data." }, }, { id: "modules/pairtools", data: { name: "pairtools", summary: - "

Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.

", + "Toolkit for Chromatin Conformation Capture experiments. Handles short-reads paired reference alignments, extracts 3C-specific information, and perform common tasks such as sorting, filtering, and deduplication.", }, }, { id: "modules/pangolin", data: { name: "Pangolin", - summary: "

Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.

", + summary: "Uses variant calls to assign SARS-CoV-2 genome sequences to global lineages.", }, }, { @@ -620,22 +580,21 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "pbmarkdup", summary: - "

Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.

", + "Takes one or multiple sequencing chips of an amplified libray as HiFi reads and marks or removes duplicates.", }, }, { id: "modules/peddy", data: { name: "Peddy", - summary: - "

Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.

", + summary: "Compares familial-relationships and sexes as reported in a PED file with those inferred from a VCF.", }, }, { id: "modules/percolator", data: { name: "Percolator", - summary: "

Semi-supervised learning for peptide identification from shotgun proteomics datasets.

", + summary: "Semi-supervised learning for peptide identification from shotgun proteomics datasets.", }, }, { @@ -643,23 +602,23 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "phantompeakqualtools", summary: - "

Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.

", + "Computes informative enrichment and quality measures for ChIP-seq/DNase-seq/FAIRE-seq/MNase-seq data.", }, }, { id: "modules/picard", - data: { name: "Picard", summary: "

Tools for manipulating high-throughput sequencing data.

" }, + data: { name: "Picard", summary: "Tools for manipulating high-throughput sequencing data." }, }, { id: "modules/porechop", - data: { name: "Porechop", summary: "

Finds and removes adapters from Oxford Nanopore reads.

" }, + data: { name: "Porechop", summary: "Finds and removes adapters from Oxford Nanopore reads." }, }, { id: "modules/preseq", data: { name: "Preseq", summary: - "

Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.

", + "Estimates library complexity, showing how many additional unique reads are sequenced for increasing total read count.", }, }, { @@ -667,23 +626,19 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "PRINSEQ++", summary: - "

C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.

", + "C++ implementation of the prinseq-lite.pl program. Filters, reformats, and trims genomic and metagenomic reads.", }, }, - { id: "modules/prokka", data: { name: "Prokka", summary: "

Rapid annotation of prokaryotic genomes.

" } }, + { id: "modules/prokka", data: { name: "Prokka", summary: "Rapid annotation of prokaryotic genomes." } }, { id: "modules/purple", - data: { - name: "PURPLE", - summary: "

A purity, ploidy and copy number estimator for whole genome tumor data.

", - }, + data: { name: "PURPLE", summary: "A purity, ploidy and copy number estimator for whole genome tumor data." }, }, { id: "modules/pychopper", data: { name: "Pychopper", - summary: - "

Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.

", + summary: "Identifies, orients, trims and rescues full length Nanopore cDNA reads. Can also rescue fused reads.", }, }, { @@ -691,242 +646,212 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; data: { name: "pycoQC", summary: - "

Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data.

", + "Computes metrics and generates interactive QC plots for Oxford Nanopore technologies sequencing data.", }, }, { id: "modules/qc3C", - data: { name: "qc3C", summary: "

Reference-free and BAM based quality control for Hi-C data.

" }, + data: { name: "qc3C", summary: "Reference-free and BAM based quality control for Hi-C data." }, }, { id: "modules/qorts", - data: { name: "QoRTs", summary: "

Toolkit for analysis, QC, and data management of RNA-Seq datasets.

" }, + data: { name: "QoRTs", summary: "Toolkit for analysis, QC, and data management of RNA-Seq datasets." }, }, { id: "modules/qualimap", - data: { - name: "QualiMap", - summary: "

Quality control of alignment data and its derivatives like feature counts.

", - }, + data: { name: "QualiMap", summary: "Quality control of alignment data and its derivatives like feature counts." }, }, - { id: "modules/quast", data: { name: "QUAST", summary: "

Quality assessment tool for genome assemblies.

" } }, + { id: "modules/quast", data: { name: "QUAST", summary: "Quality assessment tool for genome assemblies." } }, { id: "modules/rna_seqc", - data: { name: "RNA-SeQC", summary: "

RNA-Seq metrics for quality control and process optimization.

" }, + data: { name: "RNA-SeQC", summary: "RNA-Seq metrics for quality control and process optimization." }, }, { id: "modules/rockhopper", data: { name: "Rockhopper", - summary: - "

Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs.

", + summary: "Bacterial RNA-seq analysis: align reads to coding sequences, rRNAs, tRNAs, and miscellaneous RNAs.", }, }, { id: "modules/rsem", - data: { name: "RSEM", summary: "

Estimates gene and isoform expression levels from RNA-Seq data.

" }, + data: { name: "RSEM", summary: "Estimates gene and isoform expression levels from RNA-Seq data." }, }, - { id: "modules/rseqc", data: { name: "RSeQC", summary: "

Evaluates high throughput RNA-seq data.

" } }, + { id: "modules/rseqc", data: { name: "RSeQC", summary: "Evaluates high throughput RNA-seq data." } }, { id: "modules/salmon", - data: { name: "Salmon", summary: "

Quantifies expression of transcripts using RNA-seq data.

" }, - }, - { - id: "modules/sambamba", - data: { name: "Sambamba", summary: "

Toolkit for interacting with BAM/CRAM files.

" }, + data: { name: "Salmon", summary: "Quantifies expression of transcripts using RNA-seq data." }, }, + { id: "modules/sambamba", data: { name: "Sambamba", summary: "Toolkit for interacting with BAM/CRAM files." } }, { id: "modules/samblaster", - data: { - name: "Samblaster", - summary: "

Marks duplicates and extracts discordant and split reads from sam files.

", - }, - }, - { - id: "modules/samtools", - data: { name: "Samtools", summary: "

Toolkit for interacting with BAM/CRAM files.

" }, + data: { name: "Samblaster", summary: "Marks duplicates and extracts discordant and split reads from sam files." }, }, + { id: "modules/samtools", data: { name: "Samtools", summary: "Toolkit for interacting with BAM/CRAM files." } }, { id: "modules/sargasso", data: { name: "Sargasso", - summary: "

Separates mixed-species RNA-seq reads according to their species of origin.

", + summary: "Separates mixed-species RNA-seq reads according to their species of origin.", }, }, { id: "modules/seqera_cli", - data: { name: "Seqera Platform CLI", summary: "

Reports statistics generated by the Seqera Platform CLI.

" }, + data: { name: "Seqera Platform CLI", summary: "Reports statistics generated by the Seqera Platform CLI." }, }, - { id: "modules/seqfu", data: { name: "Seqfu", summary: "

Manipulate FASTA/FASTQ files.

" } }, + { id: "modules/seqfu", data: { name: "Seqfu", summary: "Manipulate FASTA/FASTQ files." } }, { id: "modules/sequali", - data: { name: "Sequali", summary: "

Sequencing quality control for both long-read and short-read data.

" }, + data: { name: "Sequali", summary: "Sequencing quality control for both long-read and short-read data." }, }, { id: "modules/seqwho", data: { name: "SeqWho", summary: - "

Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.

", + "Determines FASTQ(A) sequencing file source protocol and the species of origin, to check that the composition of the library is expected.", }, }, { id: "modules/seqyclean", - data: { - name: "SeqyClean", - summary: "

Filters adapters, vectors, and contaminants while quality trimming.

", - }, + data: { name: "SeqyClean", summary: "Filters adapters, vectors, and contaminants while quality trimming." }, }, { id: "modules/sexdeterrmine", data: { name: "SexDetErrmine", summary: - "

Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.

", + "Calculates relative coverage of X and Y chromosomes and their associated error bars from the depth of coverage at specified SNPs.", }, }, { id: "modules/sickle", - data: { name: "Sickle", summary: "

A windowed adaptive trimming tool for FASTQ files using quality.

" }, + data: { name: "Sickle", summary: "A windowed adaptive trimming tool for FASTQ files using quality." }, }, - { - id: "modules/skewer", - data: { name: "Skewer", summary: "

Adapter trimming tool for NGS paired-end sequences.

" }, - }, - { id: "modules/slamdunk", data: { name: "Slamdunk", summary: "

Tool to analyze SLAM-Seq data.

" } }, + { id: "modules/skewer", data: { name: "Skewer", summary: "Adapter trimming tool for NGS paired-end sequences." } }, + { id: "modules/slamdunk", data: { name: "Slamdunk", summary: "Tool to analyze SLAM-Seq data." } }, { id: "modules/snippy", - data: { name: "Snippy", summary: "

Rapid haploid variant calling and core genome alignment.

" }, + data: { name: "Snippy", summary: "Rapid haploid variant calling and core genome alignment." }, }, { id: "modules/snpeff", data: { name: "SnpEff", - summary: "

Annotates and predicts the effects of variants on genes (such as amino acid changes).

", + summary: "Annotates and predicts the effects of variants on genes (such as amino acid changes).", }, }, { id: "modules/snpsplit", data: { name: "SNPsplit", - summary: - "

Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions.

", + summary: "Allele-specific alignment sorter. Determines allelic origin of reads that cover known SNP positions.", }, }, { id: "modules/somalier", data: { name: "Somalier", - summary: "

Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF.

", + summary: "Genotype to pedigree correspondence checks from sketches derived from BAM/CRAM or VCF.", }, }, { id: "modules/sortmerna", data: { name: "SortMeRNA", - summary: - "

Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.

", + summary: "Program for filtering, mapping and OTU-picking NGS reads in metatranscriptomic and metagenomic data.", }, }, { id: "modules/sourmash", data: { name: "Sourmash", - summary: "

Quickly searches, compares, and analyzes genomic and metagenomic data sets.

", + summary: "Quickly searches, compares, and analyzes genomic and metagenomic data sets.", }, }, { id: "modules/spaceranger", - data: { name: "Space Ranger", summary: "

Tool to analyze 10x Genomics spatial transcriptomics data.

" }, + data: { name: "Space Ranger", summary: "Tool to analyze 10x Genomics spatial transcriptomics data." }, }, { id: "modules/stacks", - data: { name: "Stacks", summary: "

Analyzes restriction enzyme-based data (e.g. RAD-seq).

" }, + data: { name: "Stacks", summary: "Analyzes restriction enzyme-based data (e.g. RAD-seq)." }, }, - { id: "modules/star", data: { name: "STAR", summary: "

Universal RNA-seq aligner.

" } }, + { id: "modules/star", data: { name: "STAR", summary: "Universal RNA-seq aligner." } }, { id: "modules/supernova", - data: { name: "Supernova", summary: "

De novo genome assembler of 10X Genomics linked-reads.

" }, + data: { name: "Supernova", summary: "De novo genome assembler of 10X Genomics linked-reads." }, }, { id: "modules/telseq", - data: { name: "telseq", summary: "

Estimates telomere length from whole genome sequencing data (BAMs).

" }, + data: { name: "telseq", summary: "Estimates telomere length from whole genome sequencing data (BAMs)." }, }, { id: "modules/theta2", - data: { name: "THetA2", summary: "

Estimates tumour purity and clonal / subclonal copy number.

" }, + data: { name: "THetA2", summary: "Estimates tumour purity and clonal / subclonal copy number." }, }, { id: "modules/tophat", - data: { name: "Tophat", summary: "

Splice junction RNA-Seq reads mapper for mammalian-sized genomes.

" }, - }, - { - id: "modules/trimmomatic", - data: { name: "Trimmomatic", summary: "

Read trimming tool for Illumina NGS data.

" }, + data: { name: "Tophat", summary: "Splice junction RNA-Seq reads mapper for mammalian-sized genomes." }, }, + { id: "modules/trimmomatic", data: { name: "Trimmomatic", summary: "Read trimming tool for Illumina NGS data." } }, { id: "modules/truvari", - data: { name: "Truvari", summary: "

Benchmarking, merging, and annotating structural variants.

" }, + data: { name: "Truvari", summary: "Benchmarking, merging, and annotating structural variants." }, }, { id: "modules/umicollapse", data: { name: "UMICollapse", - summary: "

Algorithms for efficiently collapsing reads with Unique Molecular Identifiers.

", + summary: "Algorithms for efficiently collapsing reads with Unique Molecular Identifiers.", }, }, { id: "modules/umitools", data: { name: "UMI-tools", - summary: "

Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.

", + summary: "Tools for dealing with Unique Molecular Identifiers (UMIs)/(RMTs) and scRNA-Seq barcodes.", }, }, { id: "modules/varscan2", - data: { name: "VarScan2", summary: "

Variant detection in massively parallel sequencing data.

" }, - }, - { - id: "modules/vcftools", - data: { name: "VCFTools", summary: "

Program to analyse and reporting on VCF files.

" }, + data: { name: "VarScan2", summary: "Variant detection in massively parallel sequencing data." }, }, + { id: "modules/vcftools", data: { name: "VCFTools", summary: "Program to analyse and reporting on VCF files." } }, { id: "modules/vep", data: { name: "VEP", summary: - "

Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.

", + "Determines the effect of variants on genes, transcripts and protein sequences, as well as regulatory regions.", }, }, { id: "modules/verifybamid", - data: { name: "VerifyBAMID", summary: "

Detects sample contamination and/or sample swaps.

" }, + data: { name: "VerifyBAMID", summary: "Detects sample contamination and/or sample swaps." }, }, { id: "modules/vg", - data: { - name: "VG", - summary: "

Toolkit to manipulate and analyze graphical genomes, including read alignment.

", - }, + data: { name: "VG", summary: "Toolkit to manipulate and analyze graphical genomes, including read alignment." }, }, { id: "modules/whatshap", data: { name: "WhatsHap", - summary: "

Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly).

", + summary: "Phasing genomic variants using DNA reads (aka read-based phasing, or haplotype assembly).", }, }, { id: "modules/xengsort", - data: { name: "Xengsort", summary: "

Fast xenograft read sorter based on space-efficient k-mer hashing.

" }, + data: { name: "Xengsort", summary: "Fast xenograft read sorter based on space-efficient k-mer hashing." }, }, { id: "modules/xenium", data: { name: "Xenium", - summary: "

Spatial transcriptomics platform from 10x Genomics that provides subcellular resolution.

", + summary: "Spatial transcriptomics platform from 10x Genomics that provides subcellular resolution.", }, }, - { id: "modules/xenome", data: { name: "Xenome", summary: "

Classifies reads from xenograft sources.

" } }, + { id: "modules/xenome", data: { name: "Xenome", summary: "Classifies reads from xenograft sources." } }, ]} /> diff --git a/docs/markdown/modules/xenium.md b/docs/markdown/modules/xenium.md index 37a87065a2..7a3684fcde 100644 --- a/docs/markdown/modules/xenium.md +++ b/docs/markdown/modules/xenium.md @@ -23,13 +23,19 @@ File path for the source of this content: multiqc/modules/xenium/xenium.py Xenium is a spatial transcriptomics platform from 10x Genomics that provides subcellular resolution. -NOTE: parsing huge files is not an intended MultiQC usage. By default, MultiQC will ignore the `*.parquet` files +:::note +Parsing huge files is not an intended MultiQC usage. By default, MultiQC will ignore the `*.parquet` files as they are gigabyte-sized. To enable parsing those, make sure to have this line in your config: ``` log_filesize_limit: 5000000000 # 5GB ``` +::: + +The MultiQC module is tested with outputs from xenium-3.x, older versions of xenium output are +not supported and may even cause MultiQC to crash (see https://github.com/MultiQC/MultiQC/issues/3344). + ### File search patterns ```yaml diff --git a/scripts/make_module_docs.py b/scripts/make_module_docs.py index d4fa23bdb5..db24febc47 100644 --- a/scripts/make_module_docs.py +++ b/scripts/make_module_docs.py @@ -53,7 +53,7 @@ def main(): "id": f"modules/{mod_id}", "data": { "name": f"{module.name}", - "summary": f"{module.info}", + "summary": f"{module.info.replace('

', '').replace('

', '')}", }, } ) From 2ce3b6d07748be1a3b9de3472e9d39c407a20321 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Wed, 17 Sep 2025 21:06:11 +0200 Subject: [PATCH 30/35] Module docs: Convert HTML back to markdown, instead of just stripping

tags --- docs/markdown/modules.mdx | 7 ++----- scripts/make_module_docs.py | 6 ++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/markdown/modules.mdx b/docs/markdown/modules.mdx index 32a5721205..d04382e7d0 100644 --- a/docs/markdown/modules.mdx +++ b/docs/markdown/modules.mdx @@ -50,10 +50,7 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; }, { id: "modules/bakta", - data: { - name: "Bakta", - summary: "Rapid & standardized annotation of bacterial genomes, MAGs & plasmids.", - }, + data: { name: "Bakta", summary: "Rapid & standardized annotation of bacterial genomes, MAGs & plasmids." }, }, { id: "modules/bamdst", @@ -326,7 +323,7 @@ import MultiqcModules from "@site/src/components/MultiqcModules"; summary: "Quickly estimate coverage from a whole-genome bam index, providing 16KB resolution.", }, }, - { id: "modules/gopeaks", data: { name: "GoPeaks", summary: "Calls peaks in CUT&TAG/CUT&RUN datasets." } }, + { id: "modules/gopeaks", data: { name: "GoPeaks", summary: "Calls peaks in CUT&TAG/CUT&RUN datasets." } }, { id: "modules/gtdbtk", data: { diff --git a/scripts/make_module_docs.py b/scripts/make_module_docs.py index db24febc47..46be89a91d 100644 --- a/scripts/make_module_docs.py +++ b/scripts/make_module_docs.py @@ -5,12 +5,10 @@ python scripts/make_docs.py """ -from datetime import datetime import json -import os from typing import Dict import yaml -import argparse +from markdownify import markdownify from pathlib import Path from textwrap import dedent, indent import subprocess @@ -53,7 +51,7 @@ def main(): "id": f"modules/{mod_id}", "data": { "name": f"{module.name}", - "summary": f"{module.info.replace('

', '').replace('

', '')}", + "summary": f"{markdownify(module.info)}", }, } ) From 31135b27093f559b0e82e02696a4e49c0ea66040 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Wed, 17 Sep 2025 21:23:28 +0200 Subject: [PATCH 31/35] Update Claude Code GitHub Workflow (#3353) * "Update Claude PR Assistant workflow" * "Claude Code Review workflow" --- .github/workflows/claude-code-review.yml | 54 ++++++++++++++++++++++++ .github/workflows/claude.yml | 40 ++++++++++++------ 2 files changed, 80 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/claude-code-review.yml diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml new file mode 100644 index 0000000000..5e90d4b9ff --- /dev/null +++ b/.github/workflows/claude-code-review.yml @@ -0,0 +1,54 @@ +name: Claude Code Review + +on: + pull_request: + types: [opened, synchronize] + # Optional: Only run on specific file changes + # paths: + # - "src/**/*.ts" + # - "src/**/*.tsx" + # - "src/**/*.js" + # - "src/**/*.jsx" + +jobs: + claude-review: + # Optional: Filter by PR author + # if: | + # github.event.pull_request.user.login == 'external-contributor' || + # github.event.pull_request.user.login == 'new-developer' || + # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' + + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + issues: read + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Run Claude Code Review + id: claude-review + uses: anthropics/claude-code-action@v1 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + prompt: | + Please review this pull request and provide feedback on: + - Code quality and best practices + - Potential bugs or issues + - Performance considerations + - Security concerns + - Test coverage + + Use the repository's CLAUDE.md for guidance on style and conventions. Be constructive and helpful in your feedback. + + Use `gh pr comment` with your Bash tool to leave your review as a comment on the PR. + + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://docs.anthropic.com/en/docs/claude-code/sdk#command-line for available options + claude_args: '--allowed-tools "Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)"' + diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index 3722b0e2ab..4b2e6d2f5c 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -1,4 +1,4 @@ -name: Claude Code Review +name: Claude Code on: issue_comment: @@ -11,28 +11,40 @@ on: types: [submitted] jobs: - claude-code-review: + claude: + if: | + (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || + (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || + (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: contents: read + pull-requests: read issues: read - pull-requests: write id-token: write - - if: | - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || - (github.event_name == 'issues' && contains(github.event.issue.body, '@claude')) - + actions: read # Required for Claude to read CI results on PRs steps: - - name: Checkout code + - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 - - name: Run Claude PR Action - uses: anthropics/claude-code-action@beta + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@v1 with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - timeout_minutes: "60" + + # This is an optional setting that allows Claude to read CI results on PRs + additional_permissions: | + actions: read + + # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. + # prompt: 'Update the pull request description to include a summary of changes.' + + # Optional: Add claude_args to customize behavior and configuration + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md + # or https://docs.anthropic.com/en/docs/claude-code/sdk#command-line for available options + # claude_args: '--model claude-opus-4-1-20250805 --allowed-tools Bash(gh pr:*)' + From 3b84187a2a0e4170e40b3202ae8e78e260622bc5 Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Wed, 17 Sep 2025 19:26:08 +0000 Subject: [PATCH 32/35] Remove bedrock availability check when creating client (#3352) Co-authored-by: Josh Chorlton --- multiqc/core/ai.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/multiqc/core/ai.py b/multiqc/core/ai.py index 7b7255a516..428054355e 100644 --- a/multiqc/core/ai.py +++ b/multiqc/core/ai.py @@ -489,16 +489,6 @@ class AWSBedrockClient(Client): def __init__(self): super().__init__() - # Check Bedrock availability with detailed error reporting - is_available, error_msg = _check_bedrock_availability() - if not is_available: - if "boto3 not installed" in str(error_msg): - raise ImportError( - 'AI summary through AWS bedrock requires "boto3" to be installed. Install it with `pip install boto3`' - ) - else: - raise RuntimeError(f"AWS Bedrock is not available: {error_msg}") - self.model = config.ai_model self.name = "aws_bedrock" self.title = "AWS Bedrock" From 8f08115de728135e778f44bf37a3796c5b4e9d42 Mon Sep 17 00:00:00 2001 From: Phil Ewels Date: Wed, 17 Sep 2025 21:37:22 +0200 Subject: [PATCH 33/35] Docs: make module docs use relative links Automation update in script, see #3314 for upstream fixes. --- docs/markdown/modules/bcftools.md | 2 +- docs/markdown/modules/bclconvert.md | 2 +- docs/markdown/modules/biscuit.md | 2 +- docs/markdown/modules/cutadapt.md | 2 +- docs/markdown/modules/deeptools.md | 2 +- docs/markdown/modules/fastqc.md | 6 +++--- docs/markdown/modules/featurecounts.md | 2 +- docs/markdown/modules/lima.md | 2 +- docs/markdown/modules/preseq.md | 4 ++-- docs/markdown/modules/qualimap.md | 2 +- docs/markdown/modules/quast.md | 2 +- docs/markdown/modules/supernova.md | 2 +- docs/markdown/modules/vcftools.md | 2 +- scripts/make_module_docs.py | 4 ++++ 14 files changed, 20 insertions(+), 16 deletions(-) diff --git a/docs/markdown/modules/bcftools.md b/docs/markdown/modules/bcftools.md index e9cfc3585a..e414d3c75d 100644 --- a/docs/markdown/modules/bcftools.md +++ b/docs/markdown/modules/bcftools.md @@ -28,7 +28,7 @@ Supported commands: `stats` In non-strand-specific data, reporting the total numbers of occurences for both changes in a comlementary pair - like `A>C` and `T>G` - might not bring any additional information. To collapse such statistics in the substitutions plot, you can add the following section into -[your configuration](https://docs.seqera.io/multiqc/getting_started/config): +[your configuration](../getting_started/config): ```yaml bcftools: diff --git a/docs/markdown/modules/bclconvert.md b/docs/markdown/modules/bclconvert.md index 6a8ab6f9ad..63c06a49c8 100644 --- a/docs/markdown/modules/bclconvert.md +++ b/docs/markdown/modules/bclconvert.md @@ -32,7 +32,7 @@ You can specify a genome size in config It's often useful to talk about sequencing yield in terms of estimated depth of coverage. In order to make MultiQC show the estimated depth for each sample, specify the reference genome/target size in -your [MultiQC configuration](https://docs.seqera.io/multiqc/getting_started/config): +your [MultiQC configuration](../getting_started/config): ```yaml bclconvert: diff --git a/docs/markdown/modules/biscuit.md b/docs/markdown/modules/biscuit.md index 9ab48cc1cc..b9c05b934c 100644 --- a/docs/markdown/modules/biscuit.md +++ b/docs/markdown/modules/biscuit.md @@ -32,7 +32,7 @@ If you have BISCUIT data from before this, please use MultiQC v1.8. The second tab of this plot uses the config option `read_count_multiplier`, so if millions of reads is not useful for your data you can customise this. -See [Number base (multiplier)](https://docs.seqera.io/multiqc/#number-base-multiplier) +See [Number base (multiplier)](../#number-base-multiplier) in the documentation. ### File search patterns diff --git a/docs/markdown/modules/cutadapt.md b/docs/markdown/modules/cutadapt.md index 50d72ecb63..9958172e41 100644 --- a/docs/markdown/modules/cutadapt.md +++ b/docs/markdown/modules/cutadapt.md @@ -34,7 +34,7 @@ sp: contents: "cutadapt version" ``` -See the [module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns) +See the [module search patterns](../getting_started/config#module-search-patterns) section of the MultiQC documentation for more information. The module also understands logs saved by Trim Galore, which contain cutadapt logs. diff --git a/docs/markdown/modules/deeptools.md b/docs/markdown/modules/deeptools.md index 4c26a2dbac..12ee231b55 100644 --- a/docs/markdown/modules/deeptools.md +++ b/docs/markdown/modules/deeptools.md @@ -35,7 +35,7 @@ The module for deepTools parses a number of the text files that deepTools can pr - `plotCorrelation --outFileCorMatrix` - `plotProfile --outFileNameData` -Please be aware that some tools (namely, `plotFingerprint --outRawCounts` and `plotCoverage --outRawCounts`) are only supported as of deepTools version 2.6. For earlier output from `plotCoverage --outRawCounts`, you can use `#'chr' 'start' 'end'` in `search_patterns.yaml` (see [here](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns) for more details). Also for these types of files, you may need to increase the maximum file size supported by MultiQC (`log_filesize_limit` in the MultiQC configuration file). You can find details regarding the configuration file location [here](https://docs.seqera.io/multiqc/getting_started/config). +Please be aware that some tools (namely, `plotFingerprint --outRawCounts` and `plotCoverage --outRawCounts`) are only supported as of deepTools version 2.6. For earlier output from `plotCoverage --outRawCounts`, you can use `#'chr' 'start' 'end'` in `search_patterns.yaml` (see [here](../getting_started/config#module-search-patterns) for more details). Also for these types of files, you may need to increase the maximum file size supported by MultiQC (`log_filesize_limit` in the MultiQC configuration file). You can find details regarding the configuration file location [here](../getting_started/config). Note that sample names are parsed from the text files themselves, they are not derived from file names. diff --git a/docs/markdown/modules/fastqc.md b/docs/markdown/modules/fastqc.md index 15e816ee3b..1221f0c951 100644 --- a/docs/markdown/modules/fastqc.md +++ b/docs/markdown/modules/fastqc.md @@ -51,7 +51,7 @@ that they will share a sample name with data that has already been parsed. ::: You can customise the patterns used for finding these files in your -MultiQC config (see [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns)). +MultiQC config (see [Module search patterns](../getting_started/config#module-search-patterns)). The below code shows the default file patterns: ```yaml @@ -72,7 +72,7 @@ Sample names are discovered by parsing the line beginning It is possible to plot a dashed line showing the theoretical GC content for a reference genome. MultiQC comes with genome and transcriptome guides for Human and Mouse. You can use these in your reports by adding the following MultiQC -config keys (see [Configuring MultiQC](https://docs.seqera.io/multiqc/getting_started/config)): +config keys (see [Configuring MultiQC](../getting_started/config)): ```yaml fastqc_config: @@ -148,7 +148,7 @@ fastqc_config: Remember that it is possible to customise the order in which the different module sections appear in the report if you wish. -See [the docs](https://docs.seqera.io/multiqc/#order-of-module-and-module-subsection-output) for more information. +See [the docs](../#order-of-module-and-module-subsection-output) for more information. For example, to show the _Status Checks_ section at the top, use the following config: diff --git a/docs/markdown/modules/featurecounts.md b/docs/markdown/modules/featurecounts.md index f3a92001af..97faab6409 100644 --- a/docs/markdown/modules/featurecounts.md +++ b/docs/markdown/modules/featurecounts.md @@ -24,7 +24,7 @@ File path for the source of this content: multiqc/modules/featurecounts/featurec As of MultiQC v1.10, the module should also work with output from [Rsubread](https://bioconductor.org/packages/release/bioc/html/Rsubread.html). Note that your filenames must end in `.summary` to be discovered. -See [Module search patterns](https://docs.seqera.io/multiqc/getting_started/config#module-search-patterns) for how to customise this. +See [Module search patterns](../getting_started/config#module-search-patterns) for how to customise this. Please note that if files are in "Rsubread mode" then lines will be split by any whitespace, instead of tab characters. As such, filenames with spaces in will diff --git a/docs/markdown/modules/lima.md b/docs/markdown/modules/lima.md index b7dd696194..622a3cb93d 100644 --- a/docs/markdown/modules/lima.md +++ b/docs/markdown/modules/lima.md @@ -33,7 +33,7 @@ results are added to their own section. If you want to include the Lima results in the General Statistics table, you can rename the `barcode1--barcode2` filenames to their apropriate samples using -the [--replace-names](https://docs.seqera.io/multiqc/#sample-name-replacement) +the [--replace-names](../#sample-name-replacement) option. Each sample that is specified in this way will be moved from the Lima section to the General Statistics table. diff --git a/docs/markdown/modules/preseq.md b/docs/markdown/modules/preseq.md index 7d85d1aec9..24227c7280 100644 --- a/docs/markdown/modules/preseq.md +++ b/docs/markdown/modules/preseq.md @@ -30,7 +30,7 @@ file sizes. To avoid this, MultiQC trims back the x-axis until each dataset shows 80% of its maximum y-value (unique molecules). To disable this feature and show all the data, add the following to your -[MultiQC configuration](https://docs.seqera.io/multiqc/getting_started/config): +[MultiQC configuration](../getting_started/config): ```yaml preseq: @@ -42,7 +42,7 @@ preseq: Preseq reports its numbers as "Molecule counts". This isn't always very intuitive, and it's often easier to talk about sequencing depth in terms of coverage. You can plot the estimated coverage instead by specifying the reference genome or target size, -and the read length in your [MultiQC configuration](https://docs.seqera.io/multiqc/getting_started/config): +and the read length in your [MultiQC configuration](../getting_started/config): ```yaml preseq: diff --git a/docs/markdown/modules/qualimap.md b/docs/markdown/modules/qualimap.md index ace24290c5..a692711458 100644 --- a/docs/markdown/modules/qualimap.md +++ b/docs/markdown/modules/qualimap.md @@ -38,7 +38,7 @@ table_columns_visible: median_insert_size: False ``` -See the [relevant section of the documentation](https://docs.seqera.io/multiqc/reports/customisation#hiding-columns) for more detail. +See the [relevant section of the documentation](../reports/customisation#hiding-columns) for more detail. In addition to this, it's possible to customise which coverage thresholds calculated by the Qualimap BamQC module _(default: 1, 5, 10, 30, 50)_ and which of these are hidden in the diff --git a/docs/markdown/modules/quast.md b/docs/markdown/modules/quast.md index 39d4e477c8..17bceb3721 100644 --- a/docs/markdown/modules/quast.md +++ b/docs/markdown/modules/quast.md @@ -43,7 +43,7 @@ quast_config: ``` The default module values are shown above. See the -[main MultiQC documentation](https://docs.seqera.io/multiqc/getting_started/config) +[main MultiQC documentation](../getting_started/config) for more information about how to configure MultiQC. #### MetaQUAST diff --git a/docs/markdown/modules/supernova.md b/docs/markdown/modules/supernova.md index 0415694cf9..f0ef611cd0 100644 --- a/docs/markdown/modules/supernova.md +++ b/docs/markdown/modules/supernova.md @@ -38,7 +38,7 @@ This module has been tested using Supernova versions `1.1.4` and `1.2.0` Due to the size of the `histogram_kmer_count.json` files, MultiQC is likely to skip these files. To be able to display these you will need to change the MultiQC configuration to allow for larger logfiles, see the MultiQC -[documentation](https://docs.seqera.io/multiqc/usage/troubleshooting#big-log-files). For instance, if you run MultiQC as part of an +[documentation](../usage/troubleshooting#big-log-files). For instance, if you run MultiQC as part of an analysis pipeline, you can create a `multiqc_config.yaml` file in the working directory, containing the following line: diff --git a/docs/markdown/modules/vcftools.md b/docs/markdown/modules/vcftools.md index 0327e634c4..db36f97d90 100644 --- a/docs/markdown/modules/vcftools.md +++ b/docs/markdown/modules/vcftools.md @@ -26,7 +26,7 @@ File path for the source of this content: multiqc/modules/vcftools/vcftools.py - Depending on the size and density of the variant data (vcf), some of the stat files generated by vcftools can be very large. If you find that some of your input files are missing, increase - the [config.log_filesize_limit](https://docs.seqera.io/multiqc/usage/troubleshooting#big-log-files) + the [config.log_filesize_limit](../usage/troubleshooting#big-log-files) so that the large file(s) will not be skipped by MultiQC. Note, however, that this might make MultiQC very slow! diff --git a/scripts/make_module_docs.py b/scripts/make_module_docs.py index 46be89a91d..ab3e9bf0e5 100644 --- a/scripts/make_module_docs.py +++ b/scripts/make_module_docs.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Generate documentation for MultiQC modules and changelog. @@ -58,6 +59,9 @@ def main(): docstring = module_cls.__doc__ or "" + # Replace absolute URLs with relative, so that Docs CI can find broken links + docstring = docstring.replace("https://docs.seqera.io/multiqc/", "../") + if module.extra: extra = "\n".join(line.strip() for line in module.extra.split("\n") if line.strip()) extra += "\n\n" From 48127527c2833a953c2dd0b0192f39f5b8e0adf3 Mon Sep 17 00:00:00 2001 From: Gavin Date: Wed, 17 Sep 2025 21:06:38 +0100 Subject: [PATCH 34/35] first pass link correction (#3314) * Fix merge conflicts * Clean up prettier format check from upstream change * Only try to run claude PR reviews when coming from a branch Fails on PRs coming from a fork, as secrets with the API key are not available --------- Co-authored-by: Phil Ewels --- .github/workflows/claude-code-review.yml | 17 +++++++---------- .github/workflows/claude.yml | 3 +-- docs/markdown/modules/biscuit.md | 2 +- docs/markdown/modules/fastqc.md | 2 +- docs/markdown/modules/lima.md | 2 +- multiqc/modules/biscuit/biscuit.py | 2 +- multiqc/modules/fastqc/fastqc.py | 3 ++- multiqc/modules/lima/lima.py | 2 +- multiqc/multiqc.py | 2 +- multiqc/plots/plot.py | 2 +- multiqc/templates/default/toolbox.html | 6 +++--- 11 files changed, 20 insertions(+), 23 deletions(-) diff --git a/.github/workflows/claude-code-review.yml b/.github/workflows/claude-code-review.yml index 5e90d4b9ff..23ce89c599 100644 --- a/.github/workflows/claude-code-review.yml +++ b/.github/workflows/claude-code-review.yml @@ -12,19 +12,17 @@ on: jobs: claude-review: - # Optional: Filter by PR author - # if: | - # github.event.pull_request.user.login == 'external-contributor' || - # github.event.pull_request.user.login == 'new-developer' || - # github.event.pull_request.author_association == 'FIRST_TIME_CONTRIBUTOR' - + # Only run on PRs from branches in the same repository (not forks) + # This prevents the workflow from failing when secrets aren't available + if: github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest permissions: contents: read pull-requests: read issues: read id-token: write - + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -43,12 +41,11 @@ jobs: - Performance considerations - Security concerns - Test coverage - + Use the repository's CLAUDE.md for guidance on style and conventions. Be constructive and helpful in your feedback. Use `gh pr comment` with your Bash tool to leave your review as a comment on the PR. - + # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md # or https://docs.anthropic.com/en/docs/claude-code/sdk#command-line for available options claude_args: '--allowed-tools "Bash(gh issue view:*),Bash(gh search:*),Bash(gh issue list:*),Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr list:*)"' - diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index 4b2e6d2f5c..71378c68e7 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -35,7 +35,7 @@ jobs: uses: anthropics/claude-code-action@v1 with: anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} - + # This is an optional setting that allows Claude to read CI results on PRs additional_permissions: | actions: read @@ -47,4 +47,3 @@ jobs: # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md # or https://docs.anthropic.com/en/docs/claude-code/sdk#command-line for available options # claude_args: '--model claude-opus-4-1-20250805 --allowed-tools Bash(gh pr:*)' - diff --git a/docs/markdown/modules/biscuit.md b/docs/markdown/modules/biscuit.md index b9c05b934c..1bbdc65ddc 100644 --- a/docs/markdown/modules/biscuit.md +++ b/docs/markdown/modules/biscuit.md @@ -32,7 +32,7 @@ If you have BISCUIT data from before this, please use MultiQC v1.8. The second tab of this plot uses the config option `read_count_multiplier`, so if millions of reads is not useful for your data you can customise this. -See [Number base (multiplier)](../#number-base-multiplier) +See [Number base (multiplier)](../reports/customisation#number-base-multiplier) in the documentation. ### File search patterns diff --git a/docs/markdown/modules/fastqc.md b/docs/markdown/modules/fastqc.md index 1221f0c951..bde958611b 100644 --- a/docs/markdown/modules/fastqc.md +++ b/docs/markdown/modules/fastqc.md @@ -148,7 +148,7 @@ fastqc_config: Remember that it is possible to customise the order in which the different module sections appear in the report if you wish. -See [the docs](../#order-of-module-and-module-subsection-output) for more information. +See [the docs](../reports/customisation#order-of-module-and-module-subsection-output) for more information. For example, to show the _Status Checks_ section at the top, use the following config: diff --git a/docs/markdown/modules/lima.md b/docs/markdown/modules/lima.md index 622a3cb93d..1302f59617 100644 --- a/docs/markdown/modules/lima.md +++ b/docs/markdown/modules/lima.md @@ -33,7 +33,7 @@ results are added to their own section. If you want to include the Lima results in the General Statistics table, you can rename the `barcode1--barcode2` filenames to their apropriate samples using -the [--replace-names](../#sample-name-replacement) +the [--replace-names](../reports/customisation#sample-name-replacement) option. Each sample that is specified in this way will be moved from the Lima section to the General Statistics table. diff --git a/multiqc/modules/biscuit/biscuit.py b/multiqc/modules/biscuit/biscuit.py index 19ba01ca76..03955f1927 100644 --- a/multiqc/modules/biscuit/biscuit.py +++ b/multiqc/modules/biscuit/biscuit.py @@ -20,7 +20,7 @@ class MultiqcModule(BaseMultiqcModule): The second tab of this plot uses the config option `read_count_multiplier`, so if millions of reads is not useful for your data you can customise this. - See [Number base (multiplier)](https://docs.seqera.io/multiqc/#number-base-multiplier) + See [Number base (multiplier)](https://docs.seqera.io/multiqc/reports/customisation#number-base-multiplier) in the documentation. """ diff --git a/multiqc/modules/fastqc/fastqc.py b/multiqc/modules/fastqc/fastqc.py index 8ddfa4a869..0acb9d28a9 100755 --- a/multiqc/modules/fastqc/fastqc.py +++ b/multiqc/modules/fastqc/fastqc.py @@ -169,7 +169,8 @@ class MultiqcModule(BaseMultiqcModule): Remember that it is possible to customise the order in which the different module sections appear in the report if you wish. - See [the docs](https://docs.seqera.io/multiqc/#order-of-module-and-module-subsection-output) for more information. + See [the docs](https://docs.seqera.io/multiqc/reports/customisation#order-of-module-and-module-subsection-output) + for more information. For example, to show the _Status Checks_ section at the top, use the following config: diff --git a/multiqc/modules/lima/lima.py b/multiqc/modules/lima/lima.py index 6fe99259db..0a409311ca 100644 --- a/multiqc/modules/lima/lima.py +++ b/multiqc/modules/lima/lima.py @@ -20,7 +20,7 @@ class MultiqcModule(BaseMultiqcModule): If you want to include the Lima results in the General Statistics table, you can rename the `barcode1--barcode2` filenames to their apropriate samples using - the [--replace-names](https://docs.seqera.io/multiqc/#sample-name-replacement) + the [--replace-names](https://docs.seqera.io/multiqc/reports/customisation#sample-name-replacement) option. Each sample that is specified in this way will be moved from the Lima section to the General Statistics table. """ diff --git a/multiqc/multiqc.py b/multiqc/multiqc.py index 81beee0e58..ca36498e88 100644 --- a/multiqc/multiqc.py +++ b/multiqc/multiqc.py @@ -669,7 +669,7 @@ def run( log_and_rich.rich_console_print( "[blue]| multiqc[/] | " "Flat-image plots used. Disable with '--interactive'. " - "See [link=https://docs.seqera.io/multiqc/#flat--interactive-plots]docs[/link]." + "See [link=https://docs.seqera.io/multiqc/getting_started/config#flat--interactive-plots]docs[/link]." ) sys_exit_code = 0 diff --git a/multiqc/plots/plot.py b/multiqc/plots/plot.py index f7e284ec3d..c91fab1010 100644 --- a/multiqc/plots/plot.py +++ b/multiqc/plots/plot.py @@ -1157,7 +1157,7 @@ def flat_plot( '

', ' ', "Flat image plot. Toolbox functions such as highlighting / hiding samples will not work ", - '(see the docs).', + '(see the docs).', "", "

", ] diff --git a/multiqc/templates/default/toolbox.html b/multiqc/templates/default/toolbox.html index 7b1f09f325..de26bd2e2d 100644 --- a/multiqc/templates/default/toolbox.html +++ b/multiqc/templates/default/toolbox.html @@ -47,7 +47,7 @@

This report has flat image plots that won't be highlighted.
- See the documentation + See the documentation for help.

{% endif %} @@ -74,7 +74,7 @@

This report has flat image plots that won't be renamed.
- See the documentation + See the documentation for help.

{% endif %} @@ -110,7 +110,7 @@

This report has flat image plots that won't be hidden.
- See the documentation + See the documentation for help.

{% endif %} From 1d338098e60e4598023bf047f7f51616f603b872 Mon Sep 17 00:00:00 2001 From: Jethro Rainford <45037268+jethror1@users.noreply.github.com> Date: Wed, 17 Sep 2025 21:41:29 +0100 Subject: [PATCH 35/35] New module: sompy (#3186) * add new module som.py * sompy - simplify generate_table_headers() * update happy module description for new sompy module * fix style checks for failing tests * Minor review tweaks * Don't trigger docker build on pyproject.yml This is quite common, as every new module needs it --------- Co-authored-by: Phil Ewels --- .github/workflows/docker.yml | 4 +- multiqc/modules/happy/happy.py | 2 +- multiqc/modules/sompy/__init__.py | 3 + multiqc/modules/sompy/sompy.py | 204 ++++++++++++++++++++++++++++++ multiqc/search_patterns.yaml | 4 + pyproject.toml | 1 + 6 files changed, 215 insertions(+), 3 deletions(-) create mode 100644 multiqc/modules/sompy/__init__.py create mode 100644 multiqc/modules/sompy/sompy.py diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 427f84795b..9a9197224d 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -3,8 +3,8 @@ name: "Docker image" on: pull_request: paths: - # Step takes 5 min, too long for PR commits, so leaving it only to significant changes - - "pyproject.toml" + # Step takes 30 min, too long for PR commits, so leaving it only to significant changes + # - "pyproject.toml" - "Dockerfile" push: # Build multiqc:dev on every push to main diff --git a/multiqc/modules/happy/happy.py b/multiqc/modules/happy/happy.py index 7fcd73bcd9..d8174eb2f4 100644 --- a/multiqc/modules/happy/happy.py +++ b/multiqc/modules/happy/happy.py @@ -9,7 +9,7 @@ class MultiqcModule(BaseMultiqcModule): """ - Som.py output not currently supported. + Som.py output supported in separate sompy module. """ def __init__(self): diff --git a/multiqc/modules/sompy/__init__.py b/multiqc/modules/sompy/__init__.py new file mode 100644 index 0000000000..2b4896a76e --- /dev/null +++ b/multiqc/modules/sompy/__init__.py @@ -0,0 +1,3 @@ +from .sompy import MultiqcModule + +__all__ = ["MultiqcModule"] diff --git a/multiqc/modules/sompy/sompy.py b/multiqc/modules/sompy/sompy.py new file mode 100644 index 0000000000..59b1b2036e --- /dev/null +++ b/multiqc/modules/sompy/sompy.py @@ -0,0 +1,204 @@ +import csv +import logging + +from multiqc.base_module import BaseMultiqcModule, ModuleNoSamplesFound +from multiqc.plots import table + +log = logging.getLogger(__name__) + + +class MultiqcModule(BaseMultiqcModule): + def __init__(self): + """MultiQC module for processing som.py output""" + super(MultiqcModule, self).__init__( + name="som.py", + anchor="sompy", + href="https://github.com/Illumina/hap.py/blob/master/doc/sompy.md", + info=("Benchmarks somatic variant calls against gold standard truth datasets."), + # No publication / DOI // doi= + ) + + self.add_software_version(None) + + self.sompy_raw_sample_names = set() + self.sompy_combined_data = dict() + self.sompy_indel_data = dict() + self.sompy_snv_data = dict() + + for f in self.find_log_files("sompy"): + self.parse_file(f) + self.add_data_source(f) + + if len(self.sompy_raw_sample_names) == 0: + raise ModuleNoSamplesFound + + log.info("Found %s sompy reports", len(self.sompy_raw_sample_names)) + + helptext = ( + "No plots are generated, as som.py is generally run on single" + " control samples (HD757, etc.). Ideally, precision, recall and" + " F1 Score should all be as close to 1 as possible." + ) + + self.add_section( + name="Combined", + anchor="sompy-combined-plot", + helptext=helptext, + plot=table.plot( + self.sompy_combined_data, + self.generate_table_headers("_combined"), + pconfig={ + "id": "sompy_combined_plot", + "title": "som.py: combined", + }, + ), + ) + self.add_section( + name="Indel", + anchor="sompy-indel-plot", + helptext=helptext, + plot=table.plot( + self.sompy_indel_data, + self.generate_table_headers("_indel"), + pconfig={ + "id": "sompy_indel_plot", + "title": "som.py: indel", + }, + ), + ) + self.add_section( + name="SNV", + anchor="sompy-snv-plot", + helptext=helptext, + plot=table.plot( + self.sompy_snv_data, + self.generate_table_headers("_snv"), + pconfig={ + "id": "sompy_snv_plot", + "title": "som.py: SNV", + }, + ), + ) + + self.write_data_file(self.sompy_combined_data, "multiqc_sompy_combined_data") + self.write_data_file(self.sompy_indel_data, "multiqc_sompy_indel_data") + self.write_data_file(self.sompy_snv_data, "multiqc_sompy_snv_data") + + def generate_table_headers(self, suffix: str = "") -> dict: + """ + Generates the dict of table header metadata + """ + header = { + "unk": { + "title": "Not assessed calls", + "description": "Number of non-assessed query calls", + "min": 0, + "max": 1, + "hidden": True, + "format": "{:.4f}", + }, + "total.truth": { + "title": "Truth: Total", + "description": "Total number of truth variants", + "format": None, + "hidden": True, + }, + "total.query": { + "title": "Query: Total", + "description": "Total number of query calls", + "format": None, + "hidden": True, + }, + "tp": { + "title": "True Positive Variants", + "description": "Number of true-positive calls", + "scale": "Greens", + "format": None, + }, + "fn": { + "title": "False Negative Variants", + "description": "Calls in truth without matching query call", + "scale": "Reds", + "format": None, + }, + "fp": { + "title": "False Positive Variants", + "description": "Number of false-positive calls", + "format": None, + "scale": "Reds", + "hidden": True, + }, + "recall": { + "title": "Recall", + "description": ("Recall for truth variant representation = TRUTH.TP / (TRUTH.TP + TRUTH.FN)"), + "min": 0, + "max": 1, + "cond_formatting_rules": { + "verygreen": [ + {"gte": 0.99}, + ], + "green": [{"lt": 0.99}, {"gt": 0.98}], + "amber": [{"lt": 0.98}, {"gt": 0.90}], + "red": [{"lt": 0.90}], + }, + "cond_formatting_colours": [ + {"red": "#D2222D"}, + {"amber": "#FFBF00"}, + {"green": "#238823"}, + {"verygreen": "#007000"}, + ], + "format": "{:.4f}", + }, + "precision": { + "title": "Precision", + "description": ("Precision of query variants = QUERY.TP / (QUERY.TP + QUERY.FP)"), + "min": 0, + "max": 1, + "format": "{:.4f}", + "hidden": True, + }, + } + + return {f"{k}{suffix}": v for k, v in header.items()} + + def parse_file(self, f): + """ + Reads the combined, indel and SNV data from sompy output file + """ + if self.is_ignore_sample(f["s_name"]): + return + + if f["s_name"] in self.sompy_raw_sample_names: + log.warning( + "Duplicate sample name found in %s! Overwriting: %s", + f["root"], + f["s_name"], + ) + + self.sompy_raw_sample_names.add(f["s_name"]) + + reader = csv.DictReader(f["f"].split("\n")) + + for row in reader: + row_id = f"{f['s_name']}_{row['type']}" + + if row["type"] == "records": + if row_id not in self.sompy_combined_data: + self.sompy_combined_data[row_id] = {"sample_id": f["s_name"]} + + for fn in reader.fieldnames: + self.sompy_combined_data[row_id][fn + "_combined"] = row[fn] + + if row["type"] == "indels": + if row_id not in self.sompy_indel_data: + self.sompy_indel_data[row_id] = {"sample_id": f["s_name"]} + + for fn in reader.fieldnames: + self.sompy_indel_data[row_id][fn + "_indel"] = row[fn] + + if row["type"] == "SNVs": + if row_id not in self.sompy_snv_data: + self.sompy_snv_data[row_id] = {"sample_id": f["s_name"]} + + for fn in reader.fieldnames: + self.sompy_snv_data[row_id][fn + "_snv"] = row[fn] diff --git a/multiqc/search_patterns.yaml b/multiqc/search_patterns.yaml index b0caaed68c..e74ad3ea07 100644 --- a/multiqc/search_patterns.yaml +++ b/multiqc/search_patterns.yaml @@ -970,6 +970,10 @@ snpsplit/new: fn: "*SNPsplit_report.yaml" software_versions: fn_re: ".+_mqc_versions\\.(yaml|yml)" +sompy: + fn: "*.stats.csv" + contents: ",sompyversion,sompycmd" + num_lines: 2 sortmerna: contents: "Minimal SW score based on E-value" spaceranger/count_html: diff --git a/pyproject.toml b/pyproject.toml index fd901dca5a..ee3a0cfe98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -248,6 +248,7 @@ snippy = "multiqc.modules.snippy:MultiqcModule" snpeff = "multiqc.modules.snpeff:MultiqcModule" snpsplit = "multiqc.modules.snpsplit:MultiqcModule" somalier = "multiqc.modules.somalier:MultiqcModule" +sompy = "multiqc.modules.sompy:MultiqcModule" sortmerna = "multiqc.modules.sortmerna:MultiqcModule" sourmash = "multiqc.modules.sourmash:MultiqcModule" spaceranger = "multiqc.modules.spaceranger:MultiqcModule"