From 6e218123f98d1fbc81aaae496b0c4b2401561702 Mon Sep 17 00:00:00 2001
From: Jorisvansteenbrugge
 <7196110+Jorisvansteenbrugge@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:01:10 +0200
Subject: [PATCH 1/3] demo prs modules

---
 UMCUGenetics/pgscatalog/combine/main.nf       | 27 +++++++++
 .../pgscatalog/combine/tests/main.nf.test     | 27 +++++++++
 UMCUGenetics/pgscatalog/match/main.nf         | 42 ++++++++++++++
 .../pgscatalog/match/tests/main.nf.test       | 58 +++++++++++++++++++
 4 files changed, 154 insertions(+)
 create mode 100644 UMCUGenetics/pgscatalog/combine/main.nf
 create mode 100644 UMCUGenetics/pgscatalog/combine/tests/main.nf.test
 create mode 100644 UMCUGenetics/pgscatalog/match/main.nf
 create mode 100644 UMCUGenetics/pgscatalog/match/tests/main.nf.test

diff --git a/UMCUGenetics/pgscatalog/combine/main.nf b/UMCUGenetics/pgscatalog/combine/main.nf
new file mode 100644
index 0000000..a651c19
--- /dev/null
+++ b/UMCUGenetics/pgscatalog/combine/main.nf
@@ -0,0 +1,27 @@
+process PGSCATALOG_COMBINE {
+    tag "${meta.id}"
+
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://depot.galaxyproject.org/singularity/pgscatalog-utils:1.4.4--pyhdfd78af_0'
+        : 'biocontainers/pgscatalog-utils:1.4.4--pyhdfd78af_0'}"
+
+    input:
+    tuple val(meta), path(scoring_file)
+    val assembly_version
+
+    output:
+    tuple val(meta), path("*_normalised.txt.gz"), emit: normalised_model
+    path "versions.yml", emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: meta.id
+    """
+    pgscatalog-combine \\
+        -s ${scoring_file} \\
+        -t ${assembly_version} \\
+        -o ${prefix}_normalised.txt.gz
+
+
+    echo "pgscatalog-combine: 1.4.4" > versions.yml
+    """
+}
diff --git a/UMCUGenetics/pgscatalog/combine/tests/main.nf.test b/UMCUGenetics/pgscatalog/combine/tests/main.nf.test
new file mode 100644
index 0000000..748609f
--- /dev/null
+++ b/UMCUGenetics/pgscatalog/combine/tests/main.nf.test
@@ -0,0 +1,27 @@
+nextflow_process {
+    name "Test Process PGScatalog-combine"
+    script "../main.nf"
+    process "PGSCATALOG_COMBINE"
+
+    tag "modules/local"
+
+
+    test("Test Correct model formatting "){
+        when{
+            process{
+                """
+                input[0] = [[id: 'model'],
+                    file("${projectDir}/assets/models/BCAC_313_PRS_GRCh38.txt",
+                    checkifExists: true)]
+                input[1] = channel.value("GRCh38")
+                """
+            }
+
+            then {
+                assertAll(
+                    {assert process.success},
+                )
+            }
+        }
+    }
+}
diff --git a/UMCUGenetics/pgscatalog/match/main.nf b/UMCUGenetics/pgscatalog/match/main.nf
new file mode 100644
index 0000000..8d2db1b
--- /dev/null
+++ b/UMCUGenetics/pgscatalog/match/main.nf
@@ -0,0 +1,42 @@
+process PGSCATALOG_MATCH {
+    tag "${meta.id}"
+
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+        ? 'https://depot.galaxyproject.org/singularity/pgscatalog-utils:1.4.4--pyhdfd78af_0'
+        : 'biocontainers/pgscatalog-utils:1.4.4--pyhdfd78af_0'}"
+
+    input:
+    tuple val(meta), path(pvar)
+    tuple val(meta2), path(scoring_file)
+
+
+    output:
+    tuple val(meta), path("*_summary.csv"), emit: summary
+    tuple val(meta), path("*.scorefile.gz"), emit: scorefile
+    tuple val(meta), path("*_log.csv.gz"), emit: log
+    path "versions.yml", emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: meta.id
+    def args = task.ext.args ?: ""
+    """
+    pgscatalog-match \\
+        ${args} \\
+        --dataset ${prefix} \\
+        --scorefiles ${scoring_file} \\
+        --target ${pvar} \\
+        --outdir ./
+
+    echo "pgscatalog-match: 1.4.4" > versions.yml
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "Cohort"
+    """
+	touch ${prefix}_summary.csv
+	touch ${prefix}.scorefile.gz
+    touch ${prefix}_log.csv.gz
+
+	echo "pgscatalog-match: 1.4.4" > versions.yml
+	"""
+}
diff --git a/UMCUGenetics/pgscatalog/match/tests/main.nf.test b/UMCUGenetics/pgscatalog/match/tests/main.nf.test
new file mode 100644
index 0000000..176b6ad
--- /dev/null
+++ b/UMCUGenetics/pgscatalog/match/tests/main.nf.test
@@ -0,0 +1,58 @@
+nextflow_process {
+    name "Test Process PGScatalog-match"
+    script "../main.nf"
+    process "PGSCATALOG_MATCH"
+
+
+
+    tag "modules/local"
+
+    test("Test SNV ambiguity detection"){
+        setup{
+            run("PLINK2_VCF") {
+                script "../../../../nf-core/plink2/vcf/main.nf"
+
+                process{
+                    """
+                        input[0] = [[id: "vcf"], file("${projectDir}/assets/test-data/test.vcf",checkIfExists: true)]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = [[id: "model"],
+                        file(
+                            "${projectDir}/assets/test-data/norm_test_model_subworkflow.txt.gz",
+                            checkIfExists: true)]
+                    input[1] = PLINK2_VCF.out.pvar_zst
+                    
+                """
+            }  
+        }
+        then {
+            // Analysing the csv is necessary as the snapshots do not always match
+            // (row order differs)
+            def csv = path(process.out.summary[0][1]).csv().table
+            def matched = csv.stringColumn("match_status").isEqualTo("matched")
+            
+            def n_ambiguous_SNV = csv.where(
+                csv.booleanColumn("ambiguous").isTrue().and(matched)
+            ).row(0).getInt("count")
+
+            def n_unambiguous_SNV = csv.where(
+                csv.booleanColumn("ambiguous").isFalse().and(matched)
+            ).row(0).getInt("count")
+            
+            assertAll (
+                {assert process.success},
+                {assert n_ambiguous_SNV == 1},
+                {assert n_unambiguous_SNV == 3}
+            )
+        }
+            
+    
+    }
+}

From 10012bf8608c6b5b71f3d44cba908265887493e7 Mon Sep 17 00:00:00 2001
From: Jorisvansteenbrugge
 <7196110+Jorisvansteenbrugge@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:16:26 +0200
Subject: [PATCH 2/3] add knn modules

---
 UMCUGenetics/knn/calc/main.nf                 |  33 ++
 .../knn/calc/resources/usr/bin/knn.py         | 424 ++++++++++++++++++
 .../knn/calc/resources/usr/bin/test_knn.py    | 152 +++++++
 UMCUGenetics/knn/merge/main.nf                |  19 +
 4 files changed, 628 insertions(+)
 create mode 100644 UMCUGenetics/knn/calc/main.nf
 create mode 100755 UMCUGenetics/knn/calc/resources/usr/bin/knn.py
 create mode 100644 UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
 create mode 100644 UMCUGenetics/knn/merge/main.nf

diff --git a/UMCUGenetics/knn/calc/main.nf b/UMCUGenetics/knn/calc/main.nf
new file mode 100644
index 0000000..8533225
--- /dev/null
+++ b/UMCUGenetics/knn/calc/main.nf
@@ -0,0 +1,33 @@
+process ANCESTRY_KNN {
+    tag "${meta.id}"
+    label "process_medium"
+
+    container "ghcr.io/astral-sh/uv:python3.13-bookworm"
+
+    input:
+    tuple val(meta), path(eigenvec)
+    tuple val(meta2), path(ref_metadata)
+
+    output:
+    tuple val(meta), path("*_knn.tsv"), emit: knn_tsv
+    tuple val(meta), path("*_knn_pca.png"), emit: knn_pca_plot, optional: true
+
+    script:
+    def prefix = task.ext.prefix ?: meta.id
+    def args = task.ext.args ?: ""
+    """
+    knn.py \
+        --eig ${eigenvec} \\
+        --labels ${ref_metadata} \\
+        ${args} \\
+        --plot-output ${prefix}_knn_pca.png \\
+        --output ${prefix}_knn.tsv
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: meta.id
+    """
+    touch ${prefix}_knn.tsv
+    touch ${prefix}_knn_pca.png
+    """
+}
diff --git a/UMCUGenetics/knn/calc/resources/usr/bin/knn.py b/UMCUGenetics/knn/calc/resources/usr/bin/knn.py
new file mode 100755
index 0000000..0ce8866
--- /dev/null
+++ b/UMCUGenetics/knn/calc/resources/usr/bin/knn.py
@@ -0,0 +1,424 @@
+#!/usr/bin/env -S uv run --script --no-cache
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "matplotlib",
+#     "numpy",
+#     "pandas",
+#     "typer",
+# ]
+# ///
+"""Run K-nearest-neighbor ancestry prediction from PCA coordinates."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Annotated
+
+import matplotlib
+import numpy as np
+import pandas as pd
+import typer
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+app = typer.Typer(add_completion=False, help="KNN ancestry prediction from PCA coordinates.")
+
+
+def read_input_tables(
+    eig_path: Path,
+    labels_path: Path,
+    sep: str,
+    id_col: str,
+    label_col: str,
+) -> pd.DataFrame:
+    """Read PCA and label tables, validate required columns, and merge by ID.
+
+    Args:
+        eig_path: Path to the PCA eigenvector file.
+        labels_path: Path to the sample label file.
+        sep: Delimiter used in both input files.
+        id_col: Column name for sample identifiers.
+        label_col: Column name for group labels.
+
+    Returns:
+        Merged DataFrame with PCA features and labels joined on ``id_col``.
+
+    Raises:
+        typer.BadParameter: If required columns are missing from either file.
+    """
+    eig = pd.read_csv(eig_path, sep=sep)
+    labels = pd.read_csv(labels_path, sep=sep)
+
+    required_eig_cols = {id_col}
+    required_label_cols = {id_col, label_col}
+    missing_eig = required_eig_cols - set(eig.columns)
+    missing_lab = required_label_cols - set(labels.columns)
+
+    if missing_eig:
+        raise typer.BadParameter(
+            f"Missing required column(s) in eig file: {', '.join(sorted(missing_eig))}"
+        )
+    if missing_lab:
+        raise typer.BadParameter(
+            f"Missing required column(s) in labels file: {', '.join(sorted(missing_lab))}"
+        )
+
+    return eig.merge(labels[[id_col, label_col]], on=id_col, how="left")
+
+
+def get_pc_columns(df: pd.DataFrame) -> list[str]:
+    """Return PCA feature columns named like PC1, PC2, etc.
+
+    Args:
+        df: DataFrame expected to contain one or more PC columns.
+
+    Returns:
+        List of column names whose names start with ``PC`` (case-insensitive).
+
+    Raises:
+        typer.BadParameter: If no PC columns are found in ``df``.
+    """
+    pc_cols = [column for column in df.columns if column.upper().startswith("PC")]
+    if not pc_cols:
+        raise typer.BadParameter(
+            "No PC columns found. Expected columns like PC1, PC2, ..."
+        )
+    return pc_cols
+
+
+def sort_pc_columns(pc_cols: list[str]) -> list[str]:
+    """Sort PC columns by their numeric suffix when present (PC1, PC2, PC10, ...).
+
+    Columns with a non-numeric suffix are sorted after numeric ones.
+
+    Args:
+        pc_cols: List of PC column names to sort.
+
+    Returns:
+        Sorted list of PC column names.
+    """
+    def _pc_sort_key(column: str) -> tuple[int, str]:
+        suffix = column[2:]
+        return (int(suffix), column) if suffix.isdigit() else (10**9, column)
+
+    return sorted(pc_cols, key=_pc_sort_key)
+
+
+def split_train_predict(
+    merged_df: pd.DataFrame,
+    label_col: str,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+    """Split merged data into labeled training rows and unlabeled prediction rows.
+
+    Args:
+        merged_df: DataFrame containing both labeled reference and unlabeled samples.
+        label_col: Column name that distinguishes labeled (non-NaN) from unlabeled rows.
+
+    Returns:
+        A ``(train, predict)`` tuple where ``train`` contains rows with a known label
+        and ``predict`` contains rows whose label is NaN.
+
+    Raises:
+        typer.BadParameter: If there are no labeled or no unlabeled samples after splitting.
+    """
+    train = merged_df[merged_df[label_col].notna()].copy()
+    predict = merged_df[merged_df[label_col].isna()].copy()
+
+    if train.empty:
+        raise typer.BadParameter(
+            "No labeled samples found after merge. Check if IDs match between files."
+        )
+    if predict.empty:
+        raise typer.BadParameter(
+            "No unlabeled samples found. Nothing to predict."
+        )
+
+    return train, predict
+
+
+def standardize_train_query(
+    train_features: np.ndarray,
+    query_features: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Standardize features using training-set mean and standard deviation.
+
+    Columns with zero variance in the training set are left unchanged (std set to 1).
+
+    Args:
+        train_features: Feature matrix for labeled training samples ``(n_train, n_pcs)``.
+        query_features: Feature matrix for samples to predict ``(n_query, n_pcs)``.
+
+    Returns:
+        A ``(train_std, query_std)`` tuple of standardized feature matrices.
+    """
+    mean = train_features.mean(axis=0)
+    std = train_features.std(axis=0)
+    std[std == 0] = 1.0
+    return (train_features - mean) / std, (query_features - mean) / std
+
+
+def predict_knn(
+    train_features: np.ndarray,
+    train_labels: np.ndarray,
+    query_features: np.ndarray,
+    k: int,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Predict class labels and confidence scores using a simple KNN vote.
+
+    Confidence is the fraction of the *k* nearest neighbors that agree on the
+    winning label.  When ``k`` exceeds the number of training samples, the
+    effective neighbourhood is capped at ``len(train_labels)``.
+
+    Args:
+        train_features: Feature matrix for labeled training samples ``(n_train, n_pcs)``.
+        train_labels: Label array aligned with ``train_features`` ``(n_train,)``.
+        query_features: Feature matrix for samples to predict ``(n_query, n_pcs)``.
+        k: Number of nearest neighbors to consider.
+
+    Returns:
+        A ``(predicted_labels, confidences)`` tuple, both of shape ``(n_query,)``.
+
+    Raises:
+        typer.BadParameter: If ``k`` is less than 1.
+    """
+    if k < 1:
+        raise typer.BadParameter("k must be >= 1.")
+
+    effective_k = min(k, len(train_labels))
+    predicted_labels: list[str] = []
+    confidences: list[float] = []
+
+    for query_row in query_features:
+        distances = np.sum((train_features - query_row) ** 2, axis=1)
+        neighbor_idx = np.argpartition(distances, effective_k - 1)[:effective_k]
+        neighbor_labels = train_labels[neighbor_idx]
+
+        labels, counts = np.unique(neighbor_labels, return_counts=True)
+        winner_index = int(np.argmax(counts))
+        predicted_labels.append(str(labels[winner_index]))
+        confidences.append(float(counts[winner_index] / effective_k))
+
+    return np.array(predicted_labels), np.array(confidences)
+
+
+def write_predictions(
+    sample_ids: np.ndarray,
+    predicted_labels: np.ndarray,
+    confidences: np.ndarray,
+    output_path: Path,
+) -> None:
+    """Write KNN predictions to a tab-separated output file.
+
+    Args:
+        sample_ids: Array of sample identifiers ``(n_query,)``.
+        predicted_labels: Predicted group label for each sample ``(n_query,)``.
+        confidences: KNN confidence score for each prediction ``(n_query,)``.
+        output_path: Destination path for the output TSV.
+    """
+    out_df = pd.DataFrame(
+        {
+            "#IID": sample_ids,
+            "pred_group": predicted_labels,
+            "knn_conf": confidences,
+        }
+    )
+    out_df.to_csv(output_path, sep="\t", index=False)
+
+
+def write_pca_plot(
+    train_df: pd.DataFrame,
+    predict_df: pd.DataFrame,
+    predicted_labels: np.ndarray,
+    pc_x: str,
+    pc_y: str,
+    label_col: str,
+    output_path: Path,
+) -> None:
+    """Generate a PCA scatter plot colored by cluster/group labels.
+
+    Reference samples are drawn as small filled circles; predicted samples are
+    drawn as larger ``X`` markers with a black edge, using the same color as
+    their assigned cluster.
+
+    Args:
+        train_df: DataFrame of labeled reference samples including ``pc_x``, ``pc_y``,
+            and ``label_col`` columns.
+        predict_df: DataFrame of samples to predict including ``pc_x`` and ``pc_y`` columns.
+        predicted_labels: Predicted group label for each row in ``predict_df`` ``(n_query,)``.
+        pc_x: Column name to use as the x-axis.
+        pc_y: Column name to use as the y-axis.
+        label_col: Column name holding the reference group label in ``train_df``.
+        output_path: Destination path for the output PNG.
+    """
+    plot_train = train_df[[pc_x, pc_y, label_col]].copy()
+    plot_train["cluster"] = plot_train[label_col].astype(str)
+    plot_train["source"] = "reference"
+
+    plot_predict = predict_df[[pc_x, pc_y]].copy()
+    plot_predict["cluster"] = predicted_labels
+    plot_predict["source"] = "predicted"
+
+    plot_df = pd.concat(
+        [
+            plot_train[[pc_x, pc_y, "cluster", "source"]],
+            plot_predict[[pc_x, pc_y, "cluster", "source"]],
+        ],
+        ignore_index=True,
+    )
+
+    clusters = sorted(plot_df["cluster"].unique())
+    cmap = plt.cm.get_cmap("tab20", max(len(clusters), 1))
+    color_map = {cluster: cmap(i) for i, cluster in enumerate(clusters)}
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+
+    for cluster in clusters:
+        cluster_train = plot_df[(plot_df["cluster"] == cluster) & (plot_df["source"] == "reference")]
+        cluster_pred = plot_df[(plot_df["cluster"] == cluster) & (plot_df["source"] == "predicted")]
+        color = color_map[cluster]
+
+        if not cluster_train.empty:
+            ax.scatter(
+                cluster_train[pc_x],
+                cluster_train[pc_y],
+                s=28,
+                alpha=0.75,
+                color=color,
+                edgecolors="none",
+                label=str(cluster),
+            )
+
+        if not cluster_pred.empty:
+            ax.scatter(
+                cluster_pred[pc_x],
+                cluster_pred[pc_y],
+                s=80,
+                alpha=1.0,
+                marker="X",
+                color=color,
+                edgecolors="black",
+                linewidths=0.6,
+            )
+
+    ax.set_xlabel(pc_x)
+    ax.set_ylabel(pc_y)
+    ax.set_title("PCA Clusters with KNN Predictions")
+
+    handles, labels = ax.get_legend_handles_labels()
+    if handles:
+        by_label = dict(zip(labels, handles))
+        ax.legend(by_label.values(), by_label.keys(), title="Cluster", bbox_to_anchor=(1.02, 1), loc="upper left")
+
+    fig.tight_layout()
+    fig.savefig(output_path, dpi=200)
+    plt.close(fig)
+
+
+@app.command()
+def main(
+    eig: Annotated[
+        Path,
+        typer.Option(
+            "--eig",
+            "-e",
+            exists=True,
+            dir_okay=False,
+            readable=True,
+            help="PCA table with ID and PC columns.",
+        ),
+    ],
+    labels: Annotated[
+        Path,
+        typer.Option(
+            "--labels",
+            "-l",
+            exists=True,
+            dir_okay=False,
+            readable=True,
+            help="Label table with ID and group label columns.",
+        ),
+    ],
+    output: Annotated[
+        Path,
+        typer.Option("--output", "-o", help="Output TSV path."),
+    ] = Path("knn_pred.tsv"),
+    k: Annotated[int, typer.Option("--k", "-k", min=1, help="Number of neighbors.")] = 5,
+    id_col: Annotated[str, typer.Option("--id-col", help="Sample ID column name.")] = "#IID",
+    label_col: Annotated[str, typer.Option("--label-col", help="Label column name.")] = "SuperPop",
+    sep: Annotated[str, typer.Option("--sep", help="Input delimiter used by both files.")] = "\t",
+    normalize: Annotated[
+        bool, typer.Option("--normalize/--no-normalize", help="Standardize PC columns.")
+    ] = True,
+    conf_threshold: Annotated[float, typer.Option('--conf_threshold', help="Confidence threshold for plot outputting")] = 0.6,
+    req_label: Annotated[str, typer.Option("--required_superpop", help="Expected super population for ancestry flagging")] = "EUR",
+    plot_output: Annotated[
+        Path, typer.Option("--plot-output", help="Output PNG path for PCA scatter plot.")
+    ] = Path("knn_pca.png"),
+) -> None:
+    """Run the KNN workflow and write prediction output.
+
+    Reads PCA and label files, trains a KNN classifier on labeled reference
+    samples, predicts ancestry for unlabeled samples, and writes a TSV of
+    predictions.  A PCA scatter plot is written when the first sample's
+    confidence falls at or below ``conf_threshold`` or its predicted label
+    differs from ``req_label``.
+    """
+    merged_df = read_input_tables(
+        eig_path=eig,
+        labels_path=labels,
+        sep=sep,
+        id_col=id_col,
+        label_col=label_col,
+    )
+    pc_cols = sort_pc_columns(get_pc_columns(merged_df))
+    if len(pc_cols) < 2:
+        raise typer.BadParameter(
+            "At least two PC columns are required."
+        )
+    train_df, predict_df = split_train_predict(merged_df, label_col=label_col)
+
+    train_features = train_df[pc_cols].to_numpy(dtype=float)
+    query_features = predict_df[pc_cols].to_numpy(dtype=float)
+    train_labels = train_df[label_col].to_numpy(dtype=str)
+
+    if normalize:
+        train_features, query_features = standardize_train_query(
+            train_features=train_features,
+            query_features=query_features,
+        )
+
+    predicted_labels, confidences = predict_knn(
+        train_features=train_features,
+        train_labels=train_labels,
+        query_features=query_features,
+        k=k,
+    )
+
+    write_predictions(
+        sample_ids=predict_df[id_col].to_numpy(),
+        predicted_labels=predicted_labels,
+        confidences=confidences,
+        output_path=output,
+    )
+
+    first_confidence = confidences[0]
+    first_predicted_label = predicted_labels[0]
+
+    if first_confidence <= conf_threshold or first_predicted_label != req_label:
+        write_pca_plot(
+            train_df=train_df,
+            predict_df=predict_df,
+            predicted_labels=predicted_labels,
+            pc_x=pc_cols[0],
+            pc_y=pc_cols[1],
+            label_col=label_col,
+            output_path=plot_output,
+        )
+        typer.echo(f"Wrote {plot_output}")
+    typer.echo(f"Wrote {output}")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py b/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
new file mode 100644
index 0000000..a1448cd
--- /dev/null
+++ b/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
@@ -0,0 +1,152 @@
+"""Unit tests for knn.py helper functions."""
+
+import numpy as np
+import pandas as pd
+import pytest
+import typer
+
+from knn import (
+    get_pc_columns,
+    predict_knn,
+    sort_pc_columns,
+    split_train_predict,
+    standardize_train_query,
+)
+
+
+# ---------------------------------------------------------------------------
+# sort_pc_columns
+# ---------------------------------------------------------------------------
+
+def test_sort_pc_columns_numeric_order():
+    assert sort_pc_columns(["PC10", "PC2", "PC1"]) == ["PC1", "PC2", "PC10"]
+
+
+def test_sort_pc_columns_non_numeric_suffix_last():
+    result = sort_pc_columns(["PCx", "PC2", "PC1"])
+    assert result[:2] == ["PC1", "PC2"]
+    assert result[-1] == "PCx"
+
+
+def test_sort_pc_columns_single_element():
+    assert sort_pc_columns(["PC1"]) == ["PC1"]
+
+
+# ---------------------------------------------------------------------------
+# get_pc_columns
+# ---------------------------------------------------------------------------
+
+def test_get_pc_columns_returns_matching_columns():
+    df = pd.DataFrame(columns=["#IID", "PC1", "PC2", "SuperPop"])
+    assert set(get_pc_columns(df)) == {"PC1", "PC2"}
+
+
+def test_get_pc_columns_case_insensitive():
+    df = pd.DataFrame(columns=["#IID", "pc1", "PC2"])
+    assert set(get_pc_columns(df)) == {"pc1", "PC2"}
+
+
+def test_get_pc_columns_raises_when_none_found():
+    df = pd.DataFrame(columns=["#IID", "SuperPop"])
+    with pytest.raises(typer.BadParameter):
+        get_pc_columns(df)
+
+
+# ---------------------------------------------------------------------------
+# split_train_predict
+# ---------------------------------------------------------------------------
+
+def _make_merged_df(labeled_n: int, unlabeled_n: int) -> pd.DataFrame:
+    rows = []
+    for i in range(labeled_n):
+        rows.append({"#IID": f"ref_{i}", "PC1": float(i), "SuperPop": "EUR"})
+    for i in range(unlabeled_n):
+        rows.append({"#IID": f"sample_{i}", "PC1": float(i + 100), "SuperPop": np.nan})
+    return pd.DataFrame(rows)
+
+
+def test_split_train_predict_sizes():
+    df = _make_merged_df(labeled_n=10, unlabeled_n=3)
+    train, predict = split_train_predict(df, label_col="SuperPop")
+    assert len(train) == 10
+    assert len(predict) == 3
+
+
+def test_split_train_predict_raises_no_train():
+    df = _make_merged_df(labeled_n=0, unlabeled_n=3)
+    with pytest.raises(typer.BadParameter, match="No labeled"):
+        split_train_predict(df, label_col="SuperPop")
+
+
+def test_split_train_predict_raises_no_predict():
+    df = _make_merged_df(labeled_n=5, unlabeled_n=0)
+    with pytest.raises(typer.BadParameter, match="Nothing to predict"):
+        split_train_predict(df, label_col="SuperPop")
+
+
+# ---------------------------------------------------------------------------
+# standardize_train_query
+# ---------------------------------------------------------------------------
+
+def test_standardize_zero_mean_unit_std():
+    train = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
+    query = np.array([[3.0, 4.0]])
+    train_std, query_std = standardize_train_query(train, query)
+    np.testing.assert_allclose(train_std.mean(axis=0), 0.0, atol=1e-10)
+    np.testing.assert_allclose(train_std.std(axis=0), 1.0, atol=1e-10)
+    # query center-of-training → all zeros after standardization
+    np.testing.assert_allclose(query_std, [[0.0, 0.0]], atol=1e-10)
+
+
+def test_standardize_zero_variance_column_unchanged():
+    """Columns with zero variance must not cause division-by-zero."""
+    train = np.array([[1.0, 5.0], [1.0, 7.0]])  # column 0 has std=0
+    query = np.array([[1.0, 6.0]])
+    train_std, _ = standardize_train_query(train, query)
+    # zero-variance column is left unchanged (divided by 1 after clamp)
+    np.testing.assert_allclose(train_std[:, 0], 0.0, atol=1e-10)
+
+
+# ---------------------------------------------------------------------------
+# predict_knn
+# ---------------------------------------------------------------------------
+
+def _simple_train():
+    """Two clusters far apart: A around 0, B around 10."""
+    train_features = np.array(
+        [[0.0], [0.1], [0.2], [10.0], [10.1], [10.2]]
+    )
+    train_labels = np.array(["A", "A", "A", "B", "B", "B"])
+    return train_features, train_labels
+
+
+def test_predict_knn_clear_majority():
+    train_features, train_labels = _simple_train()
+    query = np.array([[0.05], [10.05]])
+    labels, confs = predict_knn(train_features, train_labels, query, k=3)
+    assert list(labels) == ["A", "B"]
+    np.testing.assert_allclose(confs, [1.0, 1.0])
+
+
+def test_predict_knn_confidence_range():
+    train_features, train_labels = _simple_train()
+    query = np.array([[5.0]])  # equidistant — confidence will be < 1
+    labels, confs = predict_knn(train_features, train_labels, query, k=6)
+    assert 0.0 <= confs[0] <= 1.0
+
+
+def test_predict_knn_k_exceeds_training_size():
+    """k larger than training set should not raise, just use all neighbors."""
+    train_features = np.array([[0.0], [1.0]])
+    train_labels = np.array(["A", "A"])
+    query = np.array([[0.5]])
+    labels, confs = predict_knn(train_features, train_labels, query, k=100)
+    assert labels[0] == "A"
+    assert confs[0] == 1.0
+
+
+def test_predict_knn_k_less_than_1_raises():
+    train_features = np.array([[0.0]])
+    train_labels = np.array(["A"])
+    with pytest.raises(typer.BadParameter, match="k must be"):
+        predict_knn(train_features, train_labels, np.array([[0.0]]), k=0)
diff --git a/UMCUGenetics/knn/merge/main.nf b/UMCUGenetics/knn/merge/main.nf
new file mode 100644
index 0000000..23c25c5
--- /dev/null
+++ b/UMCUGenetics/knn/merge/main.nf
@@ -0,0 +1,19 @@
+process ANCESTRY_KNN_MERGE {
+    tag "ANCESTRY_KNN_MERGE"
+    label "process_low"
+
+    input:
+    path(knn_tsvs)
+
+    output:
+    path("ancestry_knn_mqc.tsv"), emit: knn_mqc_tsv
+
+    script:
+    """
+    echo "Sample ID\tPrediction Group\tConfidence" > ancestry_knn_mqc.tsv
+    
+    for f in ${knn_tsvs}; do
+        tail -n 1 \$f >> ancestry_knn_mqc.tsv
+    done
+    """
+}

From 044cabb8383afd272be8219c00a8655242ee17cf Mon Sep 17 00:00:00 2001
From: Jorisvansteenbrugge
 <7196110+Jorisvansteenbrugge@users.noreply.github.com>
Date: Wed, 8 Apr 2026 17:19:20 +0200
Subject: [PATCH 3/3] move modules to correct folder

---
 {UMCUGenetics => modules/UMCUGenetics}/knn/calc/main.nf           | 0
 .../UMCUGenetics}/knn/calc/resources/usr/bin/knn.py               | 0
 .../UMCUGenetics}/knn/calc/resources/usr/bin/test_knn.py          | 0
 {UMCUGenetics => modules/UMCUGenetics}/knn/merge/main.nf          | 0
 {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/combine/main.nf | 0
 .../UMCUGenetics}/pgscatalog/combine/tests/main.nf.test           | 0
 {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/match/main.nf   | 0
 .../UMCUGenetics}/pgscatalog/match/tests/main.nf.test             | 0
 8 files changed, 0 insertions(+), 0 deletions(-)
 rename {UMCUGenetics => modules/UMCUGenetics}/knn/calc/main.nf (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/knn/calc/resources/usr/bin/knn.py (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/knn/calc/resources/usr/bin/test_knn.py (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/knn/merge/main.nf (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/combine/main.nf (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/combine/tests/main.nf.test (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/match/main.nf (100%)
 rename {UMCUGenetics => modules/UMCUGenetics}/pgscatalog/match/tests/main.nf.test (100%)

diff --git a/UMCUGenetics/knn/calc/main.nf b/modules/UMCUGenetics/knn/calc/main.nf
similarity index 100%
rename from UMCUGenetics/knn/calc/main.nf
rename to modules/UMCUGenetics/knn/calc/main.nf
diff --git a/UMCUGenetics/knn/calc/resources/usr/bin/knn.py b/modules/UMCUGenetics/knn/calc/resources/usr/bin/knn.py
similarity index 100%
rename from UMCUGenetics/knn/calc/resources/usr/bin/knn.py
rename to modules/UMCUGenetics/knn/calc/resources/usr/bin/knn.py
diff --git a/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py b/modules/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
similarity index 100%
rename from UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
rename to modules/UMCUGenetics/knn/calc/resources/usr/bin/test_knn.py
diff --git a/UMCUGenetics/knn/merge/main.nf b/modules/UMCUGenetics/knn/merge/main.nf
similarity index 100%
rename from UMCUGenetics/knn/merge/main.nf
rename to modules/UMCUGenetics/knn/merge/main.nf
diff --git a/UMCUGenetics/pgscatalog/combine/main.nf b/modules/UMCUGenetics/pgscatalog/combine/main.nf
similarity index 100%
rename from UMCUGenetics/pgscatalog/combine/main.nf
rename to modules/UMCUGenetics/pgscatalog/combine/main.nf
diff --git a/UMCUGenetics/pgscatalog/combine/tests/main.nf.test b/modules/UMCUGenetics/pgscatalog/combine/tests/main.nf.test
similarity index 100%
rename from UMCUGenetics/pgscatalog/combine/tests/main.nf.test
rename to modules/UMCUGenetics/pgscatalog/combine/tests/main.nf.test
diff --git a/UMCUGenetics/pgscatalog/match/main.nf b/modules/UMCUGenetics/pgscatalog/match/main.nf
similarity index 100%
rename from UMCUGenetics/pgscatalog/match/main.nf
rename to modules/UMCUGenetics/pgscatalog/match/main.nf
diff --git a/UMCUGenetics/pgscatalog/match/tests/main.nf.test b/modules/UMCUGenetics/pgscatalog/match/tests/main.nf.test
similarity index 100%
rename from UMCUGenetics/pgscatalog/match/tests/main.nf.test
rename to modules/UMCUGenetics/pgscatalog/match/tests/main.nf.test