janelia-cellmap · rhoadesScholar · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 24, 2026
diff --git a/EVALUATION.md b/EVALUATION.md
@@ -3,9 +3,11 @@
 ## Overview
 
 The evaluation pipeline scores segmentation submissions against ground truth data.
-Labels are classified as either **instance** (e.g. mito, nuc, ves) or **semantic**
-segmentation and scored with different metrics accordingly. Results are aggregated
-across all crops and labels into a single overall score.
+All labels — both **instance** ("thing") classes (e.g. `mito`, `nuc`, `ves`) and
+**semantic** ("stuff") classes — are evaluated using **Panoptic Quality (PQ)**
+accumulators (TP / FP / FN / sum\_IoU).  Per-category PQ / SQ / RQ are
+micro-averaged across crops; the final overall score is the unweighted mean PQ
+across all categories.
 
 ## Flowchart
 
@@ -34,13 +36,9 @@ flowchart TD
     ScoreMissing --> Aggregate
     ArgTuples --> Parallel
 
-    subgraph Parallel[Parallel Scoring]
+    subgraph Parallel["Parallel Scoring (single ProcessPoolExecutor)"]
         direction TB
-        Route{Label type?}
-        Route -- Instance class --> InstPool["ProcessPoolExecutor<br/>max_instance_threads workers"]
-        Route -- Semantic class --> SemPool["ProcessPoolExecutor<br/>max_semantic_threads workers"]
-        InstPool --> ScoreLabel
-        SemPool --> ScoreLabel
+        ScoreLabel
     end
 
     subgraph ScoreLabel[score_label]
@@ -64,75 +62,50 @@ flowchart TD
         BranchType -- Semantic --> SemanticScoring
     end
 
-    subgraph InstanceScoring[score_instance]
+    subgraph InstanceScoring[score_instance — PQ matching]
         direction TB
         CC["Relabel prediction via<br/>cc3d.connected_components"]
-        CC --> BinaryMetrics["Compute binary metrics<br/>(IoU, Dice, binary accuracy)"]
-        CC --> VoI["Compute rand_voi<br/>split & merge errors"]
-        BinaryMetrics --> Matching
-        VoI --> Matching
-
-        subgraph Matching[match_instances]
-            direction TB
-            CountInstances["Count instances<br/>nG = max GT, nP = max pred"]
-            CountInstances --> SpecialCase{nG=0 or nP=0?}
-            SpecialCase -- Yes --> EmptyMatch[Return empty mapping]
-            SpecialCase -- No --> RatioCheck
-
-            RatioCheck["Check pred/GT ratio<br/>vs dynamic cutoff"]
-            RatioCheck --> RatioOK{Ratio OK?}
-            RatioOK -- No --> TooMany([TooManyInstancesError])
-            RatioOK -- Yes --> ComputeOverlaps
-
-            ComputeOverlaps["Compute IoU overlaps<br/>between all instance pairs"]
-            ComputeOverlaps --> EdgeCheck{"Edges within<br/>limit?"}
-            EdgeCheck -- No --> TooManyEdges([TooManyOverlapEdgesError])
-            EdgeCheck -- Yes --> MCF
-
-            MCF["Solve min-cost flow<br/>OR-Tools SimpleMinCostFlow<br/>1:1 optimal matching"]
-            MCF --> MCFStatus{Solve status?}
-            MCFStatus -- Optimal --> ExtractMap["Extract pred_id to gt_id<br/>mapping from flow arcs"]
-            MCFStatus -- Failed --> MatchFail([MatchingFailedError])
-        end
-
-        Matching --> MatchResult{"Matching<br/>succeeded?"}
-        MatchResult -- No --> Pathological["Return pathological scores<br/>accuracy=0, combined=0<br/>keep binary & VoI metrics"]
-        MatchResult -- Yes --> Remap["Remap prediction IDs<br/>to match ground truth IDs"]
-        Remap --> Hausdorff
-
-        subgraph Hausdorff[Hausdorff Distance Computation]
-            direction TB
-            GetIDs[Get unique GT instance IDs]
-            GetIDs --> PerInstance["For each instance (threaded):<br/>Extract ROI bounding box<br/>Compute distance transforms<br/>Calculate Hausdorff distance"]
-            PerInstance --> Unmatched["Add max_distance for<br/>unmatched predictions"]
-        end
-
-        Hausdorff --> FinalInstance["Compute final instance scores:<br/>accuracy = mean(truth == pred)<br/>hausdorff = mean(distances)<br/>norm_hausdorff = mean(normalized)<br/>combined = sqrt(accuracy * norm_hausdorff)"]
+        CC --> RatioCheck["Check pred/GT instance ratio<br/>vs dynamic cutoff"]
+        RatioCheck --> RatioOK{Ratio OK?}
+        RatioOK -- No --> SkipPQ["Return worst-case accumulators<br/>tp=0, fp=nP, fn=nG, sum_iou=0<br/>status: skipped_too_many_instances"]
+        RatioOK -- Yes --> ComputeOverlaps
+
+        ComputeOverlaps["Compute sparse IoU overlaps<br/>between all instance pairs"]
+        ComputeOverlaps --> EdgeCheck{"Edges within<br/>limit?"}
+        EdgeCheck -- No --> SkipPQ
+        EdgeCheck -- Yes --> GreedyMatch
+
+        GreedyMatch["Greedy matching:<br/>keep pairs with IoU > 0.5<br/>match in descending-IoU order<br/>(provably optimal at threshold 0.5)"]
+        GreedyMatch --> PQAccum["Accumulate TP, FP, FN, sum_IoU"]
     end
 
-    subgraph SemanticScoring[score_semantic]
+    subgraph SemanticScoring[score_semantic — PQ binary]
         direction TB
-        Binarize["Binarize both arrays<br/>(threshold > 0)"]
-        Binarize --> EmptyCheck{Both empty?}
-        EmptyCheck -- Yes --> Perfect[All scores = 1.0]
-        EmptyCheck -- No --> SemMetrics["Compute metrics:<br/>IoU = jaccard_score<br/>Dice = 1 - dice distance<br/>binary_accuracy = mean match"]
+        BothEmpty{Both GT and<br/>pred empty?}
+        BothEmpty -- Yes --> ZeroAccum["tp=0, fp=0, fn=0, sum_iou=0"]
+        BothEmpty -- No --> ComputeIoU["Compute binary IoU of<br/>the single collapsed segment"]
+        ComputeIoU --> IoUCheck{"IoU > 0.5?"}
+        IoUCheck -- Yes --> SemanticTP["tp=1, fp=0, fn=0, sum_iou=IoU"]
+        IoUCheck -- No --> SemanticFPFN["tp=0, fp=1, fn=1, sum_iou=0"]
     end
 
-    InstanceScoring --> AddMeta
-    SemanticScoring --> AddMeta
+    InstanceScoring --> DerivePQ
+    SemanticScoring --> DerivePQ
     EmptyScore --> AddMeta
+    DerivePQ["Derive per-crop PQ/SQ/RQ<br/>from accumulators"]
+    DerivePQ --> AddMeta
     AddMeta["Add metadata:<br/>num_voxels, voxel_size,<br/>is_missing flag"]
 
     Parallel --> Aggregate
 
     subgraph Aggregate[Aggregate & Save Results]
         direction TB
         Collect["Collect all<br/>crop/label results"]
-        Collect --> CombineLabels["Combine per-label scores<br/>across crops, weighted<br/>by voxel count"]
-        CombineLabels --> OverallInstance["Overall Instance Score =<br/>voxel-weighted mean of<br/>combined_score across<br/>instance labels"]
-        CombineLabels --> OverallSemantic["Overall Semantic Score =<br/>voxel-weighted mean of<br/>IoU across semantic labels"]
-        OverallInstance --> OverallScore["Overall Score =<br/>sqrt(instance * semantic)<br/>(geometric mean)"]
-        OverallSemantic --> OverallScore
+        Collect --> MicroAvg["Micro-average per category:<br/>sum TP/FP/FN/sum_IoU across crops<br/>compute PQ_c / SQ_c / RQ_c"]
+        MicroAvg --> OverallThing["overall_thing_pq =<br/>unweighted mean PQ<br/>over instance classes"]
+        MicroAvg --> OverallStuff["overall_stuff_pq =<br/>unweighted mean PQ<br/>over semantic classes"]
+        OverallThing --> OverallScore["overall_score =<br/>unweighted mean PQ<br/>over all classes"]
+        OverallStuff --> OverallScore
         OverallScore --> Sanitize["Sanitize scores:<br/>NaN/Inf -> None<br/>numpy types -> Python types"]
         Sanitize --> Save["Save results JSON:<br/>- all_scores (with missing)<br/>- submitted_only scores"]
     end
@@ -142,31 +115,44 @@ flowchart TD
 
 ## Metrics
 
-### Instance Segmentation
+All labels (both instance and semantic) are scored with the same
+**Panoptic Quality** framework.
 
-| Metric | Description |
-|--------|-------------|
-| `accuracy` | Voxel-wise match rate after instance ID alignment |
-| `hausdorff_distance` | Mean Hausdorff distance across all matched instances |
-| `normalized_hausdorff_distance` | Hausdorff normalized to [0, 1] via exponential decay |
-| `combined_score` | `sqrt(accuracy * normalized_hausdorff_distance)` |
-| `iou` | Binary foreground IoU (Jaccard index) |
-| `dice_score` | Binary foreground Dice coefficient |
-| `voi_split` | Variation of Information split error |
-| `voi_merge` | Variation of Information merge error |
+### Raw accumulators (per crop, per label)
 
-### Semantic Segmentation
+| Field | Description |
+|-------|-------------|
+| `tp` | True positives — matched instance pairs (IoU > 0.5) |
+| `fp` | False positives — predicted instances with no GT match |
+| `fn` | False negatives — GT instances with no predicted match |
+| `sum_iou` | Sum of IoU values for all TP matches |
+| `pq` | Per-crop Panoptic Quality = `sum_iou / (TP + 0.5·FP + 0.5·FN)` |
+| `sq` | Per-crop Segmentation Quality = `sum_iou / TP` (mean IoU of matched pairs) |
+| `rq` | Per-crop Recognition Quality (F1) = `2·TP / (2·TP + FP + FN)` |
 
-| Metric | Description |
-|--------|-------------|
-| `iou` | Binary Jaccard index |
-| `dice_score` | Binary Dice coefficient |
-| `binary_accuracy` | Voxel-wise binary match rate |
+For **semantic** labels each crop contributes at most one GT segment and one
+predicted segment, so TP ∈ {0, 1}.
+
+### Per-category scores (`label_scores`)
+
+Accumulators are micro-averaged across all crops for each category:
+
+| Field | Description |
+|-------|-------------|
+| `pq` | `global_sum_IoU / (global_TP + 0.5·global_FP + 0.5·global_FN)` |
+| `sq` | `global_sum_IoU / global_TP` — 0 when `global_TP = 0` |
+| `rq` | `global_TP / (global_TP + 0.5·global_FP + 0.5·global_FN)` |
+| `tp`, `fp`, `fn`, `sum_iou` | Globally accumulated raw values |
+
+### Overall scores
 
-### Overall
+The per-category PQ scores are combined as an **arithmetic mean across categories**
+(not weighted by instance count or voxel volume), so each category contributes equally.
 
 | Metric | Description |
 |--------|-------------|
-| `overall_instance_score` | Voxel-weighted mean of `combined_score` across instance labels |
-| `overall_semantic_score` | Voxel-weighted mean of `iou` across semantic labels |
-| `overall_score` | `sqrt(overall_instance_score * overall_semantic_score)` |
+| `overall_thing_pq` | Arithmetic mean of `pq` across instance ("thing") categories |
+| `overall_stuff_pq` | Arithmetic mean of `pq` across semantic ("stuff") categories |
+| `overall_score` | Arithmetic mean of `pq` across **all** categories |
+| `overall_instance_score` | Alias for `overall_thing_pq` |
+| `overall_semantic_score` | Alias for `overall_stuff_pq` |
diff --git a/docs/source/evaluation.rst b/docs/source/evaluation.rst
@@ -7,6 +7,23 @@ Resampling
 ----------
 Before scoring, the predicted volumes are resampled to ensure they are compared to the ground truth at the same resolution and region of interest (ROI). For more details on the resampling process, refer to `evaluation_resampling.rst`.
 
+Evaluation Approach
+-------------------
+
+All labels — both instance ("thing") and semantic ("stuff") classes — are evaluated
+using **Panoptic Quality (PQ)**, a unified metric that rewards both correct detection
+and accurate segmentation.
+
+For each crop and each label, the scorer produces four raw accumulators:
+
+- **TP** (True Positives): matched instance/segment pairs with IoU > 0.5
+- **FP** (False Positives): predicted instances/segments with no GT match
+- **FN** (False Negatives): GT instances/segments with no predicted match
+- **sum\_IoU**: sum of IoU values for all TP pairs
+
+These are micro-averaged across crops to produce per-category PQ, SQ, and RQ scores,
+and the final overall score is the unweighted mean PQ across all categories.
+
 Instance Segmentations
 ----------------------
 
@@ -24,31 +41,93 @@ Instance Segmentations
   - Vesicle (`ves`)
   - Vimentin (`vim`)
 
-- **Scoring Components**:
-
-  - **Hausdorff Distance**: The Hausdorff distance is calculated in nanometers between the predicted and ground truth instance segmentations. This metric measures the maximum distance between any point on the predicted instance and its nearest point on the ground truth instance, and vice versa.
-
-  - **Accuracy**: The accuracy is calculated as the proportion of correctly predicted instance labels to the total number of instance labels.
+- **Scoring Method**:
 
-- **Score Normalization and Combination**:
+  Predicted labels are first relabeled via connected components. GT and predicted
+  instances are then matched greedily in descending-IoU order, keeping only pairs
+  with IoU > 0.5. Because IoU > 0.5 guarantees at most one valid counterpart for
+  each instance, this greedy matching is provably optimal. The result is a set of
+  TP/FP/FN/sum\_IoU accumulators for the crop.
 
-  - The Hausdorff distance is normalized to a range of [0, 1] using the maximum distance represented by a voxel. Specifically, the normalized Hausdorff distance is :math:`1.01^{-\frac{\text{hausdorff distance}}{\|\text{voxel\_size}\|}}`.
-
-  - The combined score is calculated as the geometric mean of the accuracy and the normalized Hausdorff distance.
-
-  - The final instance score across volumes is produced by taking the average across the combined scores for each volume, normalized by the total spatial volume of each image.
+  If the predicted-to-GT instance ratio or the number of overlap edges exceeds
+  configured limits, the crop is skipped and worst-case accumulators are returned
+  (``tp=0``, ``fp=nP``, ``fn=nG``, ``sum_iou=0``).
 
 Semantic Segmentations
 ----------------------
 
 - **All non-instance classes are included as semantic labels**
 
-- **Scoring Components**:
-
-  - **Intersection over Union (IoU)**: The IoU is calculated as the intersection of the predicted and ground truth segmentations divided by their union. This metric measures the overlap between the predicted and ground truth segmentations.
-
-  - **Dice Score**: The Dice score is calculated as twice the intersection of the predicted and ground truth segmentations divided by the sum of their volumes. This metric measures the similarity between the predicted and ground truth segmentations.
-
-- **Score Normalization and Combination**:
-
-  - The IoU scores are combined across all volumes to obtain the final scores, normalized by the total volume occupied by the volumes to which each IoU corresponds.
+- **Scoring Method**:
+
+  Each semantic label is treated as a single binary segment per crop (one GT
+  segment, one predicted segment). Their binary IoU is computed; if IoU > 0.5
+  the crop counts as a TP match, otherwise as both an FP and an FN. This keeps
+  semantic labels consistent with the same PQ framework used for instance labels.
+
+Metrics
+-------
+
+Per-Crop Accumulators
+~~~~~~~~~~~~~~~~~~~~~
+
+.. list-table::
+   :header-rows: 1
+
+   * - Field
+     - Description
+   * - ``tp``
+     - True positives — matched pairs with IoU > 0.5
+   * - ``fp``
+     - False positives — unmatched predicted instances/segments
+   * - ``fn``
+     - False negatives — unmatched GT instances/segments
+   * - ``sum_iou``
+     - Sum of IoU values for all TP matches
+   * - ``pq``
+     - Per-crop Panoptic Quality = ``sum_iou / (TP + 0.5·FP + 0.5·FN)``
+   * - ``sq``
+     - Per-crop Segmentation Quality = ``sum_iou / TP`` (mean IoU of matched pairs; 0 when TP=0)
+   * - ``rq``
+     - Per-crop Recognition Quality (F1) = ``2·TP / (2·TP + FP + FN)``
+
+Per-Category Scores (``label_scores``)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Accumulators are micro-averaged across all crops for each category:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Field
+     - Description
+   * - ``pq``
+     - :math:`\frac{\text{global\_sum\_IoU}}{\text{global\_TP} + 0.5 \cdot \text{global\_FP} + 0.5 \cdot \text{global\_FN}}`
+   * - ``sq``
+     - :math:`\frac{\text{global\_sum\_IoU}}{\text{global\_TP}}` — 0 when global TP = 0
+   * - ``rq``
+     - :math:`\frac{\text{global\_TP}}{\text{global\_TP} + 0.5 \cdot \text{global\_FP} + 0.5 \cdot \text{global\_FN}}`
+   * - ``tp``, ``fp``, ``fn``, ``sum_iou``
+     - Globally accumulated raw values
+
+Overall Scores
+~~~~~~~~~~~~~~
+
+The per-category PQ scores are combined as an **arithmetic mean across categories**
+(not weighted by instance count or voxel volume), so each category contributes equally.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Metric
+     - Description
+   * - ``overall_thing_pq``
+     - Arithmetic mean of ``pq`` across instance ("thing") categories
+   * - ``overall_stuff_pq``
+     - Arithmetic mean of ``pq`` across semantic ("stuff") categories
+   * - ``overall_score``
+     - Arithmetic mean of ``pq`` across **all** categories
+   * - ``overall_instance_score``
+     - Alias for ``overall_thing_pq``
+   * - ``overall_semantic_score``
+     - Alias for ``overall_stuff_pq``
diff --git a/src/cellmap_segmentation_challenge/evaluate.py b/src/cellmap_segmentation_challenge/evaluate.py
@@ -12,9 +12,6 @@
 
 # Re-export all public APIs for backward compatibility
 from .utils.eval_utils import (
-    # Types
-    InstanceScoreDict,
-    SemanticScoreDict,
     # Exceptions
     EvaluationError,
     TooManyInstancesError,
@@ -26,7 +23,6 @@
     CAST_TO_NONE,
     MAX_INSTANCE_THREADS,
     MAX_SEMANTIC_THREADS,
-    PER_INSTANCE_THREADS,
     MAX_DISTANCE_CAP_EPS,
     FINAL_INSTANCE_RATIO_CUTOFF,
     INITIAL_INSTANCE_RATIO_CUTOFF,
@@ -68,9 +64,6 @@
 )
 
 __all__ = [
-    # Types
-    "InstanceScoreDict",
-    "SemanticScoreDict",
     # Exceptions
     "EvaluationError",
     "TooManyInstancesError",
@@ -82,7 +75,6 @@
     "CAST_TO_NONE",
     "MAX_INSTANCE_THREADS",
     "MAX_SEMANTIC_THREADS",
-    "PER_INSTANCE_THREADS",
     "MAX_DISTANCE_CAP_EPS",
     "FINAL_INSTANCE_RATIO_CUTOFF",
     "INITIAL_INSTANCE_RATIO_CUTOFF",