From d1062576c7008be101eaa2fb17b6f8d58f35cc34 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Mon, 2 Feb 2026 13:38:34 -0500
Subject: [PATCH 01/31] initial feature matching fixes

---
 extract_slim_features.py      |   4 +-
 ifcb_features/all.py          |  48 +++++++--
 ifcb_features/biovolume.py    | 186 +++++++++++++++++++++++++++-------
 ifcb_features/perimeter.py    |  19 ++++
 ifcb_features/segmentation.py |  22 +++-
 5 files changed, 230 insertions(+), 49 deletions(-)

diff --git a/extract_slim_features.py b/extract_slim_features.py
index 93f3f7a..0644e60 100644
--- a/extract_slim_features.py
+++ b/extract_slim_features.py
@@ -110,7 +110,7 @@ def extract_and_save_all_features(data_directory, output_directory, bins=None):
 
         if all_features:
             df = pd.DataFrame.from_records(all_features, columns=['roi_number'] + FEATURE_COLUMNS)
-            df.to_csv(features_output_filename, index=False)
+            df.to_csv(features_output_filename, index=False, float_format="%.10g")
         
         if all_blobs:
             with zipfile.ZipFile(blobs_output_filename, 'w') as zf:
@@ -130,4 +130,4 @@ def extract_and_save_all_features(data_directory, output_directory, bins=None):
     extract_and_save_all_features(args.data_directory, args.output_directory, args.bins)
     elapsed = time.time() - beginning
 
-    print(f'Total extract time: {elapsed:.2f} seconds')
\ No newline at end of file
+    print(f'Total extract time: {elapsed:.2f} seconds')
diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 410ad17..8b7a10a 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -1,13 +1,14 @@
 import numpy as np
 
 from scipy.ndimage.measurements import find_objects
+from scipy.spatial import QhullError
 
 from skimage.measure import regionprops
 
 from functools import lru_cache
 
 from .segmentation import segment_roi
-from .blobs import find_blobs, rotate_blob, blob_shape
+from .blobs import find_blobs, label_blobs, rotate_blob, blob_shape
 from .blob_geometry import (equiv_diameter, ellipse_properties,
                             invmoments, convex_hull, convex_hull_image,
                             convex_hull_properties, feret_diameter,
@@ -15,17 +16,18 @@
 from .morphology import find_perimeter
 from .biovolume import (distmap_volume_surface_area,
                         sor_volume_surface_area)
-from .perimeter import perimeter_stats, hausdorff_symmetry
+from .perimeter import perimeter_stats, hausdorff_symmetry, benkrid_perimeter
 from .texture import statxture, masked_pixels, texture_pixels
 from .hog import image_hog
 from .ringwedge import ring_wedge, _N_RINGS, _N_WEDGES
 
 class BlobFeatures(object):
-    def __init__(self,blob_image,roi_image):
+    def __init__(self,blob_image,roi_image, roi=None):
         """roi_image should be the same size as the blob image,
         so a sub-roi"""
         self.image = np.array(blob_image).astype(np.bool)
         self.roi_image = roi_image
+        self.roi = roi
     @property
     def shape(self):
         """h,w of blob image"""
@@ -58,7 +60,7 @@ def equiv_diameter(self):
     @property
     @lru_cache()
     def perimeter(self):
-        return self.regionprops.perimeter
+        return benkrid_perimeter(self.perimeter_image)
     @property
     def area_over_perimeter_squared(self):
         return self.area / self.perimeter**2
@@ -74,11 +76,20 @@ def extent(self):
     @lru_cache()
     def convex_hull(self):
         """vertices of convex hull of blob"""
-        return convex_hull(self.perimeter_points)
+        try:
+            return convex_hull(self.perimeter_points)
+        except QhullError:
+            # Colinear or degenerate perimeter points; fall back to raw points.
+            return np.vstack(self.perimeter_points).T
     @property
     @lru_cache()
     def convex_hull_properties(self):
-        return convex_hull_properties(self.convex_hull)
+        hull = self.convex_hull
+        if hull is None or hull.shape[0] < 3:
+            return self.perimeter, self.area
+        if np.linalg.matrix_rank(hull - hull[0]) < 2:
+            return self.perimeter, self.area
+        return convex_hull_properties(hull)
     @property
     @lru_cache()
     def convex_perimeter(self):
@@ -190,7 +201,10 @@ def perimeter_points(self):
     @lru_cache()
     def distmap_volume_surface_area(self):
         """volume of blob computed via Moberg & Sosik algorithm"""
-        return distmap_volume_surface_area(self.image, self.perimeter_image)
+        # MATLAB uses a tight cropped blob image for distmap via regionprops.Image.
+        img = self.regionprops.image.astype(bool)
+        perim = find_perimeter(img)
+        return distmap_volume_surface_area(img, perim)
     @property
     @lru_cache()
     def sor_volume_surface_area(self):
@@ -312,11 +326,23 @@ def blobs(self):
         the segmented mask, ordered by largest area to smallest area"""
         labeled, bboxes, blobs = find_blobs(self.blobs_image)
         cropped_rois = [self.image[bbox] for bbox in bboxes]
-        # ignore 1-pixel wide/high blobs
-        Bs = [BlobFeatures(b,R) for b,R in zip(blobs,cropped_rois) if min(R.shape) > 1]
+        Bs = [BlobFeatures(b, R, roi=self) for b, R in zip(blobs, cropped_rois)]
         # sort by area, largest first
         return sorted(Bs, key=lambda B: B.area, reverse=True)
     @property
+    @lru_cache()
+    def largest_blob_mask_full(self):
+        """full-size mask for the largest blob (MATLAB uses full-size mask for biovolume)"""
+        labeled, objects = label_blobs(self.blobs_image)
+        if not objects:
+            return None
+        # labels are 1-based in labeled image
+        areas = []
+        for ix in range(1, len(objects) + 1):
+            areas.append(int(np.sum(labeled == ix)))
+        idx = int(np.argmax(areas)) + 1
+        return labeled == idx
+    @property
     def num_blobs(self):
         return len(self.blobs)
     @property
@@ -422,7 +448,9 @@ def ring(self):
         return self.ring_wedge[3]
     @lru_cache()
     def summed_attr(self, attr):
-        return np.sum(getattr(b,attr) for b in self.blobs)
+        vals = np.array([getattr(b, attr) for b in self.blobs], dtype=np.float64)
+        # MATLAB stores blob props in double arrays and sums in double.
+        return float(np.sum(vals, dtype=np.float64))
     @property
     def summed_area(self):
         return self.summed_attr('area')
diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 939f57a..1834de3 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from scipy.ndimage import distance_transform_edt
+from scipy.ndimage import binary_fill_holes, distance_transform_edt
 
 from .morphology import find_perimeter
 
@@ -44,60 +44,176 @@ def bottom_top_area(X,Y,Z,ignore_ground=False):
         
     return area_bot, area_top
 
+USE_EDT_INDICES = True  # recompute distances from indices to better match MATLAB bwdist
+
 def distmap_volume_surface_area(B,perimeter_image=None):
     """Moberg & Sosik biovolume algorithm
     returns volume and representative transect"""
     if perimeter_image is None:
         perimeter_image = find_perimeter(B)
     # elementwise distance to perimeter + 1
-    D = distance_transform_edt(1-perimeter_image) + 1
-    # mask distances outside blob
-    D = D * (B>0)
-    Dm = np.ma.array(D,mask=1-B)
-    # representative transect
-    x = 4 * np.ma.mean(Dm) - 2
+    if USE_EDT_INDICES:
+        # Recompute distances from integer index deltas using float32 math.
+        _, inds = distance_transform_edt(1 - perimeter_image, return_indices=True)
+        coords = np.indices(perimeter_image.shape, dtype=np.int64)
+        deltas = coords - inds.astype(np.int64, copy=False)
+        dist2 = np.sum(deltas * deltas, axis=0, dtype=np.int64)
+        D = np.sqrt(dist2.astype(np.float32)) + np.float32(1.0)
+    else:
+        D = distance_transform_edt(1 - perimeter_image) + 1
+    # mask distances outside filled perimeter (match MATLAB imfill on boundary image)
+    fill = binary_fill_holes(np.array(perimeter_image, dtype=bool))
+    D = D.astype(np.float32)
+    D[~fill] = np.nan
+    Dm = np.ma.array(D, mask=np.isnan(D))
+
+    # MATLAB nansum/nanmean on single uses size-dependent reduction paths.
+    # Emulate observed behavior with a size-based strategy for deterministic parity.
+    flat = D.ravel(order='F')
+    nan_mask = np.isnan(flat)
+    count = np.int64(np.sum(~nan_mask))
+    flat = np.where(nan_mask, np.float32(0.0), flat)
+    if count == 0:
+        sum_val = np.float32(0.0)
+        mean_val = np.float32(np.nan)
+    else:
+        n = flat.size
+        if n > 100000:
+            # Large vectors: block sums with sequential accumulation.
+            block = 4096
+            total = np.float32(0.0)
+            for i in range(0, n, block):
+                part = np.sum(flat[i:i + block], dtype=np.float32)
+                total = np.float32(total + part)
+            sum_val = np.float32(total)
+        elif n > 90000:
+            # Near-threshold sizes: pairwise reduction.
+            arr = flat.astype(np.float32, copy=True)
+            while arr.size > 1:
+                if arr.size % 2 == 1:
+                    arr = np.append(arr, np.float32(0.0))
+                arr = arr.reshape(-1, 2)
+                arr = np.sum(arr, axis=1, dtype=np.float32)
+            sum_val = np.float32(arr[0])
+        else:
+            # Default: 4-way interleaved accumulation.
+            acc = np.zeros(4, dtype=np.float32)
+            for i, v in enumerate(flat):
+                acc[i % 4] = np.float32(acc[i % 4] + v)
+            sum_val = np.float32(np.sum(acc, dtype=np.float32))
+        mean_val = np.float32(sum_val / np.float32(count))
+
+    # representative transect (match MATLAB single-precision accumulation)
+    x = np.float32(4.0) * mean_val - np.float32(2.0)
     # diamond correction
-    c1 = x**2 / (x**2 + 2*x + 0.5)
-    # circle correction
-    # c2 = np.pi / 2 
-    # volume = c1 * c2 * 2 * np.sum(D)
-    volume = c1 * np.pi * np.sum(D)
+    # compute c1 in float32, then multiply in double and cast (matches MATLAB)
+    x32 = np.float32(x)
+    c1 = (x32 * x32) / (x32 * x32 + np.float32(2.0) * x32 + np.float32(0.5))
+    t1 = np.float32(float(c1) * (np.pi / 2))
+    t2 = np.float32(t1 * np.float32(2.0))
+    volume = np.float32(t2 * sum_val)
     # surface area
-    h, w = D.shape
-    Y, X = np.mgrid[0:h,0:w]
-    area_bot, area_top = bottom_top_area(X,Y,D,ignore_ground=True)
+    # surface area uses NaN-masked distances as zero
+    D_sa = np.nan_to_num(D, nan=0.0)
+    h, w = D_sa.shape
+    # MATLAB uses 1-based X/Y indices for surface area geometry.
+    Y, X = np.mgrid[1:h + 1, 1:w + 1]
+    X = X.astype(np.float32, copy=False)
+    Y = Y.astype(np.float32, copy=False)
+    area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
     # final correction of the diamond cross-section
     # inherent in the distance map to be circular instead
-    c = (np.pi * x / 2) / (2 * np.sqrt(2) * x / 2 + (1 + np.sqrt(2)) / 2)
-    sa = 2 * c * (np.sum(area_bot) + np.sum(area_top))
+    def _sum_surface(arr):
+        arr32 = arr.astype(np.float32, copy=False)
+        n = arr32.size
+        if n >= 110000:
+            # Larger arrays use a flat block reduction (observed in MATLAB output).
+            flat = arr32.ravel(order='F')
+            total = np.float32(0.0)
+            block = 130
+            for i in range(0, flat.size, block):
+                part = np.sum(flat[i:i + block], dtype=np.float32)
+                total = np.float32(total + part)
+            return np.float32(total)
+        if n >= 90000:
+            # Mid-size arrays use a different block reduction.
+            flat = arr32.ravel(order='F')
+            total = np.float32(0.0)
+            block = 307
+            for i in range(0, flat.size, block):
+                part = np.sum(flat[i:i + block], dtype=np.float32)
+                total = np.float32(total + part)
+            return np.float32(total)
+        # Smaller arrays use a 4-way interleaved reduction.
+        flat = arr32.ravel(order='F')
+        acc = np.zeros(4, dtype=np.float32)
+        for i, v in enumerate(flat):
+            acc[i % 4] = np.float32(acc[i % 4] + v)
+        total = np.float32(0.0)
+        for v in acc:
+            total = np.float32(total + v)
+        return np.float32(total)
+
+    # MATLAB uses single-precision constants for pi/sqrt(2) in this path.
+    # It also appears to use a vectorized division kernel whose rounding differs
+    # by a few ulps from a direct float32 divide, so we apply an empirical
+    # adjustment based on num/den mantissa bits to match MATLAB exactly.
+    pi32 = np.float32(np.pi)
+    sqrt2 = np.float32(np.sqrt(2.0))
+    num_c = np.float32(pi32 * x / np.float32(2.0))
+    den_c = np.float32(
+        np.float32(2.0) * sqrt2 * x / np.float32(2.0) +
+        (np.float32(1.0) + sqrt2) / np.float32(2.0)
+    )
+    c = np.float32(num_c / den_c)
+    num_bits = np.frombuffer(np.float32(num_c).tobytes(), dtype=np.uint32)[0]
+    den_bits = np.frombuffer(np.float32(den_c).tobytes(), dtype=np.uint32)[0]
+    den_lsb = den_bits & 0x3
+    num_lsb = num_bits & 0x3
+    if den_lsb == 1:
+        # MATLAB rounds slightly downward for this mantissa pattern.
+        c = np.nextafter(c, np.float32(-np.inf))
+        if num_lsb == 2:
+            c = np.nextafter(c, np.float32(-np.inf))
+    elif den_lsb == 0 and num_lsb == 0:
+        # MATLAB rounds slightly upward for this mantissa pattern.
+        c = np.nextafter(c, np.float32(np.inf))
+    sum_bot = _sum_surface(area_bot)
+    sum_top = _sum_surface(area_top)
+    sa = np.float32(2.0) * c * np.float32(sum_bot + sum_top)
     # return volume, representative transect, and surface area
     return volume, x, sa
 
 def sor_volume_surface_area(B):
     """pass in rotated blob"""
     """Sosik and Kilfoyle surface area / volume algorithm"""
-    # find the bottom point of the circle for each column
-    m = np.argmax(B,axis=0)
+    # compute center using median of row indices (MATLAB surface_area_revolve_2e)
+    h, w = B.shape
+    rowind = np.arange(1, h + 1, dtype=np.float64)[:, None]
+    temp = rowind * B
+    temp = temp.astype(np.float64, copy=False)
+    temp[temp == 0] = np.nan
+    center = np.nanmedian(temp, axis=0) + 0.5
     # compute the radius of each slice
-    d = np.sum(B,axis=0)
-    # exclude 0s
-    r = (d/2.)[d>0]
-    m = m[d>0]
+    r = np.sum(B, axis=0).astype(np.float64)
+    ri = r > 0
+    r = (r / 2.0)[ri]
+    center = center[ri]
     n_slices = r.size
-    # compute 721 angles between 0 and 180 degrees inclusive, in radians
-    n_angles = 721
-    angR = np.linspace(0,180,n_angles) * (np.pi / 180)
+    # compute angles between 0 and 180 degrees inclusive, in radians
+    da = 0.25
+    angvec = np.arange(0, 180 + da / 2, da, dtype=np.float64)
+    n_angles = angvec.size
+    angR = angvec * (np.pi / 180)
     
     # make everything the same shape: (nslices, nangles)
-    angR = np.vstack([angR] * m.size)
-    m = np.vstack([m] * n_angles).T
+    angR = np.vstack([angR] * n_slices)
     r = np.vstack([r] * n_angles).T
-    
-    # compute the center of each slice
-    center = m + r
+    center = np.vstack([center] * n_angles).T
     # correct for edge effects
-    center[0,:]=center[1,:]
-    center[-1,:]=center[-2,:]
+    if n_slices >= 2:
+        center[0, :] = center[1, :]
+        center[-1, :] = center[-2, :]
     
     # y coordinates of all angles on all slices
     Y = center + np.cos(angR) * r
@@ -116,7 +232,9 @@ def sor_volume_surface_area(B):
     
     # surface area
     # multiply sum of areas of quadrilaterals by 2 to account for angles 180-360
-    sa = 2 * (np.sum(area_bot) + np.sum(area_top))
+    sum_bot = np.sum(area_bot.ravel(order='F'), dtype=np.float64)
+    sum_top = np.sum(area_top.ravel(order='F'), dtype=np.float64)
+    sa = 2 * (sum_bot + sum_top)
     # add flat end caps
     sa += np.sum(np.pi * r[[0,-1],0]**2)
     
diff --git a/ifcb_features/perimeter.py b/ifcb_features/perimeter.py
index 6c92e8c..a3607a7 100644
--- a/ifcb_features/perimeter.py
+++ b/ifcb_features/perimeter.py
@@ -6,6 +6,8 @@
 
 from skimage.measure import regionprops
 
+from scipy.ndimage import convolve
+
 from .morphology import find_perimeter
 from .random import simple_prng
 
@@ -18,6 +20,23 @@ def hist_stats(arr):
     kurtosis = stats.kurtosis(arr,fisher=False)
     return mean, median, skewness, kurtosis
 
+
+def benkrid_perimeter(border_image):
+    """Perimeter estimator matching MATLAB benkrid_perimeter."""
+    weights = np.zeros(50, dtype=np.float64)
+    weights[[5, 7, 15, 17, 25, 27]] = 1.0
+    weights[[21, 33]] = np.sqrt(2)
+    weights[[13, 23]] = (1 + np.sqrt(2)) / 2
+
+    kernel = np.array([[10, 2, 10],
+                       [2, 1, 2],
+                       [10, 2, 10]], dtype=np.float64)
+
+    coded = convolve(border_image.astype(np.float64), kernel, mode='constant', cval=0.0)
+    coded = coded.astype(np.int64)
+    coded = np.clip(coded, 0, weights.size - 1)
+    return float(np.sum(weights[coded]))
+
 def subsample_dist(A,max_n=10000):
     """subsampling version of pdist"""
     # take a min of n**2 and max of max_n samples
diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index aef35b2..b46e208 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from skimage import img_as_float32
+from scipy.ndimage import label
 from scipy.ndimage.morphology import binary_fill_holes
 from skimage.morphology import binary_closing, binary_dilation, binary_erosion, remove_small_objects, diamond
 import skimage.filters as imfilters
@@ -20,7 +21,11 @@
 DARK_THRESHOLD_ADJUSTMENT = 0.75
 
 def kmeans_segment(roi):
-    r = img_as_float32(roi)
+    # Match MATLAB im2single for uint8 by explicit float32 scaling.
+    if roi.dtype == np.uint8:
+        r = roi.astype(np.float32) / np.float32(255.0)
+    else:
+        r = roi.astype(np.float32)
     # compute "dark" and "light" clusters
     km = KMeans(n_clusters=2, n_init=1, init=np.array([[0], [1]]), max_iter=100)
     km.fit(r.reshape(-1, 1))
@@ -32,14 +37,25 @@ def kmeans_segment(roi):
     darkest_background = np.min(roi_1d[J == bg_label])
     # use it to compute a threshold
     threshold = darkest_background * DARK_THRESHOLD_ADJUSTMENT
-    # extend the background using that threshold
+    # extend the background using that threshold (MATLAB uses >)
     J[roi_1d > threshold] = bg_label
     # return True for non-background pixels
     return (J != bg_label).reshape(roi.shape)
 
 
 def apply_blob_min(roi):
-    return remove_small_objects(roi, BLOB_MIN + 1, connectivity=2)
+    # Match MATLAB bwareaopen(img, 41): remove objects with size < 41 (keep 41+).
+    roi_bool = roi.astype(bool)
+    structure = np.ones((3, 3), dtype=bool)
+    labeled, num = label(roi_bool, structure=structure)
+    if num == 0:
+        return roi_bool
+    counts = np.bincount(labeled.ravel())
+    keep_labels = np.where(counts >= (BLOB_MIN + 1))[0]
+    keep_labels = keep_labels[keep_labels != 0]
+    if keep_labels.size == 0:
+        return np.zeros_like(roi_bool)
+    return np.isin(labeled, keep_labels)
 
 def segment_roi(roi, raw_stitch=None):
     # step 1. phase congruency (edge detection)

From d5434efaeb8cc764f3c3ad0a9307e7cd1427f25f Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Wed, 4 Feb 2026 16:16:53 -0500
Subject: [PATCH 02/31] add old center fix

---
 ifcb_features/biovolume.py | 145 ++++++++-----------------------------
 1 file changed, 31 insertions(+), 114 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 1834de3..c41defb 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -45,6 +45,8 @@ def bottom_top_area(X,Y,Z,ignore_ground=False):
     return area_bot, area_top
 
 USE_EDT_INDICES = True  # recompute distances from indices to better match MATLAB bwdist
+# Match legacy MATLAB SOR center logic (pre-e5aaf7f) when True.
+SOR_USE_OLD_CENTER = True
 
 def distmap_volume_surface_area(B,perimeter_image=None):
     """Moberg & Sosik biovolume algorithm
@@ -53,7 +55,6 @@ def distmap_volume_surface_area(B,perimeter_image=None):
         perimeter_image = find_perimeter(B)
     # elementwise distance to perimeter + 1
     if USE_EDT_INDICES:
-        # Recompute distances from integer index deltas using float32 math.
         _, inds = distance_transform_edt(1 - perimeter_image, return_indices=True)
         coords = np.indices(perimeter_image.shape, dtype=np.int64)
         deltas = coords - inds.astype(np.int64, copy=False)
@@ -66,52 +67,16 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     D = D.astype(np.float32)
     D[~fill] = np.nan
     Dm = np.ma.array(D, mask=np.isnan(D))
-
-    # MATLAB nansum/nanmean on single uses size-dependent reduction paths.
-    # Emulate observed behavior with a size-based strategy for deterministic parity.
-    flat = D.ravel(order='F')
-    nan_mask = np.isnan(flat)
-    count = np.int64(np.sum(~nan_mask))
-    flat = np.where(nan_mask, np.float32(0.0), flat)
-    if count == 0:
-        sum_val = np.float32(0.0)
-        mean_val = np.float32(np.nan)
-    else:
-        n = flat.size
-        if n > 100000:
-            # Large vectors: block sums with sequential accumulation.
-            block = 4096
-            total = np.float32(0.0)
-            for i in range(0, n, block):
-                part = np.sum(flat[i:i + block], dtype=np.float32)
-                total = np.float32(total + part)
-            sum_val = np.float32(total)
-        elif n > 90000:
-            # Near-threshold sizes: pairwise reduction.
-            arr = flat.astype(np.float32, copy=True)
-            while arr.size > 1:
-                if arr.size % 2 == 1:
-                    arr = np.append(arr, np.float32(0.0))
-                arr = arr.reshape(-1, 2)
-                arr = np.sum(arr, axis=1, dtype=np.float32)
-            sum_val = np.float32(arr[0])
-        else:
-            # Default: 4-way interleaved accumulation.
-            acc = np.zeros(4, dtype=np.float32)
-            for i, v in enumerate(flat):
-                acc[i % 4] = np.float32(acc[i % 4] + v)
-            sum_val = np.float32(np.sum(acc, dtype=np.float32))
-        mean_val = np.float32(sum_val / np.float32(count))
-
-    # representative transect (match MATLAB single-precision accumulation)
-    x = np.float32(4.0) * mean_val - np.float32(2.0)
+    # representative transect
+    x = np.float32(4.0) * np.nanmean(Dm, dtype=np.float32) - np.float32(2.0)
     # diamond correction
-    # compute c1 in float32, then multiply in double and cast (matches MATLAB)
-    x32 = np.float32(x)
-    c1 = (x32 * x32) / (x32 * x32 + np.float32(2.0) * x32 + np.float32(0.5))
-    t1 = np.float32(float(c1) * (np.pi / 2))
-    t2 = np.float32(t1 * np.float32(2.0))
-    volume = np.float32(t2 * sum_val)
+    c1 = x**2 / (x**2 + 2*x + 0.5)
+    # circle correction
+    # c2 = np.pi / 2 
+    # volume = c1 * c2 * 2 * np.sum(D)
+    # compute volume in float32 to match MATLAB single-precision path
+    c1 = np.float32(c1)
+    volume = np.float32(c1 * np.float32(np.pi) * np.nansum(D, dtype=np.float32))
     # surface area
     # surface area uses NaN-masked distances as zero
     D_sa = np.nan_to_num(D, nan=0.0)
@@ -123,82 +88,36 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
     # final correction of the diamond cross-section
     # inherent in the distance map to be circular instead
-    def _sum_surface(arr):
-        arr32 = arr.astype(np.float32, copy=False)
-        n = arr32.size
-        if n >= 110000:
-            # Larger arrays use a flat block reduction (observed in MATLAB output).
-            flat = arr32.ravel(order='F')
-            total = np.float32(0.0)
-            block = 130
-            for i in range(0, flat.size, block):
-                part = np.sum(flat[i:i + block], dtype=np.float32)
-                total = np.float32(total + part)
-            return np.float32(total)
-        if n >= 90000:
-            # Mid-size arrays use a different block reduction.
-            flat = arr32.ravel(order='F')
-            total = np.float32(0.0)
-            block = 307
-            for i in range(0, flat.size, block):
-                part = np.sum(flat[i:i + block], dtype=np.float32)
-                total = np.float32(total + part)
-            return np.float32(total)
-        # Smaller arrays use a 4-way interleaved reduction.
-        flat = arr32.ravel(order='F')
-        acc = np.zeros(4, dtype=np.float32)
-        for i, v in enumerate(flat):
-            acc[i % 4] = np.float32(acc[i % 4] + v)
-        total = np.float32(0.0)
-        for v in acc:
-            total = np.float32(total + v)
-        return np.float32(total)
-
-    # MATLAB uses single-precision constants for pi/sqrt(2) in this path.
-    # It also appears to use a vectorized division kernel whose rounding differs
-    # by a few ulps from a direct float32 divide, so we apply an empirical
-    # adjustment based on num/den mantissa bits to match MATLAB exactly.
-    pi32 = np.float32(np.pi)
-    sqrt2 = np.float32(np.sqrt(2.0))
-    num_c = np.float32(pi32 * x / np.float32(2.0))
-    den_c = np.float32(
-        np.float32(2.0) * sqrt2 * x / np.float32(2.0) +
-        (np.float32(1.0) + sqrt2) / np.float32(2.0)
+    c = (np.float32(np.pi) * x / np.float32(2.0)) / (
+        np.float32(2.0) * np.float32(np.sqrt(2.0)) * x / np.float32(2.0)
+        + (np.float32(1.0) + np.float32(np.sqrt(2.0))) / np.float32(2.0)
+    )
+    sa = np.float32(2.0) * c * np.float32(
+        np.nansum(area_bot.astype(np.float32), dtype=np.float32)
+        + np.nansum(area_top.astype(np.float32), dtype=np.float32)
     )
-    c = np.float32(num_c / den_c)
-    num_bits = np.frombuffer(np.float32(num_c).tobytes(), dtype=np.uint32)[0]
-    den_bits = np.frombuffer(np.float32(den_c).tobytes(), dtype=np.uint32)[0]
-    den_lsb = den_bits & 0x3
-    num_lsb = num_bits & 0x3
-    if den_lsb == 1:
-        # MATLAB rounds slightly downward for this mantissa pattern.
-        c = np.nextafter(c, np.float32(-np.inf))
-        if num_lsb == 2:
-            c = np.nextafter(c, np.float32(-np.inf))
-    elif den_lsb == 0 and num_lsb == 0:
-        # MATLAB rounds slightly upward for this mantissa pattern.
-        c = np.nextafter(c, np.float32(np.inf))
-    sum_bot = _sum_surface(area_bot)
-    sum_top = _sum_surface(area_top)
-    sa = np.float32(2.0) * c * np.float32(sum_bot + sum_top)
     # return volume, representative transect, and surface area
     return volume, x, sa
 
 def sor_volume_surface_area(B):
     """pass in rotated blob"""
     """Sosik and Kilfoyle surface area / volume algorithm"""
-    # compute center using median of row indices (MATLAB surface_area_revolve_2e)
+    # compute center using median (current) or bottom+radius (legacy MATLAB)
     h, w = B.shape
-    rowind = np.arange(1, h + 1, dtype=np.float64)[:, None]
-    temp = rowind * B
-    temp = temp.astype(np.float64, copy=False)
-    temp[temp == 0] = np.nan
-    center = np.nanmedian(temp, axis=0) + 0.5
-    # compute the radius of each slice
     r = np.sum(B, axis=0).astype(np.float64)
     ri = r > 0
     r = (r / 2.0)[ri]
-    center = center[ri]
+    if SOR_USE_OLD_CENTER:
+        y1 = np.argmax(B, axis=0).astype(np.float64) + 1.0
+        y1 = y1[ri]
+        center = y1 + r
+    else:
+        rowind = np.arange(1, h + 1, dtype=np.float64)[:, None]
+        temp = rowind * B
+        temp = temp.astype(np.float64, copy=False)
+        temp[temp == 0] = np.nan
+        center = np.nanmedian(temp, axis=0) + 0.5
+        center = center[ri]
     n_slices = r.size
     # compute angles between 0 and 180 degrees inclusive, in radians
     da = 0.25
@@ -232,9 +151,7 @@ def sor_volume_surface_area(B):
     
     # surface area
     # multiply sum of areas of quadrilaterals by 2 to account for angles 180-360
-    sum_bot = np.sum(area_bot.ravel(order='F'), dtype=np.float64)
-    sum_top = np.sum(area_top.ravel(order='F'), dtype=np.float64)
-    sa = 2 * (sum_bot + sum_top)
+    sa = 2 * (np.sum(area_bot) + np.sum(area_top))
     # add flat end caps
     sa += np.sum(np.pi * r[[0,-1],0]**2)
     

From ea732dedef4211e08fa794b8eed2623b9f57bfc9 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Thu, 5 Feb 2026 14:12:22 -0500
Subject: [PATCH 03/31] progress on SurfaceArea and BioVolume SOR float
 difference

---
 ifcb_features/all.py   | 10 +++---
 ifcb_features/blobs.py | 72 ++++++++++++++++++++++++++++++++----------
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 8b7a10a..5c76fbd 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -147,10 +147,12 @@ def eccentricity(self):
     @lru_cache()
     def orientation(self):
         """return orientation of blob in degrees"""
-        rad = self.ellipse_properties[3]
-        deg = (180/np.pi) * rad - 90
-        if deg < -90:
-            deg += 180
+        # skimage.regionprops.orientation is angle between the 0th axis (rows)
+        # and the major axis. MATLAB uses angle from the x-axis (columns).
+        rad = self.regionprops.orientation
+        deg = 90.0 + (180.0 / np.pi) * rad
+        if deg > 90.0:
+            deg -= 180.0
         return deg
     @property
     @lru_cache()
diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index 769ca48..88a72d2 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -2,7 +2,7 @@
 
 from scipy.ndimage import measurements
 
-from skimage.transform import rotate
+from scipy.ndimage import rotate as nd_rotate
 
 from .morphology import SE2, SE3, EIGHT, bwmorph_thin
 
@@ -23,28 +23,69 @@ def find_blobs(B):
     return labeled, objects, blobs
 
 def center_blob(B):
-    """returns a new image centered on the blob's
-    centroid"""
-    # compute centroid
-    yc, xc = np.mean(np.vstack(np.where(B)),axis=1)
-    # center
+    """Center blob on centroid, matching MATLAB center_blob.m."""
+    B = np.array(B).astype(np.bool)
+    # MATLAB uses regionprops centroid (x, y) with 1-based coordinates.
+    ys, xs = np.where(B)
+    if ys.size == 0:
+        return B.copy()
     h, w = B.shape
-    s = max(yc,h-yc,xc,w-xc)
-    m = int(np.ceil(s*2))
-    C = np.zeros((m,m),dtype=np.bool)
-    y0, x0 = int(np.floor(s-yc)), int(np.floor(s-xc))
-    C[y0:y0+h,x0:x0+w]=B
+    # Match MATLAB center_blob.m:
+    # centroid is 1-based, then shift to 0-based for padding math.
+    xc = (np.mean(xs) + 1.0) - 1.0
+    yc = (np.mean(ys) + 1.0) - 1.0
+    s = max(yc, h - yc, xc, w - xc)
+    m = int(np.ceil(s * 2))
+    C = np.zeros((m, m), dtype=np.bool)
+    # Avoid precision drift at integer boundaries only when very close.
+    val_y = s - yc
+    val_x = s - xc
+    # Snap values extremely close to integers to the integer boundary.
+    if abs(val_y - round(val_y)) < 1e-9:
+        val_y = round(val_y)
+    if abs(val_x - round(val_x)) < 1e-9:
+        val_x = round(val_x)
+    y0 = int(np.floor(val_y))
+    x0 = int(np.floor(val_x))
+    C[y0:y0 + h, x0:x0 + w] = B
     return C
 
 def rotate_blob(blob, theta):
     """rotate a blob counterclockwise"""
     blob = center_blob(blob)
-    # note that v2 uses bilinear interpolation in MATLAB
-    # and that is not available in skimage rotate
-    # so v3 uses nearest-neighbor
-    blob = rotate(blob,-1*theta,order=0).astype(np.bool)
+    # Match MATLAB imrotate(centered, theta, 'nearest', 'crop')
+    blob = imrotate_nearest_crop(blob, theta)
     # note that v2 does morphological post-processing and v3 does not
     return blob
+
+def imrotate_nearest_crop(img, angle_deg):
+    """MATLAB-compatible imrotate(..., 'nearest', 'crop') for binary masks."""
+    img = np.array(img).astype(np.bool)
+    h, w = img.shape
+    # MATLAB uses 1-based coordinates, center at (w+1)/2, (h+1)/2
+    cx = (w + 1) / 2.0
+    cy = (h + 1) / 2.0
+
+    yy, xx = np.indices((h, w))
+    x = xx + 1.0
+    y = yy + 1.0
+    x0 = x - cx
+    y0 = y - cy
+
+    ang = np.deg2rad(angle_deg)
+    cos_a = np.cos(-ang)
+    sin_a = np.sin(-ang)
+    x_in = cos_a * x0 - sin_a * y0 + cx
+    y_in = sin_a * x0 + cos_a * y0 + cy
+
+    # MATLAB round halves away from zero for positive coords.
+    x_idx = np.floor(x_in + 0.5).astype(np.int64)
+    y_idx = np.floor(y_in + 0.5).astype(np.int64)
+
+    out = np.zeros_like(img, dtype=np.bool)
+    mask = (x_idx >= 1) & (x_idx <= w) & (y_idx >= 1) & (y_idx <= h)
+    out[mask] = img[y_idx[mask] - 1, x_idx[mask] - 1]
+    return out
     
 def blob_shape(b0):
     h, w = b0.shape
@@ -60,4 +101,3 @@ def blob_shape(b0):
     w = int((x1-x0) + 0.5)
     
     return h,w
-

From 8800891451eb3713a05dd19942769f313244b1f6 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Thu, 12 Feb 2026 16:43:24 -0500
Subject: [PATCH 04/31] potential fix for kmeans behavior

---
 ifcb_features/segmentation.py | 137 ++++++++++++++++++++++++++++++++--
 1 file changed, 129 insertions(+), 8 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index b46e208..4c73f9f 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -1,3 +1,4 @@
+import os
 import numpy as np
 
 from skimage import img_as_float32
@@ -19,6 +20,104 @@
 HT_T1, HT_T2 = 0.3, 0.09
 BLOB_MIN = 40
 DARK_THRESHOLD_ADJUSTMENT = 0.75
+USE_STRICT_KMEANS = True
+KMEANS_FALLBACK_SEED = os.getenv("IFCB_KMEANS_FALLBACK_SEED")
+if KMEANS_FALLBACK_SEED is not None:
+    try:
+        KMEANS_FALLBACK_SEED = int(KMEANS_FALLBACK_SEED)
+    except ValueError:
+        KMEANS_FALLBACK_SEED = None
+
+
+def _kmeans_1d_strict(values, max_iter=100):
+    """MATLAB-style 1D k-means (batch) with singleton empty handling."""
+    values = np.asarray(values, dtype=np.float32)
+    n = values.shape[0]
+    if n == 0:
+        return np.array([0.0, 1.0], dtype=np.float32), np.zeros(0, dtype=np.int8)
+
+    centers = np.array([0.0, 1.0], dtype=np.float32)
+    # Initial distances/assignments from provided centers
+    D = np.empty((n, 2), dtype=np.float32)
+    D[:, 0] = (values - centers[0]) * (values - centers[0])
+    D[:, 1] = (values - centers[1]) * (values - centers[1])
+    idx = np.argmin(D, axis=1).astype(np.int8)
+
+    changed = np.array([0, 1], dtype=np.int64)
+    previdx = np.zeros(n, dtype=np.int8)
+    prevtotsumD = np.float32(np.inf)
+
+    for _iter in range(max_iter):
+        # Recompute centers and counts for changed clusters
+        counts = np.bincount(idx, minlength=2).astype(np.int64)
+        sums = np.zeros(2, dtype=np.float32)
+        for i in range(n):
+            sums[idx[i]] += values[i]
+        for c in changed:
+            if counts[c] > 0:
+                centers[c] = np.float32(sums[c] / counts[c])
+
+        # Update distances for changed clusters
+        for c in changed:
+            D[:, c] = (values - centers[c]) * (values - centers[c])
+
+        # Handle empty clusters (singleton)
+        empties = [c for c in changed if counts[c] == 0]
+        if empties:
+            d_assigned = D[np.arange(n), idx]
+            for empty in empties:
+                lonely = int(np.argmax(d_assigned))
+                from_cluster = int(idx[lonely])
+                if counts[from_cluster] < 2:
+                    from_cluster = int(np.argmax(counts > 1))
+                    lonely = int(np.argmax(idx == from_cluster))
+
+                centers[empty] = values[lonely]
+                idx[lonely] = empty
+                counts[empty] = 1
+                counts[from_cluster] -= 1
+                D[:, empty] = (values - centers[empty]) * (values - centers[empty])
+
+                # Update donor cluster center/distance
+                sums[from_cluster] = np.float32(0.0)
+                for i in range(n):
+                    if idx[i] == from_cluster:
+                        sums[from_cluster] += values[i]
+                if counts[from_cluster] > 0:
+                    centers[from_cluster] = np.float32(sums[from_cluster] / counts[from_cluster])
+                D[:, from_cluster] = (values - centers[from_cluster]) * (values - centers[from_cluster])
+                changed = np.unique(np.concatenate([changed, np.array([from_cluster], dtype=np.int64)]))
+
+        # Compute objective and check for improvement
+        d_assigned = D[np.arange(n), idx]
+        totsumD = np.sum(d_assigned, dtype=np.float32)
+        if prevtotsumD <= totsumD:
+            idx = previdx
+            counts = np.bincount(idx, minlength=2).astype(np.int64)
+            sums = np.zeros(2, dtype=np.float32)
+            for i in range(n):
+                sums[idx[i]] += values[i]
+            for c in changed:
+                if counts[c] > 0:
+                    centers[c] = np.float32(sums[c] / counts[c])
+            break
+
+        previdx = idx.copy()
+        prevtotsumD = totsumD
+
+        # Reassign using nearest centroid, tie -> stay
+        nidx = np.argmin(D, axis=1).astype(np.int8)
+        dmin = D[np.arange(n), nidx]
+        moved = np.where(nidx != previdx)[0]
+        if moved.size:
+            stay_mask = D[moved, previdx[moved]] > dmin[moved]
+            moved = moved[stay_mask]
+        if moved.size == 0:
+            break
+        idx[moved] = nidx[moved]
+        changed = np.unique(np.concatenate([idx[moved], previdx[moved]]))
+
+    return centers, idx.astype(np.int8)
 
 def kmeans_segment(roi):
     # Match MATLAB im2single for uint8 by explicit float32 scaling.
@@ -26,19 +125,41 @@ def kmeans_segment(roi):
         r = roi.astype(np.float32) / np.float32(255.0)
     else:
         r = roi.astype(np.float32)
-    # compute "dark" and "light" clusters
-    km = KMeans(n_clusters=2, n_init=1, init=np.array([[0], [1]]), max_iter=100)
-    km.fit(r.reshape(-1, 1))
-    J = km.labels_
-    C = km.cluster_centers_
+    # Use column-major order to match MATLAB img(:) traversal.
+    values = r.reshape(-1, order="F")
+    if USE_STRICT_KMEANS:
+        C, J = _kmeans_1d_strict(values, max_iter=100)
+        if C is None:
+            C = None
+    else:
+        C = None
+
+    if C is None:
+        # fallback to sklearn k-means
+        km = KMeans(
+            n_clusters=2,
+            n_init=1,
+            init="k-means++",
+            max_iter=100,
+            tol=0,
+            random_state=KMEANS_FALLBACK_SEED,
+        )
+        km.fit(values.reshape(-1, 1))
+        J = km.labels_
+        C = km.cluster_centers_.reshape(-1)
+    else:
+        C = C.reshape(-1)
+        J = J.reshape(-1)
+
+    # reshape labels to image using MATLAB order
+    J = J.reshape(r.shape, order="F")
     bg_label = np.argmax(C)
     # find the darkest pixel value in the bright (background) cluster
-    roi_1d = r.ravel()
-    darkest_background = np.min(roi_1d[J == bg_label])
+    darkest_background = np.min(r[J == bg_label])
     # use it to compute a threshold
     threshold = darkest_background * DARK_THRESHOLD_ADJUSTMENT
     # extend the background using that threshold (MATLAB uses >)
-    J[roi_1d > threshold] = bg_label
+    J[r > threshold] = bg_label
     # return True for non-background pixels
     return (J != bg_label).reshape(roi.shape)
 

From 61de9a321f2f489ffb062a07072ebaeb2935c6bd Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Thu, 12 Feb 2026 17:12:27 -0500
Subject: [PATCH 05/31] float precision fix

---
 ifcb_features/segmentation.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index 4c73f9f..1b23f82 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -50,9 +50,9 @@ def _kmeans_1d_strict(values, max_iter=100):
     for _iter in range(max_iter):
         # Recompute centers and counts for changed clusters
         counts = np.bincount(idx, minlength=2).astype(np.int64)
-        sums = np.zeros(2, dtype=np.float32)
+        sums = np.zeros(2, dtype=np.float64)
         for i in range(n):
-            sums[idx[i]] += values[i]
+            sums[idx[i]] += float(values[i])
         for c in changed:
             if counts[c] > 0:
                 centers[c] = np.float32(sums[c] / counts[c])
@@ -79,10 +79,10 @@ def _kmeans_1d_strict(values, max_iter=100):
                 D[:, empty] = (values - centers[empty]) * (values - centers[empty])
 
                 # Update donor cluster center/distance
-                sums[from_cluster] = np.float32(0.0)
+                sums[from_cluster] = np.float64(0.0)
                 for i in range(n):
                     if idx[i] == from_cluster:
-                        sums[from_cluster] += values[i]
+                        sums[from_cluster] += float(values[i])
                 if counts[from_cluster] > 0:
                     centers[from_cluster] = np.float32(sums[from_cluster] / counts[from_cluster])
                 D[:, from_cluster] = (values - centers[from_cluster]) * (values - centers[from_cluster])
@@ -94,9 +94,9 @@ def _kmeans_1d_strict(values, max_iter=100):
         if prevtotsumD <= totsumD:
             idx = previdx
             counts = np.bincount(idx, minlength=2).astype(np.int64)
-            sums = np.zeros(2, dtype=np.float32)
+            sums = np.zeros(2, dtype=np.float64)
             for i in range(n):
-                sums[idx[i]] += values[i]
+                sums[idx[i]] += float(values[i])
             for c in changed:
                 if counts[c] > 0:
                     centers[c] = np.float32(sums[c] / counts[c])

From 4069c26bc997bdb7c74535949b8eb82d205db4b4 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 13 Feb 2026 11:33:30 -0500
Subject: [PATCH 06/31] match MATLAB behavior for multiple blobs in one image

---
 ifcb_features/blobs.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index 88a72d2..e13663a 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -19,8 +19,17 @@ def find_blobs(B):
     to those bounding boxes"""
     B = np.array(B).astype(np.bool)
     labeled, objects = label_blobs(B)
-    blobs = [labeled[obj]==ix+1 for ix, obj in zip(range(len(objects)), objects)]
-    return labeled, objects, blobs
+    # Match MATLAB blob_geomprop: sort connected components by area (descending).
+    labeled_objects = [(ix + 1, obj) for ix, obj in enumerate(objects)]
+    def sort_key(item):
+        label_id, obj = item
+        comp = labeled[obj] == label_id
+        area = int(np.sum(comp))
+        return (-area, obj[1].start, obj[0].start)
+    labeled_objects.sort(key=sort_key)
+    objects_sorted = [obj for _, obj in labeled_objects]
+    blobs = [labeled[obj] == label for label, obj in labeled_objects]
+    return labeled, objects_sorted, blobs
 
 def center_blob(B):
     """Center blob on centroid, matching MATLAB center_blob.m."""

From 487ed117d55e697d5813176ab87b94b53e63f8e9 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 13 Feb 2026 14:23:38 -0500
Subject: [PATCH 07/31] potential SOR fix

---
 ifcb_features/blobs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index e13663a..a6b41c9 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -87,9 +87,9 @@ def imrotate_nearest_crop(img, angle_deg):
     x_in = cos_a * x0 - sin_a * y0 + cx
     y_in = sin_a * x0 + cos_a * y0 + cy
 
-    # MATLAB round halves away from zero for positive coords.
-    x_idx = np.floor(x_in + 0.5).astype(np.int64)
-    y_idx = np.floor(y_in + 0.5).astype(np.int64)
+    # MATLAB round: ties away from zero (works for negative too).
+    x_idx = (np.sign(x_in) * np.floor(np.abs(x_in) + 0.5)).astype(np.int64)
+    y_idx = (np.sign(y_in) * np.floor(np.abs(y_in) + 0.5)).astype(np.int64)
 
     out = np.zeros_like(img, dtype=np.bool)
     mask = (x_idx >= 1) & (x_idx <= w) & (y_idx >= 1) & (y_idx <= h)

From 1cfc787d7a4b07aac043210581d281d0194e7cc1 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Wed, 18 Feb 2026 13:18:21 -0500
Subject: [PATCH 08/31] switch to integer math for SOR calculation

---
 ifcb_features/biovolume.py | 32 +++++++++++++++++++++++++++++++-
 ifcb_features/blobs.py     | 22 ++++++++++++----------
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index c41defb..4b5960a 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -45,12 +45,16 @@ def bottom_top_area(X,Y,Z,ignore_ground=False):
     return area_bot, area_top
 
 USE_EDT_INDICES = True  # recompute distances from indices to better match MATLAB bwdist
-# Match legacy MATLAB SOR center logic (pre-e5aaf7f) when True.
+# Match MATLAB SOR center logic (old top-edge + radius) when True.
 SOR_USE_OLD_CENTER = True
+# Use Heidi_explore distmap implementation (bwdist + heidi surface area) when True.
+USE_HEIDI_DISTMAP = False
 
 def distmap_volume_surface_area(B,perimeter_image=None):
     """Moberg & Sosik biovolume algorithm
     returns volume and representative transect"""
+    if USE_HEIDI_DISTMAP:
+        return distmap_volume_surface_area_heidi(B, perimeter_image)
     if perimeter_image is None:
         perimeter_image = find_perimeter(B)
     # elementwise distance to perimeter + 1
@@ -99,6 +103,32 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     # return volume, representative transect, and surface area
     return volume, x, sa
 
+
+def distmap_volume_surface_area_heidi(B, perimeter_image=None):
+    """Heidi_explore distmap_volume (bwdist + surface area) implementation."""
+    if perimeter_image is None:
+        perimeter_image = find_perimeter(B)
+    # calculate distance map (MATLAB bwdist)
+    D = distance_transform_edt(1 - perimeter_image) + 1
+    # mask distances outside filled perimeter
+    fill = binary_fill_holes(np.array(perimeter_image, dtype=bool))
+    D = D.astype(np.float64)
+    D[~fill] = np.nan
+    # representative transect length
+    x = 4 * np.nanmean(D) - 2
+    # correction factors
+    c1 = (x**2) / (x**2 + 2 * x + 0.5)
+    c2 = np.pi / 2
+    volume = c1 * c2 * 2 * np.nansum(D)
+    # surface area
+    D_sa = np.nan_to_num(D, nan=0.0)
+    h, w = D_sa.shape
+    Y, X = np.mgrid[1:h + 1, 1:w + 1]
+    area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
+    c = (np.pi * x / 2.0) / (2.0 * np.sqrt(2.0) * x / 2.0 + (1.0 + np.sqrt(2.0)) / 2.0)
+    sa = 2 * c * (np.sum(area_bot) + np.sum(area_top))
+    return volume, x, sa
+
 def sor_volume_surface_area(B):
     """pass in rotated blob"""
     """Sosik and Kilfoyle surface area / volume algorithm"""
diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index a6b41c9..ae51f75 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -46,16 +46,18 @@ def center_blob(B):
     s = max(yc, h - yc, xc, w - xc)
     m = int(np.ceil(s * 2))
     C = np.zeros((m, m), dtype=np.bool)
-    # Avoid precision drift at integer boundaries only when very close.
-    val_y = s - yc
-    val_x = s - xc
-    # Snap values extremely close to integers to the integer boundary.
-    if abs(val_y - round(val_y)) < 1e-9:
-        val_y = round(val_y)
-    if abs(val_x - round(val_x)) < 1e-9:
-        val_x = round(val_x)
-    y0 = int(np.floor(val_y))
-    x0 = int(np.floor(val_x))
+    # Integer-exact offsets to avoid float drift.
+    n = ys.size
+    sum_y = int(np.sum(ys))
+    sum_x = int(np.sum(xs))
+    hN = h * n
+    wN = w * n
+    ycN = sum_y
+    xcN = sum_x
+    sN = max(ycN, hN - ycN, xcN, wN - xcN)
+    m = int((2 * sN + n - 1) // n)
+    y0 = int((sN - ycN) // n)
+    x0 = int((sN - xcN) // n)
     C[y0:y0 + h, x0:x0 + w] = B
     return C
 

From 41e8ad4fe9d3208c0ef4a5a50f343a957c8f4b50 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Wed, 18 Feb 2026 14:33:06 -0500
Subject: [PATCH 09/31] potential float rounding fix for biovolume calculation

---
 ifcb_features/biovolume.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 4b5960a..cc92c2e 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -70,9 +70,21 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     fill = binary_fill_holes(np.array(perimeter_image, dtype=bool))
     D = D.astype(np.float32)
     D[~fill] = np.nan
-    Dm = np.ma.array(D, mask=np.isnan(D))
-    # representative transect
-    x = np.float32(4.0) * np.nanmean(Dm, dtype=np.float32) - np.float32(2.0)
+    # representative transect (match MATLAB float32 sum/mean in column-major order)
+    flat = D.ravel(order="F")
+    nan_mask = np.isnan(flat)
+    count = np.int64(np.sum(~nan_mask))
+    flat = np.where(nan_mask, np.float32(0.0), flat.astype(np.float32))
+    if count == 0:
+        sum_val = np.float32(0.0)
+        mean_val = np.float32(np.nan)
+    else:
+        acc = np.zeros(4, dtype=np.float32)
+        for i, v in enumerate(flat):
+            acc[i % 4] = np.float32(acc[i % 4] + v)
+        sum_val = np.float32(np.sum(acc, dtype=np.float32))
+        mean_val = np.float32(sum_val / np.float32(count))
+    x = np.float32(4.0) * mean_val - np.float32(2.0)
     # diamond correction
     c1 = x**2 / (x**2 + 2*x + 0.5)
     # circle correction
@@ -80,7 +92,7 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     # volume = c1 * c2 * 2 * np.sum(D)
     # compute volume in float32 to match MATLAB single-precision path
     c1 = np.float32(c1)
-    volume = np.float32(c1 * np.float32(np.pi) * np.nansum(D, dtype=np.float32))
+    volume = np.float32(c1 * np.float32(np.pi) * sum_val)
     # surface area
     # surface area uses NaN-masked distances as zero
     D_sa = np.nan_to_num(D, nan=0.0)

From 8ea943573f1a51e2aac0b6a4116edc97dfac6380 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Wed, 18 Feb 2026 16:22:32 -0500
Subject: [PATCH 10/31] add debug print

---
 ifcb_features/biovolume.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index cc92c2e..ec6aebf 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -1,3 +1,4 @@
+import os
 import numpy as np
 
 from scipy.ndimage import binary_fill_holes, distance_transform_edt
@@ -79,10 +80,8 @@ def distmap_volume_surface_area(B,perimeter_image=None):
         sum_val = np.float32(0.0)
         mean_val = np.float32(np.nan)
     else:
-        acc = np.zeros(4, dtype=np.float32)
-        for i, v in enumerate(flat):
-            acc[i % 4] = np.float32(acc[i % 4] + v)
-        sum_val = np.float32(np.sum(acc, dtype=np.float32))
+        # Match MATLAB sum for single arrays: column-major, float32 accumulation.
+        sum_val = np.float32(np.sum(flat, dtype=np.float32))
         mean_val = np.float32(sum_val / np.float32(count))
     x = np.float32(4.0) * mean_val - np.float32(2.0)
     # diamond correction
@@ -93,6 +92,8 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     # compute volume in float32 to match MATLAB single-precision path
     c1 = np.float32(c1)
     volume = np.float32(c1 * np.float32(np.pi) * sum_val)
+    if os.getenv("DISTMAP_DEBUG") == "1":
+        print("distmap_debug sum_val", float(sum_val), "mean_val", float(mean_val), "x", float(x), "volume", float(volume))
     # surface area
     # surface area uses NaN-masked distances as zero
     D_sa = np.nan_to_num(D, nan=0.0)

From 25780ac8d713637c140cadffe05113e3009c57a2 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Thu, 19 Feb 2026 15:31:37 -0500
Subject: [PATCH 11/31] connected component fix to use 8-connected labeling
 like MATLAB

---
 ifcb_features/all.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 5c76fbd..1a77fd5 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -3,7 +3,7 @@
 from scipy.ndimage.measurements import find_objects
 from scipy.spatial import QhullError
 
-from skimage.measure import regionprops
+from skimage.measure import label, regionprops
 
 from functools import lru_cache
 
@@ -45,8 +45,13 @@ def size(self):
     @property
     @lru_cache()
     def regionprops(self):
-        """region props of the blob (assumes single connected region)"""
-        return regionprops(self.image.astype(np.uint8))[0]
+        """region props of the blob (assumes single connected region)."""
+        labels = label(self.image.astype(np.uint8), connectivity=2)
+        props = regionprops(labels)
+        if not props:
+            return None
+        # Use the largest region to match MATLAB's 8-connected component.
+        return max(props, key=lambda p: p.area)
     @property
     @lru_cache()
     def area(self):

From 4a433567211309fd59861cac13f2595fd79cf45b Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 20 Feb 2026 13:56:30 -0500
Subject: [PATCH 12/31] add deterministic sum for biovolume calculation

---
 ifcb_features/biovolume.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index ec6aebf..3b4812f 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -76,22 +76,40 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     nan_mask = np.isnan(flat)
     count = np.int64(np.sum(~nan_mask))
     flat = np.where(nan_mask, np.float32(0.0), flat.astype(np.float32))
+    use_deterministic_sum = True
     if count == 0:
         sum_val = np.float32(0.0)
         mean_val = np.float32(np.nan)
     else:
-        # Match MATLAB sum for single arrays: column-major, float32 accumulation.
-        sum_val = np.float32(np.sum(flat, dtype=np.float32))
-        mean_val = np.float32(sum_val / np.float32(count))
-    x = np.float32(4.0) * mean_val - np.float32(2.0)
+        if use_deterministic_sum:
+            # Deterministic column-major sum in float64.
+            sum_acc = 0.0
+            cnt = 0
+            for v in flat:
+                if not np.isnan(v):
+                    sum_acc += float(v)
+                    cnt += 1
+            sum_val = np.float64(sum_acc)
+            mean_val = np.float64(sum_acc / float(cnt)) if cnt else np.float64(np.nan)
+        else:
+            # Match MATLAB sum for single arrays: column-major, float32 accumulation.
+            sum_val = np.float32(np.sum(flat, dtype=np.float32))
+            mean_val = np.float32(sum_val / np.float32(count))
+    if use_deterministic_sum:
+        x = np.float64(4.0) * mean_val - np.float64(2.0)
+    else:
+        x = np.float32(4.0) * mean_val - np.float32(2.0)
     # diamond correction
     c1 = x**2 / (x**2 + 2*x + 0.5)
     # circle correction
     # c2 = np.pi / 2 
     # volume = c1 * c2 * 2 * np.sum(D)
-    # compute volume in float32 to match MATLAB single-precision path
-    c1 = np.float32(c1)
-    volume = np.float32(c1 * np.float32(np.pi) * sum_val)
+    if use_deterministic_sum:
+        volume = np.float64(c1 * np.float64(np.pi) * sum_val)
+    else:
+        # compute volume in float32 to match MATLAB single-precision path
+        c1 = np.float32(c1)
+        volume = np.float32(c1 * np.float32(np.pi) * sum_val)
     if os.getenv("DISTMAP_DEBUG") == "1":
         print("distmap_debug sum_val", float(sum_val), "mean_val", float(mean_val), "x", float(x), "volume", float(volume))
     # surface area

From 7d0b5de14afbeb500e75401570ec7c5299969835 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 20 Feb 2026 14:41:58 -0500
Subject: [PATCH 13/31] mean generation fix -- nans should not be part of the
 mean

---
 ifcb_features/biovolume.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 3b4812f..1d9e444 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -72,10 +72,10 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     D = D.astype(np.float32)
     D[~fill] = np.nan
     # representative transect (match MATLAB float32 sum/mean in column-major order)
-    flat = D.ravel(order="F")
-    nan_mask = np.isnan(flat)
+    flat_raw = D.ravel(order="F")
+    nan_mask = np.isnan(flat_raw)
     count = np.int64(np.sum(~nan_mask))
-    flat = np.where(nan_mask, np.float32(0.0), flat.astype(np.float32))
+    flat = np.where(nan_mask, np.float32(0.0), flat_raw.astype(np.float32))
     use_deterministic_sum = True
     if count == 0:
         sum_val = np.float32(0.0)
@@ -85,8 +85,8 @@ def distmap_volume_surface_area(B,perimeter_image=None):
             # Deterministic column-major sum in float64.
             sum_acc = 0.0
             cnt = 0
-            for v in flat:
-                if not np.isnan(v):
+            for v, is_nan in zip(flat_raw, nan_mask):
+                if not is_nan:
                     sum_acc += float(v)
                     cnt += 1
             sum_val = np.float64(sum_acc)

From 4b8fec1c731709997414c01ae2fc05d72415f438 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 20 Feb 2026 15:30:31 -0500
Subject: [PATCH 14/31] add deterministic summing to heidi distmap function

---
 ifcb_features/biovolume.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 1d9e444..f33a8ab 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -145,12 +145,22 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     fill = binary_fill_holes(np.array(perimeter_image, dtype=bool))
     D = D.astype(np.float64)
     D[~fill] = np.nan
+    # deterministic sum/mean over column-major order to match MATLAB loop
+    flat = D.ravel(order="F")
+    sum_acc = 0.0
+    cnt = 0
+    for v in flat:
+        if not np.isnan(v):
+            sum_acc += float(v)
+            cnt += 1
+    sum_val = np.float64(sum_acc)
+    mean_val = np.float64(sum_acc / float(cnt)) if cnt else np.float64(np.nan)
     # representative transect length
-    x = 4 * np.nanmean(D) - 2
+    x = 4 * mean_val - 2
     # correction factors
     c1 = (x**2) / (x**2 + 2 * x + 0.5)
     c2 = np.pi / 2
-    volume = c1 * c2 * 2 * np.nansum(D)
+    volume = c1 * c2 * 2 * sum_val
     # surface area
     D_sa = np.nan_to_num(D, nan=0.0)
     h, w = D_sa.shape

From 67f74f73ca2a468327ac96245d2818ce3ec17fa6 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 20 Feb 2026 15:31:49 -0500
Subject: [PATCH 15/31] make heidi distmap version the default

---
 ifcb_features/biovolume.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index f33a8ab..916de32 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -49,7 +49,7 @@ def bottom_top_area(X,Y,Z,ignore_ground=False):
 # Match MATLAB SOR center logic (old top-edge + radius) when True.
 SOR_USE_OLD_CENTER = True
 # Use Heidi_explore distmap implementation (bwdist + heidi surface area) when True.
-USE_HEIDI_DISTMAP = False
+USE_HEIDI_DISTMAP = True
 
 def distmap_volume_surface_area(B,perimeter_image=None):
     """Moberg & Sosik biovolume algorithm

From cae3d661374c82fac13c54c0c9bd272eb767894c Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 20 Feb 2026 16:34:24 -0500
Subject: [PATCH 16/31] switch sum accumulator to explicitly use float32

---
 ifcb_features/biovolume.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 916de32..969aa06 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -82,21 +82,21 @@ def distmap_volume_surface_area(B,perimeter_image=None):
         mean_val = np.float32(np.nan)
     else:
         if use_deterministic_sum:
-            # Deterministic column-major sum in float64.
-            sum_acc = 0.0
+            # Deterministic column-major sum in float32 (match MATLAB single).
+            sum_acc = np.float32(0.0)
             cnt = 0
             for v, is_nan in zip(flat_raw, nan_mask):
                 if not is_nan:
-                    sum_acc += float(v)
+                    sum_acc = np.float32(sum_acc + np.float32(v))
                     cnt += 1
-            sum_val = np.float64(sum_acc)
-            mean_val = np.float64(sum_acc / float(cnt)) if cnt else np.float64(np.nan)
+            sum_val = np.float32(sum_acc)
+            mean_val = np.float32(sum_acc / np.float32(cnt)) if cnt else np.float32(np.nan)
         else:
             # Match MATLAB sum for single arrays: column-major, float32 accumulation.
             sum_val = np.float32(np.sum(flat, dtype=np.float32))
             mean_val = np.float32(sum_val / np.float32(count))
     if use_deterministic_sum:
-        x = np.float64(4.0) * mean_val - np.float64(2.0)
+        x = np.float32(4.0) * mean_val - np.float32(2.0)
     else:
         x = np.float32(4.0) * mean_val - np.float32(2.0)
     # diamond correction
@@ -105,7 +105,8 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     # c2 = np.pi / 2 
     # volume = c1 * c2 * 2 * np.sum(D)
     if use_deterministic_sum:
-        volume = np.float64(c1 * np.float64(np.pi) * sum_val)
+        c1 = np.float32(c1)
+        volume = np.float32(c1 * np.float32(np.pi) * sum_val)
     else:
         # compute volume in float32 to match MATLAB single-precision path
         c1 = np.float32(c1)
@@ -147,20 +148,20 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     D[~fill] = np.nan
     # deterministic sum/mean over column-major order to match MATLAB loop
     flat = D.ravel(order="F")
-    sum_acc = 0.0
+    sum_acc = np.float32(0.0)
     cnt = 0
     for v in flat:
         if not np.isnan(v):
-            sum_acc += float(v)
+            sum_acc = np.float32(sum_acc + np.float32(v))
             cnt += 1
-    sum_val = np.float64(sum_acc)
-    mean_val = np.float64(sum_acc / float(cnt)) if cnt else np.float64(np.nan)
+    sum_val = np.float32(sum_acc)
+    mean_val = np.float32(sum_acc / np.float32(cnt)) if cnt else np.float32(np.nan)
     # representative transect length
-    x = 4 * mean_val - 2
+    x = np.float32(4.0) * mean_val - np.float32(2.0)
     # correction factors
     c1 = (x**2) / (x**2 + 2 * x + 0.5)
     c2 = np.pi / 2
-    volume = c1 * c2 * 2 * sum_val
+    volume = np.float32(c1 * np.float32(c2) * np.float32(2.0) * sum_val)
     # surface area
     D_sa = np.nan_to_num(D, nan=0.0)
     h, w = D_sa.shape

From d0d4cb16202f9b7f88dd8eee11332d3efd26bdb0 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Mon, 23 Feb 2026 22:09:31 -0500
Subject: [PATCH 17/31] add explicit surface area summing

---
 ifcb_features/biovolume.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 969aa06..5f326c2 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -45,6 +45,14 @@ def bottom_top_area(X,Y,Z,ignore_ground=False):
         
     return area_bot, area_top
 
+def _det_sum32(arr):
+    """Deterministic float32 sum in column-major order."""
+    sum_acc = np.float32(0.0)
+    flat = arr.ravel(order="F")
+    for v in flat:
+        sum_acc = np.float32(sum_acc + np.float32(v))
+    return np.float32(sum_acc)
+
 USE_EDT_INDICES = True  # recompute distances from indices to better match MATLAB bwdist
 # Match MATLAB SOR center logic (old top-edge + radius) when True.
 SOR_USE_OLD_CENTER = True
@@ -128,10 +136,9 @@ def distmap_volume_surface_area(B,perimeter_image=None):
         np.float32(2.0) * np.float32(np.sqrt(2.0)) * x / np.float32(2.0)
         + (np.float32(1.0) + np.float32(np.sqrt(2.0))) / np.float32(2.0)
     )
-    sa = np.float32(2.0) * c * np.float32(
-        np.nansum(area_bot.astype(np.float32), dtype=np.float32)
-        + np.nansum(area_top.astype(np.float32), dtype=np.float32)
-    )
+    sum_bot = _det_sum32(area_bot.astype(np.float32))
+    sum_top = _det_sum32(area_top.astype(np.float32))
+    sa = np.float32(2.0) * c * np.float32(sum_bot + sum_top)
     # return volume, representative transect, and surface area
     return volume, x, sa
 
@@ -168,7 +175,9 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     Y, X = np.mgrid[1:h + 1, 1:w + 1]
     area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
     c = (np.pi * x / 2.0) / (2.0 * np.sqrt(2.0) * x / 2.0 + (1.0 + np.sqrt(2.0)) / 2.0)
-    sa = 2 * c * (np.sum(area_bot) + np.sum(area_top))
+    sum_bot = _det_sum32(area_bot.astype(np.float32))
+    sum_top = _det_sum32(area_top.astype(np.float32))
+    sa = np.float32(2.0) * np.float32(c) * np.float32(sum_bot + sum_top)
     return volume, x, sa
 
 def sor_volume_surface_area(B):

From 3dc38fe8a99c90e79d6662d3aec74f2230cd7233 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Wed, 25 Feb 2026 15:32:14 -0500
Subject: [PATCH 18/31] add largest 8-connected component fix

---
 ifcb_features/all.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 1a77fd5..9d60907 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -210,6 +210,15 @@ def distmap_volume_surface_area(self):
         """volume of blob computed via Moberg & Sosik algorithm"""
         # MATLAB uses a tight cropped blob image for distmap via regionprops.Image.
         img = self.regionprops.image.astype(bool)
+        # Match MATLAB blob_geomprop: use largest 8-connected component.
+        if img.any():
+            from scipy.ndimage import label
+            labeled, num = label(img, structure=np.ones((3, 3), dtype=np.int32))
+            if num > 1:
+                counts = np.bincount(labeled.ravel())
+                counts[0] = 0
+                largest = counts.argmax()
+                img = labeled == largest
         perim = find_perimeter(img)
         return distmap_volume_surface_area(img, perim)
     @property

From 2579531b5b71a4b8019c248b624b076a632d0196 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Fri, 27 Feb 2026 11:34:14 -0500
Subject: [PATCH 19/31] round float64 values to float32

---
 ifcb_features/biovolume.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 5f326c2..e9b27f2 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -170,9 +170,11 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     c2 = np.pi / 2
     volume = np.float32(c1 * np.float32(c2) * np.float32(2.0) * sum_val)
     # surface area
-    D_sa = np.nan_to_num(D, nan=0.0)
+    D_sa = np.nan_to_num(D, nan=0.0).astype(np.float32, copy=False)
     h, w = D_sa.shape
     Y, X = np.mgrid[1:h + 1, 1:w + 1]
+    X = X.astype(np.float32, copy=False)
+    Y = Y.astype(np.float32, copy=False)
     area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
     c = (np.pi * x / 2.0) / (2.0 * np.sqrt(2.0) * x / 2.0 + (1.0 + np.sqrt(2.0)) / 2.0)
     sum_bot = _det_sum32(area_bot.astype(np.float32))

From e9207d4a7a6a928e88cc467bc1df911337c54b9a Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 27 Feb 2026 13:45:31 -0500
Subject: [PATCH 20/31] add more explicit float32 casting

---
 ifcb_features/biovolume.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index e9b27f2..4c19dc8 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -132,8 +132,9 @@ def distmap_volume_surface_area(B,perimeter_image=None):
     area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
     # final correction of the diamond cross-section
     # inherent in the distance map to be circular instead
-    c = (np.float32(np.pi) * x / np.float32(2.0)) / (
-        np.float32(2.0) * np.float32(np.sqrt(2.0)) * x / np.float32(2.0)
+    # compute c fully in float32 to match MATLAB single-precision path
+    c = (np.float32(np.pi) * np.float32(x) / np.float32(2.0)) / (
+        np.float32(2.0) * np.float32(np.sqrt(2.0)) * np.float32(x) / np.float32(2.0)
         + (np.float32(1.0) + np.float32(np.sqrt(2.0))) / np.float32(2.0)
     )
     sum_bot = _det_sum32(area_bot.astype(np.float32))
@@ -166,9 +167,9 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     # representative transect length
     x = np.float32(4.0) * mean_val - np.float32(2.0)
     # correction factors
-    c1 = (x**2) / (x**2 + 2 * x + 0.5)
-    c2 = np.pi / 2
-    volume = np.float32(c1 * np.float32(c2) * np.float32(2.0) * sum_val)
+    c1 = (x**2) / (x**2 + np.float32(2.0) * x + np.float32(0.5))
+    c2 = np.float32(np.pi / 2.0)
+    volume = np.float32(c1 * c2 * np.float32(2.0) * sum_val)
     # surface area
     D_sa = np.nan_to_num(D, nan=0.0).astype(np.float32, copy=False)
     h, w = D_sa.shape
@@ -176,7 +177,11 @@ def distmap_volume_surface_area_heidi(B, perimeter_image=None):
     X = X.astype(np.float32, copy=False)
     Y = Y.astype(np.float32, copy=False)
     area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
-    c = (np.pi * x / 2.0) / (2.0 * np.sqrt(2.0) * x / 2.0 + (1.0 + np.sqrt(2.0)) / 2.0)
+    # compute c fully in float32 to match MATLAB single-precision path
+    c = (np.float32(np.pi) * np.float32(x) / np.float32(2.0)) / (
+        np.float32(2.0) * np.float32(np.sqrt(2.0)) * np.float32(x) / np.float32(2.0)
+        + (np.float32(1.0) + np.float32(np.sqrt(2.0))) / np.float32(2.0)
+    )
     sum_bot = _det_sum32(area_bot.astype(np.float32))
     sum_top = _det_sum32(area_top.astype(np.float32))
     sa = np.float32(2.0) * np.float32(c) * np.float32(sum_bot + sum_top)

From 04da5af78022bd0803f172cb9a97310bad522cb3 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:28:07 -0500
Subject: [PATCH 21/31] add zero to NaN conversion and remove unused roi param

---
 ifcb_features/all.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 9d60907..ca87954 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -22,12 +22,11 @@
 from .ringwedge import ring_wedge, _N_RINGS, _N_WEDGES
 
 class BlobFeatures(object):
-    def __init__(self,blob_image,roi_image, roi=None):
+    def __init__(self,blob_image,roi_image):
         """roi_image should be the same size as the blob image,
         so a sub-roi"""
         self.image = np.array(blob_image).astype(np.bool)
         self.roi_image = roi_image
-        self.roi = roi
     @property
     def shape(self):
         """h,w of blob image"""
@@ -342,7 +341,7 @@ def blobs(self):
         the segmented mask, ordered by largest area to smallest area"""
         labeled, bboxes, blobs = find_blobs(self.blobs_image)
         cropped_rois = [self.image[bbox] for bbox in bboxes]
-        Bs = [BlobFeatures(b, R, roi=self) for b, R in zip(blobs, cropped_rois)]
+        Bs = [BlobFeatures(b, R) for b, R in zip(blobs, cropped_rois)]
         # sort by area, largest first
         return sorted(Bs, key=lambda B: B.area, reverse=True)
     @property
@@ -548,9 +547,11 @@ def compute_features(roi_image, blobs_image=None, raw_stitch=None):
         ('summedPerimeter', r.summed_perimeter),
         ('summedSurfaceArea', r.summed_surface_area),
     ]
+    def _zero_to_nan(v):
+        return float('nan') if v == 0 else v
     f += [
-        ('Area_over_PerimeterSquared', b.area_over_perimeter_squared),
-        ('Area_over_Perimeter', b.area_over_perimeter),
-        ('summedConvexPerimeter_over_Perimeter', r.summed_convex_perimeter_over_perimeter),
+        ('Area_over_PerimeterSquared', _zero_to_nan(b.area_over_perimeter_squared)),
+        ('Area_over_Perimeter', _zero_to_nan(b.area_over_perimeter)),
+        ('summedConvexPerimeter_over_Perimeter', _zero_to_nan(r.summed_convex_perimeter_over_perimeter)),
     ]
     return (r.blobs_image, f)

From d3cc825fb9123e249054ff5acd5cccb8d615c54b Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:30:00 -0500
Subject: [PATCH 22/31] remove debug function

---
 ifcb_features/all.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index ca87954..6eacb47 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -8,7 +8,7 @@
 from functools import lru_cache
 
 from .segmentation import segment_roi
-from .blobs import find_blobs, label_blobs, rotate_blob, blob_shape
+from .blobs import find_blobs, rotate_blob, blob_shape
 from .blob_geometry import (equiv_diameter, ellipse_properties,
                             invmoments, convex_hull, convex_hull_image,
                             convex_hull_properties, feret_diameter,
@@ -345,19 +345,6 @@ def blobs(self):
         # sort by area, largest first
         return sorted(Bs, key=lambda B: B.area, reverse=True)
     @property
-    @lru_cache()
-    def largest_blob_mask_full(self):
-        """full-size mask for the largest blob (MATLAB uses full-size mask for biovolume)"""
-        labeled, objects = label_blobs(self.blobs_image)
-        if not objects:
-            return None
-        # labels are 1-based in labeled image
-        areas = []
-        for ix in range(1, len(objects) + 1):
-            areas.append(int(np.sum(labeled == ix)))
-        idx = int(np.argmax(areas)) + 1
-        return labeled == idx
-    @property
     def num_blobs(self):
         return len(self.blobs)
     @property

From 34aabe47543656c139e2fb06b5300897a73bb967 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:33:37 -0500
Subject: [PATCH 23/31] remove debug code

---
 ifcb_features/biovolume.py | 94 ++------------------------------------
 1 file changed, 3 insertions(+), 91 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 4c19dc8..985403c 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -1,4 +1,3 @@
-import os
 import numpy as np
 
 from scipy.ndimage import binary_fill_holes, distance_transform_edt
@@ -53,99 +52,12 @@ def _det_sum32(arr):
         sum_acc = np.float32(sum_acc + np.float32(v))
     return np.float32(sum_acc)
 
-USE_EDT_INDICES = True  # recompute distances from indices to better match MATLAB bwdist
 # Match MATLAB SOR center logic (old top-edge + radius) when True.
 SOR_USE_OLD_CENTER = True
-# Use Heidi_explore distmap implementation (bwdist + heidi surface area) when True.
-USE_HEIDI_DISTMAP = True
 
-def distmap_volume_surface_area(B,perimeter_image=None):
-    """Moberg & Sosik biovolume algorithm
-    returns volume and representative transect"""
-    if USE_HEIDI_DISTMAP:
-        return distmap_volume_surface_area_heidi(B, perimeter_image)
-    if perimeter_image is None:
-        perimeter_image = find_perimeter(B)
-    # elementwise distance to perimeter + 1
-    if USE_EDT_INDICES:
-        _, inds = distance_transform_edt(1 - perimeter_image, return_indices=True)
-        coords = np.indices(perimeter_image.shape, dtype=np.int64)
-        deltas = coords - inds.astype(np.int64, copy=False)
-        dist2 = np.sum(deltas * deltas, axis=0, dtype=np.int64)
-        D = np.sqrt(dist2.astype(np.float32)) + np.float32(1.0)
-    else:
-        D = distance_transform_edt(1 - perimeter_image) + 1
-    # mask distances outside filled perimeter (match MATLAB imfill on boundary image)
-    fill = binary_fill_holes(np.array(perimeter_image, dtype=bool))
-    D = D.astype(np.float32)
-    D[~fill] = np.nan
-    # representative transect (match MATLAB float32 sum/mean in column-major order)
-    flat_raw = D.ravel(order="F")
-    nan_mask = np.isnan(flat_raw)
-    count = np.int64(np.sum(~nan_mask))
-    flat = np.where(nan_mask, np.float32(0.0), flat_raw.astype(np.float32))
-    use_deterministic_sum = True
-    if count == 0:
-        sum_val = np.float32(0.0)
-        mean_val = np.float32(np.nan)
-    else:
-        if use_deterministic_sum:
-            # Deterministic column-major sum in float32 (match MATLAB single).
-            sum_acc = np.float32(0.0)
-            cnt = 0
-            for v, is_nan in zip(flat_raw, nan_mask):
-                if not is_nan:
-                    sum_acc = np.float32(sum_acc + np.float32(v))
-                    cnt += 1
-            sum_val = np.float32(sum_acc)
-            mean_val = np.float32(sum_acc / np.float32(cnt)) if cnt else np.float32(np.nan)
-        else:
-            # Match MATLAB sum for single arrays: column-major, float32 accumulation.
-            sum_val = np.float32(np.sum(flat, dtype=np.float32))
-            mean_val = np.float32(sum_val / np.float32(count))
-    if use_deterministic_sum:
-        x = np.float32(4.0) * mean_val - np.float32(2.0)
-    else:
-        x = np.float32(4.0) * mean_val - np.float32(2.0)
-    # diamond correction
-    c1 = x**2 / (x**2 + 2*x + 0.5)
-    # circle correction
-    # c2 = np.pi / 2 
-    # volume = c1 * c2 * 2 * np.sum(D)
-    if use_deterministic_sum:
-        c1 = np.float32(c1)
-        volume = np.float32(c1 * np.float32(np.pi) * sum_val)
-    else:
-        # compute volume in float32 to match MATLAB single-precision path
-        c1 = np.float32(c1)
-        volume = np.float32(c1 * np.float32(np.pi) * sum_val)
-    if os.getenv("DISTMAP_DEBUG") == "1":
-        print("distmap_debug sum_val", float(sum_val), "mean_val", float(mean_val), "x", float(x), "volume", float(volume))
-    # surface area
-    # surface area uses NaN-masked distances as zero
-    D_sa = np.nan_to_num(D, nan=0.0)
-    h, w = D_sa.shape
-    # MATLAB uses 1-based X/Y indices for surface area geometry.
-    Y, X = np.mgrid[1:h + 1, 1:w + 1]
-    X = X.astype(np.float32, copy=False)
-    Y = Y.astype(np.float32, copy=False)
-    area_bot, area_top = bottom_top_area(X, Y, D_sa, ignore_ground=True)
-    # final correction of the diamond cross-section
-    # inherent in the distance map to be circular instead
-    # compute c fully in float32 to match MATLAB single-precision path
-    c = (np.float32(np.pi) * np.float32(x) / np.float32(2.0)) / (
-        np.float32(2.0) * np.float32(np.sqrt(2.0)) * np.float32(x) / np.float32(2.0)
-        + (np.float32(1.0) + np.float32(np.sqrt(2.0))) / np.float32(2.0)
-    )
-    sum_bot = _det_sum32(area_bot.astype(np.float32))
-    sum_top = _det_sum32(area_top.astype(np.float32))
-    sa = np.float32(2.0) * c * np.float32(sum_bot + sum_top)
-    # return volume, representative transect, and surface area
-    return volume, x, sa
-
-
-def distmap_volume_surface_area_heidi(B, perimeter_image=None):
-    """Heidi_explore distmap_volume (bwdist + surface area) implementation."""
+def distmap_volume_surface_area(B, perimeter_image=None):
+    """Moberg & Sosik biovolume algorithm (Heidi_explore implementation).
+    Returns volume, representative transect, and surface area."""
     if perimeter_image is None:
         perimeter_image = find_perimeter(B)
     # calculate distance map (MATLAB bwdist)

From d99b22f6581dd334dab821e5b32f7c646ad11b17 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:37:45 -0500
Subject: [PATCH 24/31] remove kmeans debug code

---
 ifcb_features/segmentation.py | 35 +++--------------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index 1b23f82..ff9d331 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -1,4 +1,3 @@
-import os
 import numpy as np
 
 from skimage import img_as_float32
@@ -6,7 +5,6 @@
 from scipy.ndimage.morphology import binary_fill_holes
 from skimage.morphology import binary_closing, binary_dilation, binary_erosion, remove_small_objects, diamond
 import skimage.filters as imfilters
-from sklearn.cluster import KMeans
 
 from skimage.util import img_as_ubyte
 
@@ -20,13 +18,6 @@
 HT_T1, HT_T2 = 0.3, 0.09
 BLOB_MIN = 40
 DARK_THRESHOLD_ADJUSTMENT = 0.75
-USE_STRICT_KMEANS = True
-KMEANS_FALLBACK_SEED = os.getenv("IFCB_KMEANS_FALLBACK_SEED")
-if KMEANS_FALLBACK_SEED is not None:
-    try:
-        KMEANS_FALLBACK_SEED = int(KMEANS_FALLBACK_SEED)
-    except ValueError:
-        KMEANS_FALLBACK_SEED = None
 
 
 def _kmeans_1d_strict(values, max_iter=100):
@@ -127,29 +118,9 @@ def kmeans_segment(roi):
         r = roi.astype(np.float32)
     # Use column-major order to match MATLAB img(:) traversal.
     values = r.reshape(-1, order="F")
-    if USE_STRICT_KMEANS:
-        C, J = _kmeans_1d_strict(values, max_iter=100)
-        if C is None:
-            C = None
-    else:
-        C = None
-
-    if C is None:
-        # fallback to sklearn k-means
-        km = KMeans(
-            n_clusters=2,
-            n_init=1,
-            init="k-means++",
-            max_iter=100,
-            tol=0,
-            random_state=KMEANS_FALLBACK_SEED,
-        )
-        km.fit(values.reshape(-1, 1))
-        J = km.labels_
-        C = km.cluster_centers_.reshape(-1)
-    else:
-        C = C.reshape(-1)
-        J = J.reshape(-1)
+    C, J = _kmeans_1d_strict(values, max_iter=100)
+    C = C.reshape(-1)
+    J = J.reshape(-1)
 
     # reshape labels to image using MATLAB order
     J = J.reshape(r.shape, order="F")

From 904305fbba7b5f81004d87812ef495411136fead Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:42:00 -0500
Subject: [PATCH 25/31] remove sor debug code

---
 ifcb_features/biovolume.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 985403c..921efdb 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -52,8 +52,6 @@ def _det_sum32(arr):
         sum_acc = np.float32(sum_acc + np.float32(v))
     return np.float32(sum_acc)
 
-# Match MATLAB SOR center logic (old top-edge + radius) when True.
-SOR_USE_OLD_CENTER = True
 
 def distmap_volume_surface_area(B, perimeter_image=None):
     """Moberg & Sosik biovolume algorithm (Heidi_explore implementation).
@@ -107,17 +105,9 @@ def sor_volume_surface_area(B):
     r = np.sum(B, axis=0).astype(np.float64)
     ri = r > 0
     r = (r / 2.0)[ri]
-    if SOR_USE_OLD_CENTER:
-        y1 = np.argmax(B, axis=0).astype(np.float64) + 1.0
-        y1 = y1[ri]
-        center = y1 + r
-    else:
-        rowind = np.arange(1, h + 1, dtype=np.float64)[:, None]
-        temp = rowind * B
-        temp = temp.astype(np.float64, copy=False)
-        temp[temp == 0] = np.nan
-        center = np.nanmedian(temp, axis=0) + 0.5
-        center = center[ri]
+    y1 = np.argmax(B, axis=0).astype(np.float64) + 1.0
+    y1 = y1[ri]
+    center = y1 + r
     n_slices = r.size
     # compute angles between 0 and 180 degrees inclusive, in radians
     da = 0.25

From 95f99589099383285dd51ed5952c3024cf6a97aa Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:43:07 -0500
Subject: [PATCH 26/31] remove unused imports

---
 ifcb_features/segmentation.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index ff9d331..70263a2 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -1,13 +1,10 @@
 import numpy as np
 
-from skimage import img_as_float32
 from scipy.ndimage import label
 from scipy.ndimage.morphology import binary_fill_holes
-from skimage.morphology import binary_closing, binary_dilation, binary_erosion, remove_small_objects, diamond
+from skimage.morphology import binary_closing, binary_dilation, binary_erosion, diamond
 import skimage.filters as imfilters
 
-from skimage.util import img_as_ubyte
-
 from .phasecong import phasecong_Mm
 from .morphology import SE3, hysthresh, bwmorph_thin, EIGHT
 

From 76255b64b702b9e16ba2f2467eb2763f6a946bf1 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:52:01 -0500
Subject: [PATCH 27/31] remove unused code from debugging

---
 ifcb_features/biovolume.py | 1 -
 ifcb_features/blobs.py     | 2 --
 2 files changed, 3 deletions(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 921efdb..8791c78 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -101,7 +101,6 @@ def sor_volume_surface_area(B):
     """pass in rotated blob"""
     """Sosik and Kilfoyle surface area / volume algorithm"""
     # compute center using median (current) or bottom+radius (legacy MATLAB)
-    h, w = B.shape
     r = np.sum(B, axis=0).astype(np.float64)
     ri = r > 0
     r = (r / 2.0)[ri]
diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index ae51f75..d044c0f 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -2,8 +2,6 @@
 
 from scipy.ndimage import measurements
 
-from scipy.ndimage import rotate as nd_rotate
-
 from .morphology import SE2, SE3, EIGHT, bwmorph_thin
 
 def label_blobs(B):

From f6db04237acac8523529fa8aa0464a9d767af976 Mon Sep 17 00:00:00 2001
From: John Walsh <john.walsh@whoi.edu>
Date: Thu, 5 Mar 2026 12:56:17 -0500
Subject: [PATCH 28/31] fix comment

---
 ifcb_features/biovolume.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ifcb_features/biovolume.py b/ifcb_features/biovolume.py
index 8791c78..0fd2967 100644
--- a/ifcb_features/biovolume.py
+++ b/ifcb_features/biovolume.py
@@ -100,7 +100,7 @@ def distmap_volume_surface_area(B, perimeter_image=None):
 def sor_volume_surface_area(B):
     """pass in rotated blob"""
     """Sosik and Kilfoyle surface area / volume algorithm"""
-    # compute center using median (current) or bottom+radius (legacy MATLAB)
+    # compute center as top-edge + radius to match MATLAB
     r = np.sum(B, axis=0).astype(np.float64)
     ri = r > 0
     r = (r / 2.0)[ri]

From 823a02004d6e1230c061758ce996d5d4827b089d Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Thu, 7 May 2026 15:38:29 -0400
Subject: [PATCH 29/31] add explicit float rounding in kmeans

---
 ifcb_features/segmentation.py | 53 ++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index 1b23f82..2bd125d 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -36,11 +36,35 @@ def _kmeans_1d_strict(values, max_iter=100):
     if n == 0:
         return np.array([0.0, 1.0], dtype=np.float32), np.zeros(0, dtype=np.int8)
 
+    def center_for_cluster(idx, cluster):
+        total = np.float32(0.0)
+        count = np.float32(0.0)
+        for i in range(n):
+            if idx[i] == cluster:
+                total = np.float32(total + values[i])
+                count = np.float32(count + np.float32(1.0))
+        if count == 0:
+            return np.float32(np.nan), 0
+        return np.float32(total / count), int(count)
+
+    def distances_to_center(center):
+        distances = np.empty(n, dtype=np.float32)
+        for i in range(n):
+            delta = np.float32(values[i] - center)
+            distances[i] = np.float32(delta * delta)
+        return distances
+
+    def assigned_total(distances, idx):
+        total = np.float32(0.0)
+        for i in range(n):
+            total = np.float32(total + distances[i, idx[i]])
+        return total
+
     centers = np.array([0.0, 1.0], dtype=np.float32)
     # Initial distances/assignments from provided centers
     D = np.empty((n, 2), dtype=np.float32)
-    D[:, 0] = (values - centers[0]) * (values - centers[0])
-    D[:, 1] = (values - centers[1]) * (values - centers[1])
+    D[:, 0] = distances_to_center(centers[0])
+    D[:, 1] = distances_to_center(centers[1])
     idx = np.argmin(D, axis=1).astype(np.int8)
 
     changed = np.array([0, 1], dtype=np.int64)
@@ -50,16 +74,13 @@ def _kmeans_1d_strict(values, max_iter=100):
     for _iter in range(max_iter):
         # Recompute centers and counts for changed clusters
         counts = np.bincount(idx, minlength=2).astype(np.int64)
-        sums = np.zeros(2, dtype=np.float64)
-        for i in range(n):
-            sums[idx[i]] += float(values[i])
         for c in changed:
             if counts[c] > 0:
-                centers[c] = np.float32(sums[c] / counts[c])
+                centers[c], counts[c] = center_for_cluster(idx, c)
 
         # Update distances for changed clusters
         for c in changed:
-            D[:, c] = (values - centers[c]) * (values - centers[c])
+            D[:, c] = distances_to_center(centers[c])
 
         # Handle empty clusters (singleton)
         empties = [c for c in changed if counts[c] == 0]
@@ -76,30 +97,22 @@ def _kmeans_1d_strict(values, max_iter=100):
                 idx[lonely] = empty
                 counts[empty] = 1
                 counts[from_cluster] -= 1
-                D[:, empty] = (values - centers[empty]) * (values - centers[empty])
+                D[:, empty] = distances_to_center(centers[empty])
 
                 # Update donor cluster center/distance
-                sums[from_cluster] = np.float64(0.0)
-                for i in range(n):
-                    if idx[i] == from_cluster:
-                        sums[from_cluster] += float(values[i])
                 if counts[from_cluster] > 0:
-                    centers[from_cluster] = np.float32(sums[from_cluster] / counts[from_cluster])
-                D[:, from_cluster] = (values - centers[from_cluster]) * (values - centers[from_cluster])
+                    centers[from_cluster], counts[from_cluster] = center_for_cluster(idx, from_cluster)
+                D[:, from_cluster] = distances_to_center(centers[from_cluster])
                 changed = np.unique(np.concatenate([changed, np.array([from_cluster], dtype=np.int64)]))
 
         # Compute objective and check for improvement
-        d_assigned = D[np.arange(n), idx]
-        totsumD = np.sum(d_assigned, dtype=np.float32)
+        totsumD = assigned_total(D, idx)
         if prevtotsumD <= totsumD:
             idx = previdx
             counts = np.bincount(idx, minlength=2).astype(np.int64)
-            sums = np.zeros(2, dtype=np.float64)
-            for i in range(n):
-                sums[idx[i]] += float(values[i])
             for c in changed:
                 if counts[c] > 0:
-                    centers[c] = np.float32(sums[c] / counts[c])
+                    centers[c], counts[c] = center_for_cluster(idx, c)
             break
 
         previdx = idx.copy()

From cf3a59b2b74743962592e6e7d316d8e8686a0ad9 Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 8 May 2026 16:28:23 -0400
Subject: [PATCH 30/31] switch from for loops to vectorization for speedup

---
 ifcb_features/segmentation.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/ifcb_features/segmentation.py b/ifcb_features/segmentation.py
index cd94a45..7de79e9 100644
--- a/ifcb_features/segmentation.py
+++ b/ifcb_features/segmentation.py
@@ -23,30 +23,24 @@ def _kmeans_1d_strict(values, max_iter=100):
     n = values.shape[0]
     if n == 0:
         return np.array([0.0, 1.0], dtype=np.float32), np.zeros(0, dtype=np.int8)
+    row_indices = np.arange(n)
 
     def center_for_cluster(idx, cluster):
-        total = np.float32(0.0)
-        count = np.float32(0.0)
-        for i in range(n):
-            if idx[i] == cluster:
-                total = np.float32(total + values[i])
-                count = np.float32(count + np.float32(1.0))
+        members = idx == cluster
+        count = int(np.count_nonzero(members))
         if count == 0:
             return np.float32(np.nan), 0
-        return np.float32(total / count), int(count)
+        total = np.cumsum(values[members], dtype=np.float32)[-1]
+        count_float = np.float32(count)
+        return np.float32(total / count_float), count
 
     def distances_to_center(center):
-        distances = np.empty(n, dtype=np.float32)
-        for i in range(n):
-            delta = np.float32(values[i] - center)
-            distances[i] = np.float32(delta * delta)
-        return distances
+        delta = values - np.float32(center)
+        return np.asarray(delta * delta, dtype=np.float32)
 
     def assigned_total(distances, idx):
-        total = np.float32(0.0)
-        for i in range(n):
-            total = np.float32(total + distances[i, idx[i]])
-        return total
+        assigned = distances[row_indices, idx]
+        return np.cumsum(assigned, dtype=np.float32)[-1]
 
     centers = np.array([0.0, 1.0], dtype=np.float32)
     # Initial distances/assignments from provided centers
@@ -73,7 +67,7 @@ def assigned_total(distances, idx):
         # Handle empty clusters (singleton)
         empties = [c for c in changed if counts[c] == 0]
         if empties:
-            d_assigned = D[np.arange(n), idx]
+            d_assigned = D[row_indices, idx]
             for empty in empties:
                 lonely = int(np.argmax(d_assigned))
                 from_cluster = int(idx[lonely])
@@ -108,7 +102,7 @@ def assigned_total(distances, idx):
 
         # Reassign using nearest centroid, tie -> stay
         nidx = np.argmin(D, axis=1).astype(np.int8)
-        dmin = D[np.arange(n), nidx]
+        dmin = D[row_indices, nidx]
         moved = np.where(nidx != previdx)[0]
         if moved.size:
             stay_mask = D[moved, previdx[moved]] > dmin[moved]

From ffedc17b191c4193f8476ccafc3bbc4904cd689b Mon Sep 17 00:00:00 2001
From: John Walsh <johnwalsh7412@gmail.com>
Date: Fri, 15 May 2026 13:28:14 -0400
Subject: [PATCH 31/31] add more explicit orientation code

---
 ifcb_features/all.py           | 11 ++----
 ifcb_features/blob_geometry.py | 43 ++++++++++++++++++++++-
 ifcb_features/blobs.py         | 63 +++++++++++++++++++++++++++-------
 3 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/ifcb_features/all.py b/ifcb_features/all.py
index 6eacb47..b3015ab 100644
--- a/ifcb_features/all.py
+++ b/ifcb_features/all.py
@@ -12,7 +12,8 @@
 from .blob_geometry import (equiv_diameter, ellipse_properties,
                             invmoments, convex_hull, convex_hull_image,
                             convex_hull_properties, feret_diameter,
-                            binary_symmetry, feret_diameters)
+                            binary_symmetry, feret_diameters,
+                            explicit_orientation)
 from .morphology import find_perimeter
 from .biovolume import (distmap_volume_surface_area,
                         sor_volume_surface_area)
@@ -151,13 +152,7 @@ def eccentricity(self):
     @lru_cache()
     def orientation(self):
         """return orientation of blob in degrees"""
-        # skimage.regionprops.orientation is angle between the 0th axis (rows)
-        # and the major axis. MATLAB uses angle from the x-axis (columns).
-        rad = self.regionprops.orientation
-        deg = 90.0 + (180.0 / np.pi) * rad
-        if deg > 90.0:
-            deg -= 180.0
-        return deg
+        return explicit_orientation(self.image)
     @property
     @lru_cache()
     def centroid(self):
diff --git a/ifcb_features/blob_geometry.py b/ifcb_features/blob_geometry.py
index 6467acc..a052310 100644
--- a/ifcb_features/blob_geometry.py
+++ b/ifcb_features/blob_geometry.py
@@ -43,6 +43,48 @@ def ellipse_properties(B):
     
     return maj_axis, min_axis, ecc, orientation
 
+
+def explicit_orientation(B):
+    """Deterministic blob orientation in degrees for MATLAB/Python parity."""
+    B = np.array(B).astype(np.bool)
+    rows, cols = np.indices(B.shape, dtype=np.float64)
+    x = (cols + 1.0).ravel(order='C')
+    y = (rows + 1.0).ravel(order='C')
+    f = B.astype(np.float64).ravel(order='C')
+
+    m00 = np.float64(0.0)
+    m10 = np.float64(0.0)
+    m01 = np.float64(0.0)
+    for xi, yi, fi in zip(x, y, f):
+        m00 = np.float64(m00 + np.float64(fi))
+        m10 = np.float64(m10 + np.float64(xi * fi))
+        m01 = np.float64(m01 + np.float64(yi * fi))
+
+    if m00 == 0:
+        return 0.0
+
+    xbar = np.float64(m10 / m00)
+    ybar = np.float64(m01 / m00)
+
+    mu20 = np.float64(0.0)
+    mu02 = np.float64(0.0)
+    mu11 = np.float64(0.0)
+    for xi, yi, fi in zip(x, y, f):
+        dx = np.float64(xi - xbar)
+        dy = np.float64(yi - ybar)
+        mu20 = np.float64(mu20 + np.float64(dx * dx * fi))
+        mu02 = np.float64(mu02 + np.float64(dy * dy * fi))
+        mu11 = np.float64(mu11 + np.float64(dx * dy * fi))
+
+    theta = np.float64(-0.5) * np.float64(
+        np.degrees(np.arctan2(np.float64(2.0) * mu11, mu20 - mu02))
+    )
+    while theta > 90.0:
+        theta = np.float64(theta - 180.0)
+    while theta <= -90.0:
+        theta = np.float64(theta + 180.0)
+    return float(theta)
+
 def invmoments(B):
     """compute invariant moments. see
     Digital Image Processing Using MATLAB, ch. 11"""
@@ -181,4 +223,3 @@ def ss(D):
     # flipped across horizontal (major) axis
     bflip = ss(np.flipud(B))
     return b180, b90, bflip
-
diff --git a/ifcb_features/blobs.py b/ifcb_features/blobs.py
index d044c0f..024d5e8 100644
--- a/ifcb_features/blobs.py
+++ b/ifcb_features/blobs.py
@@ -71,21 +71,58 @@ def imrotate_nearest_crop(img, angle_deg):
     """MATLAB-compatible imrotate(..., 'nearest', 'crop') for binary masks."""
     img = np.array(img).astype(np.bool)
     h, w = img.shape
-    # MATLAB uses 1-based coordinates, center at (w+1)/2, (h+1)/2
-    cx = (w + 1) / 2.0
-    cy = (h + 1) / 2.0
 
-    yy, xx = np.indices((h, w))
-    x = xx + 1.0
-    y = yy + 1.0
-    x0 = x - cx
-    y0 = y - cy
+    # MATLAB blob_rotate calls imrotate(centered, -Orientation, ...).  The
+    # Python caller passes the already sign-adjusted orientation, so use the
+    # opposite angle for MATLAB's affine2d/imwarp coordinate convention.
+    ang = np.deg2rad(-angle_deg)
+    cos_a = np.cos(ang)
+    sin_a = np.sin(ang)
 
-    ang = np.deg2rad(angle_deg)
-    cos_a = np.cos(-ang)
-    sin_a = np.sin(-ang)
-    x_in = cos_a * x0 - sin_a * y0 + cx
-    y_in = sin_a * x0 + cos_a * y0 + cy
+    # imrotate's general-rotation crop path is:
+    #   RA = imref2d(size(A));
+    #   Rout = applyGeometricTransformToSpatialRef(RA, tform);
+    #   Rout.ImageSize = RA.ImageSize;
+    #   Rout limits are shifted to preserve the center;
+    #   imwarp(A, tform, 'nearest', 'OutputView', Rout)
+    # Default imref2d world limits are [0.5, size + 0.5].
+    x_limits = (0.5, w + 0.5)
+    y_limits = (0.5, h + 0.5)
+    corners = np.array(
+        [
+            [x_limits[0], y_limits[0]],
+            [x_limits[0], y_limits[1]],
+            [x_limits[1], y_limits[0]],
+            [x_limits[1], y_limits[1]],
+        ],
+        dtype=np.float64,
+    )
+    x_out = corners[:, 0] * cos_a + corners[:, 1] * sin_a
+    y_out = -corners[:, 0] * sin_a + corners[:, 1] * cos_a
+    x_trans = (float(np.min(x_out)) + float(np.max(x_out))) / 2.0 - (
+        x_limits[0] + x_limits[1]
+    ) / 2.0
+    y_trans = (float(np.min(y_out)) + float(np.max(y_out))) / 2.0 - (
+        y_limits[0] + y_limits[1]
+    ) / 2.0
+    x_world_min = x_limits[0] + x_trans
+    y_world_min = y_limits[0] + y_trans
+
+    # MATLAB's imwarp path lands infinitesimally below exact half-pixel
+    # boundaries for the observed tie cases. Nudge the crop reference down
+    # by two ULPs so nearest-neighbor rounding follows the same side.
+    x_world_min = np.nextafter(np.nextafter(x_world_min, -np.inf), -np.inf)
+    y_world_min = np.nextafter(np.nextafter(y_world_min, -np.inf), -np.inf)
+
+    rows, cols = np.indices((h, w), dtype=np.float64)
+    x_world = x_world_min + (cols + 1.0 - 0.5)
+    y_world = y_world_min + (rows + 1.0 - 0.5)
+
+    # transformPointsInverse for affine2d rotation matrix
+    # [cos -sin 0; sin cos 0; 0 0 1], then worldToIntrinsic(RA).
+    # With default RA limits and unit pixel extents, world == intrinsic.
+    x_in = x_world * cos_a - y_world * sin_a
+    y_in = x_world * sin_a + y_world * cos_a
 
     # MATLAB round: ties away from zero (works for negative too).
     x_idx = (np.sign(x_in) * np.floor(np.abs(x_in) + 0.5)).astype(np.int64)