From 2459e53169d73ce3300f510909882f6d112e138f Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 24 Sep 2024 16:49:13 +0200
Subject: [PATCH 1/4] add pca projection

---
 allel/stats/decomposition.py | 189 +++++++++++------------------------
 1 file changed, 56 insertions(+), 133 deletions(-)

diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
index 43fd30ff..323fbd38 100644
--- a/allel/stats/decomposition.py
+++ b/allel/stats/decomposition.py
@@ -59,8 +59,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2):
 
 class GenotypePCA(object):
 
-    def __init__(self, n_components=10, copy=True, scaler='patterson',
-                 ploidy=2):
+    def __init__(self, n_components=10, copy=True, scaler='patterson', ploidy=2):
         self.n_components = n_components
         self.copy = copy
         self.scaler = scaler
@@ -71,10 +70,10 @@ def fit(self, gn):
         return self
 
     def fit_transform(self, gn):
-        u, s, v = self._fit(gn)
-        u = u[:, :self.n_components]
-        u *= s[:self.n_components]
-        return u
+        self.u, s, v = self._fit(gn)
+        self.u = self.u[:, :self.n_components]
+        self.u *= s[:self.n_components]
+        return self.u
 
     def _fit(self, gn):
         import scipy.linalg
@@ -83,7 +82,6 @@ def _fit(self, gn):
         gn = self.scaler_.fit(gn).transform(gn)
 
         # transpose for svd
-        # TODO eliminate need for transposition
         x = gn.T
         n_samples, n_features = x.shape
 
@@ -96,10 +94,9 @@ def _fit(self, gn):
 
         # store variables
         n_components = self.n_components
-        self.components_ = v[:n_components]
+        self.components_ = v[:n_components]  # Store eigenvectors (principal components)
         self.explained_variance_ = explained_variance_[:n_components]
-        self.explained_variance_ratio_ = \
-            explained_variance_ratio_[:n_components]
+        self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
 
         return u, s, v
 
@@ -111,142 +108,68 @@ def transform(self, gn, copy=None):
         gn = self.scaler_.transform(gn, copy=copy)
 
         # transpose for transformation
-        # TODO eliminate need for transposition
         x = gn.T
 
         # apply transformation
-        x_transformed = np.dot(x, self.components_.T)
+        self.x_transformed = np.dot(x, self.components_.T)
 
-        return x_transformed
+        return self.x_transformed
 
+    def project(self, gnp, missing=-1):
+        """
+        Project samples with missing variants using the precomputed eigenvectors.
 
-def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
-                   random_state=None, scaler='patterson', ploidy=2):
-    """Perform principal components analysis of genotype data, via an
-    approximate truncated singular value decomposition using randomization
-    to speed up the computation.
+        Parameters
+        ----------
+        gnp : array_like, float, shape (n_variants, n_samples)
+            Genotypes at biallelic variants for samples to project, 
+            coded as the number of alternate alleles per call.
+            (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
+        missing : int, optional
+            Value used to represent missing genotypes in the input data.
 
-    Parameters
-    ----------
-
-    gn : array_like, float, shape (n_variants, n_samples)
-        Genotypes at biallelic variants, coded as the number of alternate
-        alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
-    n_components : int, optional
-        Number of components to keep.
-    copy : bool, optional
-        If False, data passed to fit are overwritten.
-    iterated_power : int, optional
-        Number of iterations for the power method.
-    random_state : int or RandomState instance or None (default)
-        Pseudo Random Number generator seed control. If None, use the
-        numpy.random singleton.
-    scaler : {'patterson', 'standard', None}
-        Scaling method; 'patterson' applies the method of Patterson et al
-        2006; 'standard' scales to unit variance; None centers the data only.
-    ploidy : int, optional
-        Sample ploidy, only relevant if 'patterson' scaler is used.
-
-    Returns
-    -------
-
-    coords : ndarray, float, shape (n_samples, n_components)
-        Transformed coordinates for the samples.
-    model : GenotypeRandomizedPCA
-        Model instance containing the variance ratio explained and the stored
-        components (a.k.a., loadings). Can be used to project further data
-        into the same principal components space via the transform() method.
-
-    Notes
-    -----
-
-    Genotype data should be filtered prior to using this function to remove
-    variants in linkage disequilibrium.
+        Returns
+        -------
+        array-like, shape (n_samples, n_components):
+            PCA projection of both base and projected missing samples.
 
-    Based on the :class:`sklearn.decomposition.RandomizedPCA` implementation.
+        Notes
+        -----
 
-    See Also
-    --------
-
-    pca, allel.stats.ld.locate_unlinked
-
-    """
+        This is the implementation of the least squares projection method
+        first implemented in smartPCA by Patterson in 2013, described here:
+        https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf
 
-    # set up the model
-    model = GenotypeRandomizedPCA(n_components, copy=copy,
-                                  iterated_power=iterated_power,
-                                  random_state=random_state, scaler=scaler,
-                                  ploidy=ploidy)
+        It is used to project samples with missing variants in the PC space defined
+        by samples with no missing variants.
+        """
 
-    # fit the model and project the input data onto the new dimensions
-    coords = model.fit_transform(gn)
-
-    return coords, model
-
-
-class GenotypeRandomizedPCA(object):
-
-    def __init__(self, n_components=10, copy=True, iterated_power=3,
-                 random_state=None, scaler='patterson', ploidy=2):
-        self.n_components = n_components
-        self.copy = copy
-        self.iterated_power = iterated_power
-        self.random_state = random_state
-        self.scaler = scaler
-        self.scaler_ = get_scaler(scaler, copy, ploidy)
-
-    def fit(self, gn):
-        self._fit(gn)
-        return self
-
-    def fit_transform(self, gn):
-        u, s, v = self._fit(gn)
-        u *= s
-        return u
-
-    def _fit(self, gn):
-        from sklearn.utils.validation import check_random_state
-        from sklearn.utils.extmath import randomized_svd
-
-        # apply scaling
-        gn = self.scaler_.fit(gn).transform(gn)
-
-        # transpose for svd
-        # TODO eliminate need for transposition
-        x = gn.T
-
-        # intermediates
-        random_state = check_random_state(self.random_state)
-        n_components = self.n_components
-        n_samples, n_features = x.shape
-
-        # singular value decomposition
-        u, s, v = randomized_svd(x, n_components,
-                                 n_iter=self.iterated_power,
-                                 random_state=random_state)
-
-        # calculate explained variance
-        self.explained_variance_ = exp_var = (s ** 2) / n_samples
-        full_var = np.var(x, axis=0).sum()
-        self.explained_variance_ratio_ = exp_var / full_var
-
-        # store components
-        self.components_ = v
-
-        return u, s, v
-
-    def transform(self, gn, copy=None):
         if not hasattr(self, 'components_'):
-            raise ValueError('model has not been not fitted')
+            raise ValueError('model has not been fitted')
+        if not hasattr(self, 'u'):
+            raise ValueError('genotype data has not been transformed')
+        
+        gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg
+
+        projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
+
+        for i, sample in enumerate(gnp.T):
+            # Identify non-missing entries in the current sample
+            non_missing_variants = sample != missing
+            
+            # Subset the eigenvectors to only the non-missing variants
+            eigenvectors_no_missing = self.components_[:, non_missing_variants].T
+            X_non_missing = sample[non_missing_variants]
+            
+            # Ensure dimensional compatibility and project using least squares
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0]
+            else:
+                projected_missing_samples[i] = np.zeros(self.n_components)  # Handle missing projection
+    
+        # Stack the base samples with the projected missing samples
+        return np.vstack([self.u, projected_missing_samples])
 
-        # scaling
-        gn = self.scaler_.transform(gn, copy=copy)
 
-        # transpose for transformation
-        # TODO eliminate need for transposition
-        x = gn.T
 
-        # apply transformation
-        x_transformed = np.dot(x, self.components_.T)
 
-        return x_transformed

From a5f77bae7fa30cd3c63775d8abdb06bfa84235de Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 24 Sep 2024 17:06:25 +0200
Subject: [PATCH 2/4] fix: re-add randomized PCA

---
 allel/stats/decomposition.py | 184 +++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)

diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
index 323fbd38..5c2aaaca 100644
--- a/allel/stats/decomposition.py
+++ b/allel/stats/decomposition.py
@@ -173,3 +173,187 @@ def project(self, gnp, missing=-1):
 
 
 
+def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
+                   random_state=None, scaler='patterson', ploidy=2):
+    """Perform principal components analysis of genotype data, via an
+    approximate truncated singular value decomposition using randomization
+    to speed up the computation.
+
+    Parameters
+    ----------
+
+    gn : array_like, float, shape (n_variants, n_samples)
+        Genotypes at biallelic variants, coded as the number of alternate
+        alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
+    n_components : int, optional
+        Number of components to keep.
+    copy : bool, optional
+        If False, data passed to fit are overwritten.
+    iterated_power : int, optional
+        Number of iterations for the power method.
+    random_state : int or RandomState instance or None (default)
+        Pseudo Random Number generator seed control. If None, use the
+        numpy.random singleton.
+    scaler : {'patterson', 'standard', None}
+        Scaling method; 'patterson' applies the method of Patterson et al
+        2006; 'standard' scales to unit variance; None centers the data only.
+    ploidy : int, optional
+        Sample ploidy, only relevant if 'patterson' scaler is used.
+
+    Returns
+    -------
+
+    coords : ndarray, float, shape (n_samples, n_components)
+        Transformed coordinates for the samples.
+    model : GenotypeRandomizedPCA
+        Model instance containing the variance ratio explained and the stored
+        components (a.k.a., loadings). Can be used to project further data
+        into the same principal components space via the transform() method.
+
+    Notes
+    -----
+
+    Genotype data should be filtered prior to using this function to remove
+    variants in linkage disequilibrium.
+
+    Based on the :class:`sklearn.decomposition.RandomizedPCA` implementation.
+
+    See Also
+    --------
+
+    pca, allel.stats.ld.locate_unlinked
+
+    """
+
+    # set up the model
+    model = GenotypeRandomizedPCA(n_components, copy=copy,
+                                  iterated_power=iterated_power,
+                                  random_state=random_state, scaler=scaler,
+                                  ploidy=ploidy)
+
+    # fit the model and project the input data onto the new dimensions
+    coords = model.fit_transform(gn)
+
+    return coords, model
+
+class GenotypeRandomizedPCA(object):
+
+    def __init__(self, n_components=10, copy=True, iterated_power=3,
+                 random_state=None, scaler='patterson', ploidy=2):
+        self.n_components = n_components
+        self.copy = copy
+        self.iterated_power = iterated_power
+        self.random_state = random_state
+        self.scaler = scaler
+        self.scaler_ = get_scaler(scaler, copy, ploidy)
+
+    def fit(self, gn):
+        self._fit(gn)
+        return self
+
+    def fit_transform(self, gn):
+        self.u, s, v = self._fit(gn)
+        self.u *= s
+        return self.u
+
+    def _fit(self, gn):
+        from sklearn.utils.validation import check_random_state
+        from sklearn.utils.extmath import randomized_svd
+
+        # apply scaling
+        gn = self.scaler_.fit(gn).transform(gn)
+
+        # transpose for svd
+        # TODO eliminate need for transposition
+        x = gn.T
+
+        # intermediates
+        random_state = check_random_state(self.random_state)
+        n_components = self.n_components
+        n_samples, n_features = x.shape
+
+        # singular value decomposition
+        u, s, v = randomized_svd(x, n_components,
+                                 n_iter=self.iterated_power,
+                                 random_state=random_state)
+
+        # calculate explained variance
+        self.explained_variance_ = exp_var = (s ** 2) / n_samples
+        full_var = np.var(x, axis=0).sum()
+        self.explained_variance_ratio_ = exp_var / full_var
+
+        # store components
+        self.components_ = v
+
+        return u, s, v
+
+    def transform(self, gn, copy=None):
+        if not hasattr(self, 'components_'):
+            raise ValueError('model has not been not fitted')
+
+        # scaling
+        gn = self.scaler_.transform(gn, copy=copy)
+
+        # transpose for transformation
+        # TODO eliminate need for transposition
+        x = gn.T
+
+        # apply transformation
+        x_transformed = np.dot(x, self.components_.T)
+
+        return x_transformed
+    
+    def project(self, gnp, missing=-1):
+        """
+        Project samples with missing variants using the precomputed eigenvectors.
+
+        Parameters
+        ----------
+        gnp : array_like, float, shape (n_variants, n_samples)
+            Genotypes at biallelic variants for samples to project, 
+            coded as the number of alternate alleles per call.
+            (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
+        missing : int, optional
+            Value used to represent missing genotypes in the input data.
+
+        Returns
+        -------
+        array-like, shape (n_samples, n_components):
+            PCA projection of both base and projected missing samples.
+
+        Notes
+        -----
+
+        This is the implementation of the least squares projection method
+        first implemented in smartPCA by Patterson in 2013, described here:
+        https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf
+
+        It is used to project samples with missing variants in the PC space defined
+        by samples with no missing variants.
+        """
+
+        if not hasattr(self, 'components_'):
+            raise ValueError('model has not been fitted')
+        if not hasattr(self, 'u'):
+            raise ValueError('genotype data has not been transformed')
+        
+        gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg
+
+        projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
+
+        for i, sample in enumerate(gnp.T):
+            # Identify non-missing entries in the current sample
+            non_missing_variants = sample != missing
+            
+            # Subset the eigenvectors to only the non-missing variants
+            eigenvectors_no_missing = self.components_[:, non_missing_variants].T
+            X_non_missing = sample[non_missing_variants]
+            
+            # Ensure dimensional compatibility and project using least squares
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0]
+            else:
+                projected_missing_samples[i] = np.zeros(self.n_components)  # Handle missing projection
+    
+        # Stack the base samples with the projected missing samples
+        return np.vstack([self.u, projected_missing_samples])
\ No newline at end of file

From aa0e7b46d9e6b45d1861af704b6ac5b15c290df3 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 24 Sep 2024 17:13:53 +0200
Subject: [PATCH 3/4] cleanup

---
 allel/stats/decomposition.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
index 5c2aaaca..1c4dd950 100644
--- a/allel/stats/decomposition.py
+++ b/allel/stats/decomposition.py
@@ -94,7 +94,7 @@ def _fit(self, gn):
 
         # store variables
         n_components = self.n_components
-        self.components_ = v[:n_components]  # Store eigenvectors (principal components)
+        self.components_ = v[:n_components]
         self.explained_variance_ = explained_variance_[:n_components]
         self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
 
@@ -111,9 +111,9 @@ def transform(self, gn, copy=None):
         x = gn.T
 
         # apply transformation
-        self.x_transformed = np.dot(x, self.components_.T)
+        x_transformed = np.dot(x, self.components_.T)
 
-        return self.x_transformed
+        return x_transformed
 
     def project(self, gnp, missing=-1):
         """

From 5795f5d5512e866b089e3d26907e740dba427d53 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 24 Sep 2024 17:15:27 +0200
Subject: [PATCH 4/4] black formatting

---
 allel/stats/decomposition.py | 139 +++++++++++++++++++++--------------
 1 file changed, 85 insertions(+), 54 deletions(-)

diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
index 1c4dd950..9f42c296 100644
--- a/allel/stats/decomposition.py
+++ b/allel/stats/decomposition.py
@@ -5,7 +5,7 @@
 from allel.stats.preprocessing import get_scaler
 
 
-def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2):
+def pca(gn, n_components=10, copy=True, scaler="patterson", ploidy=2):
     """Perform principal components analysis of genotype data, via singular
     value decomposition.
 
@@ -58,8 +58,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2):
 
 
 class GenotypePCA(object):
-
-    def __init__(self, n_components=10, copy=True, scaler='patterson', ploidy=2):
+    def __init__(self, n_components=10, copy=True, scaler="patterson", ploidy=2):
         self.n_components = n_components
         self.copy = copy
         self.scaler = scaler
@@ -71,8 +70,8 @@ def fit(self, gn):
 
     def fit_transform(self, gn):
         self.u, s, v = self._fit(gn)
-        self.u = self.u[:, :self.n_components]
-        self.u *= s[:self.n_components]
+        self.u = self.u[:, : self.n_components]
+        self.u *= s[: self.n_components]
         return self.u
 
     def _fit(self, gn):
@@ -89,8 +88,8 @@ def _fit(self, gn):
         u, s, v = scipy.linalg.svd(x, full_matrices=False)
 
         # calculate explained variance
-        explained_variance_ = (s ** 2) / n_samples
-        explained_variance_ratio_ = (explained_variance_ / np.sum(explained_variance_))
+        explained_variance_ = (s**2) / n_samples
+        explained_variance_ratio_ = explained_variance_ / np.sum(explained_variance_)
 
         # store variables
         n_components = self.n_components
@@ -101,8 +100,8 @@ def _fit(self, gn):
         return u, s, v
 
     def transform(self, gn, copy=None):
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been not fitted')
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been not fitted")
 
         # scaling
         gn = self.scaler_.transform(gn, copy=copy)
@@ -122,7 +121,7 @@ def project(self, gnp, missing=-1):
         Parameters
         ----------
         gnp : array_like, float, shape (n_variants, n_samples)
-            Genotypes at biallelic variants for samples to project, 
+            Genotypes at biallelic variants for samples to project,
             coded as the number of alternate alleles per call.
             (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
         missing : int, optional
@@ -144,37 +143,50 @@ def project(self, gnp, missing=-1):
         by samples with no missing variants.
         """
 
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been fitted')
-        if not hasattr(self, 'u'):
-            raise ValueError('genotype data has not been transformed')
-        
-        gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been fitted")
+        if not hasattr(self, "u"):
+            raise ValueError("genotype data has not been transformed")
+
+        gnp = self.scaler_.transform(gnp, copy=None).astype(
+            np.float32, copy=False
+        )  # cast to float32 for np.linalg
 
         projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
 
         for i, sample in enumerate(gnp.T):
             # Identify non-missing entries in the current sample
             non_missing_variants = sample != missing
-            
+
             # Subset the eigenvectors to only the non-missing variants
             eigenvectors_no_missing = self.components_[:, non_missing_variants].T
             X_non_missing = sample[non_missing_variants]
-            
+
             # Ensure dimensional compatibility and project using least squares
-            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing):
-                projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0]
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[
+                0
+            ] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(
+                    eigenvectors_no_missing, X_non_missing, rcond=None
+                )[0]
             else:
-                projected_missing_samples[i] = np.zeros(self.n_components)  # Handle missing projection
-    
+                projected_missing_samples[i] = np.zeros(
+                    self.n_components
+                )  # Handle missing projection
+
         # Stack the base samples with the projected missing samples
         return np.vstack([self.u, projected_missing_samples])
 
 
-
-
-def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
-                   random_state=None, scaler='patterson', ploidy=2):
+def randomized_pca(
+    gn,
+    n_components=10,
+    copy=True,
+    iterated_power=3,
+    random_state=None,
+    scaler="patterson",
+    ploidy=2,
+):
     """Perform principal components analysis of genotype data, via an
     approximate truncated singular value decomposition using randomization
     to speed up the computation.
@@ -226,20 +238,31 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
     """
 
     # set up the model
-    model = GenotypeRandomizedPCA(n_components, copy=copy,
-                                  iterated_power=iterated_power,
-                                  random_state=random_state, scaler=scaler,
-                                  ploidy=ploidy)
+    model = GenotypeRandomizedPCA(
+        n_components,
+        copy=copy,
+        iterated_power=iterated_power,
+        random_state=random_state,
+        scaler=scaler,
+        ploidy=ploidy,
+    )
 
     # fit the model and project the input data onto the new dimensions
     coords = model.fit_transform(gn)
 
     return coords, model
 
-class GenotypeRandomizedPCA(object):
 
-    def __init__(self, n_components=10, copy=True, iterated_power=3,
-                 random_state=None, scaler='patterson', ploidy=2):
+class GenotypeRandomizedPCA(object):
+    def __init__(
+        self,
+        n_components=10,
+        copy=True,
+        iterated_power=3,
+        random_state=None,
+        scaler="patterson",
+        ploidy=2,
+    ):
         self.n_components = n_components
         self.copy = copy
         self.iterated_power = iterated_power
@@ -273,12 +296,12 @@ def _fit(self, gn):
         n_samples, n_features = x.shape
 
         # singular value decomposition
-        u, s, v = randomized_svd(x, n_components,
-                                 n_iter=self.iterated_power,
-                                 random_state=random_state)
+        u, s, v = randomized_svd(
+            x, n_components, n_iter=self.iterated_power, random_state=random_state
+        )
 
         # calculate explained variance
-        self.explained_variance_ = exp_var = (s ** 2) / n_samples
+        self.explained_variance_ = exp_var = (s**2) / n_samples
         full_var = np.var(x, axis=0).sum()
         self.explained_variance_ratio_ = exp_var / full_var
 
@@ -288,8 +311,8 @@ def _fit(self, gn):
         return u, s, v
 
     def transform(self, gn, copy=None):
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been not fitted')
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been not fitted")
 
         # scaling
         gn = self.scaler_.transform(gn, copy=copy)
@@ -302,7 +325,7 @@ def transform(self, gn, copy=None):
         x_transformed = np.dot(x, self.components_.T)
 
         return x_transformed
-    
+
     def project(self, gnp, missing=-1):
         """
         Project samples with missing variants using the precomputed eigenvectors.
@@ -310,7 +333,7 @@ def project(self, gnp, missing=-1):
         Parameters
         ----------
         gnp : array_like, float, shape (n_variants, n_samples)
-            Genotypes at biallelic variants for samples to project, 
+            Genotypes at biallelic variants for samples to project,
             coded as the number of alternate alleles per call.
             (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
         missing : int, optional
@@ -332,28 +355,36 @@ def project(self, gnp, missing=-1):
         by samples with no missing variants.
         """
 
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been fitted')
-        if not hasattr(self, 'u'):
-            raise ValueError('genotype data has not been transformed')
-        
-        gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been fitted")
+        if not hasattr(self, "u"):
+            raise ValueError("genotype data has not been transformed")
+
+        gnp = self.scaler_.transform(gnp, copy=None).astype(
+            np.float32, copy=False
+        )  # cast to float32 for np.linalg
 
         projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
 
         for i, sample in enumerate(gnp.T):
             # Identify non-missing entries in the current sample
             non_missing_variants = sample != missing
-            
+
             # Subset the eigenvectors to only the non-missing variants
             eigenvectors_no_missing = self.components_[:, non_missing_variants].T
             X_non_missing = sample[non_missing_variants]
-            
+
             # Ensure dimensional compatibility and project using least squares
-            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing):
-                projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0]
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[
+                0
+            ] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(
+                    eigenvectors_no_missing, X_non_missing, rcond=None
+                )[0]
             else:
-                projected_missing_samples[i] = np.zeros(self.n_components)  # Handle missing projection
-    
+                projected_missing_samples[i] = np.zeros(
+                    self.n_components
+                )  # Handle missing projection
+
         # Stack the base samples with the projected missing samples
-        return np.vstack([self.u, projected_missing_samples])
\ No newline at end of file
+        return np.vstack([self.u, projected_missing_samples])