From 2459e53169d73ce3300f510909882f6d112e138f Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 24 Sep 2024 16:49:13 +0200 Subject: [PATCH 1/4] add pca projection --- allel/stats/decomposition.py | 189 +++++++++++------------------------ 1 file changed, 56 insertions(+), 133 deletions(-) diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py index 43fd30ff..323fbd38 100644 --- a/allel/stats/decomposition.py +++ b/allel/stats/decomposition.py @@ -59,8 +59,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): class GenotypePCA(object): - def __init__(self, n_components=10, copy=True, scaler='patterson', - ploidy=2): + def __init__(self, n_components=10, copy=True, scaler='patterson', ploidy=2): self.n_components = n_components self.copy = copy self.scaler = scaler @@ -71,10 +70,10 @@ def fit(self, gn): return self def fit_transform(self, gn): - u, s, v = self._fit(gn) - u = u[:, :self.n_components] - u *= s[:self.n_components] - return u + self.u, s, v = self._fit(gn) + self.u = self.u[:, :self.n_components] + self.u *= s[:self.n_components] + return self.u def _fit(self, gn): import scipy.linalg @@ -83,7 +82,6 @@ def _fit(self, gn): gn = self.scaler_.fit(gn).transform(gn) # transpose for svd - # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape @@ -96,10 +94,9 @@ def _fit(self, gn): # store variables n_components = self.n_components - self.components_ = v[:n_components] + self.components_ = v[:n_components] # Store eigenvectors (principal components) self.explained_variance_ = explained_variance_[:n_components] - self.explained_variance_ratio_ = \ - explained_variance_ratio_[:n_components] + self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] return u, s, v @@ -111,142 +108,68 @@ def transform(self, gn, copy=None): gn = self.scaler_.transform(gn, copy=copy) # transpose for transformation - # TODO eliminate need for transposition x = gn.T # apply transformation - x_transformed = np.dot(x, self.components_.T) + self.x_transformed = np.dot(x, self.components_.T) - return x_transformed + return self.x_transformed + def project(self, gnp, missing=-1): + """ + Project samples with missing variants using the precomputed eigenvectors. -def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): - """Perform principal components analysis of genotype data, via an - approximate truncated singular value decomposition using randomization - to speed up the computation. + Parameters + ---------- + gnp : array_like, float, shape (n_variants, n_samples) + Genotypes at biallelic variants for samples to project, + coded as the number of alternate alleles per call. + (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). + missing : int, optional + Value used to represent missing genotypes in the input data. - Parameters - ---------- - - gn : array_like, float, shape (n_variants, n_samples) - Genotypes at biallelic variants, coded as the number of alternate - alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). - n_components : int, optional - Number of components to keep. - copy : bool, optional - If False, data passed to fit are overwritten. - iterated_power : int, optional - Number of iterations for the power method. - random_state : int or RandomState instance or None (default) - Pseudo Random Number generator seed control. If None, use the - numpy.random singleton. - scaler : {'patterson', 'standard', None} - Scaling method; 'patterson' applies the method of Patterson et al - 2006; 'standard' scales to unit variance; None centers the data only. - ploidy : int, optional - Sample ploidy, only relevant if 'patterson' scaler is used. - - Returns - ------- - - coords : ndarray, float, shape (n_samples, n_components) - Transformed coordinates for the samples. - model : GenotypeRandomizedPCA - Model instance containing the variance ratio explained and the stored - components (a.k.a., loadings). Can be used to project further data - into the same principal components space via the transform() method. - - Notes - ----- - - Genotype data should be filtered prior to using this function to remove - variants in linkage disequilibrium. + Returns + ------- + array-like, shape (n_samples, n_components): + PCA projection of both base and projected missing samples. - Based on the :class:`sklearn.decomposition.RandomizedPCA` implementation. + Notes + ----- - See Also - -------- - - pca, allel.stats.ld.locate_unlinked - - """ + This is the implementation of the least squares projection method + first implemented in smartPCA by Patterson in 2013, described here: + https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf - # set up the model - model = GenotypeRandomizedPCA(n_components, copy=copy, - iterated_power=iterated_power, - random_state=random_state, scaler=scaler, - ploidy=ploidy) + It is used to project samples with missing variants in the PC space defined + by samples with no missing variants. + """ - # fit the model and project the input data onto the new dimensions - coords = model.fit_transform(gn) - - return coords, model - - -class GenotypeRandomizedPCA(object): - - def __init__(self, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): - self.n_components = n_components - self.copy = copy - self.iterated_power = iterated_power - self.random_state = random_state - self.scaler = scaler - self.scaler_ = get_scaler(scaler, copy, ploidy) - - def fit(self, gn): - self._fit(gn) - return self - - def fit_transform(self, gn): - u, s, v = self._fit(gn) - u *= s - return u - - def _fit(self, gn): - from sklearn.utils.validation import check_random_state - from sklearn.utils.extmath import randomized_svd - - # apply scaling - gn = self.scaler_.fit(gn).transform(gn) - - # transpose for svd - # TODO eliminate need for transposition - x = gn.T - - # intermediates - random_state = check_random_state(self.random_state) - n_components = self.n_components - n_samples, n_features = x.shape - - # singular value decomposition - u, s, v = randomized_svd(x, n_components, - n_iter=self.iterated_power, - random_state=random_state) - - # calculate explained variance - self.explained_variance_ = exp_var = (s ** 2) / n_samples - full_var = np.var(x, axis=0).sum() - self.explained_variance_ratio_ = exp_var / full_var - - # store components - self.components_ = v - - return u, s, v - - def transform(self, gn, copy=None): if not hasattr(self, 'components_'): - raise ValueError('model has not been not fitted') + raise ValueError('model has not been fitted') + if not hasattr(self, 'u'): + raise ValueError('genotype data has not been transformed') + + gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg + + projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) + + for i, sample in enumerate(gnp.T): + # Identify non-missing entries in the current sample + non_missing_variants = sample != missing + + # Subset the eigenvectors to only the non-missing variants + eigenvectors_no_missing = self.components_[:, non_missing_variants].T + X_non_missing = sample[non_missing_variants] + + # Ensure dimensional compatibility and project using least squares + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0] + else: + projected_missing_samples[i] = np.zeros(self.n_components) # Handle missing projection + + # Stack the base samples with the projected missing samples + return np.vstack([self.u, projected_missing_samples]) - # scaling - gn = self.scaler_.transform(gn, copy=copy) - # transpose for transformation - # TODO eliminate need for transposition - x = gn.T - # apply transformation - x_transformed = np.dot(x, self.components_.T) - return x_transformed From a5f77bae7fa30cd3c63775d8abdb06bfa84235de Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 24 Sep 2024 17:06:25 +0200 Subject: [PATCH 2/4] fix: re-add randomized PCA --- allel/stats/decomposition.py | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py index 323fbd38..5c2aaaca 100644 --- a/allel/stats/decomposition.py +++ b/allel/stats/decomposition.py @@ -173,3 +173,187 @@ def project(self, gnp, missing=-1): +def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, + random_state=None, scaler='patterson', ploidy=2): + """Perform principal components analysis of genotype data, via an + approximate truncated singular value decomposition using randomization + to speed up the computation. + + Parameters + ---------- + + gn : array_like, float, shape (n_variants, n_samples) + Genotypes at biallelic variants, coded as the number of alternate + alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). + n_components : int, optional + Number of components to keep. + copy : bool, optional + If False, data passed to fit are overwritten. + iterated_power : int, optional + Number of iterations for the power method. + random_state : int or RandomState instance or None (default) + Pseudo Random Number generator seed control. If None, use the + numpy.random singleton. + scaler : {'patterson', 'standard', None} + Scaling method; 'patterson' applies the method of Patterson et al + 2006; 'standard' scales to unit variance; None centers the data only. + ploidy : int, optional + Sample ploidy, only relevant if 'patterson' scaler is used. + + Returns + ------- + + coords : ndarray, float, shape (n_samples, n_components) + Transformed coordinates for the samples. + model : GenotypeRandomizedPCA + Model instance containing the variance ratio explained and the stored + components (a.k.a., loadings). Can be used to project further data + into the same principal components space via the transform() method. + + Notes + ----- + + Genotype data should be filtered prior to using this function to remove + variants in linkage disequilibrium. + + Based on the :class:`sklearn.decomposition.RandomizedPCA` implementation. + + See Also + -------- + + pca, allel.stats.ld.locate_unlinked + + """ + + # set up the model + model = GenotypeRandomizedPCA(n_components, copy=copy, + iterated_power=iterated_power, + random_state=random_state, scaler=scaler, + ploidy=ploidy) + + # fit the model and project the input data onto the new dimensions + coords = model.fit_transform(gn) + + return coords, model + +class GenotypeRandomizedPCA(object): + + def __init__(self, n_components=10, copy=True, iterated_power=3, + random_state=None, scaler='patterson', ploidy=2): + self.n_components = n_components + self.copy = copy + self.iterated_power = iterated_power + self.random_state = random_state + self.scaler = scaler + self.scaler_ = get_scaler(scaler, copy, ploidy) + + def fit(self, gn): + self._fit(gn) + return self + + def fit_transform(self, gn): + self.u, s, v = self._fit(gn) + self.u *= s + return self.u + + def _fit(self, gn): + from sklearn.utils.validation import check_random_state + from sklearn.utils.extmath import randomized_svd + + # apply scaling + gn = self.scaler_.fit(gn).transform(gn) + + # transpose for svd + # TODO eliminate need for transposition + x = gn.T + + # intermediates + random_state = check_random_state(self.random_state) + n_components = self.n_components + n_samples, n_features = x.shape + + # singular value decomposition + u, s, v = randomized_svd(x, n_components, + n_iter=self.iterated_power, + random_state=random_state) + + # calculate explained variance + self.explained_variance_ = exp_var = (s ** 2) / n_samples + full_var = np.var(x, axis=0).sum() + self.explained_variance_ratio_ = exp_var / full_var + + # store components + self.components_ = v + + return u, s, v + + def transform(self, gn, copy=None): + if not hasattr(self, 'components_'): + raise ValueError('model has not been not fitted') + + # scaling + gn = self.scaler_.transform(gn, copy=copy) + + # transpose for transformation + # TODO eliminate need for transposition + x = gn.T + + # apply transformation + x_transformed = np.dot(x, self.components_.T) + + return x_transformed + + def project(self, gnp, missing=-1): + """ + Project samples with missing variants using the precomputed eigenvectors. + + Parameters + ---------- + gnp : array_like, float, shape (n_variants, n_samples) + Genotypes at biallelic variants for samples to project, + coded as the number of alternate alleles per call. + (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). + missing : int, optional + Value used to represent missing genotypes in the input data. + + Returns + ------- + array-like, shape (n_samples, n_components): + PCA projection of both base and projected missing samples. + + Notes + ----- + + This is the implementation of the least squares projection method + first implemented in smartPCA by Patterson in 2013, described here: + https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf + + It is used to project samples with missing variants in the PC space defined + by samples with no missing variants. + """ + + if not hasattr(self, 'components_'): + raise ValueError('model has not been fitted') + if not hasattr(self, 'u'): + raise ValueError('genotype data has not been transformed') + + gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg + + projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) + + for i, sample in enumerate(gnp.T): + # Identify non-missing entries in the current sample + non_missing_variants = sample != missing + + # Subset the eigenvectors to only the non-missing variants + eigenvectors_no_missing = self.components_[:, non_missing_variants].T + X_non_missing = sample[non_missing_variants] + + # Ensure dimensional compatibility and project using least squares + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0] + else: + projected_missing_samples[i] = np.zeros(self.n_components) # Handle missing projection + + # Stack the base samples with the projected missing samples + return np.vstack([self.u, projected_missing_samples]) \ No newline at end of file From aa0e7b46d9e6b45d1861af704b6ac5b15c290df3 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 24 Sep 2024 17:13:53 +0200 Subject: [PATCH 3/4] cleanup --- allel/stats/decomposition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py index 5c2aaaca..1c4dd950 100644 --- a/allel/stats/decomposition.py +++ b/allel/stats/decomposition.py @@ -94,7 +94,7 @@ def _fit(self, gn): # store variables n_components = self.n_components - self.components_ = v[:n_components] # Store eigenvectors (principal components) + self.components_ = v[:n_components] self.explained_variance_ = explained_variance_[:n_components] self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] @@ -111,9 +111,9 @@ def transform(self, gn, copy=None): x = gn.T # apply transformation - self.x_transformed = np.dot(x, self.components_.T) + x_transformed = np.dot(x, self.components_.T) - return self.x_transformed + return x_transformed def project(self, gnp, missing=-1): """ From 5795f5d5512e866b089e3d26907e740dba427d53 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 24 Sep 2024 17:15:27 +0200 Subject: [PATCH 4/4] black formatting --- allel/stats/decomposition.py | 139 +++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 54 deletions(-) diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py index 1c4dd950..9f42c296 100644 --- a/allel/stats/decomposition.py +++ b/allel/stats/decomposition.py @@ -5,7 +5,7 @@ from allel.stats.preprocessing import get_scaler -def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): +def pca(gn, n_components=10, copy=True, scaler="patterson", ploidy=2): """Perform principal components analysis of genotype data, via singular value decomposition. @@ -58,8 +58,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): class GenotypePCA(object): - - def __init__(self, n_components=10, copy=True, scaler='patterson', ploidy=2): + def __init__(self, n_components=10, copy=True, scaler="patterson", ploidy=2): self.n_components = n_components self.copy = copy self.scaler = scaler @@ -71,8 +70,8 @@ def fit(self, gn): def fit_transform(self, gn): self.u, s, v = self._fit(gn) - self.u = self.u[:, :self.n_components] - self.u *= s[:self.n_components] + self.u = self.u[:, : self.n_components] + self.u *= s[: self.n_components] return self.u def _fit(self, gn): @@ -89,8 +88,8 @@ def _fit(self, gn): u, s, v = scipy.linalg.svd(x, full_matrices=False) # calculate explained variance - explained_variance_ = (s ** 2) / n_samples - explained_variance_ratio_ = (explained_variance_ / np.sum(explained_variance_)) + explained_variance_ = (s**2) / n_samples + explained_variance_ratio_ = explained_variance_ / np.sum(explained_variance_) # store variables n_components = self.n_components @@ -101,8 +100,8 @@ def _fit(self, gn): return u, s, v def transform(self, gn, copy=None): - if not hasattr(self, 'components_'): - raise ValueError('model has not been not fitted') + if not hasattr(self, "components_"): + raise ValueError("model has not been not fitted") # scaling gn = self.scaler_.transform(gn, copy=copy) @@ -122,7 +121,7 @@ def project(self, gnp, missing=-1): Parameters ---------- gnp : array_like, float, shape (n_variants, n_samples) - Genotypes at biallelic variants for samples to project, + Genotypes at biallelic variants for samples to project, coded as the number of alternate alleles per call. (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). missing : int, optional @@ -144,37 +143,50 @@ def project(self, gnp, missing=-1): by samples with no missing variants. """ - if not hasattr(self, 'components_'): - raise ValueError('model has not been fitted') - if not hasattr(self, 'u'): - raise ValueError('genotype data has not been transformed') - - gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg + if not hasattr(self, "components_"): + raise ValueError("model has not been fitted") + if not hasattr(self, "u"): + raise ValueError("genotype data has not been transformed") + + gnp = self.scaler_.transform(gnp, copy=None).astype( + np.float32, copy=False + ) # cast to float32 for np.linalg projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) for i, sample in enumerate(gnp.T): # Identify non-missing entries in the current sample non_missing_variants = sample != missing - + # Subset the eigenvectors to only the non-missing variants eigenvectors_no_missing = self.components_[:, non_missing_variants].T X_non_missing = sample[non_missing_variants] - + # Ensure dimensional compatibility and project using least squares - if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing): - projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0] + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[ + 0 + ] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq( + eigenvectors_no_missing, X_non_missing, rcond=None + )[0] else: - projected_missing_samples[i] = np.zeros(self.n_components) # Handle missing projection - + projected_missing_samples[i] = np.zeros( + self.n_components + ) # Handle missing projection + # Stack the base samples with the projected missing samples return np.vstack([self.u, projected_missing_samples]) - - -def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): +def randomized_pca( + gn, + n_components=10, + copy=True, + iterated_power=3, + random_state=None, + scaler="patterson", + ploidy=2, +): """Perform principal components analysis of genotype data, via an approximate truncated singular value decomposition using randomization to speed up the computation. @@ -226,20 +238,31 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, """ # set up the model - model = GenotypeRandomizedPCA(n_components, copy=copy, - iterated_power=iterated_power, - random_state=random_state, scaler=scaler, - ploidy=ploidy) + model = GenotypeRandomizedPCA( + n_components, + copy=copy, + iterated_power=iterated_power, + random_state=random_state, + scaler=scaler, + ploidy=ploidy, + ) # fit the model and project the input data onto the new dimensions coords = model.fit_transform(gn) return coords, model -class GenotypeRandomizedPCA(object): - def __init__(self, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): +class GenotypeRandomizedPCA(object): + def __init__( + self, + n_components=10, + copy=True, + iterated_power=3, + random_state=None, + scaler="patterson", + ploidy=2, + ): self.n_components = n_components self.copy = copy self.iterated_power = iterated_power @@ -273,12 +296,12 @@ def _fit(self, gn): n_samples, n_features = x.shape # singular value decomposition - u, s, v = randomized_svd(x, n_components, - n_iter=self.iterated_power, - random_state=random_state) + u, s, v = randomized_svd( + x, n_components, n_iter=self.iterated_power, random_state=random_state + ) # calculate explained variance - self.explained_variance_ = exp_var = (s ** 2) / n_samples + self.explained_variance_ = exp_var = (s**2) / n_samples full_var = np.var(x, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var @@ -288,8 +311,8 @@ def _fit(self, gn): return u, s, v def transform(self, gn, copy=None): - if not hasattr(self, 'components_'): - raise ValueError('model has not been not fitted') + if not hasattr(self, "components_"): + raise ValueError("model has not been not fitted") # scaling gn = self.scaler_.transform(gn, copy=copy) @@ -302,7 +325,7 @@ def transform(self, gn, copy=None): x_transformed = np.dot(x, self.components_.T) return x_transformed - + def project(self, gnp, missing=-1): """ Project samples with missing variants using the precomputed eigenvectors. @@ -310,7 +333,7 @@ def project(self, gnp, missing=-1): Parameters ---------- gnp : array_like, float, shape (n_variants, n_samples) - Genotypes at biallelic variants for samples to project, + Genotypes at biallelic variants for samples to project, coded as the number of alternate alleles per call. (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). missing : int, optional @@ -332,28 +355,36 @@ def project(self, gnp, missing=-1): by samples with no missing variants. """ - if not hasattr(self, 'components_'): - raise ValueError('model has not been fitted') - if not hasattr(self, 'u'): - raise ValueError('genotype data has not been transformed') - - gnp = self.scaler_.transform(gnp, copy=None).astype(np.float32, copy=False) # cast to float32 for np.linalg + if not hasattr(self, "components_"): + raise ValueError("model has not been fitted") + if not hasattr(self, "u"): + raise ValueError("genotype data has not been transformed") + + gnp = self.scaler_.transform(gnp, copy=None).astype( + np.float32, copy=False + ) # cast to float32 for np.linalg projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) for i, sample in enumerate(gnp.T): # Identify non-missing entries in the current sample non_missing_variants = sample != missing - + # Subset the eigenvectors to only the non-missing variants eigenvectors_no_missing = self.components_[:, non_missing_variants].T X_non_missing = sample[non_missing_variants] - + # Ensure dimensional compatibility and project using least squares - if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[0] == len(X_non_missing): - projected_missing_samples[i] = np.linalg.lstsq(eigenvectors_no_missing, X_non_missing, rcond=None)[0] + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[ + 0 + ] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq( + eigenvectors_no_missing, X_non_missing, rcond=None + )[0] else: - projected_missing_samples[i] = np.zeros(self.n_components) # Handle missing projection - + projected_missing_samples[i] = np.zeros( + self.n_components + ) # Handle missing projection + # Stack the base samples with the projected missing samples - return np.vstack([self.u, projected_missing_samples]) \ No newline at end of file + return np.vstack([self.u, projected_missing_samples])