diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py index 43fd30ff..9f42c296 100644 --- a/allel/stats/decomposition.py +++ b/allel/stats/decomposition.py @@ -5,7 +5,7 @@ from allel.stats.preprocessing import get_scaler -def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): +def pca(gn, n_components=10, copy=True, scaler="patterson", ploidy=2): """Perform principal components analysis of genotype data, via singular value decomposition. @@ -58,9 +58,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2): class GenotypePCA(object): - - def __init__(self, n_components=10, copy=True, scaler='patterson', - ploidy=2): + def __init__(self, n_components=10, copy=True, scaler="patterson", ploidy=2): self.n_components = n_components self.copy = copy self.scaler = scaler @@ -71,10 +69,10 @@ def fit(self, gn): return self def fit_transform(self, gn): - u, s, v = self._fit(gn) - u = u[:, :self.n_components] - u *= s[:self.n_components] - return u + self.u, s, v = self._fit(gn) + self.u = self.u[:, : self.n_components] + self.u *= s[: self.n_components] + return self.u def _fit(self, gn): import scipy.linalg @@ -83,7 +81,6 @@ def _fit(self, gn): gn = self.scaler_.fit(gn).transform(gn) # transpose for svd - # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape @@ -91,27 +88,25 @@ def _fit(self, gn): u, s, v = scipy.linalg.svd(x, full_matrices=False) # calculate explained variance - explained_variance_ = (s ** 2) / n_samples - explained_variance_ratio_ = (explained_variance_ / np.sum(explained_variance_)) + explained_variance_ = (s**2) / n_samples + explained_variance_ratio_ = explained_variance_ / np.sum(explained_variance_) # store variables n_components = self.n_components self.components_ = v[:n_components] self.explained_variance_ = explained_variance_[:n_components] - self.explained_variance_ratio_ = \ - explained_variance_ratio_[:n_components] + self.explained_variance_ratio_ = explained_variance_ratio_[:n_components] return u, s, v def transform(self, gn, copy=None): - if not hasattr(self, 'components_'): - raise ValueError('model has not been not fitted') + if not hasattr(self, "components_"): + raise ValueError("model has not been not fitted") # scaling gn = self.scaler_.transform(gn, copy=copy) # transpose for transformation - # TODO eliminate need for transposition x = gn.T # apply transformation @@ -119,9 +114,79 @@ def transform(self, gn, copy=None): return x_transformed - -def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): + def project(self, gnp, missing=-1): + """ + Project samples with missing variants using the precomputed eigenvectors. + + Parameters + ---------- + gnp : array_like, float, shape (n_variants, n_samples) + Genotypes at biallelic variants for samples to project, + coded as the number of alternate alleles per call. + (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). + missing : int, optional + Value used to represent missing genotypes in the input data. + + Returns + ------- + array-like, shape (n_samples, n_components): + PCA projection of both base and projected missing samples. + + Notes + ----- + + This is the implementation of the least squares projection method + first implemented in smartPCA by Patterson in 2013, described here: + https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf + + It is used to project samples with missing variants in the PC space defined + by samples with no missing variants. + """ + + if not hasattr(self, "components_"): + raise ValueError("model has not been fitted") + if not hasattr(self, "u"): + raise ValueError("genotype data has not been transformed") + + gnp = self.scaler_.transform(gnp, copy=None).astype( + np.float32, copy=False + ) # cast to float32 for np.linalg + + projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) + + for i, sample in enumerate(gnp.T): + # Identify non-missing entries in the current sample + non_missing_variants = sample != missing + + # Subset the eigenvectors to only the non-missing variants + eigenvectors_no_missing = self.components_[:, non_missing_variants].T + X_non_missing = sample[non_missing_variants] + + # Ensure dimensional compatibility and project using least squares + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[ + 0 + ] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq( + eigenvectors_no_missing, X_non_missing, rcond=None + )[0] + else: + projected_missing_samples[i] = np.zeros( + self.n_components + ) # Handle missing projection + + # Stack the base samples with the projected missing samples + return np.vstack([self.u, projected_missing_samples]) + + +def randomized_pca( + gn, + n_components=10, + copy=True, + iterated_power=3, + random_state=None, + scaler="patterson", + ploidy=2, +): """Perform principal components analysis of genotype data, via an approximate truncated singular value decomposition using randomization to speed up the computation. @@ -173,10 +238,14 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, """ # set up the model - model = GenotypeRandomizedPCA(n_components, copy=copy, - iterated_power=iterated_power, - random_state=random_state, scaler=scaler, - ploidy=ploidy) + model = GenotypeRandomizedPCA( + n_components, + copy=copy, + iterated_power=iterated_power, + random_state=random_state, + scaler=scaler, + ploidy=ploidy, + ) # fit the model and project the input data onto the new dimensions coords = model.fit_transform(gn) @@ -185,9 +254,15 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3, class GenotypeRandomizedPCA(object): - - def __init__(self, n_components=10, copy=True, iterated_power=3, - random_state=None, scaler='patterson', ploidy=2): + def __init__( + self, + n_components=10, + copy=True, + iterated_power=3, + random_state=None, + scaler="patterson", + ploidy=2, + ): self.n_components = n_components self.copy = copy self.iterated_power = iterated_power @@ -200,9 +275,9 @@ def fit(self, gn): return self def fit_transform(self, gn): - u, s, v = self._fit(gn) - u *= s - return u + self.u, s, v = self._fit(gn) + self.u *= s + return self.u def _fit(self, gn): from sklearn.utils.validation import check_random_state @@ -221,12 +296,12 @@ def _fit(self, gn): n_samples, n_features = x.shape # singular value decomposition - u, s, v = randomized_svd(x, n_components, - n_iter=self.iterated_power, - random_state=random_state) + u, s, v = randomized_svd( + x, n_components, n_iter=self.iterated_power, random_state=random_state + ) # calculate explained variance - self.explained_variance_ = exp_var = (s ** 2) / n_samples + self.explained_variance_ = exp_var = (s**2) / n_samples full_var = np.var(x, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var @@ -236,8 +311,8 @@ def _fit(self, gn): return u, s, v def transform(self, gn, copy=None): - if not hasattr(self, 'components_'): - raise ValueError('model has not been not fitted') + if not hasattr(self, "components_"): + raise ValueError("model has not been not fitted") # scaling gn = self.scaler_.transform(gn, copy=copy) @@ -250,3 +325,66 @@ def transform(self, gn, copy=None): x_transformed = np.dot(x, self.components_.T) return x_transformed + + def project(self, gnp, missing=-1): + """ + Project samples with missing variants using the precomputed eigenvectors. + + Parameters + ---------- + gnp : array_like, float, shape (n_variants, n_samples) + Genotypes at biallelic variants for samples to project, + coded as the number of alternate alleles per call. + (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing). + missing : int, optional + Value used to represent missing genotypes in the input data. + + Returns + ------- + array-like, shape (n_samples, n_components): + PCA projection of both base and projected missing samples. + + Notes + ----- + + This is the implementation of the least squares projection method + first implemented in smartPCA by Patterson in 2013, described here: + https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf + + It is used to project samples with missing variants in the PC space defined + by samples with no missing variants. + """ + + if not hasattr(self, "components_"): + raise ValueError("model has not been fitted") + if not hasattr(self, "u"): + raise ValueError("genotype data has not been transformed") + + gnp = self.scaler_.transform(gnp, copy=None).astype( + np.float32, copy=False + ) # cast to float32 for np.linalg + + projected_missing_samples = np.zeros((gnp.shape[1], self.n_components)) + + for i, sample in enumerate(gnp.T): + # Identify non-missing entries in the current sample + non_missing_variants = sample != missing + + # Subset the eigenvectors to only the non-missing variants + eigenvectors_no_missing = self.components_[:, non_missing_variants].T + X_non_missing = sample[non_missing_variants] + + # Ensure dimensional compatibility and project using least squares + if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[ + 0 + ] == len(X_non_missing): + projected_missing_samples[i] = np.linalg.lstsq( + eigenvectors_no_missing, X_non_missing, rcond=None + )[0] + else: + projected_missing_samples[i] = np.zeros( + self.n_components + ) # Handle missing projection + + # Stack the base samples with the projected missing samples + return np.vstack([self.u, projected_missing_samples])