cggh · maxibor · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/allel/stats/decomposition.py b/allel/stats/decomposition.py
@@ -5,7 +5,7 @@
 from allel.stats.preprocessing import get_scaler
 
 
-def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2):
+def pca(gn, n_components=10, copy=True, scaler="patterson", ploidy=2):
     """Perform principal components analysis of genotype data, via singular
     value decomposition.
 
@@ -58,9 +58,7 @@ def pca(gn, n_components=10, copy=True, scaler='patterson', ploidy=2):
 
 
 class GenotypePCA(object):
-
-    def __init__(self, n_components=10, copy=True, scaler='patterson',
-                 ploidy=2):
+    def __init__(self, n_components=10, copy=True, scaler="patterson", ploidy=2):
         self.n_components = n_components
         self.copy = copy
         self.scaler = scaler
@@ -71,10 +69,10 @@ def fit(self, gn):
         return self
 
     def fit_transform(self, gn):
-        u, s, v = self._fit(gn)
-        u = u[:, :self.n_components]
-        u *= s[:self.n_components]
-        return u
+        self.u, s, v = self._fit(gn)
+        self.u = self.u[:, : self.n_components]
+        self.u *= s[: self.n_components]
+        return self.u
 
     def _fit(self, gn):
         import scipy.linalg
@@ -83,45 +81,112 @@ def _fit(self, gn):
         gn = self.scaler_.fit(gn).transform(gn)
 
         # transpose for svd
-        # TODO eliminate need for transposition
         x = gn.T
         n_samples, n_features = x.shape
 
         # singular value decomposition
         u, s, v = scipy.linalg.svd(x, full_matrices=False)
 
         # calculate explained variance
-        explained_variance_ = (s ** 2) / n_samples
-        explained_variance_ratio_ = (explained_variance_ / np.sum(explained_variance_))
+        explained_variance_ = (s**2) / n_samples
+        explained_variance_ratio_ = explained_variance_ / np.sum(explained_variance_)
 
         # store variables
         n_components = self.n_components
         self.components_ = v[:n_components]
         self.explained_variance_ = explained_variance_[:n_components]
-        self.explained_variance_ratio_ = \
-            explained_variance_ratio_[:n_components]
+        self.explained_variance_ratio_ = explained_variance_ratio_[:n_components]
 
         return u, s, v
 
     def transform(self, gn, copy=None):
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been not fitted')
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been not fitted")
 
         # scaling
         gn = self.scaler_.transform(gn, copy=copy)
 
         # transpose for transformation
-        # TODO eliminate need for transposition
         x = gn.T
 
         # apply transformation
         x_transformed = np.dot(x, self.components_.T)
 
         return x_transformed
 
-
-def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
-                   random_state=None, scaler='patterson', ploidy=2):
+    def project(self, gnp, missing=-1):
+        """
+        Project samples with missing variants using the precomputed eigenvectors.
+
+        Parameters
+        ----------
+        gnp : array_like, float, shape (n_variants, n_samples)
+            Genotypes at biallelic variants for samples to project,
+            coded as the number of alternate alleles per call.
+            (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
+        missing : int, optional
+            Value used to represent missing genotypes in the input data.
+
+        Returns
+        -------
+        array-like, shape (n_samples, n_components):
+            PCA projection of both base and projected missing samples.
+
+        Notes
+        -----
+
+        This is the implementation of the least squares projection method
+        first implemented in smartPCA by Patterson in 2013, described here:
+        https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf
+
+        It is used to project samples with missing variants in the PC space defined
+        by samples with no missing variants.
+        """
+
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been fitted")
+        if not hasattr(self, "u"):
+            raise ValueError("genotype data has not been transformed")
+
+        gnp = self.scaler_.transform(gnp, copy=None).astype(
+            np.float32, copy=False
+        )  # cast to float32 for np.linalg
+
+        projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
+
+        for i, sample in enumerate(gnp.T):
+            # Identify non-missing entries in the current sample
+            non_missing_variants = sample != missing
+
+            # Subset the eigenvectors to only the non-missing variants
+            eigenvectors_no_missing = self.components_[:, non_missing_variants].T
+            X_non_missing = sample[non_missing_variants]
+
+            # Ensure dimensional compatibility and project using least squares
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[
+                0
+            ] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(
+                    eigenvectors_no_missing, X_non_missing, rcond=None
+                )[0]
+            else:
+                projected_missing_samples[i] = np.zeros(
+                    self.n_components
+                )  # Handle missing projection
+
+        # Stack the base samples with the projected missing samples
+        return np.vstack([self.u, projected_missing_samples])
+
+
+def randomized_pca(
+    gn,
+    n_components=10,
+    copy=True,
+    iterated_power=3,
+    random_state=None,
+    scaler="patterson",
+    ploidy=2,
+):
     """Perform principal components analysis of genotype data, via an
     approximate truncated singular value decomposition using randomization
     to speed up the computation.
@@ -173,10 +238,14 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
     """
 
     # set up the model
-    model = GenotypeRandomizedPCA(n_components, copy=copy,
-                                  iterated_power=iterated_power,
-                                  random_state=random_state, scaler=scaler,
-                                  ploidy=ploidy)
+    model = GenotypeRandomizedPCA(
+        n_components,
+        copy=copy,
+        iterated_power=iterated_power,
+        random_state=random_state,
+        scaler=scaler,
+        ploidy=ploidy,
+    )
 
     # fit the model and project the input data onto the new dimensions
     coords = model.fit_transform(gn)
@@ -185,9 +254,15 @@ def randomized_pca(gn, n_components=10, copy=True, iterated_power=3,
 
 
 class GenotypeRandomizedPCA(object):
-
-    def __init__(self, n_components=10, copy=True, iterated_power=3,
-                 random_state=None, scaler='patterson', ploidy=2):
+    def __init__(
+        self,
+        n_components=10,
+        copy=True,
+        iterated_power=3,
+        random_state=None,
+        scaler="patterson",
+        ploidy=2,
+    ):
         self.n_components = n_components
         self.copy = copy
         self.iterated_power = iterated_power
@@ -200,9 +275,9 @@ def fit(self, gn):
         return self
 
     def fit_transform(self, gn):
-        u, s, v = self._fit(gn)
-        u *= s
-        return u
+        self.u, s, v = self._fit(gn)
+        self.u *= s
+        return self.u
 
     def _fit(self, gn):
         from sklearn.utils.validation import check_random_state
@@ -221,12 +296,12 @@ def _fit(self, gn):
         n_samples, n_features = x.shape
 
         # singular value decomposition
-        u, s, v = randomized_svd(x, n_components,
-                                 n_iter=self.iterated_power,
-                                 random_state=random_state)
+        u, s, v = randomized_svd(
+            x, n_components, n_iter=self.iterated_power, random_state=random_state
+        )
 
         # calculate explained variance
-        self.explained_variance_ = exp_var = (s ** 2) / n_samples
+        self.explained_variance_ = exp_var = (s**2) / n_samples
         full_var = np.var(x, axis=0).sum()
         self.explained_variance_ratio_ = exp_var / full_var
 
@@ -236,8 +311,8 @@ def _fit(self, gn):
         return u, s, v
 
     def transform(self, gn, copy=None):
-        if not hasattr(self, 'components_'):
-            raise ValueError('model has not been not fitted')
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been not fitted")
 
         # scaling
         gn = self.scaler_.transform(gn, copy=copy)
@@ -250,3 +325,66 @@ def transform(self, gn, copy=None):
         x_transformed = np.dot(x, self.components_.T)
 
         return x_transformed
+
+    def project(self, gnp, missing=-1):
+        """
+        Project samples with missing variants using the precomputed eigenvectors.
+
+        Parameters
+        ----------
+        gnp : array_like, float, shape (n_variants, n_samples)
+            Genotypes at biallelic variants for samples to project,
+            coded as the number of alternate alleles per call.
+            (i.e., 0 = hom ref, 1 = het, 2 = hom alt, -1 = missing).
+        missing : int, optional
+            Value used to represent missing genotypes in the input data.
+
+        Returns
+        -------
+        array-like, shape (n_samples, n_components):
+            PCA projection of both base and projected missing samples.
+
+        Notes
+        -----
+
+        This is the implementation of the least squares projection method
+        first implemented in smartPCA by Patterson in 2013, described here:
+        https://github.com/chrchang/eigensoft/blob/master/POPGEN/lsqproject.pdf
+
+        It is used to project samples with missing variants in the PC space defined
+        by samples with no missing variants.
+        """
+
+        if not hasattr(self, "components_"):
+            raise ValueError("model has not been fitted")
+        if not hasattr(self, "u"):
+            raise ValueError("genotype data has not been transformed")
+
+        gnp = self.scaler_.transform(gnp, copy=None).astype(
+            np.float32, copy=False
+        )  # cast to float32 for np.linalg
+
+        projected_missing_samples = np.zeros((gnp.shape[1], self.n_components))
+
+        for i, sample in enumerate(gnp.T):
+            # Identify non-missing entries in the current sample
+            non_missing_variants = sample != missing
+
+            # Subset the eigenvectors to only the non-missing variants
+            eigenvectors_no_missing = self.components_[:, non_missing_variants].T
+            X_non_missing = sample[non_missing_variants]
+
+            # Ensure dimensional compatibility and project using least squares
+            if eigenvectors_no_missing.shape[1] > 0 and eigenvectors_no_missing.shape[
+                0
+            ] == len(X_non_missing):
+                projected_missing_samples[i] = np.linalg.lstsq(
+                    eigenvectors_no_missing, X_non_missing, rcond=None
+                )[0]
+            else:
+                projected_missing_samples[i] = np.zeros(
+                    self.n_components
+                )  # Handle missing projection
+
+        # Stack the base samples with the projected missing samples
+        return np.vstack([self.u, projected_missing_samples])