-
Notifications
You must be signed in to change notification settings - Fork 1
/
batchCorrectionMethods.py
executable file
·124 lines (96 loc) · 3.75 KB
/
batchCorrectionMethods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import anndata as an
import scanpy as sc
# Batch correction methods
def throw_out_marker_genes(data_in,metadata,n_throw_out=2):
data = data_in.copy()
gsm_drop = []
for this_gse in metadata.gse.unique():
n_gsm = len(metadata[metadata.gse == this_gse])
if n_gsm <2:
gsm_drop += list(metadata[metadata.gse == this_gse].index)
data_sub = data.drop(gsm_drop)
metadata_sub = metadata.drop(gsm_drop)
adata = an.AnnData(X=data_sub,obs=metadata_sub)
sc.pp.neighbors(adata,use_rep='X')
sc.tl.rank_genes_groups(adata, 'gse', method='t-test')
to_throw_out = set()
for n in range(n_throw_out):
to_throw_out = to_throw_out.union(set(adata.uns['rank_genes_groups']['names'][n]))
data.drop(columns=to_throw_out,inplace=True)
print(data.shape)
return data
def reducePCAMetaData(metasub, metatype,adata):
if metatype == 'gse':
gse_sep = metasub.copy()
adata_sub = adata[adata.obs['gse'].isin(gse_sep)]
else:
if metasub == 'all':
adata_sub = adata.copy()
else:
adata_sub = adata[adata.obs[metatype]==metasub]
if adata_sub.shape[0] <5:
print('too few samples for '+metasub+' in '+metatype)
return adata
# Calculate PCA
n_component = int(adata_sub.shape[0]/5)
if n_component < 3:
n_component = 3
pca = PCA(n_components=n_component)
pca_fit = pca.fit(adata_sub.X)
print(f'Variance ratios:{pca_fit.explained_variance_ratio_}')
n_throw_out = (pca_fit.explained_variance_ratio_ > 0.1).sum() #int(n_component*0.2)#
# if n_throw_out<3:
# n_throw_out = 3
pca_componenents = pca_fit.components_[:n_throw_out]
factors = np.matmul(adata.X, pca_componenents.T)
# not super elegant but will do for now
adata_new = adata.copy()
for i in range(n_throw_out):
# df = df - np.outer(factors[:,i],pca_componenents[i])
adata_new.X = adata_new.X - np.outer(factors[:, i], pca_componenents[i])
return adata_new
def getPCAMetaData(metasub, metatype,adata):
if metatype == 'gse':
gse_sep = metasub.copy()
adata_sub = adata[adata.obs['gse'].isin(gse_sep)]
else:
if metasub == 'all':
adata_sub = adata.copy()
else:
adata_sub = adata[adata.obs[metatype]==metasub]
if adata_sub.shape[0] <5:
print('too few samples for '+metasub+' in '+metatype)
return adata
# Calculate PCA
n_component = int(adata_sub.shape[0]/3)
if n_component < 3:
n_component = 3
pca = PCA(n_components=n_component)
pca_fit = pca.fit(adata_sub.X)
print(f'Variance ratios:{pca_fit.explained_variance_ratio_}')
n_throw_out = (pca_fit.explained_variance_ratio_ > 0.3).sum() #int(n_component*0.2)#
pca_componenents = pca_fit.components_[:n_throw_out]
factors = np.matmul(adata.X, pca_componenents.T)
return pca_componenents,factors
def throw_out_pca_zerohops(data_in, metadata, df_meta, obs = 'ZeroHop'):
data = data_in.copy()
adata = an.AnnData(X=data, obs=metadata)
sc.pp.neighbors(adata, use_rep='X')
# Get data subset
constr_name = 'pca_'
# Loop over all ZHs in order of samples per ZH
zhs = metadata[obs].unique()
zh_n_samples = [df_meta.loc[z,'n_samples'] for z in zhs]
sort_ind = sorted(range(len(zh_n_samples)), key=lambda k: zh_n_samples[k], reverse=False)
zhs_sorted = [zhs[i] for i in sort_ind]
for z in zhs_sorted:
if z == 'nan':
continue
else:
constr_name += z+'_'
adata = reducePCAMetaData(z, obs, adata)
data = pd.DataFrame(adata.X, columns=adata.var_names.values, index=adata.obs.index)
return data