-
Notifications
You must be signed in to change notification settings - Fork 1
/
apply_GSEA.py
109 lines (87 loc) · 4.56 KB
/
apply_GSEA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import scanpy as sc
import numpy as np
import stlearn as st
import scipy
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import sys
from h5py import Dataset, Group
import qnorm
#from sklearn.preprocessing import quantile_transform
import pickle
from scipy import sparse
import pickle
import scipy.linalg
from sklearn.metrics.pairwise import euclidean_distances
import gseapy as gp
from gseapy import gseaplot
import csv
import stlearn as st
from collections import defaultdict
def processFile(f):
df = pd.read_csv(f)
df = df.rename(columns={"symbol": "feature"})
# Replace infinite updated data with nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop rows with NaN
df.dropna(subset=["qVal", "pVal", "log2FC"], inplace=True)
# df = df[df["qVal"] < 0.05]
# df = df[(df["log2FC"]).abs() > 1]
df["sample"] = f
# Get maximum fold change values, one per feature.
resDf = df.sort_values("log2FC", ascending=False).drop_duplicates(["feature"])
return resDf
def main(args):
print("hello world! main")
node_list = [86, 59, 48, 10, 13]
name_list = ['10_13_48_59_86', '10_13_48_86_59', '10_13_59_86_48', '13_48_59_86_10', '10_48_59_86_13']
#n_index = 2
for n_index in range (0, len(name_list)):
######## read the gene rank file generated by too-many-cells differential analysis ##########################################
#toomany_label_file = '/cluster/home/t116508uhn/64630/differential_TAGConv_test_r4_14_15_org_whitelist.csv'
toomany_label_file = '/cluster/home/t116508uhn/64630/differential_TAGConv_test_r4_'+name_list[n_index]+'_prerank.csv'
gene_dict=defaultdict(list)
with open(toomany_label_file) as file:
csv_file = csv.reader(file, delimiter=",")
i = 0
for line in csv_file:
if len(line)<1 or i == 0 or np.float64(line[1]) == np.inf or np.float64(line[1]) == -np.inf:
i = i+1
continue
gene_dict[line[0]].append(np.float64(line[1])) # key: gene id. value: rank
i = i+1
for gene in gene_dict:
gene_dict[gene]=np.mean(gene_dict[gene]) # if same gene appears multiple time in the rank file, then take the average rank
data_rnk=pd.DataFrame.from_dict(gene_dict, orient='index')
####################### prepare gene set reference #######################
signature_file='/cluster/home/t116508uhn/64630/Geneset_22Sep21_Subtypesonly_edited.csv' # cancer subtype marker file from Kena
signature_info=defaultdict(list)
with open(signature_file) as file:
tsv_file = csv.reader(file, delimiter=",")
for line in tsv_file:
if (line[0].find('Basal') > -1) or (line[0].find('Classical') > -1) : # record only basal or classical subtype markers
signature_info[line[0]].append(line[1])
signature_info=dict(signature_info)
######################## run prerank to see which genes from the reference are how enriched ###################################
pre_res = gp.prerank(rnk = data_rnk,
gene_sets = signature_info,
threads=4,
min_size=0,
max_size=5000,
permutation_num=1000, # reduce number to speed up testing
outdir=None, # don't write to disk
seed=6,
verbose=True, # see what's going on behind the scenes
)
print(pre_res.res2d)
#pre_res.res2d.to_csv('/cluster/home/t116508uhn/64630/'+'14_vs_15'+'_prerank_tagconv_test_r4.csv')
pre_res.res2d.to_csv('/cluster/home/t116508uhn/64630/'+str(node_list[n_index])+'_prerank_tagconv_test_r4.csv')
terms = pre_res.res2d.Term
for i in range (0, pre_res.res2d.shape[0]):
# save figure
# gseaplot(rank_metric=pre_res.ranking, term=terms[i], ofname=save_path+name_str+'_'+str(i)+'_prerank_tagconv_test_r4.svg', **pre_res.results[terms[i]])
gseaplot(rank_metric=pre_res.ranking, term=terms[i], ofname='/cluster/home/t116508uhn/64630/'+str(node_list[n_index])+'_'+str(i)+'_prerank_tagconv_test_r4.svg', **pre_res.results[terms[i]])
#gseaplot(rank_metric=pre_res.ranking, term=terms[i], ofname='/cluster/home/t116508uhn/64630/'+"14_vs_15"+'_'+str(i)+'_prerank_tagconv_test_r4.svg', **pre_res.results[terms[i]])