-
Notifications
You must be signed in to change notification settings - Fork 1
/
NEST_output_by_component.py
174 lines (147 loc) · 8.39 KB
/
NEST_output_by_component.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import numpy as np
import csv
import pickle
from scipy import sparse
import scanpy as sc
import matplotlib
matplotlib.use('Agg')
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import stlearn as st
import numpy as np
from matplotlib.colors import LinearSegmentedColormap, to_hex, rgb2hex
#from typing import List
import qnorm
from scipy.sparse import csr_matrix
#from scipy.sparse.csgraph import connected_components
from collections import defaultdict
import pandas as pd
import gzip
#import copy
import argparse
import os
#sys.path.append("/home/gw/code/utility/altairThemes/")
#if True: # In order to bypass isort when saving
# import altairThemes
import altairThemes
import altair as alt
alt.themes.register("publishTheme", altairThemes.publishTheme)
# enable the newly registered theme
alt.themes.enable("publishTheme")
##########################################################
# readCsv, preprocessDf, plot: these three functions are taken from GW's repository /mnt/data0/gw/research/notta_pancreatic_cancer_visium/plots/fatema_signaling/hist.py
import scipy.stats
def preprocessDf(df):
"""Transform ligand and receptor columns."""
df["ligand-receptor"] = df["ligand"] + '-' + df["receptor"]
df["component"] = df["component"] #.astype(str).str.zfill(2)
return df
def plot(df):
set1 = altairThemes.get_colour_scheme("Set1", len(df["component"].unique()))
set1[0] = '#000000'
base = alt.Chart(df).mark_bar().encode(
x=alt.X("ligand-receptor:N", axis=alt.Axis(labelAngle=45), sort='-y'),
y=alt.Y("count()"),
color=alt.Color("component:N", scale = alt.Scale(range=set1)),
order=alt.Order("component:N", sort="ascending"),
tooltip=["component"]
)
p = base
return p
##alt.Color("component:N", scale = alt.Scale(range=set1))
####################### Set the name of the sample you want to visualize ###################################
data_name = 'PDAC_64630' #LUAD_GSM5702473_TD1
component_of_interest = [14, 15, 47, 60, 29, 38, 83, 17, 18]
current_directory = '/cluster/home/t116508uhn/64630/'
################################################################################
parser = argparse.ArgumentParser()
parser.add_argument( '--data_path', type=str, default='/cluster/projects/schwartzgroup/fatema/pancreatic_cancer_visium/210827_A00827_0396_BHJLJTDRXY_Notta_Karen/V10M25-61_D1_PDA_64630_Pa_P_Spatial10x_new/outs/' , help='The path to dataset')
parser.add_argument( '--embedding_data_path', type=str, default='new_alignment/Embedding_data_ccc_rgcn/' , help='The path to attention') #'/cluster/projects/schwartzgroup/fatema/pancreatic_cancer_visium/210827_A00827_0396_BHJLJTDRXY_Notta_Karen/V10M25-61_D1_PDA_64630_Pa_P_Spatial10x_new/outs/'
parser.add_argument( '--data_name', type=str, default='PDAC_64630', help='The name of dataset')
args = parser.parse_args()
'''
elif data_name == 'PDAC_140694':
parser = argparse.ArgumentParser()
parser.add_argument( '--data_path', type=str, default='/cluster/projects/schwartzgroup/fatema/pancreatic_cancer_visium/V10M25-60_C1_PDA_140694_Pa_P_Spatial10x/outs/' , help='The path to dataset')
parser.add_argument( '--embedding_data_path', type=str, default='new_alignment/Embedding_data_ccc_rgcn/' , help='The path to attention') #'/cluster/projects/schwartzgroup/fatema/pancreatic_cancer_visium/210827_A00827_0396_BHJLJTDRXY_Notta_Karen/V10M25-61_D1_PDA_64630_Pa_P_Spatial10x_new/outs/'
parser.add_argument( '--data_name', type=str, default='PDAC_140694', help='The name of dataset')
args = parser.parse_args()
'''
####### get the gene id, cell barcode, cell coordinates ######
adata_h5 = st.Read10X(path=args.data_path, count_file='filtered_feature_bc_matrix.h5') #count_file=args.data_name+'_filtered_feature_bc_matrix.h5' )
print(adata_h5)
sc.pp.filter_genes(adata_h5, min_cells=1)
print(adata_h5)
gene_ids = list(adata_h5.var_names)
coordinates = adata_h5.obsm['spatial']
cell_barcode = np.array(adata_h5.obs.index)
temp = adata_h5.X
######################### get the cell vs gene matrix - we don not need that. Just in case someone is interested ##################
'''
temp = qnorm.quantile_normalize(np.transpose(sparse.csr_matrix.toarray(temp)))
cell_vs_gene = np.transpose(temp)
'''
######################### get the cell vs cell distance matrix - we don not need that. Just in case someone is interested ##################
'''
from sklearn.metrics.pairwise import euclidean_distances
distance_matrix = euclidean_distances(coordinates, coordinates)
'''
##################### make cell metadata: barcode_info ###################################
i=0
barcode_info=[]
for cell_code in cell_barcode:
barcode_info.append([cell_code, coordinates[i,0],coordinates[i,1], 0]) # last entry will hold the component number later
i=i+1
####### load annotations ##############################################
pathologist_label_file='/cluster/home/t116508uhn/IX_annotation_artifacts.csv' #IX_annotation_artifacts.csv' #
pathologist_label=[]
with open(pathologist_label_file) as file:
csv_file = csv.reader(file, delimiter=",")
for line in csv_file:
pathologist_label.append(line)
barcode_type=dict() # record the type (annotation) of each spot (barcode)
for i in range (1, len(pathologist_label)):
barcode_type[pathologist_label[i][0]] = pathologist_label[i][1]
######################### read the NEST output in csv format ####################################################
# # columns are: from_cell, to_cell, ligand_gene, receptor_gene, attention_score, component, from_id, to_id
filename_str = 'NEST_combined_output_'+args.data_name+'.csv'
inFile = current_directory +filename_str
df = pd.read_csv(inFile, sep=",")
csv_record_final = df.values.tolist()
df_column_names = list(df.columns)
csv_record_final = [df_column_names] + csv_record_final
######################## count how many unique components ######################################
component_list = dict()
for record_idx in range (1, len(csv_record_final)-1): #last entry is a dummy for histograms, so ignore it.
record = csv_record_final[record_idx]
component_label = record[5]
component_list[component_label] = ''
component_list[0] = ''
unique_component_count = len(component_list.keys())
######################## filter the records ########################################################
## change the csv_record_final here if you want histogram for specific components/regions only. e.g., if you want to plot only stroma region, or tumor-stroma regions etc. ##
temp_csv_record_final = []
temp_csv_record_final.append(csv_record_final[0]) #last entry is a dummy for histograms
for record_idx in range (1, len(csv_record_final)-1): #last entry is a dummy for histograms, so ignore it.
# if at least one of ligand and receptors are tumors:
if (barcode_type[csv_record_final[record_idx][0]] == 'tumor' or barcode_type[csv_record_final[record_idx][1]] == 'tumor'): # or (barcode_type[csv_record_final[record_idx][0]] != 'tumor' and barcode_type[csv_record_final[record_idx][1]] != 'tumor')):
if csv_record_final[record_idx][5] in component_of_interest:
temp_csv_record_final.append(csv_record_final[record_idx]) # it is considered during making histogram.
temp_csv_record_final.append(csv_record_final[len(csv_record_final)-1]) #last entry is a dummy for histograms
csv_record_final = temp_csv_record_final
################################### Histogram plotting #################################################################################
'''
set1 = altairThemes.get_colour_scheme("Set1", unique_component_count)
set1[0] = '#000000'
csv_record_final[0].append('comp_color')
for record_idx in range (1, len(csv_record_final)):
csv_record_final[record_idx].append(set1[int(csv_record_final[record_idx][5])])
'''
df = pd.DataFrame(csv_record_final)
df.to_csv(current_directory+'temp_csv.csv', index=False, header=False)
df = pd.read_csv(current_directory+'temp_csv.csv', sep=",")
os.remove(current_directory+'temp_csv.csv') # delete the intermediate file
df = preprocessDf(df)
p = plot(df) #, unique_component_count
outPath = current_directory+'histogram_test.html'
p.save(outPath)