-
Notifications
You must be signed in to change notification settings - Fork 1
/
load_merfish_data_messi.py
188 lines (148 loc) · 7.31 KB
/
load_merfish_data_messi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# data downloaded by readydata.py from MESSI
import copy
import os
import pickle
from collections import defaultdict
import itertools
import gzip
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.cluster import AgglomerativeClustering
import messi
from messi.data_processing import *
from messi.hme import hme
from messi.gridSearch import gridSearch
input_path = 'input/'
output_path = 'output/'
data_type = 'merfish'
sex = 'Male' #'Female'
behavior = 'Parenting' #'Virgin_Parenting' #
behavior_no_space = behavior.replace(" ", "_")
current_cell_type = 'Excitatory' #'Microglia' #'general' #
current_cell_type_no_space = current_cell_type.replace(" ", "_")
data_options = sex + '_' + behavior + '_' + current_cell_type
grid_search = True #False #
n_sets = 5 # for example usage only; we recommend 5
n_classes_0 = 1
n_classes_1 = 5
n_epochs = 20 # for example usage only; we recommend using the default 20 n_epochs
preprocess = 'neighbor_cat'
top_k_response = None #20 # for example usage only; we recommend use all responses (i.e. None)
top_k_regulator = None
response_type = 'original' # use raw values to fit the model
condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_{n_classes_1}"
if grid_search:
condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_grid_search"
else:
condition = f"response_{top_k_response}_l1_{n_classes_0}_l2_{n_classes_1}"
read_in_functions = {'merfish': [read_meta_merfish, read_merfish_data, get_idx_per_dataset_merfish],
'merfish_cell_line': [read_meta_merfish_cell_line, read_merfish_cell_line_data, get_idx_per_dataset_merfish_cell_line],
'starmap': [read_meta_starmap_combinatorial, read_starmap_combinatorial, get_idx_per_dataset_starmap_combinatorial]}
# set data reading functions corresponding to the data type
if data_type in ['merfish', 'merfish_cell_line', 'starmap']:
read_meta = read_in_functions[data_type][0]
read_data = read_in_functions[data_type][1]
get_idx_per_dataset = read_in_functions[data_type][2]
else:
raise NotImplementedError(f"Now only support processing 'merfish', 'merfish_cell_line' or 'starmap'")
# read in ligand and receptor lists
#l_u, r_u = get_lr_pairs(input_path='../messi/input/') # may need to change to the default value
#
ligand_dict_dataset = defaultdict(list)
OMNIPATH_file = '/cluster/home/t116508uhn/64630/omnipath_records_2023Feb.csv'
df = pd.read_csv(OMNIPATH_file)
cell_cell_contact = dict()
for i in range (0, df['genesymbol_intercell_source'].shape[0]):
ligand = df['genesymbol_intercell_source'][i]
if 'ligand' not in df['category_intercell_source'][i]:
continue
receptor = df['genesymbol_intercell_target'][i]
if 'receptor' not in df['category_intercell_target'][i]:
continue
ligand_dict_dataset[ligand].append(receptor)
if df['category_intercell_source'][i] == 'cell_surface_ligand':
cell_cell_contact[ligand] = ''
######################################
lr_pairs = []
count = 0
for gene in list(ligand_dict_dataset.keys()):
ligand_dict_dataset[gene]=list(set(ligand_dict_dataset[gene]))
for receptor_gene in ligand_dict_dataset[gene]:
lr_pairs.append([gene, receptor_gene])
count = count + 1
##################################################################
print(count)
lr_pairs = pd.DataFrame(lr_pairs)
lr_pairs.columns = ['ligand','receptor']
'''
lr_pairs = pd.read_html(os.path.join('input/','ligand_receptor_pairs2.txt'), header=None)[0] #pd.read_table(os.path.join(input_path, filename), header=None)
lr_pairs.columns = ['ligand','receptor']
lr_pairs[['ligand','receptor']] = lr_pairs['receptor'].str.split('\t',expand=True)
lr_pairs['ligand'] = lr_pairs['ligand'].apply(lambda x: x.upper())
lr_pairs['receptor'] = lr_pairs['receptor'].apply(lambda x: x.upper())
l_u_p = set([l.upper() for l in lr_pairs['ligand']])
r_u_p = set([g.upper() for g in lr_pairs['receptor']])
l_u_search = [] # set(['CBLN1', 'CXCL14', 'CBLN2', 'VGF','SCG2','CARTPT','TAC2'])
r_u_search = [] # set(['CRHBP', 'GABRA1', 'GPR165', 'GLRA3', 'GABRG1', 'ADORA2A'])
l_u = l_u_p.union(l_u_search)
r_u = r_u_p.union(r_u_search)
'''
l_u_p = set(list(lr_pairs['ligand']))
r_u_p = set(list(lr_pairs['receptor']))
l_u_search = [] # set(['CBLN1', 'CXCL14', 'CBLN2', 'VGF','SCG2','CARTPT','TAC2'])
r_u_search = [] # set(['CRHBP', 'GABRA1', 'GPR165', 'GLRA3', 'GABRG1', 'ADORA2A'])
l_u = l_u_p.union(l_u_search)
r_u = r_u_p.union(r_u_search)
# read in meta information about the dataset # meta_all = cell x metadata
meta_all, meta_all_columns, cell_types_dict, genes_list, genes_list_u, \
response_list_prior, regulator_list_prior = read_meta('input/', behavior_no_space, sex, l_u, r_u) # TO BE MODIFIED: number of responses
#genes_list_u = genes_list_us_messi
# get all available animals/samples -- get unique IDs
all_animals = list(set(meta_all[:, meta_all_columns['Animal_ID']])) # 16, 17, 18, 19
test_animal = 16
test_animals = [test_animal]
samples_test = np.array(test_animals)
samples_train = np.array(list(set(all_animals))) #np.array(list(set(all_animals)-set(test_animals)))
print(f"Test set is {samples_test}")
print(f"Training set is {samples_train}")
bregma = None
idx_train, idx_test, idx_train_in_general, \
idx_test_in_general, idx_train_in_dataset, \
idx_test_in_dataset, meta_per_dataset_train, \
meta_per_dataset_test = find_idx_for_train_test(samples_train, samples_test,
meta_all, meta_all_columns, data_type,
current_cell_type, get_idx_per_dataset,
return_in_general = False,
bregma=bregma)
##################################################################
data_sets = []
data_sets_gatconv = []
i = 0
for animal_id, bregma in meta_per_dataset_train:
hp, hp_cor, hp_genes = read_data('input/', bregma, animal_id, genes_list, genes_list_u)
if hp is not None:
hp_columns = dict(zip(hp.columns, range(0, len(hp.columns))))
hp_np = hp.to_numpy()
else:
hp_columns = None
hp_np = None
hp_cor_columns = dict(zip(hp_cor.columns, range(0, len(hp_cor.columns))))
hp_genes_columns = dict(zip(hp_genes.columns, range(0, len(hp_genes.columns))))
data_sets.append([hp_np, hp_columns, hp_cor.to_numpy(), hp_cor_columns,
hp_genes.to_numpy(), hp_genes_columns])
cell_barcodes = data_sets[i][0][:,0]
coordinates = data_sets[i][0][:,5:7]
cell_vs_gene = data_sets[i][4]
cell_vs_animal_id_sex_behavior_bregma = data_sets[i][0][:,1:5]
cell_vs_class_neuron_cluster_id = data_sets[i][0][:,7:]
gene_ids = data_sets[i][5]
data_sets_gatconv.append([cell_barcodes, coordinates, cell_vs_gene, gene_ids, cell_vs_animal_id_sex_behavior_bregma, cell_vs_class_neuron_cluster_id])
i = i + 1
del hp, hp_cor, hp_genes
with gzip.open("/cluster/projects/schwartzgroup/fatema/find_ccc/merfish_mouse_cortex/" + 'messi_merfish_data_'+data_options, 'wb') as fp: #b, a:[0:5]
#with gzip.open("/cluster/projects/schwartzgroup/fatema/find_ccc/" + 'adjacency_records_GAT_synthetic_region1_onlyccc_70', 'wb') as fp:
pickle.dump([data_sets_gatconv, lr_pairs, cell_cell_contact], fp)