-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfund_parser.py
executable file
·144 lines (127 loc) · 7.67 KB
/
fund_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env -python
import base64
import logging
import unicodedata
from collections import defaultdict
from openpyxl import load_workbook
from adrian_code.publications_api import Publications_api
js_file_temp = '''
var fundStat = {};
'''
label_map = {'Advanced Light Microscopy (ALM)': 'Advanced Light Microscopy',
'Ancient DNA': 'Ancient DNA',
'Autoimmunity Profiling': 'Autoimmunity Profiling',
'BioImage Informatics': 'BioImage Informatics',
'Cell Profiling': 'Cell Profiling',
'Chemical Biology Consortium Sweden (CBCS)': 'Chemical Biology Consortium Sweden',
'Chemical Proteomics & Proteogenomics': 'Chemical Proteomics and Proteogenomics',
'Clinical Genomics Gothenburg': 'Clinical Genomics Gothenburg',
'Clinical Genomics Lund': 'Clinical Genomics Lund',
'Clinical Genomics Stockholm': 'Clinical Genomics Stockholm',
'Clinical Genomics Uppsala': 'Clinical Genomics Uppsala',
'Bioinformatics Compute and Storage': 'Compute and Storage',
'Cryo-EM': 'Cryo-EM',
'Drug Discovery and Development (DDD)': 'Drug Discovery and Development',
'Eukaryotic Single Cell Genomics (ESCG)': 'Eukaryotic Single Cell Genomics',
'Genome Engineering Zebrafish': 'Genome Engineering Zebrafish',
'High-throughput Genome Engineering (HTGE)': 'High Throughput Genome Engineering',
'Bioinformatics Long-term Support WABI': 'Bioinformatics Long-term Support',
'Mass Cytometry': 'Mass Cytometry',
'Microbial Single Cell Genomics': 'Microbial Single Cell Genomics',
'NGI Stockholm (Genomics Production)': 'NGI Stockholm',
'NGI Stockholm (Genomics Applications)': 'NGI Stockholm',
'NGI Uppsala (SNP&SEQ Technology Platform)': 'NGI Uppsala SNP&SEQ',
'NGI Uppsala (Uppsala Genome Center)': 'NGI Uppsala UGC',
'PLA and Single Cell Proteomics': 'PLA and Single Cell Proteomics',
'Plasma Profiling': 'Plasma Profiling',
'Protein Science Facility (PSF)': 'Protein Science Facility',
'Bioinformatics Support and Infrastructure': 'Bioinformatics Support and Infrastructure',
'Swedish Metabolomics Centre (SMC)': 'Swedish Metabolomics Centre',
'Swedish NMR Centre (SNC)': 'Swedish NMR Centre',
'Systems Biology': 'Bioinformatics Systems Biology'}
platform_map = {"Advanced Light Microscopy" : "Cellular and Molecular Imaging",
"Ancient DNA" : "Genomics",
"Autoimmunity Profiling" : "Proteomics and Metabolomics",
"BioImage Informatics" : "Cellular and Molecular Imaging",
"Cell Profiling" : "Cellular and Molecular Imaging",
"Chemical Biology Consortium Sweden" : "Chemical Biology and Genome Engineering",
"Chemical Proteomics and Proteogenomics" : "Proteomics and Metabolomics",
"Clinical Genomics Gothenburg" : "Diagnostics Development",
"Clinical Genomics Lund" : "Diagnostics Development",
"Clinical Genomics Stockholm" : "Diagnostics Development",
"Clinical Genomics Uppsala" : "Diagnostics Development",
"Compute and Storage" : "Bioinformatics",
"Cryo-EM" : "Cellular and Molecular Imaging",
"Drug Discovery and Development" : "Drug Discovery and Development",
"Eukaryotic Single Cell Genomics" : "Genomics",
"Genome Engineering Zebrafish" : "Chemical Biology and Genome Engineering",
"High Throughput Genome Engineering" : "Chemical Biology and Genome Engineering",
"In Situ Sequencing" : "Genomics",
"Bioinformatics Long-term Support" : "Bioinformatics",
"Mass Cytometry" : "Proteomics and Metabolomics",
"Microbial Single Cell Genomics" : "Genomics",
"NGI Stockholm" : "Genomics",
"NGI Uppsala SNP&SEQ" : "Genomics",
"NGI Uppsala UGC" : "Genomics",
"PLA and Single Cell Proteomics" : "Proteomics and Metabolomics",
"Plasma Profiling" : "Proteomics and Metabolomics",
"Protein Science Facility" : "Cellular and Molecular Imaging",
"Bioinformatics Support and Infrastructure" : "Bioinformatics",
"Swedish Metabolomics Centre" : "Proteomics and Metabolomics",
"Swedish NMR Centre" : "Cellular and Molecular Imaging",
"Bioinformatics Systems Biology" : "Bioinformatics"}
ignore_label = ['Advanced FISH Technologies', 'Advanced Mass Spectrometry Proteomics', 'AIDA Data Hub', 'Array and Analysis Facility', 'Biochemical Imaging Centre Umeå', 'Bioinformatics and Expression Analysis (BEA)', 'Bioinformatics Support, Infrastructure and Training', 'BioMaterial Interactions (BioMat)', 'Centre for Cellular Imaging', 'Chemical Proteomics', 'Clinical Biomarkers', 'Clinical Genomics Linköping', 'Clinical Genomics Örebro', 'Clinical Genomics Umeå', 'Clinical Proteomics Mass spectrometry', 'Exposomics', 'Fluorescence Correlation Spectroscopy', 'Fluorescence Tissue Profiling', 'Glycoproteomics', 'Gothenburg Imaging Mass Spectrometry', 'In Situ Sequencing (ISS)', 'Intravital Microscopy Facility', 'Karolinska High Throughput Center (KHTC)', 'Mass Spectrometry-based Proteomics, Uppsala', 'Mutation Analysis Facility (MAF)', 'National Genomics Infrastructure', 'National Resource for Mass Spectrometry Imaging', 'Proteogenomics', 'Targeted and Structural Proteomics', 'Tissue Profiling']
def fix_spl_char(value):
if value == None:
value = ''
return str(unicodedata.normalize('NFKD', value).encode('ascii', 'ignore'), 'utf-8')
survey_file = "Input file provided by OO"
sdata = []
ulist_wb = load_workbook(filename=survey_file, read_only=True)
ulist_all_ws = ulist_wb['All Other Funding']
fund_stat = {}
for rnum, row in enumerate(ulist_all_ws.rows):
if rnum == 0:
continue
fname = fix_spl_char(row[0].value)
if not fname:
print("WARN: On row {}, fname '{}' is not valid".format(rnum, fname))
continue
if fname not in fund_stat:
fund_stat[fname] = {'fname':fname, 'plat': platform_map[fname], 'sfund':0, 'ufund':0, 'ofund':0, 'tfund': 0, 'pub':0}
fund_type, fund_amnt = (row[2].value, row[3].value)
if fund_type == 'SciLifeLab':
fund_stat[fname]['sfund'] += fund_amnt
elif fund_type == 'University':
fund_stat[fname]['ufund'] += fund_amnt
elif fund_type == 'Other':
fund_stat[fname]['ofund'] += fund_amnt
else:
print("WARN: On row {}, check fund type '{}'".format(rnum, rfund_type))
fund_stat[fname]['tfund'] += fund_amnt
sorted_funds = sorted(fund_stat.values(), key=lambda v: v['tfund'], reverse=True)
for fund in sorted_funds:
fund['sfund'] = fund['sfund']*1000
fund['ufund'] = fund['ufund']*1000
fund['ofund'] = fund['ofund']*1000
fund['tfund'] = fund['tfund']*1000
pub_getter = Publications_api(years=["2019"])
pub_2019 = pub_getter.get_publications()
pub_read = []
for pub in pub_2019:
if pub["doi"] in pub_read:
continue
ngi_sto = False
for lab in pub['labels']:
if lab in ignore_label:
continue
elif lab.startswith('NGI Stockholm'):
if not ngi_sto:
fund_stat['NGI Stockholm']['pub'] += 1
ngi_sto = True
else:
fund_stat[label_map[lab]]['pub'] += 1
pub_read.append(pub["doi"])
# Out file fed to "distribution_plots.html"
with open("fund_content.js", "w") as ujs:
ujs.write(js_file_temp.format(sorted_funds))