-
Notifications
You must be signed in to change notification settings - Fork 3
/
maintain.py
146 lines (132 loc) · 7.45 KB
/
maintain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
'''
A standalone program for maintainence tasks on the ZoDo server
'''
import argparse
import os
from os import listdir
from os.path import isfile, join
import sys
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s:%(message)s',
filemode='w', filename='logs/maintain.log',
level=logging.INFO)
from zodo.ner.train import train
from zodo.ner.gen_iob2_files import process_annotated_ner_data
from zodo.ner.gen_embeddings import create_embeddings
from zodo.db.zoophy import insert_into_db, clear_table
from zodo.utils import load_static_objects
from zodo.ner.models import MODEL_NAMES
def main():
'''Main method : parse input arguments and run appropriate operation'''
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# common arguments
# Input operation
subparsers = parser.add_subparsers(dest='op', required=True,
help="Choose the maintenance operation to be run. "+
"See operation specific help for to know more options "+
"e.g. python maintain.py insert --help"
)
# Subcommand CLEAR
_ = subparsers.add_parser('clear', help="Clears the 'Possible_Location' table in the ZooPhy database for fresh insertion of records")
# Sub-command INSERT
sp_idb = subparsers.add_parser('insert', help='Determines LOIH for a given list of accessions (or for unprocessed accessions in the database) and inserts the records into the Possible_Location table in ZooPhy database.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
sp_idb.add_argument('--filepath', type=str, default=None,
help="Path to filename containing accession ids to be processed"+
" and inserted into the database")
sp_idb.add_argument('--suff', type=str, default="ADM1",
help='Sufficiency level based on Geonames [ADM1, ADM2, ADM3]')
sp_idb.add_argument('--maxlocs', type=int, default=10,
help='Maximum locations to be extracted per accession')
sp_idb.add_argument('--batch_size', type=int, default=1000,
help='Batch size for processing accessions')
# NER model to use
sp_idb.add_argument('--model', type=str, default='BIGRU',
choices=MODEL_NAMES, help="Model to be used")
# resource directory paths
sp_idb.add_argument('--work_dir', type=str, default="resources/",
help="working directory containing resource files")
sp_idb.add_argument('--save', type=str, default="model/",
help="path to the root directory of the saved models")
sp_idb.add_argument('--gpu', type=int, default=None,
help='GPU number to use (if any) as per CUDA')
# Word Embeddings
sp_idb.add_argument('--emb_loc', type=str,
default='resources/wikipedia-pubmed-and-PMC-w2v.bin',
# default='resources/word-embeddings.pkl',
help='word2vec embedding location')
sp_idb.add_argument('--embvocab', type=int, default=-1,
help='load top n words in word emb. -1 for all.')
# Subcommand TRAIN_NER
sp_trn = subparsers.add_parser('train_ner', help='Trains the named entity recognizer (NER) used for detecting locations in scientific articles.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
sp_trn.add_argument('--skip_generate', action='store_true',
help="skips generation of training files if they've already been generated")
sp_trn.add_argument('--train_corpus', type=str, default='data/train/',
help='path to dir where training corpus files are stored')
sp_trn.add_argument('--dev_perc', type=float, default=0.05,
help='proportion of training data for validation/development')
sp_trn.add_argument('--seed', type=int, default=666,
help='seed to be used for splitting training and development set randomly')
# NER model to use
sp_trn.add_argument('--model', type=str, default='BIGRU',
choices=MODEL_NAMES+["ALL"], help="Model to be used")
# resource directory paths
sp_trn.add_argument('--work_dir', type=str, default="resources/",
help="working directory containing resource files")
sp_trn.add_argument('--save', type=str, default="model/",
help="path to the root directory of the saved models")
sp_trn.add_argument('--gpu', type=int, default=None,
help='GPU number to use (if any) as per CUDA')
# Word Embeddings
sp_trn.add_argument('--emb_loc', type=str,
# default='resources/wikipedia-pubmed-and-PMC-w2v.bin',
default='resources/corpus-embeddings.pkl',
help='word2vec embedding location')
sp_trn.add_argument('--embvocab', type=int, default=-1,
help='load top n words for w2v to save memory. -1 to load all.')
# Hyperparameters
sp_trn.add_argument('--num_layers', type=int, default=1,
help='number of hidden layers')
sp_trn.add_argument('--hid_dims', type=str, default="150",
help='dimensions of hidden layers (comma separated per layer)')
sp_trn.add_argument('--lrn_rate', type=float, default=0.001, help='learning rate')
sp_trn.add_argument('--dropout', type=float, default=0.5, help='dropout probability')
# Settings
sp_trn.add_argument('--train_epochs', type=int, default=100, help='number of train epochs')
sp_trn.add_argument('--eval_interval', type=int, default=1, help='evaluate once in _ epochs')
sp_trn.add_argument('--batch_size', type=int, default=10, help='batch size of training')
sp_trn.add_argument('--max_len', type=int, default=150,
help='Max sentence length when RNN/CRF is used')
sp_trn.add_argument('--use_crf', type=bool, default=False,
help='use CRF on the outputs from the Neural Nets')
# Insert preferences
args = parser.parse_args()
logging.info("Input Arguments : %s", args)
if args.op == "clear":
# clear the possible location table in zoophy database
clear_table()
else:
# Choose GPU to run
if args.gpu is not None:
logging.info("Using GPU resource %s", args.gpu)
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)
# train or insert
if args.op == "train_ner":
if not args.skip_generate:
logging.info("Generating NER's IOB2 training files")
process_annotated_ner_data(args)
logging.info("Generating NER's embedding pickle files. This may take a while.")
create_embeddings(args)
# train the NER now
logging.info("Training NER")
train(args)
elif args.op == "insert":
# Load the graph and embedding objects into memory
load_static_objects(args)
logging.info("Inserting rows into database")
# now process and insert records into zoophy database
insert_into_db(args)
if __name__ == '__main__':
main()