-
Notifications
You must be signed in to change notification settings - Fork 7
/
main.py
382 lines (339 loc) · 23.6 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# coding=utf-8
#
# Copyright 2020, Gabriel Spadon, all rights reserved.
# This code is under GNU General Public License v3.0.
# www.spadon.com.br & [email protected]
#
# Joint research between:
# [USP] University of Sao Paulo - Sao Carlos, SP - Brazil;
# [GaTech] Georgia Institute of Technology - Atlanta, GA - USA;
# [UIUC] University of Illinois Urbana-Champaign - Champaign, IL - USA;
# [UGA] Université Grenoble Alpes - Grenoble-Alpes, France; and,
# [DAL] Dalhousie University ‑ Halifax, NS - Canada.
#
# Contributions:
# * Gabriel Spadon: idea, design, and development;
# * Shenda Hong & Bruno Brandoli: discussion and validation; and,
# * Stan Matwin, Jose F. Rodrigues-Jr & Jimeng Sun: discussion, validation, and idea refinement.
import argparse
import copy
import os
import gc
import platform
import pprint
import random
import sys
import uuid
import warnings
from argparse import RawTextHelpFormatter
import time
import numpy as np
from torch.backends import cudnn
from losses import *
from torch.optim import *
from networks import ReGENN
from neural_trainer import NeuralTrainer
def __main__():
description = """
A toolset for multi-multivariate time-series forecasting.
._
'.
--._ \ .-.
'. .--; / |
,;/ ^ |`\ /.-"".
;' \ _/ |' .'
/ `.;I> /_.--.` )
| .'` .'( _.Y/`;
\_.'---'` `\ `-/`-.
/_-.`-.._ _/`\ ;-./
| -./ ;.__.'`\\
`--' (._ _.'`|
/ ./
; `--';'
`;-,-'
,
Copyright 2020, Gabriel Spadon, all rights reserved.
This code is under GNU General Public License v3.0.
www.spadon.com.br & [email protected]
Joint research between:
[USP] University of Sao Paulo - Sao Carlos, SP - Brazil;
[GaTech] Georgia Institute of Technology - Atlanta, GA - USA;
[UIUC] University of Illinois Urbana-Champaign - Champaign, IL - USA;
[UGA] Université Grenoble Alpes - Grenoble-Alpes, France; and,
[DAL] Dalhousie University ‑ Halifax, NS - Canada.
Contributions:
* Gabriel Spadon: idea, design, and development;
* Shenda Hong & Bruno Brandoli: discussion and validation; and,
* Stan Matwin, Jose F. Rodrigues-Jr & Jimeng Sun: discussion, validation, and idea refinement.
"""
# Yielding program description
parser = argparse.ArgumentParser(prog="GSNeural", description=description, formatter_class=RawTextHelpFormatter)
# Minimization criteria and evaluation metrics
metrics = [
"MAELoss", "MALELoss", "MSELoss", "MSLELoss", "RMSELoss", "RMSLELoss"
]
# In-layer and output activation functions
activation = [
"CELU", "ELU", "GELU", "Hardshrink", "Hardtanh", "Identity", "LeakyReLU", "LogSigmoid", "PReLU", "ReLU",
"ReLU6", "RReLU", "SELU", "Sigmoid", "Softplus", "Softshrink", "Softsign", "Swish", "Tanh", "Tanhshrink"
]
# [+] Positional Arguments
parser.add_argument("--input", type=str, nargs="?", help="A PyTorch tri-dimensional tensor of shape (samples, time, variables).")
parser.add_argument("--stride", type=int, help="The number of time-steps to predict.")
parser.add_argument("--validation-samples", type=float, help="Percentage of samples reserved for assessing sample-generalization.")
parser.add_argument("--validation-stride", type=int, help="The number of time-steps reserved for assessing time-generalization.")
parser.add_argument("--watch-axis", type=int, nargs="?", choices=[0, 1, 2, 3], help="The data to which the schedulers will be watching - 0: training data; 1: reserved samples, 2: reserved time-fold of the training data, and 3: reserved time-fold of the reserved samples.")
parser.add_argument("--window", type=int, help="The number of previous time-steps to look during training and testing.")
# [-] Optional Arguments
parser.add_argument("-aM", "--ablation-mode", action="store_true", help="Deactivates all non-vanilla layers (default: %(default)s).")
parser.add_argument("-AR", "--autoregression", action="store_true", help="Activates an autoregressive component in parallel with the network (default: %(default)s).")
parser.add_argument("-batch", "--batch-size", type=int, default=32, help="Number of samples to use in each training batch (default: %(default)s).")
parser.add_argument("-b", "--bias", action="store_true", help="Learns additive bias vectors whenever it is supported (default: %(default)s).")
parser.add_argument("-bG", "--bidirectional-gate", action="store_true", help="Sets the time gate as bidirectional (default: %(default)s).")
parser.add_argument("-bS", "--bidirectional-sequencer", action="store_true", help="Sets the variable sequencer as bidirectional (default: %(default)s).")
parser.add_argument("-norm", "--clip-norm", type=float, default=10, help="The gradient norm used as the clipping threshold (default: %(default)s).")
parser.add_argument("--cost-list", type=str, nargs="+", default=["MAELoss", "RMSELoss", "MSLELoss"], choices=metrics, help="A list of three cost-functions used to evaluate the results (default: %(default)s).")
parser.add_argument("--criterion", type=str, default="MAELoss", nargs="?", choices=metrics, help="The optimizer's minimization criterion (default: %(default)s).")
parser.add_argument("-dM", "--debugging-mode", action="store_true", help="Turns on the debugging mode, i.e., redirects all the output to the STDOUT (default: %(default)s).")
parser.add_argument("-drop", "--dropout", type=float, default=.1, help="Dropout probability for zeroing the output of a neuron (default: %(default)s).")
parser.add_argument("-stop", "--early-stop", type=int, default=250, help="The number of non-improving epochs to wait before stop training (default: %(default)s).")
parser.add_argument("--epochs", type=int, default=2500, help="The number of iterations over the whole dataset (default: %(default)s).")
parser.add_argument("-eF", "--evolution-function", type=str, default=None, nargs="?", choices=activation, help="Activation function to be used in the evolution layer (default: %(default)s).")
parser.add_argument("-prefix", "--file-prefix", type=str, default=None, nargs="?", help="The prefix appended to the output if saving any file (default: %(default)s).")
parser.add_argument("-g", "--gate", type=str, default="LSTM", nargs="?", choices=["RNN", "GRU", "LSTM"], help="Defines the architecture used by the time gate (default: %(default)s).")
parser.add_argument("--gpu", type=int, default=-1, help="Which GPU to use - automatically picks one when entering a negative number (default: %(default)s).")
parser.add_argument("--iterator", type=str, default="time", nargs="?", choices=["time", "batch"], help="The data iterator to be used during training (default: %(default)s).")
parser.add_argument("-lr", "--learning-rate", type=float, default=.001, help="Learning rate of the optimizer (default: %(default)s).")
parser.add_argument("--load-weights", type=str, nargs="?", default=None, help="Path to a pre-trained ReGENN model, from which we will use pre-learned weights to initialize the current model weights (default: %(default)s).")
parser.add_argument("--network-type", type=str, nargs="?", default="ReGENN", choices=["ReGENN"], help="The network architecture to be used (default: %(default)s).")
parser.add_argument("--network-uuid", type=str, nargs="?", default=None, help="Unique identifier to label the neural network (default: %(default)s).")
parser.add_argument("-nE", "--no-encoder", action="store_true", help="Deactivates encoding layers (default: %(default)s).")
parser.add_argument("-nS", "--no-sequencer", action="store_true", help="Deactivates the variable sequencer (default: %(default)s).")
parser.add_argument("--normalization-axis", type=int, default=2, nargs="?", choices=[0, 1, 2], help="When normalizing by the maximum value, it indicates the axis to which the dataset will be normalized - 0: sample-axis; 1: time-axis; and, 2: variable-axis (default: %(default)s).")
parser.add_argument("--normalization-type", type=str, default="maximum", nargs="?", choices=["linear", "maximum", "logarithm"], help="Which normalization to apply onto the dataset (default: %(default)s).")
parser.add_argument("--optimizer", type=str, nargs="?", default="Adam", help="A valid PyTorch Optimizer (default: %(default)s).")
parser.add_argument("-oF", "--output-function", type=str, nargs="?", default="ReLU", choices=activation, help="Activation function to be used on the output (default: %(default)s).")
parser.add_argument("--random-seed", type=int, default=0, help="Which random seed to use - sets no seed when entering a negative number (default: %(default)s).")
parser.add_argument("--requested-memory", type=int, default=500, help="The minimum amount of memory (in MB) required from the GPU (default: %(default)s).")
parser.add_argument("--save-graph", action="store_true", help="Saves a snapshot of all graphs - source and target graph evolution - at each epoch (default: %(default)s).")
parser.add_argument("--save-model", action="store_true", help="Saves a snapshot of the network after training (default: %(default)s).")
parser.add_argument("--save-output", type=str, nargs="?", default=None, help="Full path used for saving everything logged into the STDOUT (default: %(default)s).")
parser.add_argument("-factor", "--scheduler-factor", type=float, default=.95, help="Multiplicative factor by which the learning rate will be reduced (default: %(default)s).")
parser.add_argument("--scheduler-min-lr", type=float, default=.0, help="A lower bound for the learning rate (default: %(default)s).")
parser.add_argument("-patience", "--scheduler-patience", type=int, default=25, help="The number of non-improving epochs to wait before reducing the learning rate (default: %(default)s).")
parser.add_argument("-threshold", "--scheduler-threshold", type=float, default=.1, help="The threshold used to define a significant change in the minimization criterion (default: %(default)s).")
parser.add_argument("-s", "--sequencer", type=str, default="LSTM", nargs="?", choices=["RNN", "GRU", "LSTM"], help="Defines the architecture used by the variable sequencer (default: %(default)s).")
parser.add_argument("--test-every", type=int, default=5, help="The frequency in which the network will be tested on the reserved data fold (default: %(default)s).")
parser.add_argument("--variable-graph", type=str, nargs="?", default=None, help="Path to a pre-learned variable graph that will be used instead of creating a graph from the training data (default: %(default)s).")
parser.add_argument("--working-directory", type=str, nargs="?", default="./", help="Full path to which all generated data will be saved to (default: %(default)s).")
# Parsing arguments
args = parser.parse_args()
args_copy = copy.deepcopy(args)
# Disables the sequencer on the target network
args.sequencer = None if args.no_sequencer else args.sequencer
# Disables the encoder layer on the target network
args.encoder = False if args.no_encoder else True
# Creates a new network unique identifier in case none is provided
args.network_uuid = str(uuid.uuid4()) if args.network_uuid is None else args.network_uuid
# Creating required directories to save the output
os.makedirs(args.working_directory, exist_ok=True)
os.makedirs("%s/graph" % args.working_directory, exist_ok=True)
os.makedirs("%s/output" % args.working_directory, exist_ok=True)
os.makedirs("%s/sandbox" % args.working_directory, exist_ok=True)
os.makedirs("%s/snapshots" % args.working_directory, exist_ok=True)
# If no filename is provided, we will build one that is unique
output_file = (str(args.file_prefix) + " | ") if args.file_prefix is not None else ""
output_file += "Ablation | " if args.ablation_mode else ""
output_file += args.network_type + " |"
output_file += " [AE] " if args.encoder else " "
output_file += "b" if args.bidirectional_gate else "u"
output_file += args.gate
if not args.no_sequencer:
output_file += "-b" if args.bidirectional_sequencer else "-u"
output_file += args.sequencer
output_file += "-AR" if args.autoregression else ""
output_file += " +b" if args.bias else " -b"
output_file += " | " + args.network_uuid
# String package for printing dictionaries and regular strings
pp = pprint.PrettyPrinter(compact=True)
if not args.debugging_mode: # Debug output will always be on the STDOUT
filepath = ("%s/output/%s.txt" % (args.working_directory, output_file)) if args.save_output is None else args.save_output
sys.stdout = open(filepath, "w+", buffering=1) # All output will be redirected to this file
# Enforcing uniqueness for solo runs but not for cross-validation as we can start-stop training procedures
assert not os.path.exists("%s/snapshots/%s.pth" % (args.working_directory, args.network_uuid)), "Network UUID is not unique!"
# We can clip predictions to evaluate a smaller period, but the model is bounded to predict stride-sized periods
assert args.stride >= args.validation_stride, "The number of reserved strides shouldn't be higher than the stride."
selected_gpu = False
while not selected_gpu:
temporary_file = "%s/sandbox/.%s.gpu" % (args.working_directory, args.network_uuid) # List of available GPUs
os.system("nvidia-smi -q -d Memory | grep -A4 GPU | grep Free | sed 's/[^0-9]*//g'> %s" % temporary_file)
gpu_memory = np.array([int(line) for line in open(temporary_file, "r").readlines()])[::-1]
os.system("rm %s" % temporary_file) # Removes the temporary file
if -1 < args.gpu < len(gpu_memory):
# Checking if the selected GPU has enough memory available
if gpu_memory[args.gpu] >= args.requested_memory:
gpu_id = args.gpu # User-selected GPU
selected_gpu = True # Leave the GPU selector
if not selected_gpu:
# Automatically selects the GPU with enough memory available
if np.max(gpu_memory) >= args.requested_memory:
gpu_id = len(gpu_memory) - np.argmax(gpu_memory) - 1
selected_gpu = True # Leave the GPU selector
# Entering sleep mode before repeating the process once more
else: time.sleep(int(os.getpid())) # PID is Unique
# Setting reproducible seeds
if args.random_seed >= 0:
# Might impact performance
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(args.random_seed)
np.random.seed(args.random_seed)
torch.manual_seed(args.random_seed)
# Identifiers for the current Python process
print("[Running on %s - PID #%d and GPU #%d]" % (platform.node().title(), os.getpid(), gpu_id), end="\n\n")
print("Your training UUID is \"%s\"." % args.network_uuid)
print("All results will be saved under that UUID.", end="\n\n")
# Sending everything to the selected GPU
with torch.cuda.device("cuda:%d" % gpu_id):
# The shape must be (samples, time, variables)
dataset = torch.load("%s/%s" % (args.working_directory, args.input)).pin_memory()
assert (3 >= dataset.ndim >= 2), "Incompatible number of dimensions."
dataset = torch.unsqueeze(dataset, dim=0) if dataset.ndim == 2 else dataset
# Verbosing the dataset information
print("Dataset(samples=%d, time-steps=%d, variables=%d)" % dataset.shape, end="\n\n")
# Limited to three cost-functions for enhancing output visualization
assert len(args.cost_list) == 3, "Make sure to select three cost-functions from the list."
args.cost_list = [eval(cost_function)().cuda() for cost_function in args.cost_list]
# Validating training-, development-, and testing-related variables
assert args.validation_stride >= 0 and 1. >= args.validation_samples >= 0. and args.window > 0 and args.stride > 0 and \
dataset.shape[1] >= (args.window + args.validation_stride + (2 * args.stride)), \
"Unfeasible parametrization for the current dataset."
# Checking the fit-watch parameter used by the schedulers
assert not ((args.watch_axis == 1 or args.watch_axis == 3) and args.validation_samples == 0) and \
not ((args.watch_axis == 2 or args.watch_axis == 3) and args.validation_stride == 0), \
"Make some data available for testing on time- and/or sample-reserved data."
# Running single-run training mode
train_once(dataset, args)
# Printing user-input parameters for further reproducibility and also sanity-check
print("\n[HYPERPARAMETERS]", end="\n\n")
print(pp.pformat(vars(args_copy)))
def __train(dataset, samples, fold_id, args):
"""
Vanilla training.
:param dataset: array-like of shape (samples, time, variables)
The complete dataset for training and testing the network.
:param samples: array-like of shape ([set-samples, dev-samples], 1)
List of training and test sample indices.
:param fold_id: integer
The number of the current dataset fold.
:param args: parsed dictionary
A set of parsed command-line arguments.
:return: array-like of shape (loss, [trainer, network, args])
Test loss and all variables required to reproduce it.
"""
# Instancing the minimization criterion
args.criterion = eval(args.criterion)().cuda()
# Creating a new instance of the neural network
network = eval(args.network_type)(variables=dataset.shape[2], **vars(args)).cuda()
# Overwrite the network initialization with a pre-trained set of weights
if args.load_weights and not args.ablation_mode:
state_dicts = torch.load(args.load_weights, map_location=lambda storage, loc: storage)
_drop = torch.nn.Dropout(p=.2)
for k in state_dicts.keys():
if isinstance(state_dicts[k], dict):
for v in state_dicts[k].keys():
if isinstance(state_dicts[k][v], torch.Tensor):
state_dicts[k][v] = _drop(state_dicts[k][v])
network.load_state_dict(state_dicts["model_dict"])
# Using a pre-learned graph to the network source-evolution layer
if args.variable_graph and not args.ablation_mode:
graph = torch.load(args.variable_graph, map_location=lambda storage, loc: storage)
graph = torch.from_numpy(graph).cuda()
network.SourceEvolution.preset_adjacency(graph)
# Verbosing network-init configuration
if fold_id == 0:
# No need to verbose the network architecture at every fold
print("[NETWORK-SUMMARY]\n\n" + str(network), end="\n\n")
# Instancing the user-selected optimizer
args.optimizer = eval(args.optimizer)(network.parameters(), lr=args.learning_rate)
# Sending data to the NeuralTrainer
trainer = NeuralTrainer(network=network, dataset=dataset, samples=samples, **vars(args))
# Training the network and testing as soon as a better model is found
loss = trainer.train()
# Retrieves the network with the best parameters
network = trainer.network
# Returns the test loss and the best network
return loss, (trainer, network, args)
def train_once(dataset, args):
"""
Trains the network following the user's training-test split approach.
:param dataset: array-like of shape (samples, time, variables)
The complete dataset for training and testing the network.
:param args: parsed dictionary
A set of parsed command-line arguments.
"""
# Shuffling and slicing the dataset samples
items = int(dataset.shape[0] * args.validation_samples)
samples = np.random.permutation(dataset.shape[0])
x_set, x_dev = samples[items:], samples[:items]
# Training the network using x-set and assessing generalization with x-dev
test_loss, (trainer, network, args) = __train(dataset, (x_set, x_dev), fold_id=0, args=args)
# Saving all network-related variables when requested
if args.save_model:
# Preparing all the state dictionaries - enables reproducibility and start-stop training
args_dict, model_dict, trainer_dict, scheduler_dict, optimizer_dict = __state_dict(trainer, args)
# Saving all state dictionaries
torch.save({
"test_loss": test_loss,
"args_dict": args_dict,
"model_dict": model_dict,
"trainer_dict": trainer_dict,
"scheduler_dict": scheduler_dict,
"optimizer_dict": optimizer_dict,
}, "%s/snapshots/%s.pth" % (args.working_directory, args.network_uuid))
# Creating a metric summary to speed-up results assessment
print("\n[SUMMARY]", end="\n\n")
# String formatter for summary building
tokenizer = lambda instance, idx: str(tuple(zip(instance.metric_storage[idx], instance.metric_deviation[idx])))
# The first conditional case deals with regular regression and the second one with regular time-series
metric_list = [0, 1, 2, 3, 4, 5] # However, as default, we will return results from all six sets of data
metric_list = [0, 1, 4, 5] if args.validation_samples > 0 and args.validation_stride == 0 else metric_list
metric_list = [0, 2, 4] if args.validation_samples == 0 and args.validation_stride > 0 else metric_list
# The summary layout depends on the reserved data for validation
tk = str(tuple(tokenizer(trainer, idx) for idx in metric_list))
# After a few tweaks, the summary string will be ready to be pasted to a spreadsheet
print("%s\t%s" % (tk.replace("(", "").replace(")", "").replace(", ", "\t").replace("'", ""), args.network_uuid))
def __state_dict(trainer, args):
"""
Builds a state dictionary from a neural trainer instance. In the case of NeuralTrainer changes,
this method might fail. The method is outside the proper class because it will destroy all
major attributes within it. This means that this method should be the last thing you call.
neural_trainer: object
A NeuralTrainer instance.
:param args: parsed dictionary
A set of parsed command-line arguments.
return: array-like of shape (model-sd, trainer-sd, scheduler-sd, optimizer-sd)
An array with the state dictionaries extracted from the model, trainer, scheduler, and optimizer.
"""
# Tensors to NumPy & GPU to CPU
trainer.criterion = trainer.criterion.name
trainer.scale = trainer.scale.cpu().numpy()
trainer.network_uuid = str(trainer.network_uuid)
trainer.data_iterator = trainer.data_iterator.__name__
trainer.cost_list = [cost_function.name for cost_function in trainer.cost_list]
# Will be handled as outside variables
model_dict = trainer.network.cpu().state_dict()
optimizer_dict = trainer.optimizer.state_dict()
scheduler_dict = trainer.scheduler.state_dict()
# Rollback parsed arguments to raw-arguments
args.cost_list = trainer.cost_list
args.criterion = trainer.criterion
args.network_uuid = trainer.network_uuid
args.optimizer = args.optimizer.__class__.__name__
# Deleting all the objects that should not be saved
del trainer.dataset # The dataset is private and won't be saved
del trainer.network, trainer.scheduler, trainer.optimizer # Will save the state dictionary
return vars(args), model_dict, trainer.__dict__, scheduler_dict, optimizer_dict
if __name__ == "__main__":
# Calls the main function when scripting
__main__()