forked from gianlucadetommaso/volatile
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvolatile.py
executable file
·398 lines (329 loc) · 15.1 KB
/
volatile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
#!/usr/bin/env python
import numpy as np
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, SUPPRESS
import csv
import os.path
import pickle
from download import download_tickers_info
from utils import convert_currency, extract_hierarchical_info
import models
import plotting
import multitasking
import logging
logging.basicConfig(level=logging.INFO)
def softplus(x: np.array) -> np.array:
"""
It is a function from real to positive numbers
Parameters
----------
x: np.array
Real value.
"""
return np.log(1 + np.exp(x))
def estimate_logprice_statistics(mu: np.array, sigma: np.array, tt: np.array) -> tuple:
"""
It estimates mean and standard deviations of log-prices.
Parameters
----------
mu: np.array
Parameters of regression polynomial.
sigma: np.array
Parameters of standard deviation.
tt: np.array
Sequence of times to evaluate statistics at.
Returns
-------
It returns a tuple of mean and standard deviation log-price estimators.
"""
return np.dot(mu, tt), softplus(sigma)
def estimate_price_statistics(mu: np.array, sigma: np.array):
"""
It estimates mean and standard deviations of prices.
Parameters
----------
mu: np.array
Mean estimates of log-prices.
sigma: np.array
Standard deviation estimates of log-prices.
Returns
-------
It returns a tuple of mean and standard deviation price estimators.
"""
return np.exp(mu + sigma**2 / 2), np.sqrt(np.exp(2 * mu + sigma**2) * (np.exp(sigma**2) - 1))
def rate(scores: np.array, lower_bounds: dict = None) -> list:
"""
Rate scores according to `lower_bounds`. Possible rates are `HIGHLY BELOW TREND`, `BELOW TREND`, `ALONG TREND`,
`ABOVE TREND` and `HIGHLY ABOVE TREND`.
Parameters
----------
scores: np.array
An array of scores for each stock.
lower_bounds: dict
It has for keys possible rates and for values corresponding lower-bound lower_bounds, meaning that for a
stock to be given a certain rate it needs to have score above its lower bound and below higher lower bounds of
other rates.
Returns
-------
rates: list
List of rates for each stock.
"""
if lower_bounds is None:
lower_bounds = {"HIGHLY BELOW TREND": 3, "BELOW TREND": 2, "ALONG TREND": -2, "ABOVE TREND": -3}
rates = []
for i in range(len(scores)):
if scores[i] > lower_bounds["HIGHLY BELOW TREND"]:
rates.append("HIGHLY BELOW TREND")
elif scores[i] > lower_bounds["BELOW TREND"]:
rates.append("BELOW TREND")
elif scores[i] > lower_bounds["ALONG TREND"]:
rates.append("ALONG TREND")
elif scores[i] > lower_bounds["ABOVE TREND"]:
rates.append("ABOVE TREND")
else:
rates.append("HIGHLY ABOVE TREND")
return rates
def estimate_matches(tickers: list, mu: np.array, tt: np.array) -> dict:
"""
It estimates matches of correlated stocks.
Parameters
----------
tickers: list
List of tickers
mu: np.array
Parameters of regression polynomial.
tt: np.array
Array of times corresponding to days of trading.
Returns
-------
matches: dict
For each symbol, this dictionary contains a corresponding `match` symbol, the `index` of the match symbol in the
list of symbols and the computed `distance` between the two.
"""
dtt = np.arange(1, tt.shape[0])[:, None] * tt[1:] / tt[1, None]
dlogp_est = np.dot(mu[:, 1:], dtt)
num_stocks = len(tickers)
try:
assert num_stocks <= 2000
match_dist = np.sum((dlogp_est[:, None] - dlogp_est[None]) ** 2, 2)
match_minidx = np.argsort(match_dist, 1)[:, 1]
match_mindist = np.sort(match_dist, 1)[:, 1]
matches = {
tickers[i]: {"match": tickers[match_minidx[i]], "index": match_minidx[i], "distance": match_mindist[i]}
for i in range(num_stocks)
}
except:
num_threads = min([len(tickers), multitasking.cpu_count() * 2])
multitasking.set_max_threads(num_threads)
matches = {}
@multitasking.task
def _estimate_one(i, tickers, dlogp_est):
match_dist = np.sum((dlogp_est[i] - dlogp_est) ** 2, 1)
match_minidx = np.argsort(match_dist)[1]
match_mindist = np.sort(match_dist)[1]
matches[tickers[i]] = {"match": tickers[match_minidx], "index": match_minidx, "distance": match_mindist}
for i in range(num_stocks):
_estimate_one(i, tickers, dlogp_est)
return matches
def estimate_clusters(tickers: list, mu: np.array, tt: np.array):
dtt = np.arange(1, tt.shape[0])[:, None] * tt[1:] / tt[1, None]
dlogp_est = np.dot(mu[:, 1:], dtt)
num_stocks = len(tickers)
num_threads = min([len(tickers), multitasking.cpu_count() * 2])
multitasking.set_max_threads(num_threads)
clusters = []
def _unite_clusters(clusters):
k = 0
flag = 0
while k < len(clusters):
for j in range(k + 1, len(clusters)):
if clusters[j] & clusters[k]:
clusters[j] = clusters[j].union(clusters[k])
flag = 1
break
if flag:
del clusters[k]
flag = 0
else:
k += 1
return clusters
def _estimate_one(i, dlogp_est):
dist = np.sum((dlogp_est[i] - dlogp_est) ** 2, 1)
clusters.append(set(np.argsort(dist)[:2].tolist()))
return _unite_clusters(clusters)
for i in range(num_stocks):
clusters = _estimate_one(i, dlogp_est)
return [np.where([j in clusters[k] for k in range(len(clusters))])[0][0] for j in range(num_stocks)]
if __name__ == "__main__":
cli = ArgumentParser("Volatile: your day-to-day trading companion.", formatter_class=ArgumentDefaultsHelpFormatter)
cli.add_argument("-s", "--symbols", type=str, nargs="+", help=SUPPRESS)
cli.add_argument(
"--rank",
type=str,
choices=["rate", "growth", "volatility"],
default="rate",
help="If `rate`, stocks are ranked in the prediction table and in the stock estimation plot from "
"the highest below to the highest above trend; if `growth`, ranking is done from the largest"
" to the smallest trend growth at current date; if `volatility`, from the largest to the "
"smallest current volatility estimate.",
)
cli.add_argument("--save-table", action="store_true", help="Save prediction table in csv format.")
cli.add_argument("--no-plots", action="store_true", help="Plot estimates with their uncertainty over time.")
cli.add_argument("--plot-losses", action="store_true", help="Plot loss function decay over training iterations.")
cli.add_argument("--cache", action="store_true", help="Use cached data and parameters if available.")
args = cli.parse_args()
current_working_directory = os.getcwd()
cached_data_filename = "data.pickle"
if args.cache and os.path.exists(cached_data_filename):
print("\nLoading last year of data...")
with open(cached_data_filename, "rb") as file:
data = pickle.load(file)
print(f"Data has been saved to {current_working_directory}/{cached_data_filename}")
else:
if args.symbols is None:
with open("symbols_list.txt", "r") as file:
args.symbols = file.readlines()[0].split(" ")
print("\nDownloading last year of data...")
data = download_tickers_info(args.symbols)
with open(cached_data_filename, "wb") as file:
pickle.dump(data, file)
tickers = data["tickers"]
log_price = np.log(data["price"])
# convert currencies to most frequent one
for i, currency in enumerate(data["currencies"]):
if currency != data["default_currency"]:
log_price[i] = convert_currency(log_price[i], np.array(data["exchange_rates"][currency]), type="forward")
num_stocks, t = log_price.shape
info = extract_hierarchical_info(data["sectors"], data["industries"])
if num_stocks > 1:
print("\nTraining a model that discovers correlations...")
# order of the polynomial
order = 52
# times corresponding to trading dates in the data
info["tt"] = (np.linspace(1 / t, 1, t) ** np.arange(order + 1).reshape(-1, 1)).astype("float32")
# reweighing factors for parameters corresponding to different orders of the polynomial
info["order_scale"] = np.ones((1, order + 1), dtype="float32")
# train the model
phi_m, psi_m, phi_s, psi_s, phi_i, psi_i, phi, psi = models.train_msis_mcs(log_price, info, num_steps=50000)
print("Training completed.")
print("\nEstimate top matches...")
matches = estimate_matches(tickers, phi.numpy(), info["tt"])
print("Top matches estimation completed.")
print("\nTraining a model that estimates and predicts trends...")
# how many days to look ahead when comparing the current price against a prediction
horizon = 5
# order of the polynomial
order = 2
# times corresponding to trading dates in the data
info["tt"] = (np.linspace(1 / t, 1, t) ** np.arange(order + 1).reshape(-1, 1)).astype("float32")
# reweighing factors for parameters corresponding to different orders of the polynomial
info["order_scale"] = np.linspace(1 / (order + 1), 1, order + 1)[::-1].astype("float32")[None, :]
# train the model
phi_m, psi_m, phi_s, psi_s, phi_i, psi_i, phi, psi = models.train_msis_mcs(
log_price, info, plot_losses=args.plot_losses
)
print("Training completed.")
## log-price statistics (Normal distribution)
# calculate stock-level estimators of log-prices
logp_est, std_logp_est = estimate_logprice_statistics(phi.numpy(), psi.numpy(), info["tt"])
# calculate stock-level predictions of log-prices
tt_pred = ((1 + (np.arange(1 + horizon) / t)) ** np.arange(order + 1).reshape(-1, 1)).astype("float32")
logp_pred, std_logp_pred = estimate_logprice_statistics(phi.numpy(), psi.numpy(), tt_pred)
# calculate industry-level estimators of log-prices
logp_ind_est, std_logp_ind_est = estimate_logprice_statistics(phi_i.numpy(), psi_i.numpy(), info["tt"])
# calculate sector-level estimators of log-prices
logp_sec_est, std_logp_sec_est = estimate_logprice_statistics(phi_s.numpy(), psi_s.numpy(), info["tt"])
# calculate market-level estimators of log-prices
logp_mkt_est, std_logp_mkt_est = estimate_logprice_statistics(phi_m.numpy(), psi_m.numpy(), info["tt"])
# compute score
scores = (logp_pred[:, horizon] - log_price[:, -1]) / std_logp_pred.squeeze()
# compute growth as percentage price variation
growth = np.dot(phi.numpy()[:, 1:], np.arange(1, order + 1)) / t
# convert log-price currencies back (standard deviations of log-prices stay the same)
for i, currency in enumerate(data["currencies"]):
if currency != data["default_currency"]:
log_price[i] = convert_currency(log_price[i], np.array(data["exchange_rates"][currency]), type="backward")
logp_est[i] = convert_currency(logp_est[i], np.array(data["exchange_rates"][currency]), type="backward")
## price statistics (log-Normal distribution)
# calculate stock-level estimators of prices
p_est, std_p_est = estimate_price_statistics(logp_est, std_logp_est)
# calculate stock-level prediction of prices
p_pred, std_p_pred = estimate_price_statistics(logp_pred, std_logp_pred)
# calculate industry-level estimators of prices
p_ind_est, std_p_ind_est = estimate_price_statistics(logp_ind_est, std_logp_ind_est)
# calculate sector-level estimators of prices
p_sec_est, std_p_sec_est = estimate_price_statistics(logp_sec_est, std_logp_sec_est)
# calculate market-level estimators of prices
p_mkt_est, std_p_mkt_est = estimate_price_statistics(logp_mkt_est, std_logp_mkt_est)
# volatility
volatility = std_p_est[:, -1] / data["price"][:, -1]
# rank according to score
if args.rank == "rate":
rank = np.argsort(scores)[::-1]
elif args.rank == "growth":
rank = np.argsort(growth)[::-1]
elif args.rank == "volatility":
rank = np.argsort(volatility)[::-1]
ranked_tickers = np.array(tickers)[rank]
ranked_scores = scores[rank]
ranked_p = data["price"][rank]
ranked_currencies = np.array(data["currencies"])[rank]
ranked_growth = growth[rank]
ranked_volatility = volatility[rank]
if num_stocks > 1:
ranked_matches = np.array([matches[ticker]["match"] for ticker in ranked_tickers])
# rate stocks
ranked_rates = rate(ranked_scores)
if not args.no_plots:
plotting.plot_market_estimates(data, p_mkt_est, std_p_mkt_est)
plotting.plot_sector_estimates(data, info, p_sec_est, std_p_sec_est)
plotting.plot_industry_estimates(data, info, p_ind_est, std_p_ind_est)
plotting.plot_stock_estimates(data, p_est, std_p_est, args.rank, rank, ranked_rates)
if num_stocks > 1:
plotting.plot_matches(data, matches)
print("\nPREDICTION TABLE")
ranked_sectors = [
name if name[:2] != "NA" else "Not Available" for name in np.array(list(data["sectors"].values()))[rank]
]
ranked_industries = [
name if name[:2] != "NA" else "Not Available" for name in np.array(list(data["industries"].values()))[rank]
]
strf = "{:<15} {:<26} {:<42} {:<16} {:<22} {:<11} {:<15} {:<4}"
num_dashes = 159
separator = num_dashes * "-"
print(num_dashes * "-")
print(strf.format("SYMBOL", "SECTOR", "INDUSTRY", "PRICE", "RATE", "GROWTH", "VOLATILITY", "MATCH"))
print(separator)
for i in range(num_stocks):
print(
strf.format(
ranked_tickers[i],
ranked_sectors[i],
ranked_industries[i],
"{} {}".format(np.round(ranked_p[i, -1], 2), ranked_currencies[i]),
ranked_rates[i],
"{}{}{}".format("+" if ranked_growth[i] >= 0 else "", np.round(100 * ranked_growth[i], 2), "%"),
np.round(ranked_volatility[i], 2),
ranked_matches[i] if num_stocks > 1 else "None",
)
)
print(separator)
if i < num_stocks - 1 and ranked_rates[i] != ranked_rates[i + 1]:
print(separator)
if args.save_table:
tab_name = "prediction_table.csv"
table = zip(
["SYMBOL"] + ranked_tickers.tolist(),
["SECTOR"] + ranked_sectors,
["INDUSTRY"] + ranked_industries,
["PRICE"]
+ ["{} {}".format(np.round(ranked_p[i, -1], 2), ranked_currencies[i]) for i in range(num_stocks)],
["RATE"] + ranked_rates,
["GROWTH"] + ranked_growth.tolist(),
["VOLATILITY"] + ranked_volatility.tolist(),
["MATCH"] + (ranked_matches.tolist() if num_stocks > 1 else ["None"]),
)
with open(tab_name, "w") as file:
wr = csv.writer(file)
for row in table:
wr.writerow(row)
print("\nThe prediction table printed above has been saved to {}/{}.".format(os.getcwd(), tab_name))