-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSGD_AdaGrad.cpp
220 lines (195 loc) · 6.47 KB
/
SGD_AdaGrad.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#include <Rcpp.h>
#include <RcppEigen.h>
// [[Rcpp::depends(RcppEigen)]]
#include <cmath>
using Eigen::Map;
using Eigen::MatrixXd;
using Eigen::VectorXd;
using Eigen::MappedSparseMatrix;
using Eigen::SparseMatrix;
using Eigen::RowMajor;
/*
* Fast inverse square root approximation.
* Copy/Pasted from Wikipedia:
* https://en.wikipedia.org/wiki/Fast_inverse_square_root
*/
union cast_double{ uint64_t asLong; double asDouble; };
static inline double invSqrt( const double& x )
{ //Stolen from physicsforums
cast_double caster;
caster.asDouble = x;
double xhalf = ( double )0.5 * caster.asDouble;
caster.asLong = 0x5fe6ec85e7de30daLL - ( caster.asLong >> 1 );
double y = caster.asDouble;
y = y * ( ( double )1.5 - xhalf * y * y );
y = y * ( ( double )1.5 - xhalf * y * y ); //For better accuracy
return y;
}
/**
* Computes the negative log likelihood of beta given X, y, and m, (minus some
* value constant in beta).
* Implements the expresion: (m - y)' X beta + m' log(1 + exp(-X beta))
*
* Assumes yi ~ Binomial(mi, wi), where wi = 1/(1 + exp(-xi' * beta)).
*
* Args:
* beta: the weight vector
* y: the vector of labels
* X: the feature matrix (each row is one data point)
* m: the vector of number of trials for each data point
*/
// [[Rcpp::export]]
double computeNll_cpp(
const VectorXd& beta,
const VectorXd& y,
const SparseMatrix<double, RowMajor>& X,
const VectorXd& m) {
VectorXd Xbeta = X * beta;
VectorXd exp_term = (1.0 + (-1.0 * Xbeta.array()).exp()).log();
double nll = (m - y).dot(Xbeta) + m.dot(exp_term);
return nll;
}
/**
* Computes the negative log likelihood of beta for a single data point (minus
* some value constant in beta).
*
* Assumes yi ~ Binomial(mi, wi), where wi = 1/(1 + exp(-xi' * beta)).
*
* Args:
* beta: the weight vector
* y: the vector of labels
* X: the feature matrix (each row is one data point)
* m: the vector of number of trials for each data point
* index: the row index into y, X, and m.
*/
double computeNll_single_cpp(
const VectorXd& beta,
const VectorXd& y,
const SparseMatrix<double, RowMajor>& X,
const VectorXd& m,
const int index) {
double single_y = y(index);
double single_m = m(index);
double xBeta = 0;
for (SparseMatrix<double, RowMajor>::InnerIterator it(X, index); it; ++it) {
xBeta += it.value() * beta(it.index());
}
double right = single_m * std::log(1 + std::exp(-xBeta));
double left = (single_m - single_y) * xBeta;
double nll = left + right;
return nll;
}
/**
* Computes the gradient of the negative log-likelihood function.
* Implements the expression: X' (y_hat - y)
*
* Args:
* beta: the weight vector
* y: the vector of labels
* X: the feature matrix (each row is one data point)
* m: the vector of number of trials for each data point
*/
SparseMatrix<double> computeGradient_cpp(
const VectorXd& beta,
const double single_y,
const SparseMatrix<double, RowMajor>& X,
const double single_m,
const int index) {
double exponent = 0;
int nnz = 0;
for (SparseMatrix<double, RowMajor>::InnerIterator it(X, index); it; ++it) {
exponent += it.value() * beta(it.index());
nnz++;
}
double w = 1 / (1 + std::exp(-exponent));
double err = single_m * w - single_y;
SparseMatrix<double> gradient(X.cols(), 1);
gradient.reserve(nnz);
for (SparseMatrix<double, RowMajor>::InnerIterator it(X, index); it; ++it) {
gradient.insert(it.index(), 0) = it.value() * err;
}
return gradient;
}
/**
* AdaGrad update for SGD.
* Performs updats (beta and hessian_approx) in place, so no return value.
*/
void AdaGradUpdate_cpp(
VectorXd& beta,
const VectorXd& y,
const SparseMatrix<double, RowMajor>& X,
const VectorXd& m,
VectorXd& hessian_approx,
const double learning_rate,
const int index) {
// compute gradient vector
SparseMatrix<double> gradient = computeGradient_cpp(beta, y(index), X, m(index), index);
// update hessian in place
// update beta in place
for (SparseMatrix<double>::InnerIterator it(gradient, 0); it; ++it) {
hessian_approx(it.index()) += std::pow(it.value(), 2); // h <- h + g^2
beta(it.index()) -= learning_rate * it.value() * invSqrt(hessian_approx(it.index()) + 1e-8);
}
}
/**
* Stochastic gradient descent with AdaGrad.
*
* Args:
* y: the vector of labels
* X: the feature matrix (each row is one data point)
* m: the vector of number of trials for each data point
* num_epochs: number of times the whole dataset is iterated through
* learning_rate: learning rate for AdaGrad algorithm
*
* Returns:
* a list with the following elements
* beta: regression coefficients
* sample_likelihoods: vector of negative log-likelihood of beta value for individual data points
* avg_likelihoods: vector of exponentially weighted running average of sample_likelihoods
* full_likelihoods: vector of the negative log-likelihood of beta given the entire dataset
*/
// [[Rcpp::export]]
Rcpp::List SGD_AdaGrad_cpp(
const Map<VectorXd> y,
const MappedSparseMatrix<double> X_bycol,
const Map<VectorXd> m,
int num_epochs,
double learning_rate) {
// convert to RowMajor
const SparseMatrix<double, RowMajor> X(X_bycol);
// initialize beta to be all 0
VectorXd beta = VectorXd::Zero(X.cols());
// initialize estimate of diagonal approx to Hessian
VectorXd hessian_approx(X.cols());
for (int i = 0; i < hessian_approx.size(); ++i) {
hessian_approx(i) = 0.001;
}
// create vector to keep track negative log-likelihood values
VectorXd sample_likelihoods = VectorXd::Zero(num_epochs * X.rows() + 1);
VectorXd avg_likelihoods = VectorXd::Zero(num_epochs * X.rows() + 1);
VectorXd full_likelihoods = VectorXd::Zero(num_epochs * X.rows() + 1);
int i = 0;
while (num_epochs-- > 0) {
for (int index = 0; index < X.rows(); index++) {
// for (int index = 0; index < 2; index++) {
// update beta and hessian_approx in place
AdaGradUpdate_cpp(beta, y, X, m, hessian_approx, learning_rate, index);
// get new negative log-likelihood
sample_likelihoods(i) = computeNll_single_cpp(beta, y, X, m, index);
full_likelihoods(i) = computeNll_cpp(beta, y, X, m);
// update exponentially weighted moving average
if (i == 0) {
avg_likelihoods(i) = sample_likelihoods(i);
} else {
avg_likelihoods(i) = 0.99 * avg_likelihoods(i-1) + 0.01 * sample_likelihoods(i);
}
i++;
}
}
return Rcpp::List::create(
Rcpp::Named("beta") = beta,
Rcpp::Named("sample_likelihoods") = sample_likelihoods,
Rcpp::Named("full_likelihoods") = full_likelihoods,
Rcpp::Named("avg_likelihoods") = avg_likelihoods
);
}