-
Notifications
You must be signed in to change notification settings - Fork 1
/
oJobWatchDog.yaml
327 lines (291 loc) · 13.5 KB
/
oJobWatchDog.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# oJob WatchDog functionality
# Copyright 2023 Nuno Aguiar
jobs:
- name: oJob WatchDog
help: |
Verifies that a process (OpenAF/oJob/etc...) is running. Expects:
- cmdToStop (String) If defined will execute the command on the stop event.
- execToStop (String) If defined will execute the code on the stop event.
- jobToStop (String) If defined will execute the job on the stop event.
- waitAfterStop (Number) Number of ms to wait after stopping.
- workDirStop (String) The working directory to use for the stop command.
- timeoutStop (Number) Timeout waiting for cmdToStop to exit.
- exitCodeStop (Number) If defined the cmdToStop exitcode must be this value.
- cmdToStart (String) Command to startup (in backgroud (use start (in windows) or & (in unix))) on the start event.
- execToStart (String) If defined will execute the code on the start event.
- jobToStart (String) If defined will execute the job on the start event.
- waitAfterStart (Number) Number of ms to wait after starting.
- workDirStart (String) The working directory to use for startup command.
- timeoutStart (Number) Timeout waiting for cmdToStart to exit.
- exitCodeStart (Number) If defined the cmdToStart exitcode must be this value.
- checks (Map) Array of checks to perform to determine if it's running (if fail will assume needs restart)
- checks.pid.file (String) The pid file location (when ojob uses unique pidFile ojob options).
- checks.log.folder (String) Folder where the log files to check are located.
- checks.log.fileRE (String) Checks for files matching fileRE choosing the latest by modified date.
- checks.log.stringRE (Array) Array of regular expressions strings to look for. If found assumes a restart is needed.
- checks.log.histFile (String) The file where to store the history of findings on the file to avoid duplicate findings.
- checks.log.olderMin (Number) Checks if the latest log file (fileRE) by modified date is older than x minutes.
- checks.custom.exec (String) Executes the corresponding code in a function and passes if returns true or fails assuming a restart is needed if returns false.
- quiet (Boolean) If true will only produce logging if something is not right (default is true)
- jobToNotify (String) If defined will execute when finished checking, stopped or started (receives args.__watchdog and should check phase for start, stop and checks)
exec: |
args.checks =_$(args.checks).isMap().default({});
args.__watchdog = { shouldRestart: false };
args.timeoutStart = _$(args.timeoutStart).isNumber().default(void 0);
args.timeoutStop = _$(args.timeoutStop).isNumber().default(void 0);
args.workDirStart = _$(args.workDirStart).isString().default(void 0);
args.workDirStop = _$(args.workDirStop).isString().default(void 0);
args.cmdToStart = _$(args.cmdToStart).isString().default(void 0);
args.exitCodeStart = _$(args.exitCodeStart).isNumber().default(void 0);
args.waitAfterStart = _$(args.waitAfterStart).isNumber().default(void 0);
args.exitCodeStop = _$(args.exitCodeStop).isNumber().default(void 0);
args.waitAfterStop = _$(args.waitAfterStop).isNumber().default(void 0);
args.quiet = _$(args.quiet).isBoolean().default(true);
args.jobToNotify = _$(args.jobToNotify).isString().default(void 0);
var shouldRestart = false;
// Checks
// ------
args.__watchdog.checks = {};
// Check pid file
// --------------
args.__watchdog.phase = "checks";
if (isDef(args.checks.pid)) {
var running = false;
try {
if (!args.quiet) log("Checking pid '" + args.checks.pid.file + "'");
if (io.fileExists(args.checks.pid.file)) {
var r = pidCheck(io.readFileString(args.checks.pid.file).replace(/^(\d+)[.\n\r]*/mg, "$1"));
args.__watchdog.checks.pid = r;
if (r) {
running = true;
if (!args.quiet) log("'" + args.checks.pid.file + "' is running.");
}
}
if (!running) {
logWarn("'" + args.checks.pid.file + "' not running.");
if (isDef(args.cmdToStart)) {
shouldRestart = true;
args.__watchdog.shouldRestart = true;
}
}
} catch(e) {
logErr("Found problem while checking for '" + args.checks.pid.file + "' (" + String(e) + ")");
throw e;
}
if (isUnDef(args.execToStop) && isUnDef(args.cmdToStop)) {
if (io.fileExists(args.checks.pid.file)) args.execToStop = "pidKill(io.readFileString('" + args.checks.pid.file + "'), true);";
}
}
// Check custom
// ------------
if (isDef(args.checks.custom) && isDef(args.checks.custom.exec)) {
if (!args.quiet) log("Checking custom...");
try {
var f = new Function("args", args.checks.custom.exec);
var r = f(args);
args.__watchdog.checks.custom = r;
if (!r) { shouldRestart = true; args.__watchdog.shouldRestart = true; }
} catch(e) {
logErr("Found problem while custom checking (" + String(e) + ")");
throw e;
}
}
// Check log file
// --------------
if (isDef(args.checks.log) &&
isDef(args.checks.log.fileRE) &&
isDef(args.checks.log.histFile) &&
isDef(args.checks.log.folder) &&
isDef(args.checks.log.olderMin)) {
try {
if (isString(args.checks.log.stringRE)) args.checks.log.stringRE = [ args.checks.log.stringRE ];
if (!args.quiet) log("Checking log file at " + args.checks.log.folder + " for patterns like '" + args.checks.log.stringRE.join("', ''") + "'...");
var f = $from(io.listFiles(args.checks.log.folder).files).match("filename", args.checks.log.fileRE).max("lastModified");
if (isDef(args.checks.log.olderMin) && ow.loadFormat().dateDiff.inMinutes(new Date(f.lastModified), new Date()) > args.checks.log.olderMin) {
args.__watchdog.shouldRestart = true;
args.__watchdog.checks.logOlderMin = new Date(f.lastModified);
logWarn("Log is older than " + args.checks.log.olderMin + " minutes (" + new Date(f.lastModified) + ")");
}
var history;
if (io.fileExists(args.checks.log.histFile)) {
history = io.readFile(args.checks.log.histFile);
if (history.file != f.filename || history.size > f.size) {
history = {
file : f.filename,
size : f.size,
line : 0
}
}
} else {
history = {
file : f.filename,
size : f.size,
line : 0
};
}
args.__watchdog.checks.log = { history: history };
var str = io.readFileStream(f.canonicalPath);
var c = 0;
ioStreamReadLines(str, (l) => {
c++;
if (c > history.line) {
args.__watchdog.checks.log.lines = [];
for(var ii in args.checks.log.stringRE) {
if (l.match(new RegExp(args.checks.log.stringRE[ii], "i"))) {
shouldRestart = true;
history.line = c;
args.__watchdog.shouldRestart = true;
args.__watchdog.checks.log.lines.push({ lineNum: l, line: c, file: f.canonicalPath });
logWarn("Found line '" + l + "' on line " + c + " of " + f.canonicalPath + ".");
}
}
}
});
history.line = c;
history.size = io.fileInfo(f.canonicalPath).size;
str.close();
io.writeFile(args.checks.log.histFile, history);
} catch(e) {
logErr("Found problem while checking for log entries in '" + args.checks.log.folder + "'");
throw e;
}
if (isDef(args.jobToNotify)) {
try {
//oJobRunJob(args.jobToNotify, args);
ow.oJob.add2Todo({ name: args.jobToNotify, args: args });
} catch(e) {
logErr("Found problem while trying to notify executing job '" + args.jobToNotify + "' (" + String(e) + ")");
throw e;
}
}
}
// Restart
// -------
if (shouldRestart || (isDef(args.STOP) && args.STOP)) {
args.__watchdog.phase = "stop";
// STOP
// ----
if (isDef(args.execToStop)) {
try {
args.__watchdog.execToStop = af.eval(args.execToStop);
} catch(e) {
logErr("Found problem while trying to restart executing '" + args.execToStop + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.jobToStop)) {
try {
//oJobRunJob(args.jobToStop, args);
ow.oJob.add2Todo({ name: args.jobToStop, args: args });
} catch(e) {
logErr("Found problem while trying to stop executing job '" + args.jobToStop + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.cmdToStop)) {
log("Trying to stop '" + args.cmdToStop + "'");
try {
var m = sh(args.cmdToStop, "", args.timeoutStop, true, args.workDirStop, true);
args.__watchdog.cmdToStop = m;
if (isDef(args.exitCodeStop)) {
if (m.exitcode != args.exitCodeStop) throw "'" + args.cmdToStop + "' exit code returned was " + m.exitcode;
} else {
if (m.exitcode != 0) {
logWarn("While stopping '" + args.cmdToStop + "' received exit code = " + m.exitcode);
}
}
} catch(e) {
logErr("Found problem while trying to stop '" + args.cmdToStop + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.jobToNotify)) {
try {
//oJobRunJob(args.jobToNotify, args);
ow.oJob.add2Todo({ name: args.jobToNotify, args: args })
} catch(e) {
logErr("Found problem while trying to notify executing job '" + args.jobToNotify + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.waitAfterStop)) {
if (!args.quiet) { log("Waiting after stop for " + args.waitAfterStop + "ms"); }
sleep(args.waitAfterStop);
}
// START
// -----
if (isUnDef(args.STOP) || !args.STOP) {
args.__watchdog.phase = "start";
if (isDef(args.execToStart)) {
try {
args.__watchdog.execToStart = af.eval(args.execToStart);
} catch(e) {
logErr("Found problem while trying to start executing '" + args.execToStart + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.jobToStart)) {
try {
//oJobRunJob(args.jobToStart, args);
ow.oJob.add2Todo({ name: args.jobToStart, args: args });
} catch(e) {
logErr("Found problem while trying to start executing job '" + args.jobToStart + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.cmdToStart)) {
log("Trying to restart '" + args.cmdToStart + "'");
try {
var m = sh(args.cmdToStart, "", args.timeoutStart, true, args.workDirStart, true);
args.__watchdog.cmdToStart = m;
if (isDef(args.exitCodeStart)) {
if (m.exitcode != args.exitCodeStart) throw "'" + args.cmdToStart + "'exit code returned was " + m.exitcode;
} else {
if (m.exitcode != 0) {
logWarn("While restarting '" + args.cmdToStart + "' received exit code = " + m.exitcode);
}
}
} catch(e) {
logErr("Found problem while trying to start '" + args.cmdToStart + "' (" + String(e) + ")");
throw e;
}
}
if (isDef(args.waitAfterStart)) {
if (!args.quiet) { log("Waiting after start for " + args.waitAfterStart + "ms"); }
sleep(args.waitAfterStart);
}
if (isDef(args.jobToNotify)) {
try {
//oJobRunJob(args.jobToNotify, args);
ow.oJob.add2Todo({ name: args.jobToNotify, args: args });
} catch(e) {
logErr("Found problem while trying to notify executing job '" + args.jobToNotify + "' (" + String(e) + ")");
throw e;
}
}
}
}
#todo:
# - name: oJob WatchDog
# args:
# checks :
# pid:
# file: /some/path/a.pid
# log :
# folder : /some/path/logya
# fileRE : log-\d+-\d+-\d+.log
# histFile: /some/path/logya/logya.json
# stringRE: OutOfMemory
# custom:
# exec: |
# print(123);
# return false;
#
# cmdToStart : start ojob /some/path/a.yaml
# workDirStart : /some/path/
# waitAfterStart: 5000
#
# execToStop : |
# pidKill(io.readFileString("/some/path/a.pid"), true);
#
# quiet : false