-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.js
54 lines (46 loc) · 1.91 KB
/
clean.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*jshint esversion: 6 */
const readline = require('readline');
const fs = require('fs');
const stop_words = require('stopwords').english;
const rl = readline.createInterface({
input: fs.createReadStream('training_data.txt')
});
fd = fs.openSync('training_data_generated.txt', 'w');
rl.on('line', (line) => {
fs.write(fd, line.removeStopWords() + '\n');
console.log(line.removeStopWords());
});
String.prototype.removeStopWords = function() {
var x;
var y;
var word;
var stop_word;
var regex_str;
var regex;
var cleansed_string = this.valueOf();
// Split out all the individual words in the phrase
words = cleansed_string.match(/[^\s]+|\s+[^\s+]$/g);
// Review all the words
for(x=0; x < words.length; x++) {
// For each word, check all the stop words
for(y=0; y < stop_words.length; y++) {
// Get the current word
word = words[x].replace(/\s+|[^a-z]+/ig, ""); // Trim the word and remove non-alpha
// Get the stop word
stop_word = stop_words[y];
// If the word matches the stop word, remove it from the keywords
if(word.toLowerCase() == stop_word) {
// TODO Does not capture stop words at beginning of feature, eg. "On foo bar"
// Build the regex
regex_str = "^\\s*"+stop_word+"\\s*$"; // Only word
regex_str += "|^\\s*"+stop_word+"\\s+"; // First word
regex_str += "|\\s+"+stop_word+"\\s*$"; // Last word
regex_str += "|\\s+"+stop_word+"\\s+"; // Word somewhere in the middle
regex = new RegExp(regex_str, "ig");
// Remove the word from the keywords
cleansed_string = cleansed_string.replace(regex, " ");
}
}
}
return cleansed_string.replace(/^\s+|\s+$/g, "");
};