-
Notifications
You must be signed in to change notification settings - Fork 0
/
getEmbeddings2.py
79 lines (58 loc) · 2.04 KB
/
getEmbeddings2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import numpy as np
import re
import string
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
from nltk.corpus import stopwords
def textClean(text):
text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = text.lower().split()
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
return (text)
def cleanup(text):
text = textClean(text)
text = text.translate(str.maketrans("", "", string.punctuation))
return text
def constructLabeledSentences(data):
sentences = []
for index, row in data.iteritems():
sentences.append(
LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
return sentences
def clean_test_data(path):
data = pd.read_csv(path)
missing_rows = []
for i in range(len(data)):
if data.loc[i, 'text'] != data.loc[i, 'text']:
missing_rows.append(i)
data = data.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1)
for i in range(len(data)):
data.loc[i, 'text'] = cleanup(data.loc[i, 'text'])
x = data.loc[:, 'text'].values
np.save('test_data.npy', x)
def clean_data(path='datasets/train.csv'):
vector_dimension = 300
data = pd.read_csv(path)
missing_rows = []
for i in range(len(data)):
if data.loc[i, 'text'] != data.loc[i, 'text']:
missing_rows.append(i)
data = data.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1)
for i in range(len(data)):
data.loc[i, 'text'] = cleanup(data.loc[i, 'text'])
data = data.sample(frac=1).reset_index(drop=True)
x = data.loc[:, 'text'].values
y = data.loc[:, 'label'].values
data_split = int(0.8 * len(y))
xtr = x[:data_split]
xte = x[data_split:]
ytr = y[:data_split]
yte = y[data_split:]
np.save('xtr_shuffled.npy', xtr)
np.save('xte_shuffled.npy', xte)
np.save('ytr_shuffled.npy', ytr)
np.save('yte_shuffled.npy', yte)