getEmbeddings2.py

import numpy as np
import re
import string
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
from nltk.corpus import stopwords


def textClean(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return (text)


def cleanup(text):
    text = textClean(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text


def constructLabeledSentences(data):
    sentences = []
    for index, row in data.iteritems():
        sentences.append(
            LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences


def clean_test_data(path):
    data = pd.read_csv(path)

    missing_rows = []
    for i in range(len(data)):
        if data.loc[i, 'text'] != data.loc[i, 'text']:
            missing_rows.append(i)
    data = data.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1)

    for i in range(len(data)):
        data.loc[i, 'text'] = cleanup(data.loc[i, 'text'])

    x = data.loc[:, 'text'].values
    np.save('test_data.npy', x)


def clean_data(path='datasets/train.csv'):
    vector_dimension = 300

    data = pd.read_csv(path)

    missing_rows = []
    for i in range(len(data)):
        if data.loc[i, 'text'] != data.loc[i, 'text']:
            missing_rows.append(i)
    data = data.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1)

    for i in range(len(data)):
        data.loc[i, 'text'] = cleanup(data.loc[i, 'text'])

    data = data.sample(frac=1).reset_index(drop=True)

    x = data.loc[:, 'text'].values
    y = data.loc[:, 'label'].values

    data_split = int(0.8 * len(y))

    xtr = x[:data_split]
    xte = x[data_split:]
    ytr = y[:data_split]
    yte = y[data_split:]

    np.save('xtr_shuffled.npy', xtr)
    np.save('xte_shuffled.npy', xte)
    np.save('ytr_shuffled.npy', ytr)
    np.save('yte_shuffled.npy', yte)