-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweet.py
102 lines (65 loc) · 2.06 KB
/
tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk import FreqDist
import warnings
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from nltk.collocations import *
from sklearn.feature_extraction.text import TfidfVectorizer
#stop_words = set(stopwords.words('english'))
train = pd.read_csv('./output.csv')
#print (train['text'])
tweets=train['text']
train['text'] = train['text'].str.replace("[^a-zA-Z#]", " ")
#Removing Short Words
train['text'] = train['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
#tokenization
tokenized_tweet = train['text'].apply(lambda x: x.split())
#print (tokenized_tweet)
#Stemming is done in this part
stemmer = PorterStemmer()
#tokenized_tweet = tokenized_tweet.apply(lambda x:[stemmer.stem(i) for i in x])
#print(tokenized_tweet)
new_tweet=tokenized_tweet
#print(new_tweet)
#join all tokenized and stemmed tokens together
for i in range(len(tokenized_tweet)):
tokenized_tweet[i]=' '.join(tokenized_tweet[i])
train['text'] = tokenized_tweet
#print(train['text'].head())
# removing https:// and URLs
#creating a .csv file storing values
#print(tweets)
def hashtag_extract(x):
hashtag=[]
for i in x:
ht =re.findall(r"(\w+)", i)
hashtag.append(ht)
return hashtag
# extracting hashtags from tweets
HT_regular = hashtag_extract(train['text'])
cleaned_tweet=HT_regular
#print(cleaned_tweet)
HT_regular = sum(HT_regular,[])
#Plotting non racist hastag
a = nltk.FreqDist(HT_regular)
a.plot(30)
d = pd.DataFrame({'Hashtag': list(a.keys()),
'Count': list(a.values())})
# selecting top 10 most frequent hashtags
d = d.nlargest(columns="Count", n = 10)
plt.figure(figsize=(15,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.savefig('/home/rrahul/seabornDesktop/figure_1.png',dpi=100)
plt.show()
#Location
location=train['id']
print(location)