-
Notifications
You must be signed in to change notification settings - Fork 0
/
LyricsTextAnalysis.py
195 lines (156 loc) · 6.91 KB
/
LyricsTextAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import pandas as pd # for importing & transforming data
import re # for regular expressions
import matplotlib.pyplot as plt # for plots
import seaborn as sns # for charts
from wordcloud import WordCloud # for the wordcloud
from spacy.lang.en import English # for tokenising text
nlp = English() # for tokenising text
from collections import Counter # for getting freq of words
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import the csv file into a Pandas dataframe
lyrics_df = pd.read_csv("lyricsrealcoleworld.csv")
artists = ["J. Cole"]
# you can also add multiple artists by separating with a comma:
# artists = ["J. Cole", "Drake", "Kendrick Lamar", "Nicki Minaj"]
# but make sure that these artists are also present in the csv file
# view the shape of the data (the number of rows and columns)
print(f"The shape of the data is: {lyrics_df.shape}")
df1 = lyrics_df['lyrics']
print(df1.head(10))
# Create a single string containing all the lyrics,
# This will be needed to be able to create a wordcloud
lyrics_string = " ".join(lyrics for lyrics in lyrics_df["lyrics"])
print(lyrics_string)
# Create the wordcloud
lyrics_wordcloud = WordCloud(background_color="black", max_words=100).generate(lyrics_string)
# View the wordcloud
plt.imshow(lyrics_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
print("\n")
print("\n")
print("\n")
# Data Cleaning
# Convert the text to lower case
lyrics_string_lower = lyrics_string.lower()
# View the first 200 elements of the string to check this worked as expected
print(lyrics_string_lower[0:200])
print("\n")
print("\n")
print("\n")
# Remove extra white spaces so there is only one space between words
lyrics_string_space = re.sub(r'\s+',' ', lyrics_string_lower)
# View the first 200 elements of the string to check this worked as expected
print(lyrics_string_space[0:200])
# Create the wordcloud
lyrics_cleaned_wordcloud = WordCloud(background_color="black", max_words=100).generate(lyrics_string_space)
# View the wordcloud
plt.imshow(lyrics_cleaned_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# Create a spacy document by pointing spacy to the lyrics string
lyrics_doc = nlp(lyrics_string_space)
# Get all tokens that aren't punctuation
lyrics_words = [token.text for token in lyrics_doc if token.is_punct != True]
# Get the frequency of each word (token) in the lyrics string
lyrics_word_freq = Counter(lyrics_words)
# Get the 10 most frequent words
ten_most_common_words = lyrics_word_freq.most_common(10)
# View the 10 most common words
print(ten_most_common_words)
# Create a Pandas dataframe containing the tokens (words) and their frequencies
freq_df = pd.DataFrame.from_dict(lyrics_word_freq, orient='index').reset_index()
# Rename the columns to "word" and "freq"
freq_df.columns=["word", "freq"]
# Sort the dataframe so that the most frequent word is at the top and view the first 3 rows
print(freq_df.sort_values(by="freq", ascending=False).head(3))
# Display a bar chart showing the top 25 words and their frequencies
fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(data=freq_df.sort_values(by="freq", ascending=False).head(25), y="word", x="freq", color='#7bbcd5')
plt.ylabel("Word")
plt.xlabel("Frequency")
plt.title("Top 25 Most Frequent Words")
sns.despine()
plt.show()
# Get all tokens that aren't punctuation and aren't stopwords
# Stopwords are the words that are not really important for analysis
# Examples of stopwords are: "a," "and," "but," "how," "or,"
lyrics_words = [token.text for token in lyrics_doc if token.is_punct != True and token.is_stop != True]
# Get the frequency of each word (token) in the lyrics string
lyrics_word_freq = Counter(lyrics_words)
# Re-create the Pandas dataframe containing the tokens (words) and their frequencies
freq_df = pd.DataFrame.from_dict(lyrics_word_freq, orient='index').reset_index()
# Rename the columns to "word" and "freq"
freq_df.columns=["word", "freq"]
# Display a bar chart showing the top 25 words and their frequencies (which will exclude the stopwords this time)
fig, ax = plt.subplots(figsize=(12,6))
sns.barplot(data=freq_df.sort_values(by="freq", ascending=False).head(25), y="word", x="freq", color='#7bbcd5')
plt.ylabel("Word")
plt.xlabel("Frequency")
plt.title("Top 25 Most Frequent Words (Excluding Stopwords)")
sns.despine()
plt.show()
# Number of words used by J Cole in his top 10 songs.
plt.rcParams['figure.figsize'] = (3, 2)
wordsdf = pd.DataFrame(columns=('artist', 'words'))
i=0
for artist in artists:
num_words = 0
all_text = ''
for sentence in lyrics_string_space:
num_words_this = len(sentence.split(" "))
num_words += num_words_this
wordsdf.loc[i] = (artist, num_words)
i+=1
print(wordsdf)
# If you want you can also show this using a plot, just uncomment the next two code lines (141 and 142)
# I would recommend showing it in plot form when there are multiple artists
# wordsdf.plot.bar(x='artist', y='words', title='Number of Words in J. Cole songs');
# plt.show()
lexicaldf = pd.DataFrame(columns=('artist', 'lexicalrichness'))
for artist in artists:
all_words = ''
num_words = 0
raw_text = ""
for sentence in lyrics_string_space:
raw_text += sentence
words = raw_text.split(" ")
filtered_words = [word for word in words if word not in stopwords.words('english') and len(word) > 1 and word not in ['na', 'la']] # remove the stopwords
a = len(set(filtered_words))
b = len(words)
lexicaldf.loc[i] = (artist, (a / float(b)) * 100)
i += 1
print(lexicaldf)
# If you want you can also show this using a plot, just uncomment the next two code lines (163 and 164)
# I would recommend showing it in plot form when there are multiple artists
# lexicaldf.plot.bar(x='artist', y='lexicalrichness', title='Lexical richness of each Artist');
# plt.show()
# Let's perform Sentiment Analysis
sentimentanalysisdf = pd.DataFrame(columns=('artist', 'positive', 'neutral', 'negative'))
sid = SentimentIntensityAnalyzer()
i = 0
for artist in artists:
num_positive = 0
num_negative = 0
num_neutral = 0
for sentence in lyrics_string_space:
comp = sid.polarity_scores(sentence)
comp = comp['compound']
if comp >= 0.5:
num_positive += 1
elif comp > -0.5 and comp < 0.5:
num_neutral += 1
else:
num_negative += 1
num_total = num_negative + num_neutral + num_positive
percent_negative = (num_negative / float(num_total)) * 100
percent_neutral = (num_neutral / float(num_total)) * 100
percent_positive = (num_positive / float(num_total)) * 100
sentimentanalysisdf.loc[i] = (artist, percent_positive, percent_neutral, percent_negative)
i += 1
print(sentimentanalysisdf)
# If you want you can also show this using a plot, just uncomment the next two code lines (194 and 195)
# I would recommend showing it in plot form when there are multiple artists
# sentimentanalysisdf.plot.bar(x='artist', stacked=True)
# plt.show()