-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlatent_variables
187 lines (161 loc) · 5.53 KB
/
latent_variables
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# LDA
rm(list = ls())
wd <- 'C:/Users/KLN/some_r'
setwd(wd)
source('util_fun.R')
library(slam)
input.dir <- 'C:/Users/KLN/some_r/data/nt_hist'
files.v <- dir(path = input.dir, pattern='.*txt')
# tokenize text in directory
maketext <- function(files,directory){
text.word.l <- list()
for(i in 1:length(files)){
text.v <- scan(paste(directory, files[i], sep="/"), what="character", sep="\n") # read a file
text.v <- paste(text.v, collapse=" ") # collapse lines
text.lower.v <- tolower(text.v) # casefolding
text.words.v <- strsplit(text.lower.v, "\\W") # tokenize
text.words.v <- unlist(text.words.v) # transform list to vector
text.words.v <- text.words.v[which(text.words.v!="")] # remove blanks
text.word.l[[files[i]]] <- text.words.v # update list
}
names(text.word.l) <- gsub("\\..*","",files)
return(text.word.l)
}
# slice text in n bins
slice_text <- function(text,bin){
sliced.text.l <- split(text, cut(1:length(text),bin))
}
text.word.l <- maketext(files.v,input.dir)
names(text.word.l) <- gsub("\\..*","",files.v)
text.l <- unlist(lapply(text.word.l,slice_text,2), recursive=FALSE)# why could slice size matter?
filenames.v <- gsub("\\..*","",names(text.l))
# create corpus from slices
library(tm)
text.cor <- Corpus(VectorSource(lapply(text.l, paste, collapse = " ")))
# clean and filter
text.cor <- tm_map(text.cor, removeNumbers)
text.cor <- tm_map(text.cor, removeWords, stopwords("english"))
text.cor <- tm_map(text.cor, stripWhitespace)
## create document term matrix
text.dtm <- DocumentTermMatrix(text.cor)
print(text.dtm)
text.dtm <- docsparse(2,text.dtm)
print(text.dtm)
summary(col_sums(text.dtm))
# prune dtm
prune <- function(dtm,mx){
mx <- ceiling(dim(dtm)[1]*mx)
dtm <- dtm[,slam::col_sums(as.matrix(dtm) > 0) < mx]
return(dtm)
}
text.dtm <- prune(text.dtm,.75)# try other levels of pruning
summary(col_sums(text.dtm))
# extract Thomas
thom.idx <- filenames.v == 'Thomas'
thom.dtm <- text.dtm[thom.idx,]
text.dtm <- text.dtm[!thom.idx,]
text.dtm$dimnames$Docs <- filenames.v[!thom.idx]
# train topic model based latent dirichlet allocation
library(topicmodels) # Based on Blei's code
ls('package:topicmodels')# show functions in library
k = 20 # number of topics
seed <- 1234
mdl1 <- LDA(text.dtm, k = k, method = 'VEM', control = list(seed = seed))
## unpacking the model
# quick & dirty
terms(mdl1,5)
topics(mdl1,2)
# proportions parameter
alpha <- mdl1@alpha
print(alpha)
# lexicon
lexicon <- mdl1@terms
head(lexicon)
# topics' word distribution (these estimate are only semi-meaningful)
topicword.mat <- mdl1@beta
dim(topicword.mat)
terms(mdl1,10)# print the 10 most likely terms within each topic
# documents' topic distribution
doctopic.mat <- mdl1@gamma
dim(doctopic.mat)
doctopic.mat[1,]# topic saturation of document 1
row_sums(doctopic.mat)[1]
# plot document distribution
barplot(doctopic.mat[1,])
library(ggplot2)
dev.new()
i = 6 # document number to plot
dt.df <- data.frame(x = 1:length(doctopic.mat[i,]), y = doctopic.mat[i,])
ggplot(data = dt.df, aes(x = x, y = y)) +
geom_bar(stat = "identity", colour ="#FF9999")+
theme_minimal() +
scale_x_discrete('Topic', breaks = 1:k, limits = as.character(1:k)) +
ylab("Document weight") +
labs(title = paste(text.dtm$dimnames$Docs[i]))
# calculate posteriors for words within each topic
mdl1post.l <- posterior(mdl1, text.dtm)
str(mdl1post.l)
# topic 1
tmp <- sort(mdl1post.l$terms[4,], decreasing = T)# word posteriors for each topic p(w|k)=ϕkw
# and now the obligatory topic word cloud
library(wordcloud); require(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
dev.new()
wordcloud(names(tmp[1:20]),tmp[1:30], scale=c(8,.2), random.order=FALSE, rot.per=.15, colors=pal2)
# perplexity
# model evaluation
perplexity(mdl1)
perplexity(mdl1,thom.dtm)
# infer topics of unseen documents
mdl1new.l <- posterior(mdl1, thom.dtm)
str(mdl1new.l)
barplot(mdl1new.l$topics[1,])# topic distribution for document 1 of unseen data
### scaling
# necessary
library(tm)
library(topicmodels)
# make life easier
library(plyr)
library(slam)
# build corpus
dd <- "C:/Users/KLN/some_r/data/kjv_books";
books.cor <- Corpus(DirSource(dd, encoding = "UTF-8"), readerControl = list(language = "lat"))
names(books.cor) <- gsub("\\..*","",names(books.cor))# remove ending
filenames <- names(books.cor)
books.cor <- tm_map(books.cor, PlainTextDocument)
books.cor <- tm_map(books.cor, content_transformer(tolower))
books.cor <- tm_map(books.cor, removePunctuation)
books.cor <- tm_map(books.cor, removeNumbers)
books.cor <- tm_map(books.cor, removeWords, stopwords("english"))
books.cor <- tm_map(books.cor, stemDocument)
books.cor <- tm_map(books.cor, stripWhitespace)
# document term matrix
books.dtm <- DocumentTermMatrix(books.cor, control = list(minWordLength = 2))
dim(books.dtm)
books.dtm <- docsparse(2,books.dtm)
dim(books.dtm)
summary(col_sums(books.dtm))
books.dtm <- prune(books.dtm,.75,1)
summary(col_sums(books.dtm))
### estimate number of topics (optimal k estimation)
k = 100
#progress.bar <- create_progress_bar("text")
#progress.bar$init(k)
best.mdl <- list()
perplex.mat <- matrix(0,k-1,2)
for(i in 2:k){
best.mdl[[i-1]] <- LDA(books.dtm, i)
print(paste('k =',i, sep = ' '))
#progress.bar$step()
}
#save(best.mdl,file = 'estimate_k.RData')
load('estimate_k.RData')
perplex.mat <- as.matrix(unlist(lapply(best.mdl, perplexity)))
plot(perplex.mat, main= 'Parameter estimation', xlab = 'k', ylab = 'Perplexity')
# unpack model
# ten most likely terms in each topic
terms(best.mdl[[18]],10)
# two dominant topics for all documents
top2 <- topics(best.mdl[[18]],2)
colnames(top2) <- filenames
print(top2)