latent_variables

# LDA
rm(list = ls())
wd <- 'C:/Users/KLN/some_r'
setwd(wd)
source('util_fun.R')
library(slam)

input.dir <- 'C:/Users/KLN/some_r/data/nt_hist'
files.v <- dir(path = input.dir, pattern='.*txt')

# tokenize text in directory
maketext <- function(files,directory){
  text.word.l <- list()
  for(i in 1:length(files)){
    text.v <- scan(paste(directory, files[i], sep="/"), what="character", sep="\n") # read a file
    text.v <- paste(text.v, collapse=" ") # collapse lines
    text.lower.v <- tolower(text.v) # casefolding
    text.words.v <- strsplit(text.lower.v, "\\W") # tokenize
    text.words.v <- unlist(text.words.v) # transform list to vector
    text.words.v <- text.words.v[which(text.words.v!="")] # remove blanks
    text.word.l[[files[i]]] <- text.words.v # update list
  }
  names(text.word.l) <- gsub("\\..*","",files)
  return(text.word.l)
}  


# slice text in n bins
slice_text <- function(text,bin){
  sliced.text.l <- split(text, cut(1:length(text),bin))
}


text.word.l <- maketext(files.v,input.dir)
names(text.word.l) <- gsub("\\..*","",files.v)
text.l <- unlist(lapply(text.word.l,slice_text,2), recursive=FALSE)# why could slice size matter?
filenames.v <- gsub("\\..*","",names(text.l))

# create corpus from slices
library(tm)
text.cor <- Corpus(VectorSource(lapply(text.l, paste, collapse = " ")))
# clean and filter
text.cor <- tm_map(text.cor, removeNumbers)
text.cor <- tm_map(text.cor, removeWords, stopwords("english"))
text.cor <- tm_map(text.cor, stripWhitespace)
## create document term matrix
text.dtm <- DocumentTermMatrix(text.cor)
print(text.dtm)
text.dtm <- docsparse(2,text.dtm)
print(text.dtm)

summary(col_sums(text.dtm))
# prune dtm
prune <- function(dtm,mx){
  mx <- ceiling(dim(dtm)[1]*mx)
  dtm <- dtm[,slam::col_sums(as.matrix(dtm) > 0) < mx]
  return(dtm)
}
text.dtm <- prune(text.dtm,.75)# try other levels of pruning
summary(col_sums(text.dtm))

# extract Thomas
thom.idx <- filenames.v == 'Thomas'
thom.dtm <- text.dtm[thom.idx,]
text.dtm <- text.dtm[!thom.idx,]
text.dtm$dimnames$Docs <- filenames.v[!thom.idx]


# train topic model based latent dirichlet allocation
library(topicmodels) # Based on Blei's code
ls('package:topicmodels')# show functions in library
k = 20 # number of topics
seed <- 1234
mdl1 <- LDA(text.dtm, k = k, method = 'VEM', control = list(seed = seed))
## unpacking the model

# quick & dirty
terms(mdl1,5)
topics(mdl1,2)

# proportions parameter
alpha <- mdl1@alpha
print(alpha)
# lexicon
lexicon <- mdl1@terms
head(lexicon)

# topics' word distribution (these estimate are only semi-meaningful)
topicword.mat <- mdl1@beta
dim(topicword.mat)
terms(mdl1,10)# print the 10 most likely terms within each topic

# documents' topic distribution
doctopic.mat <- mdl1@gamma
dim(doctopic.mat)
doctopic.mat[1,]# topic saturation of document 1
row_sums(doctopic.mat)[1]
# plot document distribution
barplot(doctopic.mat[1,])
library(ggplot2)
dev.new()

i = 6 # document number to plot
  dt.df <- data.frame(x = 1:length(doctopic.mat[i,]), y = doctopic.mat[i,])
  ggplot(data = dt.df, aes(x = x, y = y)) +
  geom_bar(stat = "identity", colour ="#FF9999")+
  theme_minimal() +
  scale_x_discrete('Topic', breaks = 1:k, limits = as.character(1:k)) +
  ylab("Document weight") +
  labs(title = paste(text.dtm$dimnames$Docs[i]))


# calculate posteriors for words within each topic
mdl1post.l <- posterior(mdl1, text.dtm)
str(mdl1post.l)
# topic 1
tmp <- sort(mdl1post.l$terms[4,], decreasing = T)# word posteriors for each topic p(w|k)=ϕkw
# and now the obligatory topic word cloud
library(wordcloud); require(RColorBrewer)
pal2 <- brewer.pal(8,"Dark2")
dev.new()
wordcloud(names(tmp[1:20]),tmp[1:30], scale=c(8,.2), random.order=FALSE, rot.per=.15, colors=pal2)

# perplexity
# model evaluation
perplexity(mdl1)
perplexity(mdl1,thom.dtm)

# infer topics of unseen documents
mdl1new.l <- posterior(mdl1, thom.dtm)
str(mdl1new.l)
barplot(mdl1new.l$topics[1,])# topic distribution for document 1 of unseen data


### scaling
# necessary
library(tm)
library(topicmodels)
# make life easier
library(plyr)
library(slam)

# build corpus
dd <- "C:/Users/KLN/some_r/data/kjv_books";
books.cor  <- Corpus(DirSource(dd, encoding = "UTF-8"), readerControl = list(language = "lat"))
names(books.cor) <- gsub("\\..*","",names(books.cor))# remove ending
filenames <- names(books.cor)
books.cor <- tm_map(books.cor, PlainTextDocument)
books.cor <- tm_map(books.cor, content_transformer(tolower))
books.cor <- tm_map(books.cor, removePunctuation)
books.cor <- tm_map(books.cor, removeNumbers)
books.cor <- tm_map(books.cor, removeWords, stopwords("english"))
books.cor <- tm_map(books.cor, stemDocument)
books.cor <- tm_map(books.cor, stripWhitespace)
# document term matrix
books.dtm <- DocumentTermMatrix(books.cor, control = list(minWordLength = 2))
dim(books.dtm)
books.dtm <- docsparse(2,books.dtm)
dim(books.dtm)
summary(col_sums(books.dtm))
books.dtm <- prune(books.dtm,.75,1)
summary(col_sums(books.dtm))

### estimate number of topics (optimal k estimation)
k = 100
#progress.bar <- create_progress_bar("text")
#progress.bar$init(k)
best.mdl <- list()
perplex.mat <- matrix(0,k-1,2)
for(i in 2:k){
  best.mdl[[i-1]] <- LDA(books.dtm, i)
  print(paste('k =',i, sep = ' '))
  #progress.bar$step()
}
#save(best.mdl,file = 'estimate_k.RData')
load('estimate_k.RData')
perplex.mat <- as.matrix(unlist(lapply(best.mdl, perplexity)))
plot(perplex.mat, main= 'Parameter estimation', xlab = 'k', ylab = 'Perplexity')

# unpack model
# ten most likely terms in each topic
terms(best.mdl[[18]],10)

# two dominant topics for all documents
top2 <- topics(best.mdl[[18]],2)
colnames(top2) <- filenames
print(top2)