-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathclassification.R
166 lines (140 loc) · 5.55 KB
/
classification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# classification
rm(list = ls())
wd <- 'C:/Users/KLN/some_r'
setwd(wd)
source('util_fun.R')
input.dir <- 'C:/Users/KLN/some_r/data/nt_hist'
files.v <- dir(path = input.dir, pattern='.*txt')
# tokenize text in directory
maketext <- function(files,directory){
text.word.l <- list() # set up empty list
for(i in 1:length(files)){ # loop over the files in directory
text.v <- scan(paste(directory, files[i], sep="/"), what="character", sep="\n") # read a file
text.v <- paste(text.v, collapse=" ") # collapse lines
text.lower.v <- tolower(text.v) # casefolding
text.words.v <- strsplit(text.lower.v, "\\W") # tokenize
text.words.v <- unlist(text.words.v) # transform list to vector
text.words.v <- text.words.v[which(text.words.v!="")] # remove blanks
text.word.l[[files[i]]] <- text.words.v # update list
}
names(text.word.l) <- gsub("\\..*","",files)
return(text.word.l)
}
text.word.l <- maketext(files.v,input.dir)
names(text.word.l)
texttitle.l <- gsub("\\..*","",files.v)
# slice tokenized text in n chuncks
nchunk <- function(tokens,n){
maxlen <- length(tokens)/n
x <- seq_along(tokens)
chunks.l <- split(tokens, ceiling(x/maxlen))
}
nslice.l <- lapply(text.word.l,nchunk,100)
# unlist multiple list while preserving sublist names
text.l <- unlist(nslice.l, recursive=FALSE)
# get class information
class.v <- gsub('\\..*','',names(text.l))
# create corpus from chunks
library(tm)
text.l <- lapply(text.l, paste, collapse = " ")
text.vs <- VectorSource(text.l)
text.cor <- Corpus(text.vs)
# clean and filter
text.cor <- tm_map(text.cor, removeNumbers)
text.cor <- tm_map(text.cor, removeWords, stopwords("english"))
text.cor <- tm_map(text.cor, stemDocument)
text.cor <- tm_map(text.cor, stripWhitespace)
## create document term matrix
text.dtm <- DocumentTermMatrix(text.cor)
text.dtm <- docsparse(25,text.dtm)
print(text.dtm)
# transform to matrix object
text.mat <- as.matrix(text.dtm)
rownames(text.mat) <- names(text.l)
text.mat[1:10,1:10]
# full model predict class of Thomas and build data frame
idx <- which(class.v != 'Thomas')
# classical validation procedure
train <- ifelse(runif(length(class.v)) < 0.80,1,0)
idx <- which(train == 1)
# build feature set for training
feat1.df <- data.frame(book = class.v[idx],text.mat[idx,])
head(feat1.df[1:10,1:10])
# naive bayes classifier (categorical data, but assumes independence)
library(e1071)
model.nb <- naiveBayes(book ~ ., data = feat1.df)
pred.v <- predict(model.nb, feat1.df)
# conditional posterior probabilities
predraw.v <- predict(model.nb, feat1.df,type = 'raw')
head(predraw.v)
confusion.mat <- as.matrix(table(pred = pred.v,true = feat1.df$book))
# performance metric
accuracy <- sum(diag(confusion.mat))/sum(confusion.mat)
print(accuracy)
# plot confusion
library(ggplot2)
dev.new()
plot <- ggplot(as.data.frame(confusion.mat))
plot + geom_tile(aes(x=pred, y=true, fill=Freq)) +
scale_x_discrete(name="Predicted Class") +
scale_y_discrete(name="True Class") +
scale_fill_gradient(breaks=seq(from=-.5, to=4, by=.2)) +
labs(fill="Frequency")
# chance level
table(feat1.df$book)
#chance <- 100/sum(table(feat1.df$book))
# testing procedure
idx <- which(train != 1)
feat2.df <- data.frame(book = class.v[idx],text.mat[idx,])
test.v <- predict(model.nb, feat2.df)
confusion.mat <- as.matrix(table(test.v,feat2.df$book))
accuracy <- sum(diag(confusion.mat))/sum(confusion.mat)
# predict class of Thomas (early or late)
idx <- which(class.v == 'Thomas')
feat2.df <- data.frame(book = class.v[idx],text.mat[idx,])
predthom.v <- predict(model.nb, feat2.df)
dev.new()
plot(table(predthom.v))
predAct <- data.frame(pred.v,feat1.df$book)
# random performance (compare to full model)
classrand.v <- sample(class.v)
featrand.df <- data.frame(book = classrand.v[idx],text.mat[idx,])
modelrand.nb <- naiveBayes(book ~ ., data = featrand.df)
predrand.v <- predict(modelrand.nb, featrand.df)
# conditional posterior probabilities
confusion.mat <- as.matrix(table(predrand.v,featrand.df$book))
# performance metric
accuracy <- sum(diag(confusion.mat))/sum(confusion.mat)
print(accuracy)
##############################
#scaling with RTextTools and tm
library(RWeka)
text.dtm <- DocumentTermMatrix(text.cor, control=list(tokenize = NGramTokenizer))
text.dtm <- docsparse(5,text.dtm)
library(RTextTools)
## separate training and testing set and create a container
# random sample for testing data from data set
trainidx.v <- 1:nrow(text.dtm)
testidx.v <- sort(sample(trainidx.v, nrow(text.dtm)*.1, replace = FALSE, prob = NULL))
trainidx.v <- sort(trainidx.v[! trainidx.v%in%testidx.v])
# change object type, create_analytics() only handles numeric
classnum.v <- as.numeric(as.factor(class.v))
# to transform back to original
factor(classnum.v, labels = unique(class.v))
# create container
container <- create_container(text.dtm, classnum.v, trainSize=trainidx.v,
testSize=testidx.v, virgin=FALSE)
# training models
mdl1.l <- train_models(container, algorithms='SVM')
mdl2.l <- train_models(container, algorithms = c('SVM','NNET','TREE') )
# Classifying data
res.df <- classify_models(container, mdl2.l)
head(res.df)
confusion.mat <- as.matrix(table(res.df$SVM_LABEL, container@testing_codes))
rownames(confusion.mat) <- colnames(confusion.mat) <- unique(class.v)
print(confusion.mat)
accuracy <- sum(diag(confusion.mat))/sum(confusion.mat)
# performance metrics
analytics <- create_analytics(container, res.df)
class(analytics)
summary(analytics)