Skip to content

Commit

Permalink
Merge pull request #14 from imbi-heidelberg/basic_development
Browse files Browse the repository at this point in the history
lemmatize corpus objects
  • Loading branch information
max-pilz authored Jul 3, 2024
2 parents bd4f110 + 5ccc2ba commit e442990
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 5 deletions.
6 changes: 3 additions & 3 deletions R/MetaNLP.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ MetaNLP <- function(file,
(`[[`)(c("x")) |>
# lower case
tolower() |>
# lemmatization of the words
textstem::lemmatize_strings(dictionary = lexicon) |>
tm::VectorSource() |>
# create corpus object
tm::Corpus() |>
# remove special characters
tm::tm_map(tm::content_transformer(replaceSpecialChars), language = language) |>
# strip white space
tm::tm_map(tm::stripWhitespace) |>
# lemmatization of the words
tm::tm_map(textstem::lemmatize_strings, dictionary = lexicon) |>
# only use word stems
tm::tm_map(tm::stemDocument, language = language) |>
# create matrix
Expand All @@ -133,7 +133,7 @@ MetaNLP <- function(file,

# only choose word stems that appear at least a pre-specified number of times
temp <- temp[, colSums(temp) >= bounds[1] & colSums(temp) <= bounds[2]]

#
# order by column name
index_vec <- order(names(temp))
temp |>
Expand Down
1 change: 1 addition & 0 deletions tests/testthat/test_constructor.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ test_that("constructor works", {
MetaNLP(source_path_ru, bounds = c(1, Inf), language = "russian",
encoding = "UTF-8")
)

})

test_that("print methods work", {
Expand Down
3 changes: 1 addition & 2 deletions tests/testthat/test_deletion.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ test_that("Special characters can be replaces", {

# load french data set
source_path_fr <- test_path("data", "french_data.csv")
obj_fr <- MetaNLP(source_path_fr, bounds = c(1, Inf), language = "french",
stringsAsFactors=FALSE, fileEncoding = "latin1")
obj_fr <- MetaNLP(source_path_fr, bounds = c(1, Inf), language = "french")

# add a column name that contains all possible special characters
obj_fr@data_frame <- data.frame(obj_fr@data_frame,
Expand Down

0 comments on commit e442990

Please sign in to comment.