Skip to content

Commit

Permalink
Merge pull request #1 from imbi-heidelberg/basic_development
Browse files Browse the repository at this point in the history
Implementation of the class MetaNLP
  • Loading branch information
max-pilz authored Apr 11, 2024
2 parents b381897 + 79b76ce commit 3ba72e2
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 17 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@ Suggests:
Imports:
glmnet,
tm,
textstem,
methods,
utils
VignetteBuilder: knitr
Collate:
MetaNLP.R
util.R
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.1
Expand Down
81 changes: 71 additions & 10 deletions R/MetaNLP.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#' Natrual Language Processing for Meta Analysis
#' Natural Language Processing for Meta Analysis
#'
#' The \pkg{MetaNLP} package provides methods to quickly transform a
#' CSV-file with titles and abstracts to an R data frame that can be
Expand All @@ -12,27 +12,88 @@

#' Create a data frame with word counts
#'
#' A \code{MetaNLP} object is the base class of the \pkg{MetaNLP}.
#' A \code{MetaNLP} object is the base class of the package \pkg{MetaNLP}.
#' It is initialized by passing the path to a CSV file and constructs
#' a data frame which column names are the words that occur in the titles
#' and abstracts and which cells contain the word counts for each
#' paper.
#'
#' @rdname MetaNLP
setClass("MetaNLP", representation(data_frame = "data.frame",
label = "character"))
setClass("MetaNLP", representation(data_frame = "data.frame"))

#' @param path path to the CSV file
#' @return an object of class \code{MetaNLP}
#' @param path Path to the CSV file
#' @param bounds An integer vector of length 2. The first value specifies
#' the minimum number of appearances of a word to become a column of the word
#' count matrix, the second value specifies the maximum number.
#' Defaults to \code{c(2, Inf)}.
#' @param word_length An integer vector of length 2. The first value specifies
#' the minimum number of characters of a word to become a column of the word
#' count matrix, the second value specifies the maximum number.
#' Defaults to \code{c(3, Inf)}.
#' @return An object of class \code{MetaNLP}
#'
#' @details
#' An object of class \code{MetaNLP} contains a slot data_frame where
#' the word count data frame is stored.
#' The CSV file must have a column \code{ID} to identify each paper, a column
#' \code{Title} with the belonging titles of the papers and a column
#' \code{Abstract} which contains the abstracts. Furthermore, to store the
#' decision for each paper, a column \code{Decision} should exist, where the
#' values are either "yes" and "no" or "include" and "exclude". The value "maybe"
#' is handled as a "yes"/"include".
#'
#' @rdname MetaNLP
#' @export
MetaNLP <- function(path) {
file = utils::read.csv(path, header = TRUE, sep = ";")
# TODO: modify here
return(new("MetaNLP", data_frame = file))
MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {

# load file
file <- utils::read.csv(path, header = TRUE, sep = ";")

# make column names lower case
names(file) <- tolower(names(file))

# only select rows without na values
file <- subset(file, !(is.na(file$abstract) | is.na(file$title)))

suppressWarnings({file |>
# select the columns "abstract" and "title"
(`[`)(c("title", "abstract")) |>
# add new column x where Title and Abstract are pasted
within(x <- paste(title, abstract)) |>
(`[[`)(c("x")) |>
# lower case
tolower()|>
# lemmatization of the words
textstem::lemmatize_strings() |>
tm::VectorSource() |>
# create corpus object
tm::Corpus() |>
# remove special characters
tm::tm_map(tm::content_transformer(replaceSpecialChars)) |>
# strip white space
tm::tm_map(tm::stripWhitespace) |>
# only use word stems
tm::tm_map(tm::stemDocument) |>
# create matrix
tm::TermDocumentMatrix(control = list(wordLengths = word_length)) |>
as.matrix() |>
t() |>
as.data.frame() -> temp
})

# only choose word stems that appear at least a pre-specified number of times
temp <- temp[, colSums(temp) >= bounds[1] & colSums(temp) <= bounds[2]]

# order by column name
index_vec <- order(names(temp))
temp |>
subset(select = index_vec) -> temp

# allow for "maybe" as decision
decision <- ifelse(file$decision %in% c("include", "maybe", "yes"), "yes", "no")

# add columns containing the ids of the papers and the belonging decisions
res <- cbind(id = file$id, decision, temp)

return(new("MetaNLP", data_frame = res))
}
3 changes: 3 additions & 0 deletions R/util.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# replaces each special character by a space

replaceSpecialChars <- function(d) gsub("[^a-z]", " ", d)
26 changes: 21 additions & 5 deletions man/MetaNLP.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/data/test_data.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ID;decision;author;title;abstract
ID;DECISION;author;title;Abstract
1;exclude;John Doe 2019;A beautiful paper;Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas porttitor congue massa. Fusce posuere, magna sed pulvinar ultricies, purus lectus malesuada libero, sit amet commodo magna eros quis urna.
2;exclude;John Doe and Jane Roe 2019;A very beautiful paper;Nunc viverra imperdiet enim. Fusce est. Vivamus a tellus. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Proin pharetra nonummy pede. Mauris et orci.
3;include;John Doe and Jane Roe 2020;An enormously interesting paper;Aenean nec lorem. In porttitor. Donec laoreet nonummy augue. Suspendisse dui purus, scelerisque at, vulputate vitae, pretium mattis, nunc. Mauris eget neque at sem venenatis eleifend. Ut nonummy.
Expand Down
43 changes: 42 additions & 1 deletion tests/testthat/test_constructor.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,48 @@
test_that("constructor works", {
obj <- MetaNLP("data/test_data.csv")
obj2 <- MetaNLP("data/test_data.csv", bounds = c(1, Inf))
obj3 <- MetaNLP("data/test_data.csv", bounds = c(3,6), word_length = c(4,8))

# rows containing na values are dropped
expect_equal(
nrow(obj@data_frame),
5
4
)

# columns with id should be unchanged by min_appear and word_length
expect_equal(
obj2@data_frame$id,
obj@data_frame$id
)

expect_equal(
obj2@data_frame$decision,
obj@data_frame$decision
)

# correct conversion from "include/exclude" to "yes/no"
expect_equal(
obj@data_frame$decision,
c("no", "no", "yes", "yes")
)

# when we allow for words that appear at least once, the number of columns
# should be higher
expect_true(
ncol(obj2@data_frame) > ncol(obj@data_frame)
)

# bounds and word_length stick to conditions
expect_true(
min(colSums(obj3@data_frame[-(1:2)])) >= 3 &
max(colSums(obj3@data_frame[-(1:2)])) <= 6 &
min(nchar(names(obj3@data_frame[-(1:2)]))) >= 4 &
max(nchar(names(obj3@data_frame[-(1:2)]))) <= 8
)

# exemplary row to test correct results
expect_equal(
obj@data_frame$paper,
c(1, 1, 1, 3)
)
})

0 comments on commit 3ba72e2

Please sign in to comment.