Merge pull request #1 from imbi-heidelberg/basic_development

Implementation of the class MetaNLP
imbi-heidelberg · Apr 11, 2024 · 3ba72e2 · 3ba72e2
2 parents b381897 + 79b76ce
commit 3ba72e2
Show file tree

Hide file tree

Showing 6 changed files with 140 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,11 +39,13 @@ Suggests:
 Imports:
     glmnet,
     tm,
+    textstem,
     methods,
     utils
 VignetteBuilder: knitr
 Collate: 
     MetaNLP.R
+    util.R
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1

diff --git a/R/MetaNLP.R b/R/MetaNLP.R
@@ -1,4 +1,4 @@
-#' Natrual Language Processing for Meta Analysis
+#' Natural Language Processing for Meta Analysis
 #'
 #' The \pkg{MetaNLP} package provides methods to quickly transform a
 #' CSV-file with titles and abstracts to an R data frame that can be
@@ -12,27 +12,88 @@
 
 #' Create a data frame with word counts
 #'
-#' A \code{MetaNLP} object is the base class of the \pkg{MetaNLP}.
+#' A \code{MetaNLP} object is the base class of the package \pkg{MetaNLP}.
 #' It is initialized by passing the path to a CSV file and constructs
 #' a data frame which column names are the words that occur in the titles
 #' and abstracts and which cells contain the word counts for each
 #' paper.
 #'
 #' @rdname MetaNLP
-setClass("MetaNLP", representation(data_frame = "data.frame",
-                                   label = "character"))
+setClass("MetaNLP", representation(data_frame = "data.frame"))
 
-#' @param path path to the CSV file
-#' @return an object of class \code{MetaNLP}
+#' @param path Path to the CSV file
+#' @param bounds An integer vector of length 2. The first value specifies
+#' the minimum number of appearances of a word to become a column of the word
+#' count matrix, the second value specifies the maximum number.
+#' Defaults to \code{c(2, Inf)}.
+#' @param word_length An integer vector of length 2. The first value specifies
+#' the minimum number of characters of a word to become a column of the word
+#' count matrix, the second value specifies the maximum number.
+#' Defaults to \code{c(3, Inf)}.
+#' @return An object of class \code{MetaNLP}
 #'
 #' @details
 #' An object of class \code{MetaNLP} contains a slot data_frame where
 #' the word count data frame is stored.
+#' The CSV file must have a column \code{ID} to identify each paper, a column
+#' \code{Title} with the belonging titles of the papers and a column
+#' \code{Abstract} which contains the abstracts. Furthermore, to store the
+#' decision for each paper, a column \code{Decision} should exist, where the
+#' values are either "yes" and "no" or "include" and "exclude". The value "maybe"
+#' is handled as a "yes"/"include".
 #'
 #' @rdname MetaNLP
 #' @export
-MetaNLP <- function(path) {
-  file = utils::read.csv(path, header = TRUE, sep = ";")
-  # TODO: modify here
-  return(new("MetaNLP", data_frame = file))
+MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {
+
+  # load file
+  file <- utils::read.csv(path, header = TRUE, sep = ";")
+
+  # make column names lower case
+  names(file) <- tolower(names(file))
+
+  # only select rows without na values
+  file <-  subset(file, !(is.na(file$abstract) | is.na(file$title)))
+
+  suppressWarnings({file |>
+    # select the columns "abstract" and "title"
+    (`[`)(c("title", "abstract")) |>
+    # add new column x where Title and Abstract are pasted
+    within(x <- paste(title, abstract)) |>
+    (`[[`)(c("x")) |>
+    # lower case
+    tolower()|>
+    # lemmatization of the words
+    textstem::lemmatize_strings() |>
+    tm::VectorSource() |>
+    # create corpus object
+    tm::Corpus() |>
+    # remove special characters
+    tm::tm_map(tm::content_transformer(replaceSpecialChars)) |>
+    # strip white space
+    tm::tm_map(tm::stripWhitespace) |>
+    # only use word stems
+    tm::tm_map(tm::stemDocument) |>
+    # create matrix
+    tm::TermDocumentMatrix(control = list(wordLengths = word_length)) |>
+    as.matrix() |>
+    t() |>
+    as.data.frame() -> temp
+  })
+
+  # only choose word stems that appear at least a pre-specified number of times
+  temp <- temp[, colSums(temp) >= bounds[1] & colSums(temp) <= bounds[2]]
+
+  # order by column name
+  index_vec <- order(names(temp))
+  temp |>
+    subset(select = index_vec) -> temp
+
+  # allow for "maybe" as decision
+  decision <- ifelse(file$decision %in% c("include", "maybe", "yes"), "yes", "no")
+
+  # add columns containing the ids of the papers and the belonging decisions
+  res <- cbind(id = file$id, decision, temp)
+
+  return(new("MetaNLP", data_frame = res))
 }
diff --git a/R/util.R b/R/util.R
@@ -0,0 +1,3 @@
+# replaces each special character by a space
+
+replaceSpecialChars <- function(d) gsub("[^a-z]", " ", d)
diff --git a/man/MetaNLP.Rd b/man/MetaNLP.Rd
diff --git a/tests/testthat/data/test_data.csv b/tests/testthat/data/test_data.csv
@@ -1,4 +1,4 @@
-ID;decision;author;title;abstract
+ID;DECISION;author;title;Abstract
 1;exclude;John Doe 2019;A beautiful paper;Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas porttitor congue massa. Fusce posuere, magna sed pulvinar ultricies, purus lectus malesuada libero, sit amet commodo magna eros quis urna.
 2;exclude;John Doe and Jane Roe 2019;A very beautiful paper;Nunc viverra imperdiet enim. Fusce est. Vivamus a tellus. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Proin pharetra nonummy pede. Mauris et orci.
 3;include;John Doe and Jane Roe 2020;An enormously interesting paper;Aenean nec lorem. In porttitor. Donec laoreet nonummy augue. Suspendisse dui purus, scelerisque at, vulputate vitae, pretium mattis, nunc. Mauris eget neque at sem venenatis eleifend. Ut nonummy.

diff --git a/tests/testthat/test_constructor.R b/tests/testthat/test_constructor.R
@@ -1,7 +1,48 @@
 test_that("constructor works", {
   obj <- MetaNLP("data/test_data.csv")
+  obj2 <- MetaNLP("data/test_data.csv", bounds = c(1, Inf))
+  obj3 <- MetaNLP("data/test_data.csv", bounds = c(3,6), word_length = c(4,8))
+
+  # rows containing na values are dropped
   expect_equal(
     nrow(obj@data_frame),
-    5
+    4
+  )
+
+  # columns with id should be unchanged by min_appear and word_length
+  expect_equal(
+    obj2@data_frame$id,
+    obj@data_frame$id
+  )
+
+  expect_equal(
+    obj2@data_frame$decision,
+    obj@data_frame$decision
+  )
+
+  # correct conversion from "include/exclude" to "yes/no"
+  expect_equal(
+    obj@data_frame$decision,
+    c("no", "no", "yes", "yes")
+  )
+
+  # when we allow for words that appear at least once, the number of columns
+  # should be higher
+  expect_true(
+    ncol(obj2@data_frame) > ncol(obj@data_frame)
+  )
+
+  # bounds and word_length stick to conditions
+  expect_true(
+    min(colSums(obj3@data_frame[-(1:2)]))      >= 3 &
+    max(colSums(obj3@data_frame[-(1:2)]))      <= 6 &
+    min(nchar(names(obj3@data_frame[-(1:2)]))) >= 4 &
+    max(nchar(names(obj3@data_frame[-(1:2)]))) <= 8
+  )
+
+  # exemplary row to test correct results
+  expect_equal(
+    obj@data_frame$paper,
+    c(1, 1, 1, 3)
   )
 })