Merge pull request #2 from imbi-heidelberg/basic_development

Functions to delete columns
imbi-heidelberg · Apr 15, 2024 · 9e20efc · 9e20efc
2 parents 3ba72e2 + efc903e
commit 9e20efc
Show file tree

Hide file tree

Showing 8 changed files with 226 additions and 7 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,6 +46,7 @@ VignetteBuilder: knitr
 Collate: 
     MetaNLP.R
     util.R
+    delete_functions.R
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.1

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
 export(MetaNLP)
+export(delete_stop_words)
+export(delete_words)
+exportMethods(delete_stop_words)
+exportMethods(delete_words)
 import(methods)
diff --git a/R/MetaNLP.R b/R/MetaNLP.R
@@ -36,15 +36,17 @@ setClass("MetaNLP", representation(data_frame = "data.frame"))
 #' An object of class \code{MetaNLP} contains a slot data_frame where
 #' the word count data frame is stored.
 #' The CSV file must have a column \code{ID} to identify each paper, a column
-#' \code{Title} with the belonging titles of the papers and a column
-#' \code{Abstract} which contains the abstracts. Furthermore, to store the
-#' decision for each paper, a column \code{Decision} should exist, where the
+#' \code{title} with the belonging titles of the papers and a column
+#' \code{abstract} which contains the abstracts. Furthermore, to store the
+#' decision for each paper, a column \code{decision} should exist, where the
 #' values are either "yes" and "no" or "include" and "exclude". The value "maybe"
 #' is handled as a "yes"/"include".
 #'
 #' @rdname MetaNLP
 #' @export
 MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {
+  title <- NULL
+  abstract <- NULL
 
   # load file
   file <- utils::read.csv(path, header = TRUE, sep = ";")
@@ -62,7 +64,7 @@ MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {
     within(x <- paste(title, abstract)) |>
     (`[[`)(c("x")) |>
     # lower case
-    tolower()|>
+    tolower() |>
     # lemmatization of the words
     textstem::lemmatize_strings() |>
     tm::VectorSource() |>

diff --git a/R/delete_functions.R b/R/delete_functions.R
@@ -0,0 +1,93 @@
+#' Delete list of words
+#'
+#' There can be words that do not offer additional information
+#' in the classification whether a paper should be included or excluded
+#' from a meta-analysis. Thus, such words should not be part of the word count
+#' matrix. This function allows the user to remove these columns of the word
+#' count matrix by specifying a vector of words to delete.
+#'
+#' @param object A MetaNLP object, whose data frame is to be modified
+#' @param delete_list A character vector containing the words to be deleted
+#' @return An object of class \code{MetaNLP}
+#'
+#' @details
+#' The words in \code{delete_list} can be given like they appear in the
+#' text. They are lemmatized and stemmed by \code{delete_words} to match the
+#' columns of the word count matrix.
+#'
+#' @export
+setGeneric("delete_words", function(object, delete_list) {
+  standardGeneric("delete_words")
+})
+
+
+
+#' @examples
+#' \dontrun{
+#' obj<- MetaNLP("test_data.csv")
+#' del_words <- c("beautiful", "considering", "found")
+#' obj <- delete_words(obj, del_words)
+#' }
+#'
+#' @rdname delete_words
+#' @export
+setMethod("delete_words", signature("MetaNLP", "character"),
+          function(object, delete_list) {
+            # lemmatize and stem delete words
+            delete_list |>
+              textstem::lemmatize_strings() |>
+              tm::stemDocument() -> lem_list
+
+            # create vector of words which remain in word count matrix
+            col_names <- names(object@data_frame)
+            index <- col_names[!(col_names %in% lem_list)]
+
+            object@data_frame <- object@data_frame[index]
+            object
+          })
+
+
+
+
+#' Delete stop words
+#'
+#' Usually, stop words do not offer useful information in the classification
+#' whether a paper should be included or excluded
+#' from a meta-analysis. Thus, such words should not be part of the word count
+#' matrix. This function allows the user to automatically delete stop words.
+#'
+#' @param object A MetaNLP object, whose data frame is to be modified.
+#' @param ... Language of the stop words. Defaults to "english".
+#' @return An object of class \code{MetaNLP}.
+#'
+#' @details
+#' This function allows to delete stop words from different languages. Supported
+#' languages are \code{english}, \code{french}, \code{german}, \code{italian},
+#' \code{portugese}, \code{romanian}, \code{russian}, \code{spanish} and
+#' \code{swedish}. Language names are case sensitive.
+#'
+#'
+#' @export
+setGeneric("delete_stop_words", function(object, ...) {
+  standardGeneric("delete_stop_words")
+})
+
+#' @examples
+#' \dontrun{
+#' obj <- MetaNLP("test_data.csv")
+#' obj <- delete_stop_words(obj, "english")
+#' }
+#'
+#' @rdname delete_stop_words
+#' @export
+setMethod("delete_stop_words", signature("MetaNLP"),
+          function(object, ...) {
+
+            language <- list(...)
+            # define stop words by language
+            delete_list <- tm::stopwords(kind = language)
+
+            # delete these words from word count matrix
+            object@data_frame <- delete_words(object, delete_list)@data_frame
+            object
+          })
diff --git a/man/MetaNLP.Rd b/man/MetaNLP.Rd
diff --git a/man/delete_stop_words.Rd b/man/delete_stop_words.Rd
diff --git a/man/delete_words.Rd b/man/delete_words.Rd
diff --git a/tests/testthat/test_deletion.R b/tests/testthat/test_deletion.R
@@ -0,0 +1,42 @@
+test_that("Deletion functions work", {
+
+  # The first four words of the vector exist in (possibly modified) form in the
+  # word count matrix of the test data
+  source_path <- test_path("data", "test_data.csv")
+  obj <- MetaNLP(source_path)
+  deletion_list <- c("beautiful", "considering", "facts", "find", "algebra")
+  obj_delete <- delete_words(obj, deletion_list)
+  obj_stop_words <- delete_stop_words(obj)
+
+  # delete_words and delete_stop_words return MetaNLP objects
+
+  expect_true(
+    isClass(obj_delete, MetaNLP)
+  )
+
+  expect_true(
+    isClass(obj_stop_words, MetaNLP)
+  )
+
+  # the data frame in obj_delete should have 4 columns less
+  expect_true(
+    ncol(obj@data_frame) - 4 == ncol(obj_delete@data_frame)
+  )
+
+  # check that the correct columns have been deleted
+  expect_true(
+    !("beauti" %in% names(obj_delete) |
+      "consid" %in% names(obj_delete) |
+      "fact"   %in% names(obj_delete) |
+      "find"   %in% names(obj_delete))
+  )
+
+  # check that stop words have been deleted
+  expect_false(
+    !(any(tm::stopwords() %in% names(obj@data_frame)))
+  )
+
+  expect_true(
+    !(any(tm::stopwords() %in% names(obj_stop_words@data_frame)))
+  )
+})