Skip to content

Commit

Permalink
Merge pull request #2 from imbi-heidelberg/basic_development
Browse files Browse the repository at this point in the history
Functions to delete columns
  • Loading branch information
max-pilz authored Apr 15, 2024
2 parents 3ba72e2 + efc903e commit 9e20efc
Show file tree
Hide file tree
Showing 8 changed files with 226 additions and 7 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ VignetteBuilder: knitr
Collate:
MetaNLP.R
util.R
delete_functions.R
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.1
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# Generated by roxygen2: do not edit by hand

export(MetaNLP)
export(delete_stop_words)
export(delete_words)
exportMethods(delete_stop_words)
exportMethods(delete_words)
import(methods)
10 changes: 6 additions & 4 deletions R/MetaNLP.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@ setClass("MetaNLP", representation(data_frame = "data.frame"))
#' An object of class \code{MetaNLP} contains a slot data_frame where
#' the word count data frame is stored.
#' The CSV file must have a column \code{ID} to identify each paper, a column
#' \code{Title} with the belonging titles of the papers and a column
#' \code{Abstract} which contains the abstracts. Furthermore, to store the
#' decision for each paper, a column \code{Decision} should exist, where the
#' \code{title} with the belonging titles of the papers and a column
#' \code{abstract} which contains the abstracts. Furthermore, to store the
#' decision for each paper, a column \code{decision} should exist, where the
#' values are either "yes" and "no" or "include" and "exclude". The value "maybe"
#' is handled as a "yes"/"include".
#'
#' @rdname MetaNLP
#' @export
MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {
title <- NULL
abstract <- NULL

# load file
file <- utils::read.csv(path, header = TRUE, sep = ";")
Expand All @@ -62,7 +64,7 @@ MetaNLP <- function(path, bounds = c(2, Inf), word_length = c(3, Inf)) {
within(x <- paste(title, abstract)) |>
(`[[`)(c("x")) |>
# lower case
tolower()|>
tolower() |>
# lemmatization of the words
textstem::lemmatize_strings() |>
tm::VectorSource() |>
Expand Down
93 changes: 93 additions & 0 deletions R/delete_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#' Delete list of words
#'
#' There can be words that do not offer additional information
#' in the classification whether a paper should be included or excluded
#' from a meta-analysis. Thus, such words should not be part of the word count
#' matrix. This function allows the user to remove these columns of the word
#' count matrix by specifying a vector of words to delete.
#'
#' @param object A MetaNLP object, whose data frame is to be modified
#' @param delete_list A character vector containing the words to be deleted
#' @return An object of class \code{MetaNLP}
#'
#' @details
#' The words in \code{delete_list} can be given like they appear in the
#' text. They are lemmatized and stemmed by \code{delete_words} to match the
#' columns of the word count matrix.
#'
#' @export
setGeneric("delete_words", function(object, delete_list) {
standardGeneric("delete_words")
})



#' @examples
#' \dontrun{
#' obj<- MetaNLP("test_data.csv")
#' del_words <- c("beautiful", "considering", "found")
#' obj <- delete_words(obj, del_words)
#' }
#'
#' @rdname delete_words
#' @export
setMethod("delete_words", signature("MetaNLP", "character"),
function(object, delete_list) {
# lemmatize and stem delete words
delete_list |>
textstem::lemmatize_strings() |>
tm::stemDocument() -> lem_list

# create vector of words which remain in word count matrix
col_names <- names(object@data_frame)
index <- col_names[!(col_names %in% lem_list)]

object@data_frame <- object@data_frame[index]
object
})




#' Delete stop words
#'
#' Usually, stop words do not offer useful information in the classification
#' whether a paper should be included or excluded
#' from a meta-analysis. Thus, such words should not be part of the word count
#' matrix. This function allows the user to automatically delete stop words.
#'
#' @param object A MetaNLP object, whose data frame is to be modified.
#' @param ... Language of the stop words. Defaults to "english".
#' @return An object of class \code{MetaNLP}.
#'
#' @details
#' This function allows to delete stop words from different languages. Supported
#' languages are \code{english}, \code{french}, \code{german}, \code{italian},
#' \code{portugese}, \code{romanian}, \code{russian}, \code{spanish} and
#' \code{swedish}. Language names are case sensitive.
#'
#'
#' @export
setGeneric("delete_stop_words", function(object, ...) {
standardGeneric("delete_stop_words")
})

#' @examples
#' \dontrun{
#' obj <- MetaNLP("test_data.csv")
#' obj <- delete_stop_words(obj, "english")
#' }
#'
#' @rdname delete_stop_words
#' @export
setMethod("delete_stop_words", signature("MetaNLP"),
function(object, ...) {

language <- list(...)
# define stop words by language
delete_list <- tm::stopwords(kind = language)

# delete these words from word count matrix
object@data_frame <- delete_words(object, delete_list)@data_frame
object
})
6 changes: 3 additions & 3 deletions man/MetaNLP.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions man/delete_stop_words.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

39 changes: 39 additions & 0 deletions man/delete_words.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions tests/testthat/test_deletion.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
test_that("Deletion functions work", {

# The first four words of the vector exist in (possibly modified) form in the
# word count matrix of the test data
source_path <- test_path("data", "test_data.csv")
obj <- MetaNLP(source_path)
deletion_list <- c("beautiful", "considering", "facts", "find", "algebra")
obj_delete <- delete_words(obj, deletion_list)
obj_stop_words <- delete_stop_words(obj)

# delete_words and delete_stop_words return MetaNLP objects

expect_true(
isClass(obj_delete, MetaNLP)
)

expect_true(
isClass(obj_stop_words, MetaNLP)
)

# the data frame in obj_delete should have 4 columns less
expect_true(
ncol(obj@data_frame) - 4 == ncol(obj_delete@data_frame)
)

# check that the correct columns have been deleted
expect_true(
!("beauti" %in% names(obj_delete) |
"consid" %in% names(obj_delete) |
"fact" %in% names(obj_delete) |
"find" %in% names(obj_delete))
)

# check that stop words have been deleted
expect_false(
!(any(tm::stopwords() %in% names(obj@data_frame)))
)

expect_true(
!(any(tm::stopwords() %in% names(obj_stop_words@data_frame)))
)
})

0 comments on commit 9e20efc

Please sign in to comment.