-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from imbi-heidelberg/basic_development
Functions to delete columns
- Loading branch information
Showing
8 changed files
with
226 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,8 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(MetaNLP) | ||
export(delete_stop_words) | ||
export(delete_words) | ||
exportMethods(delete_stop_words) | ||
exportMethods(delete_words) | ||
import(methods) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#' Delete list of words | ||
#' | ||
#' There can be words that do not offer additional information | ||
#' in the classification whether a paper should be included or excluded | ||
#' from a meta-analysis. Thus, such words should not be part of the word count | ||
#' matrix. This function allows the user to remove these columns of the word | ||
#' count matrix by specifying a vector of words to delete. | ||
#' | ||
#' @param object A MetaNLP object, whose data frame is to be modified | ||
#' @param delete_list A character vector containing the words to be deleted | ||
#' @return An object of class \code{MetaNLP} | ||
#' | ||
#' @details | ||
#' The words in \code{delete_list} can be given like they appear in the | ||
#' text. They are lemmatized and stemmed by \code{delete_words} to match the | ||
#' columns of the word count matrix. | ||
#' | ||
#' @export | ||
setGeneric("delete_words", function(object, delete_list) { | ||
standardGeneric("delete_words") | ||
}) | ||
|
||
|
||
|
||
#' @examples | ||
#' \dontrun{ | ||
#' obj<- MetaNLP("test_data.csv") | ||
#' del_words <- c("beautiful", "considering", "found") | ||
#' obj <- delete_words(obj, del_words) | ||
#' } | ||
#' | ||
#' @rdname delete_words | ||
#' @export | ||
setMethod("delete_words", signature("MetaNLP", "character"), | ||
function(object, delete_list) { | ||
# lemmatize and stem delete words | ||
delete_list |> | ||
textstem::lemmatize_strings() |> | ||
tm::stemDocument() -> lem_list | ||
|
||
# create vector of words which remain in word count matrix | ||
col_names <- names(object@data_frame) | ||
index <- col_names[!(col_names %in% lem_list)] | ||
|
||
object@data_frame <- object@data_frame[index] | ||
object | ||
}) | ||
|
||
|
||
|
||
|
||
#' Delete stop words | ||
#' | ||
#' Usually, stop words do not offer useful information in the classification | ||
#' whether a paper should be included or excluded | ||
#' from a meta-analysis. Thus, such words should not be part of the word count | ||
#' matrix. This function allows the user to automatically delete stop words. | ||
#' | ||
#' @param object A MetaNLP object, whose data frame is to be modified. | ||
#' @param ... Language of the stop words. Defaults to "english". | ||
#' @return An object of class \code{MetaNLP}. | ||
#' | ||
#' @details | ||
#' This function allows to delete stop words from different languages. Supported | ||
#' languages are \code{english}, \code{french}, \code{german}, \code{italian}, | ||
#' \code{portugese}, \code{romanian}, \code{russian}, \code{spanish} and | ||
#' \code{swedish}. Language names are case sensitive. | ||
#' | ||
#' | ||
#' @export | ||
setGeneric("delete_stop_words", function(object, ...) { | ||
standardGeneric("delete_stop_words") | ||
}) | ||
|
||
#' @examples | ||
#' \dontrun{ | ||
#' obj <- MetaNLP("test_data.csv") | ||
#' obj <- delete_stop_words(obj, "english") | ||
#' } | ||
#' | ||
#' @rdname delete_stop_words | ||
#' @export | ||
setMethod("delete_stop_words", signature("MetaNLP"), | ||
function(object, ...) { | ||
|
||
language <- list(...) | ||
# define stop words by language | ||
delete_list <- tm::stopwords(kind = language) | ||
|
||
# delete these words from word count matrix | ||
object@data_frame <- delete_words(object, delete_list)@data_frame | ||
object | ||
}) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
test_that("Deletion functions work", { | ||
|
||
# The first four words of the vector exist in (possibly modified) form in the | ||
# word count matrix of the test data | ||
source_path <- test_path("data", "test_data.csv") | ||
obj <- MetaNLP(source_path) | ||
deletion_list <- c("beautiful", "considering", "facts", "find", "algebra") | ||
obj_delete <- delete_words(obj, deletion_list) | ||
obj_stop_words <- delete_stop_words(obj) | ||
|
||
# delete_words and delete_stop_words return MetaNLP objects | ||
|
||
expect_true( | ||
isClass(obj_delete, MetaNLP) | ||
) | ||
|
||
expect_true( | ||
isClass(obj_stop_words, MetaNLP) | ||
) | ||
|
||
# the data frame in obj_delete should have 4 columns less | ||
expect_true( | ||
ncol(obj@data_frame) - 4 == ncol(obj_delete@data_frame) | ||
) | ||
|
||
# check that the correct columns have been deleted | ||
expect_true( | ||
!("beauti" %in% names(obj_delete) | | ||
"consid" %in% names(obj_delete) | | ||
"fact" %in% names(obj_delete) | | ||
"find" %in% names(obj_delete)) | ||
) | ||
|
||
# check that stop words have been deleted | ||
expect_false( | ||
!(any(tm::stopwords() %in% names(obj@data_frame))) | ||
) | ||
|
||
expect_true( | ||
!(any(tm::stopwords() %in% names(obj_stop_words@data_frame))) | ||
) | ||
}) |