From fd98f6d917ffd29f84e15d512f44fa8c422f02e2 Mon Sep 17 00:00:00 2001 From: LuisDVA Date: Wed, 21 Aug 2024 16:50:45 -0600 Subject: [PATCH 1/2] replace magritttr pipes with base pipes --- R/encoding.R | 6 +-- R/form.R | 4 +- R/html.R | 14 +++---- R/live.R | 14 +++---- R/rvest-package.R | 12 +++--- R/selectors.R | 18 ++++----- R/session.R | 16 ++++---- R/table.R | 12 +++--- R/text.R | 8 ++-- README.Rmd | 16 ++++---- README.md | 22 +++++++---- demo/tripadvisor.R | 34 ++++++++--------- demo/united.R | 16 ++++---- demo/zillow.R | 26 ++++++------- man/LiveHTML.Rd | 6 +-- man/html_attr.Rd | 8 ++-- man/html_element.Rd | 18 ++++----- man/html_encoding_guess.Rd | 6 +-- man/html_form.Rd | 4 +- man/html_name.Rd | 6 +-- man/html_table.Rd | 12 +++--- man/html_text.Rd | 8 ++-- man/read_html.Rd | 12 +++--- man/read_html_live.Rd | 8 ++-- man/session.Rd | 16 ++++---- vignettes/rvest.Rmd | 70 +++++++++++++++++----------------- vignettes/starwars-dynamic.Rmd | 8 ++-- 27 files changed, 203 insertions(+), 197 deletions(-) diff --git a/R/encoding.R b/R/encoding.R index 8e265a0e..df622f3d 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -11,12 +11,12 @@ #' # A file with bad encoding included in the package #' path <- system.file("html-ex", "bad-encoding.html", package = "rvest") #' x <- read_html(path) -#' x %>% html_elements("p") %>% html_text() +#' x |> html_elements("p") |> html_text() #' #' html_encoding_guess(x) #' # Two valid encodings, only one of which is correct -#' read_html(path, encoding = "ISO-8859-1") %>% html_elements("p") %>% html_text() -#' read_html(path, encoding = "ISO-8859-2") %>% html_elements("p") %>% html_text() +#' read_html(path, encoding = "ISO-8859-1") |> html_elements("p") |> html_text() +#' read_html(path, encoding = "ISO-8859-2") |> html_elements("p") |> html_text() html_encoding_guess <- function(x) { check_installed("stringi") diff --git a/R/form.R b/R/form.R index d39f81f3..525b993d 100644 --- a/R/form.R +++ b/R/form.R @@ -22,11 +22,11 @@ #' html <- read_html("http://www.google.com") #' search <- html_form(html)[[1]] #' -#' search <- search %>% html_form_set(q = "My little pony", hl = "fr") +#' search <- search |> html_form_set(q = "My little pony", hl = "fr") #' #' # Or if you have a list of values, use !!! #' vals <- list(q = "web scraping", hl = "en") -#' search <- search %>% html_form_set(!!!vals) +#' search <- search |> html_form_set(!!!vals) #' #' # To submit and get result: #' \dontrun{ diff --git a/R/html.R b/R/html.R index 8f2aef77..f8098817 100644 --- a/R/html.R +++ b/R/html.R @@ -8,9 +8,9 @@ #' url <- "https://rvest.tidyverse.org/articles/starwars.html" #' html <- read_html(url) #' -#' html %>% -#' html_element("div") %>% -#' html_children() %>% +#' html |> +#' html_element("div") |> +#' html_children() |> #' html_name() #' @export #' @importFrom xml2 xml_name @@ -35,11 +35,11 @@ html_name <- function(x) { #'
  • b
  • #' ') #' -#' html %>% html_elements("a") %>% html_attrs() +#' html |> html_elements("a") |> html_attrs() #' -#' html %>% html_elements("a") %>% html_attr("href") -#' html %>% html_elements("li") %>% html_attr("class") -#' html %>% html_elements("li") %>% html_attr("class", default = "inactive") +#' html |> html_elements("a") |> html_attr("href") +#' html |> html_elements("li") |> html_attr("class") +#' html |> html_elements("li") |> html_attr("class", default = "inactive") #' @export #' @importFrom xml2 xml_attr html_attr <- function(x, name, default = NA_character_) { diff --git a/R/live.R b/R/live.R index 91f4a1b7..f35b59b7 100644 --- a/R/live.R +++ b/R/live.R @@ -27,16 +27,16 @@ #' # When we retrieve the raw HTML for this site, it doesn't contain the #' # data we're interested in: #' static <- read_html("https://www.forbes.com/top-colleges/") -#' static %>% html_elements(".TopColleges2023_tableRow__BYOSU") +#' static |> html_elements(".TopColleges2023_tableRow__BYOSU") #' #' # Instead, we need to run the site in a real web browser, causing it to #' # download a JSON file and then dynamically generate the html: #' #' sess <- read_html_live("https://www.forbes.com/top-colleges/") #' sess$view() -#' rows <- sess %>% html_elements(".TopColleges2023_tableRow__BYOSU") -#' rows %>% html_element(".TopColleges2023_organizationName__J1lEV") %>% html_text() -#' rows %>% html_element(".grant-aid") %>% html_text() +#' rows <- sess |> html_elements(".TopColleges2023_tableRow__BYOSU") +#' rows |> html_element(".TopColleges2023_organizationName__J1lEV") |> html_text() +#' rows |> html_element(".grant-aid") |> html_text() #' } read_html_live <- function(url) { check_installed(c("chromote", "R6")) @@ -67,11 +67,11 @@ read_html_live <- function(url) { #' sess <- read_html_live("https://www.bodybuilding.com/exercises/finder") #' sess$view() #' -#' sess %>% html_elements(".ExResult-row") %>% length() +#' sess |> html_elements(".ExResult-row") |> length() #' sess$click(".ExLoadMore-btn") -#' sess %>% html_elements(".ExResult-row") %>% length() +#' sess |> html_elements(".ExResult-row") |> length() #' sess$click(".ExLoadMore-btn") -#' sess %>% html_elements(".ExResult-row") %>% length() +#' sess |> html_elements(".ExResult-row") |> length() #' } LiveHTML <- R6::R6Class( "LiveHTML", diff --git a/R/rvest-package.R b/R/rvest-package.R index 64fc853d..d31daae6 100644 --- a/R/rvest-package.R +++ b/R/rvest-package.R @@ -30,21 +30,21 @@ #' # Then find elements that match a css selector or XPath expression #' # using html_elements(). In this example, each
    corresponds #' # to a different film -#' films <- starwars %>% html_elements("section") +#' films <- starwars |> html_elements("section") #' films #' #' # Then use html_element() to extract one element per film. Here #' # we the title is given by the text inside

    -#' title <- films %>% -#' html_element("h2") %>% +#' title <- films |> +#' html_element("h2") |> #' html_text2() #' title #' #' # Or use html_attr() to get data out of attributes. html_attr() always #' # returns a string so we convert it to an integer using a readr function -#' episode <- films %>% -#' html_element("h2") %>% -#' html_attr("data-id") %>% +#' episode <- films |> +#' html_element("h2") |> +#' html_attr("data-id") |> #' readr::parse_integer() #' episode xml2::read_html diff --git a/R/selectors.R b/R/selectors.R index b48e1a1a..348ad90e 100644 --- a/R/selectors.R +++ b/R/selectors.R @@ -40,10 +40,10 @@ #'

    This is an important paragraph

    #' ") #' -#' html %>% html_element("h1") -#' html %>% html_elements("p") -#' html %>% html_elements(".important") -#' html %>% html_elements("#first") +#' html |> html_element("h1") +#' html |> html_elements("p") +#' html |> html_elements(".important") +#' html |> html_elements("#first") #' #' # html_element() vs html_elements() -------------------------------------- #' html <- minimal_html(" @@ -54,18 +54,18 @@ #'
  • R4-P17 is a droid
  • #' #' ") -#' li <- html %>% html_elements("li") +#' li <- html |> html_elements("li") #' #' # When applied to a node set, html_elements() returns all matching elements #' # beneath any of the inputs, flattening results into a new node set. -#' li %>% html_elements("i") +#' li |> html_elements("i") #' #' # When applied to a node set, html_element() always returns a vector the #' # same length as the input, using a "missing" element where needed. -#' li %>% html_element("i") +#' li |> html_element("i") #' # and html_text() and html_attr() will return NA -#' li %>% html_element("i") %>% html_text2() -#' li %>% html_element("span") %>% html_attr("class") +#' li |> html_element("i") |> html_text2() +#' li |> html_element("span") |> html_attr("class") html_element <- function(x, css, xpath) { UseMethod("html_element") } diff --git a/R/session.R b/R/session.R index 0572c1f3..861072d3 100644 --- a/R/session.R +++ b/R/session.R @@ -22,19 +22,19 @@ #' @export #' @examples #' s <- session("http://hadley.nz") -#' s %>% -#' session_jump_to("hadley-wickham.jpg") %>% -#' session_jump_to("/") %>% +#' s |> +#' session_jump_to("hadley-wickham.jpg") |> +#' session_jump_to("/") |> #' session_history() #' -#' s %>% -#' session_jump_to("hadley-wickham.jpg") %>% -#' session_back() %>% +#' s |> +#' session_jump_to("hadley-wickham.jpg") |> +#' session_back() |> #' session_history() #' #' \donttest{ -#' s %>% -#' session_follow_link(css = "p a") %>% +#' s |> +#' session_follow_link(css = "p a") |> #' html_elements("p") #' } session <- function(url, ...) { diff --git a/R/table.R b/R/table.R index dca24952..2091abef 100644 --- a/R/table.R +++ b/R/table.R @@ -30,8 +30,8 @@ #' 4y #' 10z #' ") -#' sample1 %>% -#' html_element("table") %>% +#' sample1 |> +#' html_element("table") |> #' html_table() #' #' # Values in merged cells will be duplicated @@ -41,8 +41,8 @@ #' 45 #' 67 #' ") -#' sample2 %>% -#' html_element("table") %>% +#' sample2 |> +#' html_element("table") |> #' html_table() #' #' # If a row is missing cells, they'll be filled with NAs @@ -52,8 +52,8 @@ #' 3 #' 4 #' ") -#' sample3 %>% -#' html_element("table") %>% +#' sample3 |> +#' html_element("table") |> #' html_table() html_table <- function(x, header = NA, diff --git a/R/text.R b/R/text.R index b64d1ec8..ca4585c8 100644 --- a/R/text.R +++ b/R/text.R @@ -27,17 +27,17 @@ #' #' # html_text() returns the raw underlying text, which includes whitespace #' # that would be ignored by a browser, and ignores the
    -#' html %>% html_element("p") %>% html_text() %>% writeLines() +#' html |> html_element("p") |> html_text() |> writeLines() #' #' # html_text2() simulates what a browser would display. Non-significant #' # whitespace is collapsed, and
    is turned into a line break -#' html %>% html_element("p") %>% html_text2() %>% writeLines() +#' html |> html_element("p") |> html_text2() |> writeLines() #' #' # By default, html_text2() also converts non-breaking spaces to regular #' # spaces: #' html <- minimal_html("

    x y

    ") -#' x1 <- html %>% html_element("p") %>% html_text() -#' x2 <- html %>% html_element("p") %>% html_text2() +#' x1 <- html |> html_element("p") |> html_text() +#' x2 <- html |> html_element("p") |> html_text2() #' #' # When printed, non-breaking spaces look exactly like regular spaces #' x1 diff --git a/README.Rmd b/README.Rmd index c9cd5788..3811fb8e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -50,21 +50,21 @@ starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
    corresponds # to a different film -films <- starwars %>% html_elements("section") +films <- starwars |> html_elements("section") films # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

    -title <- films %>% - html_element("h2") %>% +title <- films |> + html_element("h2") |> html_text2() title # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function -episode <- films %>% - html_element("h2") %>% - html_attr("data-id") %>% +episode <- films |> + html_element("h2") |> + html_attr("data-id") |> readr::parse_integer() episode ``` @@ -74,7 +74,7 @@ If the page contains tabular data you can convert it directly to a data frame wi ```{r} html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565") -html %>% - html_element(".tracklist") %>% +html |> + html_element(".tracklist") |> html_table() ``` diff --git a/README.md b/README.md index 356bb1d4..37f938f8 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
    corresponds # to a different film -films <- starwars %>% html_elements("section") +films <- starwars |> html_elements("section") films #> {xml_nodeset (7)} #> [1]

    \nThe Phantom Menace\n

    \n

    \nReleased: 1999 ... @@ -57,23 +57,29 @@ films #> [5]

    \nThe Empire Strikes Back\n

    \n

    \nReleased: ... #> [6]

    \nReturn of the Jedi\n

    \n

    \nReleased: 1983 ... #> [7]

    \nThe Force Awakens\n

    \n

    \nReleased: 2015- ... +``` + +``` r # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

    -title <- films %>% - html_element("h2") %>% +title <- films |> + html_element("h2") |> html_text2() title #> [1] "The Phantom Menace" "Attack of the Clones" #> [3] "Revenge of the Sith" "A New Hope" #> [5] "The Empire Strikes Back" "Return of the Jedi" #> [7] "The Force Awakens" +``` + +``` r # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function -episode <- films %>% - html_element("h2") %>% - html_attr("data-id") %>% +episode <- films |> + html_element("h2") |> + html_attr("data-id") |> readr::parse_integer() episode #> [1] 1 2 3 4 5 6 7 @@ -85,8 +91,8 @@ frame with `html_table()`: ``` r html <- read_html("https://en.wikipedia.org/w/index.php?title=The_Lego_Movie&oldid=998422565") -html %>% - html_element(".tracklist") %>% +html |> + html_element(".tracklist") |> html_table() #> # A tibble: 29 × 4 #> No. Title `Performer(s)` Length diff --git a/demo/tripadvisor.R b/demo/tripadvisor.R index 7ece89c8..4246a923 100644 --- a/demo/tripadvisor.R +++ b/demo/tripadvisor.R @@ -5,32 +5,32 @@ library(rvest) url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html" -reviews <- url %>% - read_html() %>% +reviews <- url |> + read_html() |> html_elements("#REVIEWS .innerBubble") -id <- reviews %>% - html_element(".quote a") %>% +id <- reviews |> + html_element(".quote a") |> html_attr("id") -quote <- reviews %>% - html_element(".quote span") %>% +quote <- reviews |> + html_element(".quote span") |> html_text() -rating <- reviews %>% - html_element(".rating .rating_s_fill") %>% - html_attr("alt") %>% - gsub(" of 5 stars", "", .) %>% +rating <- reviews |> + html_element(".rating .rating_s_fill") |> + html_attr("alt") |> + gsub(" of 5 stars", "", .) |> as.integer() -date <- reviews %>% - html_element(".rating .ratingDate") %>% - html_attr("title") %>% - strptime("%b %d, %Y") %>% +date <- reviews |> + html_element(".rating .ratingDate") |> + html_attr("title") |> + strptime("%b %d, %Y") |> as.POSIXct() -review <- reviews %>% - html_element(".entry .partial_entry") %>% +review <- reviews |> + html_element(".entry .partial_entry") |> html_text() -data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) %>% View() +data.frame(id, quote, rating, date, review, stringsAsFactors = FALSE) |> View() diff --git a/demo/united.R b/demo/united.R index b4ba358e..24feccfb 100644 --- a/demo/united.R +++ b/demo/united.R @@ -3,18 +3,18 @@ library(rvest) united <- session("http://www.united.com/") -login <- united %>% - html_element("form[name=LoginForm]") %>% - html_form() %>% +login <- united |> + html_element("form[name=LoginForm]") |> + html_form() |> html_form_set( MpNumber = "GY797363", Password = password ) -logged_in <- united %>% session_submit(login) +logged_in <- united |> session_submit(login) -logged_in %>% - follow_link("View account") %>% - html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") %>% - html_text() %>% +logged_in |> + follow_link("View account") |> + html_element("#ctl00_ContentInfo_AccountSummary_spanEliteMilesNew") |> + html_text() |> readr::parse_number() diff --git a/demo/zillow.R b/demo/zillow.R index e98d1f14..94401225 100644 --- a/demo/zillow.R +++ b/demo/zillow.R @@ -4,25 +4,25 @@ library(tidyr) page <- read_html("http://www.zillow.com/homes/for_sale/Greenwood-IN/fsba,fsbo,fore,cmsn_lt/house_type/52333_rid/39.638414,-86.011362,39.550714,-86.179419_rect/12_zm/0_mmm/") -houses <- page %>% +houses <- page |> html_elements(".photo-cards li article") -z_id <- houses %>% html_attr("id") +z_id <- houses |> html_attr("id") -address <- houses %>% - html_element(".zsg-photo-card-address") %>% +address <- houses |> + html_element(".zsg-photo-card-address") |> html_text() -price <- houses %>% - html_element(".zsg-photo-card-price") %>% - html_text() %>% +price <- houses |> + html_element(".zsg-photo-card-price") |> + html_text() |> readr::parse_number() -params <- houses %>% - html_element(".zsg-photo-card-info") %>% - html_text() %>% +params <- houses |> + html_element(".zsg-photo-card-info") |> + html_text() |> strsplit("\u00b7") -beds <- params %>% purrr::map_chr(1) %>% readr::parse_number() -baths <- params %>% purrr::map_chr(2) %>% readr::parse_number() -house_area <- params %>% purrr::map_chr(3) %>% readr::parse_number() +beds <- params |> purrr::map_chr(1) |> readr::parse_number() +baths <- params |> purrr::map_chr(2) |> readr::parse_number() +house_area <- params |> purrr::map_chr(3) |> readr::parse_number() diff --git a/man/LiveHTML.Rd b/man/LiveHTML.Rd index a568e669..5b3aaa98 100644 --- a/man/LiveHTML.Rd +++ b/man/LiveHTML.Rd @@ -24,11 +24,11 @@ that exposes a more powerful user interface, like sess <- read_html_live("https://www.bodybuilding.com/exercises/finder") sess$view() -sess \%>\% html_elements(".ExResult-row") \%>\% length() +sess |> html_elements(".ExResult-row") |> length() sess$click(".ExLoadMore-btn") -sess \%>\% html_elements(".ExResult-row") \%>\% length() +sess |> html_elements(".ExResult-row") |> length() sess$click(".ExLoadMore-btn") -sess \%>\% html_elements(".ExResult-row") \%>\% length() +sess |> html_elements(".ExResult-row") |> length() } } \section{Public fields}{ diff --git a/man/html_attr.Rd b/man/html_attr.Rd index 75ac0ccf..3a9860a4 100644 --- a/man/html_attr.Rd +++ b/man/html_attr.Rd @@ -32,9 +32,9 @@ html <- minimal_html('') -html \%>\% html_elements("a") \%>\% html_attrs() +html |> html_elements("a") |> html_attrs() -html \%>\% html_elements("a") \%>\% html_attr("href") -html \%>\% html_elements("li") \%>\% html_attr("class") -html \%>\% html_elements("li") \%>\% html_attr("class", default = "inactive") +html |> html_elements("a") |> html_attr("href") +html |> html_elements("li") |> html_attr("class") +html |> html_elements("li") |> html_attr("class", default = "inactive") } diff --git a/man/html_element.Rd b/man/html_element.Rd index cc2e24ef..53a12a58 100644 --- a/man/html_element.Rd +++ b/man/html_element.Rd @@ -57,10 +57,10 @@ html <- minimal_html("

    This is an important paragraph

    ") -html \%>\% html_element("h1") -html \%>\% html_elements("p") -html \%>\% html_elements(".important") -html \%>\% html_elements("#first") +html |> html_element("h1") +html |> html_elements("p") +html |> html_elements(".important") +html |> html_elements("#first") # html_element() vs html_elements() -------------------------------------- html <- minimal_html(" @@ -71,16 +71,16 @@ html <- minimal_html("
  • R4-P17 is a droid
  • ") -li <- html \%>\% html_elements("li") +li <- html |> html_elements("li") # When applied to a node set, html_elements() returns all matching elements # beneath any of the inputs, flattening results into a new node set. -li \%>\% html_elements("i") +li |> html_elements("i") # When applied to a node set, html_element() always returns a vector the # same length as the input, using a "missing" element where needed. -li \%>\% html_element("i") +li |> html_element("i") # and html_text() and html_attr() will return NA -li \%>\% html_element("i") \%>\% html_text2() -li \%>\% html_element("span") \%>\% html_attr("class") +li |> html_element("i") |> html_text2() +li |> html_element("span") |> html_attr("class") } diff --git a/man/html_encoding_guess.Rd b/man/html_encoding_guess.Rd index d64b9cc4..db040af7 100644 --- a/man/html_encoding_guess.Rd +++ b/man/html_encoding_guess.Rd @@ -20,10 +20,10 @@ encodings, then try each out by using \code{encoding} argument of \code{read_htm # A file with bad encoding included in the package path <- system.file("html-ex", "bad-encoding.html", package = "rvest") x <- read_html(path) -x \%>\% html_elements("p") \%>\% html_text() +x |> html_elements("p") |> html_text() html_encoding_guess(x) # Two valid encodings, only one of which is correct -read_html(path, encoding = "ISO-8859-1") \%>\% html_elements("p") \%>\% html_text() -read_html(path, encoding = "ISO-8859-2") \%>\% html_elements("p") \%>\% html_text() +read_html(path, encoding = "ISO-8859-1") |> html_elements("p") |> html_text() +read_html(path, encoding = "ISO-8859-2") |> html_elements("p") |> html_text() } diff --git a/man/html_form.Rd b/man/html_form.Rd index ee5fc627..e159258c 100644 --- a/man/html_form.Rd +++ b/man/html_form.Rd @@ -52,11 +52,11 @@ and submit it with \code{html_form_submit()}. html <- read_html("http://www.google.com") search <- html_form(html)[[1]] -search <- search \%>\% html_form_set(q = "My little pony", hl = "fr") +search <- search |> html_form_set(q = "My little pony", hl = "fr") # Or if you have a list of values, use !!! vals <- list(q = "web scraping", hl = "en") -search <- search \%>\% html_form_set(!!!vals) +search <- search |> html_form_set(!!!vals) # To submit and get result: \dontrun{ diff --git a/man/html_name.Rd b/man/html_name.Rd index 3e66d3a7..cbe46558 100644 --- a/man/html_name.Rd +++ b/man/html_name.Rd @@ -20,8 +20,8 @@ Get element name url <- "https://rvest.tidyverse.org/articles/starwars.html" html <- read_html(url) -html \%>\% - html_element("div") \%>\% - html_children() \%>\% +html |> + html_element("div") |> + html_children() |> html_name() } diff --git a/man/html_table.Rd b/man/html_table.Rd index 3a05da80..e1067cb6 100644 --- a/man/html_table.Rd +++ b/man/html_table.Rd @@ -54,8 +54,8 @@ sample1 <- minimal_html("
    4y
    10z
    ") -sample1 \%>\% - html_element("table") \%>\% +sample1 |> + html_element("table") |> html_table() # Values in merged cells will be duplicated @@ -65,8 +65,8 @@ sample2 <- minimal_html("
    45
    67
    ") -sample2 \%>\% - html_element("table") \%>\% +sample2 |> + html_element("table") |> html_table() # If a row is missing cells, they'll be filled with NAs @@ -76,7 +76,7 @@ sample3 <- minimal_html("
    3
    4
    ") -sample3 \%>\% - html_element("table") \%>\% +sample3 |> + html_element("table") |> html_table() } diff --git a/man/html_text.Rd b/man/html_text.Rd index c65afdb4..a25875b2 100644 --- a/man/html_text.Rd +++ b/man/html_text.Rd @@ -47,17 +47,17 @@ html <- minimal_html( # html_text() returns the raw underlying text, which includes whitespace # that would be ignored by a browser, and ignores the
    -html \%>\% html_element("p") \%>\% html_text() \%>\% writeLines() +html |> html_element("p") |> html_text() |> writeLines() # html_text2() simulates what a browser would display. Non-significant # whitespace is collapsed, and
    is turned into a line break -html \%>\% html_element("p") \%>\% html_text2() \%>\% writeLines() +html |> html_element("p") |> html_text2() |> writeLines() # By default, html_text2() also converts non-breaking spaces to regular # spaces: html <- minimal_html("

    x y

    ") -x1 <- html \%>\% html_element("p") \%>\% html_text() -x2 <- html \%>\% html_element("p") \%>\% html_text2() +x1 <- html |> html_element("p") |> html_text() +x2 <- html |> html_element("p") |> html_text2() # When printed, non-breaking spaces look exactly like regular spaces x1 diff --git a/man/read_html.Rd b/man/read_html.Rd index 7978875e..8ffc329e 100644 --- a/man/read_html.Rd +++ b/man/read_html.Rd @@ -62,21 +62,21 @@ starwars <- read_html("https://rvest.tidyverse.org/articles/starwars.html") # Then find elements that match a css selector or XPath expression # using html_elements(). In this example, each
    corresponds # to a different film -films <- starwars \%>\% html_elements("section") +films <- starwars |> html_elements("section") films # Then use html_element() to extract one element per film. Here # we the title is given by the text inside

    -title <- films \%>\% - html_element("h2") \%>\% +title <- films |> + html_element("h2") |> html_text2() title # Or use html_attr() to get data out of attributes. html_attr() always # returns a string so we convert it to an integer using a readr function -episode <- films \%>\% - html_element("h2") \%>\% - html_attr("data-id") \%>\% +episode <- films |> + html_element("h2") |> + html_attr("data-id") |> readr::parse_integer() episode } diff --git a/man/read_html_live.Rd b/man/read_html_live.Rd index 81489d0d..474ef478 100644 --- a/man/read_html_live.Rd +++ b/man/read_html_live.Rd @@ -36,15 +36,15 @@ on your machine. # When we retrieve the raw HTML for this site, it doesn't contain the # data we're interested in: static <- read_html("https://www.forbes.com/top-colleges/") -static \%>\% html_elements(".TopColleges2023_tableRow__BYOSU") +static |> html_elements(".TopColleges2023_tableRow__BYOSU") # Instead, we need to run the site in a real web browser, causing it to # download a JSON file and then dynamically generate the html: sess <- read_html_live("https://www.forbes.com/top-colleges/") sess$view() -rows <- sess \%>\% html_elements(".TopColleges2023_tableRow__BYOSU") -rows \%>\% html_element(".TopColleges2023_organizationName__J1lEV") \%>\% html_text() -rows \%>\% html_element(".grant-aid") \%>\% html_text() +rows <- sess |> html_elements(".TopColleges2023_tableRow__BYOSU") +rows |> html_element(".TopColleges2023_organizationName__J1lEV") |> html_text() +rows |> html_element(".grant-aid") |> html_text() } } diff --git a/man/session.Rd b/man/session.Rd index 4e0a296a..9295b126 100644 --- a/man/session.Rd +++ b/man/session.Rd @@ -68,19 +68,19 @@ and \code{\link[httr:status_code]{httr::status_code()}}. } \examples{ s <- session("http://hadley.nz") -s \%>\% - session_jump_to("hadley-wickham.jpg") \%>\% - session_jump_to("/") \%>\% +s |> + session_jump_to("hadley-wickham.jpg") |> + session_jump_to("/") |> session_history() -s \%>\% - session_jump_to("hadley-wickham.jpg") \%>\% - session_back() \%>\% +s |> + session_jump_to("hadley-wickham.jpg") |> + session_back() |> session_history() \donttest{ -s \%>\% - session_follow_link(css = "p a") \%>\% +s |> + session_follow_link(css = "p a") |> html_elements("p") } } diff --git a/vignettes/rvest.Rmd b/vignettes/rvest.Rmd index 55295061..4ecbef9e 100644 --- a/vignettes/rvest.Rmd +++ b/vignettes/rvest.Rmd @@ -145,10 +145,10 @@ Both functions take a document[^3] and a css selector: [^3]: Or another element, more on that shortly. ```{r} -html %>% html_element("h1") -html %>% html_elements("p") -html %>% html_elements(".important") -html %>% html_elements("#first") +html |> html_element("h1") +html |> html_elements("p") +html |> html_elements(".important") +html |> html_elements("#first") ``` Selectors can also be combined in various ways using **combinators**. @@ -174,8 +174,8 @@ html <- minimal_html("
  • pineapple
  • ") -html %>% - html_elements("li") %>% +html |> + html_elements("li") |> html_text2() ``` @@ -184,8 +184,8 @@ Note that the escaped ampersand is automatically converted to `&`; you'll only e You might wonder why I used `html_text2()`, since it seems to give the same result as `html_text()`: ```{r} -html %>% - html_elements("li") %>% +html |> + html_elements("li") |> html_text() ``` @@ -209,18 +209,18 @@ html <- minimal_html(" `html_text2()` gives you what you expect: two paragraphs of text separated by a blank line. ```{r} -html %>% - html_element("body") %>% - html_text2() %>% +html |> + html_element("body") |> + html_text2() |> cat() ``` Whereas `html_text()` returns the garbled raw underlying text: ```{r} -html %>% - html_element("body") %>% - html_text() %>% +html |> + html_element("body") |> + html_text() |> cat() ``` @@ -239,25 +239,25 @@ html <- minimal_html(" The value of an attribute can be retrieved with `html_attr()`: ```{r} -html %>% - html_elements("a") %>% +html |> + html_elements("a") |> html_attr("href") -html %>% - html_elements("img") %>% +html |> + html_elements("img") |> html_attr("src") ``` Note that `html_attr()` always returns a string, so you may need to post-process with `as.integer()`/`readr::parse_integer()` or similar. ```{r} -html %>% - html_elements("img") %>% +html |> + html_elements("img") |> html_attr("width") -html %>% - html_elements("img") %>% - html_attr("width") %>% +html |> + html_elements("img") |> + html_attr("width") |> as.integer() ``` @@ -292,8 +292,8 @@ html <- minimal_html(" Because tables are a common way to store data, rvest includes the handy `html_table()` which converts a table into a data frame: ```{r} -html %>% - html_node("table") %>% +html |> + html_node("table") |> html_table() ``` @@ -319,27 +319,27 @@ html <- minimal_html(" If you try to extract name, species, and weight directly, you end up with one vector of length four and two vectors of length three, and no way to align them: ```{r} -html %>% html_elements("b") %>% html_text2() -html %>% html_elements("i") %>% html_text2() -html %>% html_elements(".weight") %>% html_text2() +html |> html_elements("b") |> html_text2() +html |> html_elements("i") |> html_text2() +html |> html_elements(".weight") |> html_text2() ``` Instead, use `html_elements()` to find a element that corresponds to each character, then use `html_element()` to extract each variable for all observations: ```{r} -characters <- html %>% html_elements("li") +characters <- html |> html_elements("li") -characters %>% html_element("b") %>% html_text2() -characters %>% html_element("i") %>% html_text2() -characters %>% html_element(".weight") %>% html_text2() +characters |> html_element("b") |> html_text2() +characters |> html_element("i") |> html_text2() +characters |> html_element(".weight") |> html_text2() ``` `html_element()` automatically fills in `NA` when no elements match, keeping all of the variables aligned and making it easy to create a data frame: ```{r} data.frame( - name = characters %>% html_element("b") %>% html_text2(), - species = characters %>% html_element("i") %>% html_text2(), - weight = characters %>% html_element(".weight") %>% html_text2() + name = characters |> html_element("b") |> html_text2(), + species = characters |> html_element("i") |> html_text2(), + weight = characters |> html_element(".weight") |> html_text2() ) ``` diff --git a/vignettes/starwars-dynamic.Rmd b/vignettes/starwars-dynamic.Rmd index 9d5e3c3e..6134cb29 100644 --- a/vignettes/starwars-dynamic.Rmd +++ b/vignettes/starwars-dynamic.Rmd @@ -10,10 +10,10 @@ vignette: > ```{r, eval = FALSE, echo = FALSE} library(magrittr) crawl_html <- function(x) { - x %>% - gsub("\r", "", .) %>% - gsub("\n\n", "

    ", .) %>% - gsub("\n", " ", .) %>% + x |> + gsub("\r", "", .) |> + gsub("\n\n", "

    ", .) |> + gsub("\n", " ", .) |> paste0("

    ", ., "

    ") } From 76da79d8b4e0522cde675ef19d00c7c101c2563c Mon Sep 17 00:00:00 2001 From: LuisDVA Date: Wed, 21 Aug 2024 17:41:43 -0600 Subject: [PATCH 2/2] update snapshop --- tests/testthat/_snaps/session.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/session.md b/tests/testthat/_snaps/session.md index ae4f055b..cfab7cad 100644 --- a/tests/testthat/_snaps/session.md +++ b/tests/testthat/_snaps/session.md @@ -7,12 +7,12 @@ https://hadley.nz/ Status: 200 Type: text/html; charset=utf-8 - Size: 821273 + Size: 821905 Code expect_true(is.session(s)) s <- session_follow_link(s, css = "p a") Message - Navigating to . + Navigating to . Code session_history(s) Output