library(tidyverse)
library(rvest) #html webscraping
library(packageRank) #to retrieve CRAN download counts
3 CRAN Packages Webpage Scrape
The following code does not run here due to time consumption. The generated data is timestamped with a data collection time produced from local R scripts that are pushed up to Github.
The below walks through a script designed to scrape the names of packages in the ggplot environment from the CRAN page (https://cran.r-project.org/web/packages/available_packages_by_name.html) to then retrieve the cumulative historical cran download count for each.
Finally, the scraped data here is merged with the tidyverse gallery data generated from the previous chapter.
First import the necessary packages:
Read in the downloaded CRAN packages (alphabetical order) page html file.
<- read_html("raw_data/CRAN_ Available Packages By Name.html") df
Scrape all packages names from the page.
<- df |>
names html_elements("span.CRAN") |>
html_text()
Filter for the names that start with gg by finding the index boundaries and store in gg_package_names.
#beginning index for 'gg' names
<- which(substr(names, 1, 2) == "gg")[1]
gg_start #end index for 'gg' names
<- which(substr(names, 1, 2) == "gh")[1] - 1
gg_end
<- names[gg_start:gg_end] gg_package_names
Remove packages that start with gg but are not relevant to ggplot2. These names were determined manually and saved in the ‘non_ggplot_packages.csv’ file.
<- read.csv("raw_data/non_ggplot_packages.csv")[[1]]
remove_packages
<- gg_package_names[!gg_package_names %in% remove_packages] ggplot_package_names
Scrape all package descriptions from the page. The resulting vector elements alternate between a package name and its description (e.g. ‘ggplot2’, ‘Create Elegant Data Visualisations Using the Grammar of Graphics’, ‘ggplot2.utils’, etc.), and is offset by a “” element between every letter change (e.g. between the end of all ‘a’ names and start of ‘b’ names).
#returns a vector that alternates between package names and their descriptions
<- df |>
desc html_elements("div.container") |>
html_elements("table") |>
html_elements("tbody") |>
html_elements("tr") |>
html_elements("td") |>
html_text()
Filter for only the packages with names that start with ‘gg’ by identifying index boundaries, and remove package names, keeping only descriptions, by retaining every other element to store in new vector.
#beginning index for 'gg' names in descriptions
<- which(substr(desc, 1, 2) == "gg")[1]
first_gg
#end index for 'gg' names in descriptions
<- which(substr(desc, 1, 2) == "gh")[1] - 1
end_gg
<- desc[first_gg:end_gg]
desc_gg
#Select only descriptions (remove package names) by storing every other element
<- desc_gg[c(FALSE, TRUE)] gg_descriptions
Remove descriptions of packages starting with gg but not relevant to ggplot2. Found manually and saved in ‘non_ggplot_desc.csv’ file.
#removes line breaks in descriptions to match csv file formatting
<- gsub("\n", " ", gg_descriptions)
gg_descriptions
<- read.csv("raw_data/non_ggplot_desc.csv")[[1]]
remove_desc
<- gg_descriptions[!gg_descriptions %in% remove_desc] ggplot_descriptions
The following block of code finds additional ggplot packages that do not start with gg. The names and descriptions are saved in csv files that can be read in for the future.
#finds all non gg package names
<- desc[c(1:(first_gg-1), (end_gg+1):length(desc))]
non_gg_desc
#found indices at which either the description or name contains 'ggplot', removes the first index since it is a duplicate
<- grep('ggplot', non_gg_desc)[-1]
gg_indices
#removes a known duplicate index where name and desc both contain 'ggplot'
<- gg_indices[-match(30865, gg_indices)]
gg_indices
#replaces manually found indices at which the name contains 'ggplot' with the indices of their descriptions
<- c(13588, 20251, 29832)
replace for (num in replace) {
<- match(num, gg_indices)
pos <- gg_indices[pos] + 1
gg_indices[pos]
}
#names are at the elements -1 before each desc
<- non_gg_desc[gg_indices-1]
add_ggplot_names <- non_gg_desc[gg_indices]
add_ggplot_desc
#save down additional names and desc for future reference
write_csv(data.frame(names = add_ggplot_names, indices = gg_indices-1), "raw_data/add_ggplot_names.csv")
write_csv(data.frame(desc = add_ggplot_desc, indices = gg_indices), "raw_data/add_ggplot_desc.csv")
Add the additional ggplot package names and descriptions found and saved in the code block above.
<- read.csv("raw_data/add_ggplot_names.csv")[[1]]
add_gg_names <- read.csv("raw_data/add_ggplot_desc.csv")[[1]]
add_gg_desc
<- c(ggplot_package_names, add_gg_names)
all_ggplot_packages <- c(ggplot_descriptions, add_gg_desc) all_ggplot_desc
To find the most current total historical download count, set a target_date of two days before today. Depending on the time of day, cranDownloads is updated to either 1 or 2 days previous to the current day.
<- Sys.Date()-2 target_date
The below function get_total_downloads takes in a package name to retrieve a cumulative count of that package’s cran downloads up until the set target_date by utilizing the cranDownloads funcion of packageRank. Handles error that arises when package is not found on CRAN. Returned as dataframe.
<- function(pkg) {
get_total_downloads
#to = 2025 pulls entire download history
<- tryCatch(
cd cranDownloads(packages = pkg, to = 2025),
#if the package is not found in cran return NA
error = function(e) NA
)
#retrieving the 'cumulative' value of a particular date gets total download count up to that date
<- ifelse(length(cd) == 1, NA, cd$cranlogs.data$cumulative [
count $cranlogs.data$date == target_date
cd
])
data.frame(package = pkg, downloads = count)
}
Retrieve historic cran download count for each package by mapping get_total_downloads across scraped package_names and combining returned dataframes in one df. Will take a few minutes to complete.
<- map_dfr(all_ggplot_packages, get_total_downloads) cran_packages
Store data in new dataframe with a column indicating CRAN page as the source and export as csv file with a timestamp indicating when data was generated.
$description = all_ggplot_desc
cran_packages$CRAN = TRUE
cran_packages
head(cran_packages)
<- paste("# CRAN data generated on:", Sys.time())
timestamp
<- format_csv(cran_packages)
data_lines
# Combine and write
write_lines(c(timestamp, data_lines), "generated_data/cran_packages.csv")
Read in and merge the data on ggplot packages from the tidyverse gallery and from the cran webpage to store in a new dataframe, packages.
<- read_csv("generated_data/gallery_packages.csv", skip = 1)
gallery_packages
<- read_csv("generated_data/cran_packages.csv", skip = 1)
cran_packages
<- full_join(gallery_packages, cran_packages, by = c("package", "downloads")) packages
Sort by download count and save as all_packages with timestamp.
<- arrange(packages, desc(downloads))
sorted_packages $CRAN <- ifelse(is.na(sorted_packages$CRAN), FALSE, sorted_packages$CRAN)
sorted_packages$gallery <- ifelse(is.na(sorted_packages$gallery), FALSE, sorted_packages$gallery)
sorted_packages
head(sorted_packages)
<- readLines("generated_data/gallery_packages.csv", n = 1)
gallery_ts <- readLines("generated_data/cran_packages.csv", n = 1)
cran_ts <- format_csv(sorted_packages)
data_lines
write_lines(c(gallery_ts, cran_ts, data_lines), "generated_data/all_packages.csv")
In practice, the above code is run on a local R script to prevent repetitive time consumption. The data on all ggplot2 environment packages can be read in as below.
library(tidyverse)
#read in data, skipping timestamp
<- read.csv("generated_data/all_packages.csv", skip = 2)
all_packages
head(all_packages)
package downloads stars gallery
1 ggplot2 159099092 NA FALSE
2 ggrepel 23345337 1228 TRUE
3 cowplot 17969538 712 TRUE
4 ggpubr 16228504 1154 TRUE
5 ggsci 12072118 676 TRUE
6 ggsignif 12007419 597 TRUE
description CRAN
1 Create Elegant Data Visualisations Using the Grammar of Graphics TRUE
2 Automatically Position Non-Overlapping Text Labels with 'ggplot2' TRUE
3 Streamlined Plot Theme and Plot Annotations for 'ggplot2' TRUE
4 'ggplot2' Based Publication Ready Plots TRUE
5 Scientific Journal and Sci-Fi Themed Color Palettes for 'ggplot2' TRUE
6 Significance Brackets for 'ggplot2' TRUE