3  CRAN Packages Webpage Scrape

The following code does not run here due to time consumption. The generated data is timestamped with a data collection time produced from local R scripts that are pushed up to Github.

The below walks through a script designed to scrape the names of packages in the ggplot environment from the CRAN page (https://cran.r-project.org/web/packages/available_packages_by_name.html) to then retrieve the cumulative historical cran download count for each.

Finally, the scraped data here is merged with the tidyverse gallery data generated from the previous chapter.

First import the necessary packages:

library(tidyverse)
library(rvest) #html webscraping
library(packageRank) #to retrieve CRAN download counts

Read in the downloaded CRAN packages (alphabetical order) page html file.

df <- read_html("raw_data/CRAN_ Available Packages By Name.html")

Scrape all packages names from the page.

names <- df |>
  html_elements("span.CRAN") |> 
  html_text()

Filter for the names that start with gg by finding the index boundaries and store in gg_package_names.

#beginning index for 'gg' names
gg_start <- which(substr(names, 1, 2) == "gg")[1] 
#end index for 'gg' names
gg_end <- which(substr(names, 1, 2) == "gh")[1] - 1 

gg_package_names <- names[gg_start:gg_end]

Remove packages that start with gg but are not relevant to ggplot2. These names were determined manually and saved in the ‘non_ggplot_packages.csv’ file.

remove_packages <- read.csv("raw_data/non_ggplot_packages.csv")[[1]]

ggplot_package_names <- gg_package_names[!gg_package_names %in% remove_packages]

Scrape all package descriptions from the page. The resulting vector elements alternate between a package name and its description (e.g. ‘ggplot2’, ‘Create Elegant Data Visualisations Using the Grammar of Graphics’, ‘ggplot2.utils’, etc.), and is offset by a “” element between every letter change (e.g. between the end of all ‘a’ names and start of ‘b’ names).

#returns a vector that alternates between package names and their descriptions
desc <- df |>
  html_elements("div.container") |> 
  html_elements("table") |> 
  html_elements("tbody") |> 
  html_elements("tr") |> 
  html_elements("td") |> 
  html_text()

Filter for only the packages with names that start with ‘gg’ by identifying index boundaries, and remove package names, keeping only descriptions, by retaining every other element to store in new vector.

#beginning index for 'gg' names in descriptions
first_gg <- which(substr(desc, 1, 2) == "gg")[1] 

#end index for 'gg' names in descriptions
end_gg <- which(substr(desc, 1, 2) == "gh")[1] - 1 

desc_gg <- desc[first_gg:end_gg]

#Select only descriptions (remove package names) by storing every other element
gg_descriptions <- desc_gg[c(FALSE, TRUE)]

Remove descriptions of packages starting with gg but not relevant to ggplot2. Found manually and saved in ‘non_ggplot_desc.csv’ file.

#removes line breaks in descriptions to match csv file formatting
gg_descriptions <- gsub("\n", " ", gg_descriptions)

remove_desc <- read.csv("raw_data/non_ggplot_desc.csv")[[1]]

ggplot_descriptions <- gg_descriptions[!gg_descriptions %in% remove_desc]

The following block of code finds additional ggplot packages that do not start with gg. The names and descriptions are saved in csv files that can be read in for the future.

#finds all non gg package names
non_gg_desc <- desc[c(1:(first_gg-1), (end_gg+1):length(desc))] 

#found indices at which either the description or name contains 'ggplot', removes the first index since it is a duplicate
gg_indices <- grep('ggplot', non_gg_desc)[-1] 

#removes a known duplicate index where name and desc both contain 'ggplot'
gg_indices <- gg_indices[-match(30865, gg_indices)] 


#replaces manually found indices at which the name contains 'ggplot' with the indices of their descriptions
replace <- c(13588, 20251, 29832)
for (num in replace) {
  pos <- match(num, gg_indices)
  gg_indices[pos] <- gg_indices[pos] + 1
}

#names are at the elements -1 before each desc
add_ggplot_names <- non_gg_desc[gg_indices-1] 
add_ggplot_desc <- non_gg_desc[gg_indices]

#save down additional names and desc for future reference
write_csv(data.frame(names = add_ggplot_names, indices = gg_indices-1), "raw_data/add_ggplot_names.csv")
write_csv(data.frame(desc = add_ggplot_desc, indices = gg_indices), "raw_data/add_ggplot_desc.csv")

Add the additional ggplot package names and descriptions found and saved in the code block above.

add_gg_names <- read.csv("raw_data/add_ggplot_names.csv")[[1]]
add_gg_desc <- read.csv("raw_data/add_ggplot_desc.csv")[[1]]

all_ggplot_packages <- c(ggplot_package_names, add_gg_names)
all_ggplot_desc <- c(ggplot_descriptions, add_gg_desc)

To find the most current total historical download count, set a target_date of two days before today. Depending on the time of day, cranDownloads is updated to either 1 or 2 days previous to the current day.

target_date <- Sys.Date()-2

The below function get_total_downloads takes in a package name to retrieve a cumulative count of that package’s cran downloads up until the set target_date by utilizing the cranDownloads funcion of packageRank. Handles error that arises when package is not found on CRAN. Returned as dataframe.

get_total_downloads <- function(pkg) {
  
  #to = 2025 pulls entire download history
  cd <- tryCatch(
    cranDownloads(packages = pkg, to = 2025),
    
    #if the package is not found in cran return NA
    error = function(e) NA
  )
  
  #retrieving the 'cumulative' value of a particular date gets total download count up      to that date
  count <- ifelse(length(cd) == 1, NA, cd$cranlogs.data$cumulative [ 
    cd$cranlogs.data$date == target_date
  ])
  
  data.frame(package = pkg, downloads = count)
}

Retrieve historic cran download count for each package by mapping get_total_downloads across scraped package_names and combining returned dataframes in one df. Will take a few minutes to complete.

cran_packages <- map_dfr(all_ggplot_packages, get_total_downloads)

Store data in new dataframe with a column indicating CRAN page as the source and export as csv file with a timestamp indicating when data was generated.

cran_packages$description = all_ggplot_desc
cran_packages$CRAN = TRUE

head(cran_packages)

timestamp <- paste("# CRAN data generated on:", Sys.time())

data_lines <- format_csv(cran_packages)

# Combine and write
write_lines(c(timestamp, data_lines), "generated_data/cran_packages.csv")

Read in and merge the data on ggplot packages from the tidyverse gallery and from the cran webpage to store in a new dataframe, packages.

gallery_packages <- read_csv("generated_data/gallery_packages.csv", skip = 1)

cran_packages <- read_csv("generated_data/cran_packages.csv", skip = 1)

packages <- full_join(gallery_packages, cran_packages, by = c("package", "downloads"))

Sort by download count and save as all_packages with timestamp.

sorted_packages <- arrange(packages, desc(downloads))
sorted_packages$CRAN <- ifelse(is.na(sorted_packages$CRAN), FALSE, sorted_packages$CRAN)
sorted_packages$gallery <- ifelse(is.na(sorted_packages$gallery), FALSE, sorted_packages$gallery)

head(sorted_packages)

gallery_ts <- readLines("generated_data/gallery_packages.csv", n = 1)
cran_ts <- readLines("generated_data/cran_packages.csv", n = 1)
data_lines <- format_csv(sorted_packages)

write_lines(c(gallery_ts, cran_ts, data_lines), "generated_data/all_packages.csv")

In practice, the above code is run on a local R script to prevent repetitive time consumption. The data on all ggplot2 environment packages can be read in as below.

library(tidyverse)

#read in data, skipping timestamp
all_packages <- read.csv("generated_data/all_packages.csv", skip = 2)

head(all_packages)
   package downloads stars gallery
1  ggplot2 159099092    NA   FALSE
2  ggrepel  23345337  1228    TRUE
3  cowplot  17969538   712    TRUE
4   ggpubr  16228504  1154    TRUE
5    ggsci  12072118   676    TRUE
6 ggsignif  12007419   597    TRUE
                                                        description CRAN
1  Create Elegant Data Visualisations Using the Grammar of Graphics TRUE
2 Automatically Position Non-Overlapping Text Labels with 'ggplot2' TRUE
3         Streamlined Plot Theme and Plot Annotations for 'ggplot2' TRUE
4                           'ggplot2' Based Publication Ready Plots TRUE
5 Scientific Journal and Sci-Fi Themed Color Palettes for 'ggplot2' TRUE
6                               Significance Brackets for 'ggplot2' TRUE