Purpose

Scrape website for NBA stats and convert to acceptable form for JavaScript application

Load packages

library(tidyverse)
library(rvest)
library(magrittr)

Set base url and sub domains

## Identify the url from where you want to extract data
base_url <-"https://www.basketball-reference.com/players/"
webpage <- read_html(base_url)
# Links to the player pages by letter
alphabet_links <- html_nodes(webpage, "#content li > a")

# convert to characters
alphabet_links <- as.character(html_text(alphabet_links))

#only take first 25 as there is no 'X'
alphabet_links<-alphabet_links[1:25] 
# establishes end of url that applies to each webpage
alphabet_links<-tolower(alphabet_links) 

Loop through each link and scrape data of interest from pages

playerStats <- tibble()
for (i in 1:length(alphabet_links)) {
  BBall.url <- paste0("https://www.basketball-reference.com/players/",alphabet_links[i],"/")
  BBall.Pages<-read_html(BBall.url)
  # it is easiest to take entire table and get rid of unneeded data later on
  t <- html_node(BBall.Pages, "table") 
  t <- html_table(t)
  
  # bind together
  playerStats <- rbind(playerStats, t)
}
# drop position
playerStats %<>% select(-Pos, -Colleges)
playerStats %>% head()
##                 Player From   To   Ht  Wt        Birth Date
## 1       Alaa Abdelnaby 1991 1995 6-10 240     June 24, 1968
## 2      Zaid Abdul-Aziz 1969 1978  6-9 235     April 7, 1946
## 3 Kareem Abdul-Jabbar* 1970 1989  7-2 225    April 16, 1947
## 4   Mahmoud Abdul-Rauf 1991 2001  6-1 162     March 9, 1969
## 5    Tariq Abdul-Wahad 1998 2003  6-6 223  November 3, 1974
## 6  Shareef Abdur-Rahim 1997 2008  6-9 225 December 11, 1976

Height is expressed with a hyphen This needs to be converted to numeric

playerStats %<>% 
  mutate(Ht = sapply(strsplit(as.character(playerStats$Ht),"-"), function(x) {
  12 * as.numeric(x[1]) + as.numeric(x[2])}))
## Warning: package 'bindrcpp' was built under R version 3.4.4
# calculate career length
playerStats %<>% 
  mutate(careerLength = To - From + 1) %>%  # adding 1 to account for how seasons are recorded on the site
  select(-c(To, From))

Convert Birth Date to just year

playerStats %<>% 
  mutate(birthDate = sapply(strsplit(playerStats$`Birth Date`, split = " "), function(x) {
  as.numeric(x[3])
 }))
playerStats %<>% 
  select(-`Birth Date`)

Write out into a usable form for Chart.js

Base R hack for creating a js file that can be read as an array in JavaScript. This will only write the dataframe contents, no headers, so you will need to keep track of what the variables represent.

stats<-paste("[\"",playerStats$Player,"\",\"", playerStats$Ht,"\",\"" , playerStats$Wt,"\",\"" , playerStats$careerLength,"\",\"" , playerStats$birthDate, "\"],", sep = "")
print(stats[1:6])
## [1] "[\"Alaa Abdelnaby\",\"82\",\"240\",\"5\",\"1968\"],"       
## [2] "[\"Zaid Abdul-Aziz\",\"81\",\"235\",\"10\",\"1946\"],"     
## [3] "[\"Kareem Abdul-Jabbar*\",\"86\",\"225\",\"20\",\"1947\"],"
## [4] "[\"Mahmoud Abdul-Rauf\",\"73\",\"162\",\"11\",\"1969\"],"  
## [5] "[\"Tariq Abdul-Wahad\",\"78\",\"223\",\"6\",\"1974\"],"    
## [6] "[\"Shareef Abdur-Rahim\",\"81\",\"225\",\"12\",\"1976\"],"

This method requires some post processing in a text editor to add the var name Open in sublime text and add ‘var stats =’ Add to html script and call as ‘stats’ I wont use this method even though it is slighly more space efficient as I want to have a JSON object.

writeLines(stats, "stats.js")

This is another method that uses a specialized library specifically for creating and manipulating JSON files. First couple lines shown for brevity.

require(jsonlite)

json_stats <- toJSON(playerStats, pretty = T)

Write out for use in app

write_lines(paste("var stats = ", json_stats), "nba_stats.js")