Scrape website for NBA stats and convert to acceptable form for JavaScript application
Load packages
library(tidyverse)
library(rvest)
library(magrittr)
Set base url and sub domains
## Identify the url from where you want to extract data
base_url <-"https://www.basketball-reference.com/players/"
webpage <- read_html(base_url)
# Links to the player pages by letter
alphabet_links <- html_nodes(webpage, "#content li > a")
# convert to characters
alphabet_links <- as.character(html_text(alphabet_links))
#only take first 25 as there is no 'X'
alphabet_links<-alphabet_links[1:25]
# establishes end of url that applies to each webpage
alphabet_links<-tolower(alphabet_links)
Loop through each link and scrape data of interest from pages
playerStats <- tibble()
for (i in 1:length(alphabet_links)) {
BBall.url <- paste0("https://www.basketball-reference.com/players/",alphabet_links[i],"/")
BBall.Pages<-read_html(BBall.url)
# it is easiest to take entire table and get rid of unneeded data later on
t <- html_node(BBall.Pages, "table")
t <- html_table(t)
# bind together
playerStats <- rbind(playerStats, t)
}
# drop position
playerStats %<>% select(-Pos, -Colleges)
playerStats %>% head()
## Player From To Ht Wt Birth Date
## 1 Alaa Abdelnaby 1991 1995 6-10 240 June 24, 1968
## 2 Zaid Abdul-Aziz 1969 1978 6-9 235 April 7, 1946
## 3 Kareem Abdul-Jabbar* 1970 1989 7-2 225 April 16, 1947
## 4 Mahmoud Abdul-Rauf 1991 2001 6-1 162 March 9, 1969
## 5 Tariq Abdul-Wahad 1998 2003 6-6 223 November 3, 1974
## 6 Shareef Abdur-Rahim 1997 2008 6-9 225 December 11, 1976
Height is expressed with a hyphen This needs to be converted to numeric
playerStats %<>%
mutate(Ht = sapply(strsplit(as.character(playerStats$Ht),"-"), function(x) {
12 * as.numeric(x[1]) + as.numeric(x[2])}))
## Warning: package 'bindrcpp' was built under R version 3.4.4
# calculate career length
playerStats %<>%
mutate(careerLength = To - From + 1) %>% # adding 1 to account for how seasons are recorded on the site
select(-c(To, From))
Convert Birth Date to just year
playerStats %<>%
mutate(birthDate = sapply(strsplit(playerStats$`Birth Date`, split = " "), function(x) {
as.numeric(x[3])
}))
playerStats %<>%
select(-`Birth Date`)
Base R hack for creating a js file that can be read as an array in JavaScript. This will only write the dataframe contents, no headers, so you will need to keep track of what the variables represent.
stats<-paste("[\"",playerStats$Player,"\",\"", playerStats$Ht,"\",\"" , playerStats$Wt,"\",\"" , playerStats$careerLength,"\",\"" , playerStats$birthDate, "\"],", sep = "")
print(stats[1:6])
## [1] "[\"Alaa Abdelnaby\",\"82\",\"240\",\"5\",\"1968\"],"
## [2] "[\"Zaid Abdul-Aziz\",\"81\",\"235\",\"10\",\"1946\"],"
## [3] "[\"Kareem Abdul-Jabbar*\",\"86\",\"225\",\"20\",\"1947\"],"
## [4] "[\"Mahmoud Abdul-Rauf\",\"73\",\"162\",\"11\",\"1969\"],"
## [5] "[\"Tariq Abdul-Wahad\",\"78\",\"223\",\"6\",\"1974\"],"
## [6] "[\"Shareef Abdur-Rahim\",\"81\",\"225\",\"12\",\"1976\"],"
This method requires some post processing in a text editor to add the var name Open in sublime text and add ‘var stats =’ Add to html script and call as ‘stats’ I wont use this method even though it is slighly more space efficient as I want to have a JSON object.
writeLines(stats, "stats.js")
This is another method that uses a specialized library specifically for creating and manipulating JSON files. First couple lines shown for brevity.
require(jsonlite)
json_stats <- toJSON(playerStats, pretty = T)
Write out for use in app
write_lines(paste("var stats = ", json_stats), "nba_stats.js")