Load relavent packages
require(rvest) # for web scraping
require(sf)
require(geojsonio)
require(jsonlite)
require(magrittr)
require(dplyr)
Establish initial website to pull data from. Wikipedia is great for this becuase of how they structure their data and webpage.
webpage <- read_html("https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population")
Below I read in the entire table as it appears on the webpage at once and clean it in the R environment. This is easier than trying to take indivual columns of interest.
# Full table
table <- html_nodes(webpage, "table")
# Read as table
table <- html_table(table, header = T)
# Only take first table as others are not needed
table <- table[[1]]
# Remove first two columns which are unneeded
table[,1:2]<-NULL
# Remove commas and percent sign. This drops one column which has number of reps due to special characters
table[,-1] <- as.numeric(gsub(",|%", "" , as.matrix(table[,-1])))
## Warning: NAs introduced by coercion
# only concerned with states, DC and Puerto Rico
table <- table[1:52,]
Explore the data
table %>% head()
## Name Population estimate, July 1, 2018[4]
## 1 California 39557045
## 2 Texas 28701845
## 3 Florida 21299325
## 4 New York 19542209
## 5 Pennsylvania 12807060
## 6 Illinois 12741080
## Census population, April 1, 2010 Percent increase from 2010-2018[note 1]
## 1 37252895 6.19
## 2 25146105 14.14
## 3 18804623 13.27
## 4 19378087 0.85
## 5 12702887 0.82
## 6 12831549 -0.71
## Total seats in the U.S. House of Representatives, 20132023
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## Estimated population per electoral vote, 2018[note 2]
## 1 719219
## 2 755312
## 3 734459
## 4 673869
## 5 640353
## 6 637054
## Estimated population per House seat, 2018
## 1 746359
## 2 797273
## 3 788864
## 4 723786
## 5 711503
## 6 707838
## Census population per House seat, 2010
## 1 702885
## 2 698503
## 3 696468
## 4 717707
## 5 705715
## 6 712864
## Percent of the total U.S. population, 2018[note 3]
## 1 11.96
## 2 8.68
## 3 6.44
## 4 5.91
## 5 3.87
## 6 3.85
Clean data some more before transfer to JSON
# Drop column of representatives which is full of NA values
table[,5]<-NULL
# Shorten and simplify colnames
colnames(table)<- c("name", "pop2018", "pop2010", "percentChange", "popPerElectoralVote", "popPerHouseSeat", "censusPerHouseSeat", "perTotalPopulation")
Read in a csv with electoral vote totals for each state. This could also be done rvest and a different wiki page, but I knew I had this data from a past project.
#read in document that has electoral votes to add to dataset
votingPower <- read.csv("../data/voting power/Electoral Info.csv", stringsAsFactors = F) %>% select(state, Electoral.Votes)
votingPower %<>%
rename(electoralVotes = Electoral.Votes)
# Join with data of electoral votes
table <- left_join(table, votingPower, by= c("name"="state"))
Need to change some of the data for Puerto Rico since it is an outlier in a lot of regards. Some values are added to Puerto Rico just so it can be seen on a graph later on.
# change Puerto Rice NA to 0
table[is.na(table)] <- 0
# arrange in order
table %<>% arrange(name)
# Adding value to the Puerto Rico Electoral Vote so it can be seen on the graph
table[40,]$popPerElectoralVote<-30000
Calculate Electoral Votes based soley on Population including PR
# Get the total number of electoral votes
eVotes <- table$electoralVotes %>% sum()
table$pop2018 %>% sum()
## [1] 330489940
# Calculate proportional votes based on population
table %<>%
mutate(proportionalElectoral = round(eVotes*(pop2018/sum(pop2018)), 2))
# Rounding errors introduce some loss in the proportional votes total
table$proportionalElectoral %>% sum()
## [1] 537.99
Write out for use as javascript variable. During development I was able to write the file to the folder that my HTML was reading. This means I could make a change to the data in R, save the data here, and have that change show up on the voting project HTML. Once this was uploaded to a server the process is not as seemless.
# Write out for js useage
writeLines(paste("var voting =", toJSON(table, pretty = T), sep = " "), "../data/js/voting_update.js")
Read in state data. Data comes from Natural Earth.
states <- st_read("../data/states", layer = "ne_10m_admin_1_states_provinces_lakes")
## Reading layer `ne_10m_admin_1_states_provinces_lakes' from data source `C:\Users\Max Blasdel\Desktop\web projects\projects\bubble_chart\data\states' using driver `ESRI Shapefile'
## Simple feature collection with 4593 features and 83 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
# Filter for the US
us<-states %>%
filter(adm0_a3 == "USA")
# Select Puerto Rico
pr<-states %>%
filter(name == "Puerto Rico")
# Bind together
us <- rbind(pr, us)
# Remove unneeded attributes and leave only name
us %<>% select(name)
Join some attributes that may be useful
us<-left_join(us, table, by="name")
## Warning: Column `name` joining factor and character vector, coercing into
## character vector
# I really just want some of the population data from the map
us <- us[,1:4]
us %>% head()
## Simple feature collection with 6 features and 4 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.7346 ymin: 17.92292 xmax: -65.24462 ymax: 49.36949
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## name pop2018 pop2010 percentChange
## 1 Puerto Rico 3195153 3726157 -14.25
## 2 Minnesota 5611179 5303925 5.79
## 3 Washington 7535591 6724543 12.06
## 4 Idaho 1754208 1567652 11.90
## 5 Montana 1062305 989417 7.37
## 6 North Dakota 760077 672591 13.01
## geometry
## 1 MULTIPOLYGON (((-67.85587 1...
## 2 MULTIPOLYGON (((-95.16057 4...
## 3 MULTIPOLYGON (((-122.6533 4...
## 4 MULTIPOLYGON (((-117.0382 4...
## 5 MULTIPOLYGON (((-116.0482 4...
## 6 MULTIPOLYGON (((-104.0476 4...
Create seperate objects for Alaska, Hawaii, Puerto Rico
pr <- us %>%
filter(name == 'Puerto Rico')
ak <- us %>%
filter(name == 'Alaska')
hi <- us %>%
filter(name == 'Hawaii')
Convert to spatial object and then geojson object. Simple function to do both operations.
toGeoJSON <- function(object){
shape <- sf::as_Spatial(object)
geojson <- geojsonio::geojson_json(shape)
return(geojson)
}
Convert each object to a GeoJSON.
us <- toGeoJSON(us)
pr <- toGeoJSON(pr)
ak <- toGeoJSON(ak)
hi <- toGeoJSON(hi)
Write out for js usage
writeLines(paste("var states = ", us), "../data/js/states.js")
writeLines(paste("var ak = ", ak), "../data/js/alaska.js")
writeLines(paste("var hi = ", hi), "../data/js/hawaii.js")
writeLines(paste("var pr = ", pr), "../data/js/puertorico.js")