Script for collecting and preparing data for HTML voting project

Load relavent packages

require(rvest) # for web scraping
require(sf)
require(geojsonio)
require(jsonlite)
require(magrittr)
require(dplyr)

Establish initial website to pull data from. Wikipedia is great for this becuase of how they structure their data and webpage.

webpage <- read_html("https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population")

Below I read in the entire table as it appears on the webpage at once and clean it in the R environment. This is easier than trying to take indivual columns of interest.

# Full table
table <- html_nodes(webpage, "table")

# Read as table
table <- html_table(table, header = T)

# Only take first table as others are not needed
table <- table[[1]]

# Remove first two columns which are unneeded
table[,1:2]<-NULL

# Remove commas and percent sign. This drops one column which has number of reps due to special characters
table[,-1] <- as.numeric(gsub(",|%", "" , as.matrix(table[,-1])))
## Warning: NAs introduced by coercion
# only concerned with states, DC and Puerto Rico
table <- table[1:52,]

Explore the data

table %>% head()
##           Name Population estimate, July 1, 2018[4]
## 1   California                             39557045
## 2        Texas                             28701845
## 3      Florida                             21299325
## 4     New York                             19542209
## 5 Pennsylvania                             12807060
## 6     Illinois                             12741080
##   Census population, April 1, 2010 Percent increase from 2010-2018[note 1]
## 1                         37252895                                    6.19
## 2                         25146105                                   14.14
## 3                         18804623                                   13.27
## 4                         19378087                                    0.85
## 5                         12702887                                    0.82
## 6                         12831549                                   -0.71
##   Total seats in the U.S. House of Representatives, 2013–2023
## 1                                                          NA
## 2                                                          NA
## 3                                                          NA
## 4                                                          NA
## 5                                                          NA
## 6                                                          NA
##   Estimated population per electoral vote, 2018[note 2]
## 1                                                719219
## 2                                                755312
## 3                                                734459
## 4                                                673869
## 5                                                640353
## 6                                                637054
##   Estimated population per House seat, 2018
## 1                                    746359
## 2                                    797273
## 3                                    788864
## 4                                    723786
## 5                                    711503
## 6                                    707838
##   Census population per House seat, 2010
## 1                                 702885
## 2                                 698503
## 3                                 696468
## 4                                 717707
## 5                                 705715
## 6                                 712864
##   Percent of the total U.S. population, 2018[note 3]
## 1                                              11.96
## 2                                               8.68
## 3                                               6.44
## 4                                               5.91
## 5                                               3.87
## 6                                               3.85

Clean data some more before transfer to JSON

# Drop column of representatives which is full of NA values
table[,5]<-NULL

# Shorten and simplify colnames
colnames(table)<- c("name", "pop2018", "pop2010", "percentChange", "popPerElectoralVote", "popPerHouseSeat", "censusPerHouseSeat", "perTotalPopulation")

Read in a csv with electoral vote totals for each state. This could also be done rvest and a different wiki page, but I knew I had this data from a past project.

#read in document that has electoral votes to add to dataset
votingPower <- read.csv("../data/voting power/Electoral Info.csv", stringsAsFactors = F) %>% select(state, Electoral.Votes)

votingPower %<>% 
  rename(electoralVotes = Electoral.Votes)

# Join with data of electoral votes
table <- left_join(table, votingPower, by= c("name"="state"))

Need to change some of the data for Puerto Rico since it is an outlier in a lot of regards. Some values are added to Puerto Rico just so it can be seen on a graph later on.

# change Puerto Rice NA to 0
table[is.na(table)] <- 0

# arrange in order
table %<>% arrange(name)

# Adding value to the Puerto Rico Electoral Vote so it can be seen on the graph
table[40,]$popPerElectoralVote<-30000

Calculate Electoral Votes based soley on Population including PR

# Get the total number of electoral votes
eVotes <- table$electoralVotes %>% sum()
table$pop2018 %>% sum()
## [1] 330489940
# Calculate proportional votes based on population
table %<>% 
  mutate(proportionalElectoral = round(eVotes*(pop2018/sum(pop2018)), 2))

# Rounding errors introduce some loss in the proportional votes total
table$proportionalElectoral %>% sum()
## [1] 537.99

Write out for use as javascript variable. During development I was able to write the file to the folder that my HTML was reading. This means I could make a change to the data in R, save the data here, and have that change show up on the voting project HTML. Once this was uploaded to a server the process is not as seemless.

# Write out for js useage
writeLines(paste("var voting =", toJSON(table, pretty = T), sep = " "), "../data/js/voting_update.js")

Mapping Portion for the leaflet map

Read in state data. Data comes from Natural Earth.

states <- st_read("../data/states", layer = "ne_10m_admin_1_states_provinces_lakes")
## Reading layer `ne_10m_admin_1_states_provinces_lakes' from data source `C:\Users\Max Blasdel\Desktop\web projects\projects\bubble_chart\data\states' using driver `ESRI Shapefile'
## Simple feature collection with 4593 features and 83 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
# Filter for the US 
us<-states %>% 
  filter(adm0_a3 == "USA")

# Select Puerto Rico
pr<-states %>% 
  filter(name == "Puerto Rico")

# Bind together
us <- rbind(pr, us)

# Remove unneeded attributes and leave only name
us %<>% select(name)

Join some attributes that may be useful

us<-left_join(us, table, by="name")
## Warning: Column `name` joining factor and character vector, coercing into
## character vector
# I really just want some of the population data from the map
us <- us[,1:4]
us %>% head()
## Simple feature collection with 6 features and 4 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.7346 ymin: 17.92292 xmax: -65.24462 ymax: 49.36949
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
##           name pop2018 pop2010 percentChange
## 1  Puerto Rico 3195153 3726157        -14.25
## 2    Minnesota 5611179 5303925          5.79
## 3   Washington 7535591 6724543         12.06
## 4        Idaho 1754208 1567652         11.90
## 5      Montana 1062305  989417          7.37
## 6 North Dakota  760077  672591         13.01
##                         geometry
## 1 MULTIPOLYGON (((-67.85587 1...
## 2 MULTIPOLYGON (((-95.16057 4...
## 3 MULTIPOLYGON (((-122.6533 4...
## 4 MULTIPOLYGON (((-117.0382 4...
## 5 MULTIPOLYGON (((-116.0482 4...
## 6 MULTIPOLYGON (((-104.0476 4...

Create seperate objects for Alaska, Hawaii, Puerto Rico

pr <- us %>% 
  filter(name == 'Puerto Rico')
ak <- us %>% 
  filter(name == 'Alaska')
hi <- us %>% 
  filter(name == 'Hawaii')

Convert to spatial object and then geojson object. Simple function to do both operations.

toGeoJSON <- function(object){
  shape <- sf::as_Spatial(object)
  geojson <- geojsonio::geojson_json(shape)
  return(geojson)
}

Convert each object to a GeoJSON.

us <- toGeoJSON(us)
pr <- toGeoJSON(pr)
ak <- toGeoJSON(ak)
hi <- toGeoJSON(hi)

Write out for js usage

writeLines(paste("var states = ", us), "../data/js/states.js")

writeLines(paste("var ak = ", ak), "../data/js/alaska.js")

writeLines(paste("var hi = ", hi), "../data/js/hawaii.js")

writeLines(paste("var pr = ", pr), "../data/js/puertorico.js")