Baseball Graphics
Mallory Valente
February 14, 2018
library(tidyverse)
## -- Attaching packages ---------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.1 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
player_data <- read.csv("Master.csv")
pitching_data <- read.csv("Pitching.csv")
salary_data <- read.csv("Salaries.csv")
inflation_index <- read.csv("inflation.csv")
Graph 1: Box Plot
pitching_data$yearID <- as.factor(pitching_data$yearID)
summary_ERA <- summarize(group_by(pitching_data, yearID), Q1 = quantile(ERA,.25,na.rm=T),median=median(ERA,na.rm=T), Q3 = quantile(ERA,.75,na.rm=T), min=min(ERA,na.rm=T), max=max(ERA,na.rm=T), na.rm=T)
ggplot(pitching_data) + geom_boxplot(aes(x = yearID, y = ERA), na.rm=T)
Graph 2: Line Plot
summary_ERA$yearID <- as.numeric(as.character(summary_ERA$yearID))
ggplot(summary_ERA)+geom_line(aes(x=yearID, y=median))
Graph 3: Ribbon Plot
ggplot(summary_ERA) + geom_ribbon(aes(x=yearID, ymin=Q1, ymax=Q3), fill="lightgreen") + geom_line(aes(x=yearID, y=median), color="darkblue")
Graph 4: Line Plot
pitching_data_filtered <- filter(pitching_data, G>=10)
pitching_data_filtered$G <- as.numeric(pitching_data_filtered$G)
summary_pitching_data <- summarize(group_by(pitching_data_filtered, yearID),
ERA_six_proportion=mean(ERA >= 6, na.rm=T),
ERA_three_proportion=mean(ERA <= 3,na.rm=T))
summary_pitching_data$yearID <- as.numeric(as.character(summary_pitching_data$yearID))
ggplot(summary_pitching_data, aes(x=yearID)) +
geom_line(aes(y=ERA_six_proportion, color= "6 or higher"))+
geom_line(aes(y=ERA_three_proportion, color="3 or under")) + scale_color_manual(values = c("6 or higher" = "red", "3 or under"="darkblue"), name="ERA") +
theme_classic() +
labs(x = "Year", y="Proportion", title="Proportion of Pitchers (pitching at least 10 games)\n With Low and High ERAs by Year")
Graph 5:Ribbon and Line Plot
player_data_2 <- mutate(player_data, usa = ifelse(birthCountry == "USA", "Born in US", "Born outside USA"))
names(inflation_index)[1] <- "yearID"
head(inflation_index)
## yearID inflation2015
## 1 1980 2.88
## 2 1981 2.61
## 3 1982 2.46
## 4 1983 2.38
## 5 1984 2.28
## 6 1985 2.20
tail(inflation_index)
## yearID inflation2015
## 30 2009 1.10
## 31 2010 1.09
## 32 2011 1.05
## 33 2012 1.03
## 34 2013 1.02
## 35 2014 1.00
player_data_2$playerID <- as.character(player_data_2$playerID)
salary_data$playerID <- as.character(salary_data$playerID)
summary_salary_inner <- inner_join(salary_data, player_data_2, by = "playerID")
summary_salary <- summarize(group_by(summary_salary_inner, yearID,usa), Q1 = quantile(salary,.25,na.rm=T),median=median(salary,na.rm=T), Q3 = quantile(salary,.75,na.rm=T), min=min(salary,na.rm=T), max=max(salary,na.rm=T))
summary_salary_inner <- inner_join(summary_salary, inflation_index, by = "yearID")
summary_salary_left <- left_join(summary_salary, inflation_index, by="yearID")
summary_salary_right <- right_join(summary_salary, inflation_index, by="yearID")
summary_salary_full <- full_join(summary_salary, inflation_index, by="yearID")
summary_salary_left[summary_salary_left$yearID==2015,"inflation2015"]<-1
summary_salary_inner <- mutate(summary_salary_inner, median_inflation_adjusted = median*inflation2015, Q1_inflation_adjusted = Q1*inflation2015, Q3_inflation_adjusted = Q3*inflation2015, min_inflation_adjusted = min*inflation2015,max_inflation_adjusted = max*inflation2015)
ggplot(summary_salary_inner) +
geom_ribbon(aes(x=yearID, ymin= Q1_inflation_adjusted, ymax= Q3_inflation_adjusted, fill= usa), alpha = 0.4) +
geom_line(aes(x=yearID, y=median_inflation_adjusted, color = usa)) + scale_color_manual(values = c("Born in US" = "red", "Born outside USA"="darkblue"), name="Median Salary") +
scale_y_continuous(labels = scales::dollar)+labs(y="Annual Salary \n (Adjusted for Inflation)",x="Year",title="Salaries of Middle 50% of Earners in Major League Baseball") +
scale_fill_manual(name="Middle 50% of Earners",values=c("Born in US"="red", "Born outside USA" = "turquoise1")) +
theme_classic()