This lab seeks to replicate and improve the New York Times’ Migration Graphs, focusing in particular on the migration in and out of the District of Columbia between the 1800s and early 2000s.
library(tidyverse)
library(forcats)
full_data <- read.csv("DCMigration_full.csv")
northeast_states<- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont", "New Jersey", "New York", "Pennsylvania")
midwest_states <- c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin", "Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
south_states <- c("Delaware", "Florida", "Georgia", "Maryland", "North Carolina", "South Carolina", "Virginia", "District of Columbia", "West Virginia", "Alabama", "Kentucky", "Mississippi", "Tennessee", "Arkansas", "Louisiana", "Oklahoma", "Texas")
west_states <- c("Arizona", "Colorado", "Idaho", "Montana", "Nevada", "New Mexico", "Utah", "Wyoming", "Alaska", "California", "Hawaii", "Oregon", "Washington")
dc_data <- mutate(full_data, region=ifelse(residence %in% northeast_states, "Northeast",
ifelse(residence %in% midwest_states, "Midwest",
ifelse(residence %in% west_states, "West",
ifelse(residence %in% south_states & !(residence %in% c("District of Columbia", "Maryland", "Virginia")), "Other Southern States",
ifelse(residence == "Maryland","Maryland",
ifelse(residence == "Virginia", "Virginia",
ifelse(residence == "District of Columbia","Stayed in DC","Other"))))))))
dc_data_sum_prop <- summarize(group_by(dc_data,YEAR),
prop_stayed=mean(region=="Stayed in DC"),
prop_maryland=mean(region=="Maryland"),
prop_virginia=mean(region=="Virginia"),
prop_south=mean(region=="Other Southern States"),
prop_west=mean(region=="West"),
prop_midwest=mean(region=="Midwest"),
prop_northeast=mean(region=="Northeast"),
prop_other=mean(region=="Other"))
dc_data_sum_num <- summarize(group_by(dc_data,YEAR),
num_stayed=sum(region=="Stayed in DC"),
num_maryland=sum(region=="Maryland"),
num_virginia=sum(region=="Virginia"),
num_south=sum(region=="Other Southern States"),
num_west=sum(region=="West"),
num_midwest=sum(region=="Midwest"),
num_northeast=sum(region=="Northeast"),
num_other=sum(region=="Other"))
dc_data_sum <- left_join(dc_data_sum_num, dc_data_sum_prop, by="YEAR")
write.csv(dc_data_sum, "DCMigration_v2.csv", row.names=F)
#write.csv(dc_data, "DCMigration_v3.csv", row.names=F)
DCMigration <- read.csv("DCMigration_v2.csv")
DCMigration <- select(DCMigration, -starts_with("num"))
DCMigration <- select(DCMigration, YEAR, stayed=prop_stayed, maryland=prop_maryland, virginia=prop_virginia, west=prop_west, south=prop_south, midwest=prop_midwest, northeast=prop_northeast)
DCMigration.gather <- gather(DCMigration,-YEAR, key=region, value=propMoved)
DCMigration.gather$region = factor(DCMigration.gather$region, levels=unique(DCMigration.gather$region))
ggplot(DCMigration.gather, aes(x=YEAR, y=propMoved, fill=region)) + geom_bar(stat="identity")+labs(title="Stacked barplot of population movement by region", x="Year", y="Proportion of people who Moved")
The following graph shows several line graphs of the proportion of people who have moved by region.
ggplot(DCMigration.gather) + geom_line(aes_string(x=DCMigration.gather$YEAR, y=DCMigration.gather$propMoved, color="region"))+labs(title="Line graphs of population movement by region", x="Year", y="Proportion of people who Moved")
The following is the recreation of a stream diagram from the NYT article, including the code to manipulate the data.
eps <- .Machine$double.eps
DCMigration.gather$lowHeight <- NA
DCMigration.gather$highHeight <- NA
DCMigration.gather[DCMigration.gather$region=="south","lowHeight"] <- 0
DCMigration.gather[DCMigration.gather$region=="south","highHeight"] <- DCMigration$south - eps
DCMigration.gather[DCMigration.gather$region=="northeast","lowHeight"] <- DCMigration$south + eps
DCMigration.gather[DCMigration.gather$region=="northeast","highHeight"] <- DCMigration$south + DCMigration$northeast - eps
DCMigration.gather[DCMigration.gather$region=="midwest","lowHeight"] <- DCMigration$south + DCMigration$northeast + eps
DCMigration.gather[DCMigration.gather$region=="midwest","highHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest - eps
DCMigration.gather[DCMigration.gather$region=="west","lowHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + eps
DCMigration.gather[DCMigration.gather$region=="west","highHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west - eps
DCMigration.gather[DCMigration.gather$region=="virginia","lowHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + eps
DCMigration.gather[DCMigration.gather$region=="virginia","highHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + DCMigration$virginia - eps
DCMigration.gather[DCMigration.gather$region=="maryland","lowHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + DCMigration$virginia + eps
DCMigration.gather[DCMigration.gather$region=="maryland","highHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + DCMigration$virginia + DCMigration$maryland - eps
DCMigration.gather[DCMigration.gather$region=="stayed","lowHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + DCMigration$virginia + DCMigration$maryland + eps
DCMigration.gather[DCMigration.gather$region=="stayed","highHeight"] <- DCMigration$south + DCMigration$northeast + DCMigration$midwest + DCMigration$west + DCMigration$virginia + DCMigration$maryland + DCMigration$stayed - eps
DCMigration <- DCMigration.gather
decade <- subset(DCMigration, YEAR %in% seq(1900,2000,by=10))
decade$midHeight <- (decade$lowHeight + decade$highHeight)/2
decade <- gather(decade, key=heightType, value=height, -c(YEAR,region,propMoved))
decade$labelx <- decade$YEAR
decade[decade$YEAR <= 1920,"labelx"] <- 1915
decade[decade$YEAR >= 2000,"labelx"] <- 2000
decade$labelEnd <- NA
decade[decade$region == "south","labelEnd"] <- "% of people \n born in DC lived in other states in the South"
decade[decade$region == "northeast","labelEnd"] <- "% of people \n born in DC lived in the Northeast"
decade[decade$region == "midwest","labelEnd"] <- "% of people \n born in DC lived in the Midwest"
decade[decade$region == "west","labelEnd"] <- "% of people born in DC \n lived in the West"
decade[decade$region == "maryland","labelEnd"] <- "% of people born in DC \n lived in Maryland"
decade[decade$region == "virginia","labelEnd"] <- "% of people born in DC \n lived in Virginia"
decade[decade$region == "stayed","labelEnd"] <- "% of people \n born in DC lived in DC"
g <- ggplot(DCMigration,aes(YEAR)) +
geom_point(data=decade,aes(x=YEAR,y=height),color="white")+
geom_ribbon(aes(ymin=lowHeight,ymax=highHeight,group=region,fill=region),color="white")+
theme_classic()+theme(axis.line.y=element_blank())+
geom_segment(data=decade, aes(x=YEAR,xend=YEAR),y=0,yend=1,color="grey90",alpha=0.1)+
scale_x_continuous(limits=c(1890,2025),breaks=seq(1900,2000,by=10),name="")+
scale_y_continuous(limits=c(0,1.01),name="")+theme(axis.ticks=element_blank(),axis.text.y=element_blank(),axis.line.x=element_blank())
cols <- c("stayed"="lightgrey","west"="gold2","midwest"="dodgerblue","south"="greenyellow","northeast"="orchid","maryland"="greenyellow","virginia"="greenyellow")
g <- g + scale_fill_manual(values=cols,guide=F)+labs(title="Where people born in DC have moved to:")
ribbonLabels <- c("Other States in the South","Other States in the Northeast","Other States in the Midwest","Other States in the West","Virginia", "Maryland", "Stayed in DC")
ribbonHeights <- sort(decade[decade$heightType=="midHeight"&decade$YEAR==1950,]$height)
g <- g + annotate(geom="text",x=rep(1950,7),y=ribbonHeights,label=ribbonLabels)
DC1900 <- DCMigration[DCMigration$YEAR==1900 & DCMigration$region == "stayed",]
DC1900height <- (DC1900$highHeight+DC1900$lowHeight)/2
DC2000 <- DCMigration[DCMigration$YEAR==2000 & DCMigration$region == "stayed",]
DC2000height <- (DC2000$highHeight+DC2000$lowHeight)/2
Maryland2000 <- DCMigration[DCMigration$YEAR==2000 & DCMigration$region == "maryland",]
Maryland2000height <- (Maryland2000$highHeight+Maryland2000$lowHeight)/2
Virginia2000 <- DCMigration[DCMigration$YEAR==2000 & DCMigration$region == "virginia",]
Virginia2000height <- (Virginia2000$highHeight+Virginia2000$lowHeight)/2
g <- g + annotate(geom="text",color="grey40",x=1900,y=DC1900height,hjust=0,label=paste(round(DC1900$propMoved,2)*100,"%",sep=""))+
annotate(geom="text",color="grey30",x=2000,y=DC2000height,hjust=1,label=paste(round(DC2000$propMoved,2)*100,"%",sep=""))+
annotate(geom="text",color="grey30",x=2000,y=Maryland2000height,hjust=1,label=paste(round(Maryland2000$propMoved,2)*100,"%",sep=""))+
annotate(geom="text",color="grey30",x=2000,y=Virginia2000height,hjust=1,label=paste(round(Virginia2000$propMoved,2)*100,"%",sep=""))
g