Data and R code to reproduce the graphics in this July 30, 2016 BuzzFeed News article on trends in athletics and swimming performances. Supporting files are in this GitHub repository.
iaaf_toplists.csv
Data from the all-time top performance lists for senior outdoor track and field meets, maintained by the International Association of Athletic Federations (IAAF); top 100 performances for each event (top 50 for decathlon and heptathlon).
Contains the following fields:
Result
Time, distance, height, or points, depending on Event
.Competitor
Name of athlete, or team in the case of relays.Nat
Three-letter code for nationality of Competitor
.Venue
Date
Location and date of performance.Gender
Men or women.Event
Self-explanatory.StateDope
“GDR” for performances by East German athletes from Jan 1, 1974 onwards, during its state-sponsored doping program; “RUS” for performances by Russian athletes from Jan 1, 2012 onwards, during the doping program documented in this report commissioned by the World Anti-Doping Agency; else “NULL.”iaaf_wr.csv
As above, reflecting world record performances only. Contains the fields Result
,Date
,Event
, and Gender
.
world_records.csv
World records in swimming (long course, ratified records only) and track and field on July 29, 2016, sourced from FINA, the international swimming federation, and the IAAF.
Contains the following fields:
Gender
Event
As above.Year
Year in which record was set.Sport
Swimming or track and field.# load required packages
library(readr)
library(dplyr)
library(tidyr)
# load data
all_iaaf <- read_csv("data/iaaf_toplists.csv")
# convert StateDope to an ordered factor
all_iaaf$StateDope <- factor(all_iaaf$StateDope, levels = c("NULL","RUS","GDR"), ordered = T)
# short track events (time in seconds)
track_short_iaaf <- all_iaaf %>%
filter(grepl("100m|200m|400m|400h|110m|4x100m", Event) & nchar(Result) < 6) %>%
mutate(Seconds = as.numeric(Result),
Time_S = Seconds) %>%
select(-Seconds)
# medium/long track events (time in minutes)
track_medium_iaaf <- all_iaaf %>%
filter(grepl("800m|1500m|5000m|10000m|3000h|4x400m", Event)) %>%
mutate(Time = Result) %>%
separate(Time, c("Minutes","Seconds"), ":") %>%
mutate(Minutes = as.numeric(Minutes),
Seconds = as.numeric(Seconds),
Time_M = Minutes+Seconds/60) %>%
select(-Seconds, -Minutes)
# marathon (time in hours)
track_long_iaaf <- all_iaaf %>%
filter(Event == "marathon") %>%
mutate(Time = Result) %>%
separate(Time, c("Hours","Minutes","Seconds"), ":") %>%
mutate(Hours = as.numeric(Hours),
Minutes = as.numeric(Minutes),
Seconds = as.numeric(Seconds),
Time_H = Hours + Minutes/60 + Seconds/3600) %>%
select(-Seconds, -Minutes, -Hours)
# field events (distance/height in meters)
field_iaaf <- all_iaaf %>%
filter(grepl("throw|put|jump|vault", Event)) %>%
mutate(Meters = as.numeric(Result))
# combined events (points)
combined_iaaf <- all_iaaf %>%
filter(grepl("decathlon|heptathlon", Event)) %>%
mutate(Points = as.numeric(Result))
# load data
iaaf_wr <- read_csv("data/iaaf_wr.csv")
# short track events (time in seconds)
short_track_records <- iaaf_wr %>%
filter(grepl("100m|200m|400m|400h|110m", Event) & nchar(Result) < 6) %>%
mutate(Time_S = as.numeric(Result))
# medium/long track events (time in minutes)
medium_track_records <- iaaf_wr %>%
filter(grepl("800m|1500m|5000m|10000m|3000h|4x400m", Event)) %>%
mutate(Time = Result) %>%
separate(Time, c("Minutes","Seconds"), ":") %>%
mutate(Minutes = as.numeric(Minutes),
Seconds = as.numeric(Seconds),
Time_M = Minutes + Seconds/60) %>%
select(-Seconds, -Minutes)
# marathon (time in hours)
long_track_records <- iaaf_wr %>%
filter(Event == "marathon") %>%
mutate(Time = Result) %>%
separate(Time, c("Hours","Minutes","Seconds"), ":") %>%
mutate(Hours = as.numeric(Hours),
Minutes = as.numeric(Minutes),
Seconds=as.numeric(Seconds),
Time_H = Hours + Seconds/3600 + Minutes/60) %>%
select(-Seconds,-Minutes,-Hours)
# field events (distance/height in meters)
field_records <- iaaf_wr %>%
filter(grepl("throw|put|jump|vault", Event)) %>%
mutate(Meters = as.numeric(Result))
# combined events (points)
combined_records <- iaaf_wr %>%
filter(grepl("decathlon|heptathlon", Event)) %>%
mutate(Points = as.numeric(Result))
(Note that the calculated fields Time_H
and Time_M
, giving race times in hours and minutes, respectively, are decimals.)
The code that follows makes charts of the following form, here showing performances in the women’s 400 meters. Performances by East German athletes from Jan 1, 1974 onwards, during its state-sponsored doping program, shown in blue; performances by Russian athletes from Jan 1, 2012 onwards, during the doping program documented in this report, shown in red):
# load required packages
library(ggplot2)
library(scales)
genders <- c("men","women")
# short track events (time in seconds)
for (gender in genders) {
for (event in unique(track_short_iaaf$Event)) {
tmp <- track_short_iaaf %>%
filter(Event==event & Gender==gender)
tmp_wr <- short_track_records %>%
filter(Event==event & Gender==gender)
# the following three lines ensure that the stepped line for record progression will extend to the right of the chart
x <- data.frame(tmp_wr$Time_S[1],as.Date("2016-07-29"),event,gender)
names(x) <- c("Time_S","Date","Event","Gender")
tmp_wr <- bind_rows(tmp_wr,x)
tmp_chart <- ggplot() +
theme_minimal() +
theme(text=element_text(size=22)) +
theme(axis.title = element_text(size=16)) +
geom_point(aes(x=Date, y=Time_S, fill=StateDope),
shape = 21,
colour="black",
size=5,
alpha=0.5,
data=tmp) +
xlab("") +
ylab("Seconds")
if (!"RUS" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "blue"), guide=F)
}
else if (!"GDR" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red"), guide=F)
}
else
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red", "blue"), guide=F)
}
if (!is.na(tmp_wr$Result[1]))
{
tmp_chart <- tmp_chart +
scale_y_reverse(limits = c(max(tmp$Time_S),min(tmp$Time_S))) +
scale_x_date(limits = c(min(tmp$Date),as.Date("2016-07-29")))
tmp_chart <- tmp_chart +
geom_step(aes(x=Date, y=Time_S), color="grey40", data = tmp_wr)
}
ggsave(file=paste0("charts/",gender," ",event,".jpg"), plot=tmp_chart, width = 8, height = 4, units = "in", dpi=300)
print(paste0(gender," ",event))
}
}
# medium/long track events (time in minutes)
for (gender in genders) {
for (event in unique(track_medium_iaaf$Event)) {
tmp <- track_medium_iaaf %>%
filter(Event==event & Gender==gender)
tmp_wr <- medium_track_records %>%
filter(Event==event & Gender==gender)
x <- data.frame(tmp_wr$Time_M[1],as.Date("2016-07-29"),event,gender)
names(x) <- c("Time_M","Date","Event","Gender")
tmp_wr <- bind_rows(tmp_wr,x)
tmp_chart <- ggplot() +
theme_minimal() +
theme(text=element_text(size=22)) +
theme(axis.title = element_text(size=16)) +
geom_point(aes(x=Date, y=Time_M, fill=StateDope),
shape = 21,
colour="black",
size=5,
alpha=0.5,
data=tmp) +
xlab("") +
ylab("Minutes")
if (!"RUS" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "blue"), guide=F)
}
else if (!"GDR" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red"), guide=F)
}
else
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red", "blue"), guide=F)
}
if (!is.na(tmp_wr$Result[1]))
{
tmp_chart <- tmp_chart +
scale_y_reverse(limits = c(max(tmp$Time_M),min(tmp$Time_M))) +
scale_x_date(limits = c(min(tmp$Date),as.Date("2016-07-29")))
tmp_chart <- tmp_chart +
geom_step(aes(x=Date, y=Time_M), color="grey40", data = tmp_wr)
}
ggsave(file=paste0("charts/",gender," ",event,".jpg"), plot=tmp_chart, width = 8, height = 4, units = "in", dpi=300)
print(paste0(gender," ",event))
}
}
# marathon (time in hours)
for (gender in genders) {
for (event in unique(track_long_iaaf$Event)) {
tmp <- track_long_iaaf %>%
filter(Event==event & Gender==gender)
tmp_wr <- long_track_records %>%
filter(Event==event & Gender==gender)
x <- data.frame(tmp_wr$Time_H[1],as.Date("2016-07-29"),event,gender)
names(x) <- c("Time_H","Date","Event","Gender")
tmp_wr <- bind_rows(tmp_wr,x)
tmp_chart <- ggplot() +
theme_minimal() +
theme(text=element_text(size=22)) +
theme(axis.title = element_text(size=16)) +
geom_point(aes(x=Date, y=Time_H, fill=StateDope),
shape = 21,
colour="black",
size=5,
alpha=0.5,
data=tmp) +
xlab("") +
ylab("Hours")
if (!"RUS" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "blue"), guide=F)
}
else if (!"GDR" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red"), guide=F)
}
else
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red", "blue"), guide=F)
}
if (!is.na(tmp_wr$Result[1]))
{
tmp_chart <- tmp_chart +
scale_y_reverse(limits = c(max(tmp$Time_H),min(tmp$Time_H))) +
scale_x_date(limits = c(min(tmp$Date),as.Date("2016-07-29")))
tmp_chart <- tmp_chart +
geom_step(aes(x=Date, y=Time_H), color="grey40", data = tmp_wr)
}
ggsave(file=paste0("charts/",gender," ",event,".jpg"), plot=tmp_chart, width = 8, height = 4, units = "in", dpi=300)
print(paste0(gender," ",event))
}
}
# field events (distance/height in meters)
for (gender in genders) {
for (event in unique(field_iaaf$Event)) {
tmp <- field_iaaf %>%
filter(Event==event & Gender==gender)
tmp_wr <- field_records %>%
filter(Event==event & Gender==gender)
x <- data.frame(tmp_wr$Meters[1],as.Date("2016-07-29"),event,gender)
names(x) <- c("Meters","Date","Event","Gender")
tmp_wr <- bind_rows(tmp_wr,x)
tmp_chart <- ggplot() +
theme_minimal() +
theme(text=element_text(size=22)) +
theme(axis.title = element_text(size=16)) +
geom_point(aes(x=Date, y=Meters, fill=StateDope), shape = 21, colour="black", size=5, alpha=0.5, data=tmp) +
xlab("") + ylab("Meters")
if (!"RUS" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "blue"), guide=F)
}
else if (!"GDR" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red"), guide=F)
}
else
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red", "blue"), guide=F)
}
if (!is.na(tmp_wr$Meters[1]))
{
tmp_chart <- tmp_chart +
geom_step(aes(x=Date, y=Meters), color="grey40", data = tmp_wr)
tmp_chart <- tmp_chart +
scale_x_date(limits = c(min(tmp$Date),as.Date("2016-07-29"))) +
scale_y_continuous(limits = c(min(tmp$Meters),max(tmp$Meters)), labels = comma)
}
ggsave(file=paste0("charts/",gender," ",event,".jpg"), plot=tmp_chart, width = 8, height = 4, units = "in", dpi=300)
print(paste0(gender," ",event))
}
}
# combined events (points)
for (gender in genders) {
for (event in unique(combined_iaaf$Event)) {
tmp <- combined_iaaf %>%
filter(Event==event & Gender==gender)
tmp_wr <- combined_records %>%
filter(Event==event & Gender==gender)
x <- data.frame(tmp_wr$Points[1],as.Date("2016-07-29"),event,gender)
names(x) <- c("Points","Date","Event","Gender")
tmp_wr <- bind_rows(tmp_wr,x)
tmp_chart <- ggplot() +
theme_minimal() +
theme(text=element_text(size=22)) +
theme(axis.title = element_text(size=16)) +
geom_point(aes(x=Date, y=Points, fill=StateDope), shape = 21, colour="black", size=5, alpha=0.5, data=tmp) +
xlab("") + ylab("Points")
if (!"RUS" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "blue"), guide=F)
}
else if (!"GDR" %in% tmp$StateDope)
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red"), guide=F)
}
else
{
tmp_chart <- tmp_chart +
scale_fill_manual(values=c("gray", "red", "blue"), guide=F)
}
if (!is.na(tmp_wr$Points[1]))
{
tmp_chart <- tmp_chart +
geom_step(aes(x=Date, y=Points), color="grey40", data = tmp_wr)
tmp_chart <- tmp_chart +
scale_x_date(limits = c(min(tmp$Date),as.Date("2016-07-29"))) +
scale_y_continuous(limits = c(min(tmp$Points),max(tmp_wr$Points)),labels=comma)
}
ggsave(file=paste0("charts/",gender," ",event,".jpg"), plot=tmp_chart, width = 8, height = 4, units = "in", dpi=300)
print(paste0(gender," ",event))
}
}
wr <- read_csv("data/world_records.csv")
record_dates <- ggplot(wr, aes(x=Year, fill=Gender, color=Gender)) +
scale_color_manual(values=c("mediumblue","violetred")) +
scale_fill_manual(values=c("mediumblue","violetred")) +
geom_dotplot(stackgroups = T,
method = "histodot",
binwidth=1,
stackdir = "center") +
xlab("") +
ylab("") +
theme_minimal() +
facet_wrap(~Sport) +
coord_flip() +
scale_x_continuous(breaks= c(1985,1990,1995,2000,2005,2010,2015),
labels=c("1985","1990","1995","2000","2005","2010","2015")) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
text=element_text(size=22),
legend.position = "bottom",
legend.title = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())
ggsave("charts/record_dates.svg", plot=record_dates, width = 8, height = 4)
plot(record_dates)