Data and R code for the analysis supporting this April 6, 2016 BuzzFeed News post on federal surveillance aircraft. Supporting files are in this GitHub repository.

Data preparation

BuzzFeed News obtained more than four months of aircraft transponder detections from the plane tracking website Flightradar24, covering August 17 to December 31, 2015 UTC, containing all data displayed on the site within a bounding box encompassing the continental United States, Alaska, Hawaii and Puerto Rico. Fightradar24 receives data from its network of ground-based receivers, supplemented by a feed from ground radars provided by the Federal Aviation Administration (FAA) with a five-minute delay.

After parsing from the raw files supplied by Flightradar24, the data included the following fields, for each transponder detection:

We saved the data for each individual plane in a separate comma-separated values (CSV) file, named for the aircraft’s adshex Mode-S code.

From the FAA registration database, we identified aircraft registered to the Department of Homeland Security and companies previously identified as front operations for the FBI. Using the Mode-S hexadecimal codes for these planes, we imported the corresponding files from the Flightradar24 data into R, and joined this to the FAA data to give the following additional fields:

This data is spread across three files feds1.csv, feds2.csv, and feds3.csv, to keep file sizes small, which we then processed to create further calculated fields.

# load required packages
library(readr)
library(dplyr)

# Set default timezone for session to UTC
Sys.setenv(TZ = "UTC")

# create data frame
feds <- data_frame()

# list files to load
files <- list.files("data/feds")

# load data
for (file in files) {
    tmp <- read_csv(paste0("data/feds/",file), col_types = list(
      adshex = col_character(),
      flight_id = col_character(),
      latitude = col_double(),
      longitude = col_double(),
      altitude = col_integer(),
      speed = col_integer(),
      squawk = col_character(),
      type = col_character(),
      timestamp = col_datetime(),
      name = col_character(),
      other_names1 = col_character(),
      other_names2 = col_character(),
      n_number = col_character(),
      serial_number = col_character(),
      mfr_mdl_code = col_character(),
      mfr = col_character(),
      model = col_character(),
      year_mfr = col_integer(),
      type_aircraft = col_integer(),
      agency = col_character()
    ))
    feds <- bind_rows(feds,tmp)
}
rm(tmp,file)

First, we created a new field loc_timestamp, converting the UTC timestamps to local standard times.

# load required packages
library(rgdal)
library(rgeos)

# load Natural Earth timezones shapefile
timezones <- readOGR("data/ne_10m_time_zones", "ne_10m_time_zones")
# what is its Coordinate Reference System (CRS)?
timezones@proj4string
## CRS arguments:
##  +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
# convert the feds data to a spatial points data frame, with the same CRS
feds_map <- as.data.frame(feds)
xy <- feds_map[,c(4,3)]
feds_map <- SpatialPointsDataFrame(coords = xy, data = feds_map, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

# spatial join of timezone data to feds_map (this will take some time to run)
feds_map <- raster::intersect(feds_map, timezones)

# function to use when adding/subtracting hours from datetime objects
hrs <- function(x) {
  y <- x * 3600
  return(y)
}

# create loc_timestamp field, and drop the fields added in the spatial join
# zone is the field that gives the offset from UTC in hours
feds <- as.data.frame(feds_map) %>%
  mutate(loc_timestamp = timestamp + hrs(zone)) %>%
  select(1:21,loc_timestamp)

We then created further fields from loc_timestamp, giving date, day of the week, hour on the 24-hour clock, and designating each day as a regular working day, or not (for weekends and federal holidays).

# create date, day_week, and hour fields
feds <- feds %>%
  mutate(date = as.Date(loc_timestamp),
         day_week = weekdays(loc_timestamp),
         hour = as.numeric(format(loc_timestamp, "%H")))

# create work_day field
feds_weekends <- feds %>%
  filter(day_week == "Saturday" | day_week == "Sunday") %>%
  mutate(work_day = "N")

hols <- as.data.frame(as.Date(c("2015-09-07","2015-10-12","2015-11-11","2015-11-26","2015-12-25")))
names(hols) <- "date"

feds_hols <- inner_join(feds, hols) %>%
  mutate(work_day = "N")

feds_working <- feds %>%
  filter(day_week != "Saturday" & day_week != "Sunday") %>%
  anti_join(hols) %>%
  mutate(work_day = "Y")

feds <- bind_rows(feds_weekends, feds_hols, feds_working)

The feds data frame now contained the following additional fields:

Registration documents obtained from the FAA revealed that the ownership of two helicopters was transferred from the DHS to an FBI front company in mid-December, so we corrected earlier records to reflect this switch.

# correcting ownership records for aircraft that switched registration from DHS to FBI
transfers <- feds %>%
  filter(date < "2015-12-14" & (n_number == "6971A" | n_number == "6982A"))
feds <- anti_join(feds, transfers)
transfers <- transfers %>%
  mutate(name = "DEPARTMENT OF HOMELAND SECURITY", agency = "dhs")
feds <- bind_rows(feds, transfers)

We performed a spatial join to the boundaries of U.S. states and territories, for analysis of the data by state, and to restrict calculations to planes observed over U.S. states and territories.

# load states/provinces shapefile
states <- readOGR("data/ne_10m_admin_1_states_provinces", "ne_10m_admin_1_states_provinces")

# filter for US states/territories only
states <- states [ which(states@data$sov_a3 == "US1"), ]
# what is its CRS?
states@proj4string
## CRS arguments:
##  +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
# convert the feds data to a spatial points data frame, with the same CRS
feds_map <- as.data.frame(feds)
xy <- feds_map[,c(4,3)]
feds_map <- SpatialPointsDataFrame(coords = xy, data = feds_map, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

# spatial join of states to feds_map (this will take some time to run)
states_feds_map <- raster::intersect(feds_map, states) 

# convert to data frame, and drop fields from the spatial join apart from state/territory name and abbreviation
states_feds <- as.data.frame(states_feds_map) %>%
  select(1:26, name.1, postal) %>%
  rename(state = name.1)

And we performed a spatial join to the boundaries of urban areas defined by the Census Bureau. This allowed subsquent analysis for transponder detections over any urban area.

# load urban areas shapefile
urban <-  readOGR("data/cb_2014_us_ua10_500k", "cb_2014_us_ua10_500k") 
# what is its CRS?
urban@proj4string
## CRS arguments:
##  +proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0
# convert the feds data to a spatial points data frame, with the same CRS
feds_map <- as.data.frame(feds)
xy <- feds_map[,c(4,3)]
feds_map <- SpatialPointsDataFrame(coords = xy, data = feds_map, proj4string = CRS("+proj=longlat +datum=NAD83 +no_defs +ellps=GRS80 +towgs84=0,0,0"))

# spatial join of urban to feds_map (this will take some time to run)
# note use of spatialEco package function here, rather than raster package function used above, which crashed on this spatial join
urban_feds_map <- spatialEco::point.in.poly(feds_map, urban) 

# convert to data frame, and drop some fields from the spatial join
urban_feds <- as.data.frame(urban_feds_map) %>%
  select(1:26,29,30)

The urban_feds data frame now contained the following additional fields:

We then filtered to remove data for 2015-08-16 and 2015-12-31, which will be incomplete after conversion to local times from UTC.

# filter to remove dates with incomplete data
feds_series <- feds %>%
  filter(date != as.Date("2015-08-16") & date != as.Date("2015-12-31"))
urban_feds_series <- urban_feds %>%
  filter(date != as.Date("2015-08-16") & date != as.Date("2015-12-31"))
states_feds_series <- states_feds %>%
  filter(date != as.Date("2015-08-16") & date != as.Date("2015-12-31"))

Analysis

# load required packages
library(DT)
library(ggplot2)
library(scales)
library(tidyr)

Aircraft detected, by agency and major type

# Count aircraft for each agency, fixed-wing and helicopters
feds_types <- states_feds_series %>%
  mutate(type_aircraft = paste0("type", type_aircraft)) %>%
  group_by(agency, type_aircraft) %>%
  summarise(count = n_distinct(adshex)) %>%
  spread(type_aircraft, count)
feds_types[is.na(feds_types)] <- 0
feds_types <- feds_types %>%
  mutate("Fixed-wing" = type4 + type5,
         Helicopters = type2 + type6,
         Agency = toupper(agency)) %>%
  select(8,6,7)
# DHS count includes 5 helicopters incorrectly classified in the FAA registration database as balloons (type_aircraft==2), fixed in the code above

Number of aircraft, by agency

(Includes two helicopters registered to the DHS until mid-December 2015, then transferred to an FBI front company, counted here under both agencies.)


Total number of flights and flight segments

# how many flight segments? How many days active for each plane?
feds_flights <- states_feds_series %>%
  group_by(agency, adshex) %>%
  summarise(flights = n_distinct(flight_id), days = n_distinct(date))

total_days <- feds_flights %>%
  ungroup() %>%
  group_by(agency) %>%
  summarise(flights = sum(flights),days = sum(days)) %>%
  mutate(agency = toupper(agency),
         flights = format(flights, big.mark = ","),
         days=format(days, big.mark = ","))
  
names(total_days) <- c("Agency","Flight segments","Minimum flights")

Number of flights, by agency

In the Flightradar24 data, there may be more than one flight segment within a flight, if a plane disappears from coverage for a period. In this table, “Minimum flights” is a count of distinct aircraft/date combinations, and is a conservative estimate of the total number of flights.


All detected aircraft

# count transponder detections by plane
all_planes <- states_feds_series %>%
  group_by(agency, n_number, mfr, model, type_aircraft) %>%
  summarise(count = n())
all_helicopters <- all_planes %>%
  filter(type_aircraft == 2 | type_aircraft == 6) %>%
  mutate(type = "Helicopter")
all_fixed <- all_planes %>%
  filter(type_aircraft == 4 | type_aircraft == 5) %>%
  mutate(type = "Fixed-wing")
all_planes <- bind_rows(all_helicopters, all_fixed) %>%
  arrange(desc(count)) %>%
  ungroup() %>%
  mutate(agency = toupper(agency),
         count = format(count, big.mark = ",")) %>%
  select(1:4,7,6)
names(all_planes) <- c("Agency","Registration","Manufacturer","Model","Type","Transponder detections")

Transponder detections, by aircraft


Total planes and transponder detections, by state

# for FBI aircraft
states_fbi <- states_feds_series %>%
  filter(agency == "fbi") %>%
  group_by(state, postal) %>%
  summarise(planes = n_distinct(adshex), detections = n()) %>%
  ungroup() %>%
  arrange(desc(detections)) %>%
  mutate(detections = format(detections, big.mark = ","))
names(states_fbi) <- c("State/territory","Abbrev.","Planes","Transponder detections")

FBI surveillance, by state


# for dhs aircraft
states_dhs <- states_feds_series %>%
  filter(agency == "dhs") %>%
  group_by(state, postal) %>%
  summarise(planes = n_distinct(adshex), detections = n()) %>%
  ungroup() %>%
  arrange(desc(detections)) %>%
  mutate(detections = format(detections, big.mark=","))
names(states_dhs) <- c("State/territory","Abbrev.","Planes","Transponder detections")

DHS surveillance, by state


# combined fbi and dhs aircraft
states_combined <- states_feds_series %>%
  group_by(state,postal) %>%
  summarise(planes = n_distinct(adshex), detections = n()) %>%
  ungroup() %>%
  arrange(desc(detections)) %>%
  mutate(detections=format(detections, big.mark = ","))
names(states_combined) <- c("State/territory","Abbrev.","Planes","Transponder detections")

Combined FBI and DHS surveillance, by state


FBI or DHS aircraft were detected at least fleetingly over every U.S. state, plus Washington DC and Puerto Rico — apart from Montana.

FBI and DHS surveillance by hour of the day

# count transponder detections per hour
feds_hours <- states_feds_series %>%
  group_by(agency, hour) %>%
  summarise(observations = n()) %>%
  filter(agency == "fbi" | agency == "dhs")

# plot
hours <- ggplot(feds_hours, aes(x=hour, y=observations, color = agency)) %>%
  + geom_line(size = 0.7) %>%
  + geom_point(size = 1.5) %>%
  + xlab("Hour (24-hour clock)") %>%
  + ylab("Total transponder detections") %>%
  + theme_minimal() %>%
  + scale_color_manual(values = c("#0F3B82", "#FF2900"),
                       labels = c("DHS", "FBI")) %>%
  + scale_y_continuous(labels = comma) %>%
  + theme(legend.position = "top",
          legend.title = element_blank())

FBI and DHS transponder detections, by hour of the day