Your Dumb Tweets Are Getting Flagged To People Trying To Stop School Shootings

Data and R code to recreate the text analysis and graphics in this Oct. 31, 2019 BuzzFeed News post on the social media alerting service offered to school districts by Social Sentinel, which monitors for potential threats to school safety. Supporting files are in this GitHub repository.

Timeline of deaths in school shootings and spending on Social Sentinel

We searched the GovSpend database for purchases of Social Sentinel’s services, restricting to clients identified as school districts. Data on recent school shootings is from the K-12 School Shooting Database. Circles denoting school shootings are scaled by the number of deaths in each incident.

# load required packages
library(tidyverse)
library(scales)
library(sf)

# load data
load("data/school_purchases.RData")
load("data/recent_school_shootings.RData")

# total spending per month
school_purchases_month <- school_purchases %>%
  group_by(year, month) %>%
  summarize(total = sum(price, na.rm = TRUE)) %>%
  mutate(date = as.Date(paste0(year,"-",month,"-01")))

# chart
ggplot(school_purchases_month, aes(x = date, y = total)) +
  geom_col(fill = "black", color = "white") +
  geom_hline(yintercept = 600000, size = 0.3, color = "red") +
  geom_point(data = subset(recent_school_shootings, killed > 0), 
             aes(x = date, y = 600000, size = killed),
             color = "red",
             alpha = 0.4) +
  theme_minimal(base_size = 14, base_family = "Basier Square SemiBold") +
  scale_x_date(date_breaks = "1 year", 
               date_labels = "%b %Y", 
               limits = c(as.Date("2013-10-01"), as.Date("2019-12-01")),
               expand = c(0, 0)) +
  scale_y_continuous(labels = dollar, 
                     breaks = c(200000,400000), 
                     limits = c(0,650000), 
                     expand = c(0, 0)) +
  scale_size_area(max_size = 12, guide = FALSE)+
  geom_hline(yintercept = 0, size = 0.3) +
  geom_hline(yintercept = c(200000, 400000), size = 0.05) +
  geom_vline(xintercept = as.Date("2018-02-14"), size = 0.5, color = "red", linetype = "dotted") +
  xlab("") +
  geom_vline(xintercept = as.Date("2018-05-18"), size = 0.5, color = "red", linetype = "dotted") +
  xlab("") +
  ylab("Spending per month") +
  theme(panel.grid = element_blank(),
        axis.ticks.x = element_line()) +
  annotate("text", x = as.Date("2018-01-01"), 
           y = 480000, 
           angle = 90, 
           label = "Parkland",
           family = "Basier Square SemiBold",
           size = 4,
           color = "red") +
  annotate("text", x = as.Date("2018-04-10"), 
           y = 480000, 
           angle = 90, 
           label = "Santa Fe",
           family = "Basier Square SemiBold",
           size = 4,
           color = "red") +
  annotate("text", x = as.Date("2015-01-01"), 
           y = 540000, 
           label = "Killed in school shootings",
           family = "Basier Square SemiBold",
           size = 4,
           color = "red")

Words used in flagged versus unflagged posts from the same Twitter users

Using the Twitter API, we collected data on 190 tweets that were flagged by Social Sentinel Jul. 4 and Sept. 29, 2019, in alerts sent to school districts obtained by BuzzFeed News in public records requests. We recorded the dates (in UTC) of the first and last tweets flagged from each user, and then similary collected data on all tweets from each user over a period from one day before the first flagged tweet to one day after. This gave a comparison group of 11,265 tweets. In the text analysis, we calculated the frequency of usage of each word in the flagged and unflagged data, after removing Twitter handles, hashtags, numbers, and very common words like “the,” “I’m,” and “at.” (We have removed screen names from the data to protect the users’ privacy.)

# load required package
library(tidytext)

# load data
load("data/flag_language.RData")

# regex for parsing tweets
replace_reg <- "https?://[^\\s]+|&amp;|&lt;|&gt;|\bRT\\b"

# tokenize into words
words <- flag_language %>%
  # remove urls, non ASCII characters etc
  mutate(text = str_replace_all(text, replace_reg, ""),
         text = iconv(text, "latin1", "ASCII", sub="")) %>%
  filter(text != "") %>%
  unnest_tokens(word, text, token = "tweets") %>%
  anti_join(stop_words) %>%
  anti_join(custom_stopwords) %>%
  group_by(word,flag) %>%
  count(sort = TRUE) %>%
  # remove hashtags,handles, and anything containing a number
  filter(!grepl("@|#|[0-9]",word))

# calculate frequency of word use within category and select top 20 words in each
words_pc <- words %>%
  group_by(flag) %>%
  mutate(percent = n/sum(n)*100,
         r = row_number()) %>%
  slice(1:20) %>%
  ungroup() %>%
  arrange(-percent)

# chart
ggplot(words_pc, aes(x = reorder(r,-r), y = percent)) +
  geom_col(aes(fill = flag), color = "white") +
  geom_text(aes(label = word, 
                y = -1, 
                x = reorder(r,-r),
                color = flag),
            size = 4.5,
            family = "Basier Square SemiBold") +
  facet_wrap(~flag) +
  xlab("") +
  scale_fill_manual(values = c("red","black"), guide = FALSE) +
  scale_color_manual(values = c("red","black"), guide = FALSE) +
  scale_x_discrete(breaks = NULL) +
  scale_y_continuous(limits = c(-2,3.5), breaks = c(1,2,3)) +
  coord_flip() +
  theme_minimal(base_size = 16, base_family = "Basier Square SemiBold") +
  theme(panel.grid.minor = element_blank()) 

Map of school shootings since 2014

We had earlier geocoded the school shootings data for mapping.

# load required package
library(tidycensus)

# labels for Parkland and Santa Fe
recent_school_shootings <- recent_school_shootings %>%
  mutate(label = case_when(grepl("Parkland|Santa Fe",city) ~ city,
                           TRUE ~ NA_character_))

# get states basemap from Census Bureau
states <- get_acs(geography = "state", variables = "B19013_001", 
                            shift_geo = TRUE, geometry = TRUE)

# map
ggplot(states) +
  geom_sf(color = "white", size = 0.3, fill = "#cccccc") +
  geom_sf(data = subset(recent_school_shootings, killed > 0),
          aes(size = killed),
          color = "red",
          alpha = 0.4) +
  geom_sf_text(data = subset(recent_school_shootings, killed > 0),
               aes(label = label),
               color = "red",
               nudge_x = 300000,
               nudge_y = -200000,
               family = "Basier Square SemiBold") +
  scale_size_area(max_size = 12, guide = FALSE) +
  theme_void(base_family = "Basier Square SemiBold", base_size = 14)