Data and R code to recreate the text analysis and graphics in this Oct. 31, 2019 BuzzFeed News post on the social media alerting service offered to school districts by Social Sentinel, which monitors for potential threats to school safety. Supporting files are in this GitHub repository.
Using the Twitter API, we collected data on 190 tweets that were flagged by Social Sentinel Jul. 4 and Sept. 29, 2019, in alerts sent to school districts obtained by BuzzFeed News in public records requests. We recorded the dates (in UTC) of the first and last tweets flagged from each user, and then similary collected data on all tweets from each user over a period from one day before the first flagged tweet to one day after. This gave a comparison group of 11,265 tweets. In the text analysis, we calculated the frequency of usage of each word in the flagged and unflagged data, after removing Twitter handles, hashtags, numbers, and very common words like “the,” “I’m,” and “at.” (We have removed screen names from the data to protect the users’ privacy.)
# load required package
library(tidytext)
# load data
load("data/flag_language.RData")
# regex for parsing tweets
replace_reg <- "https?://[^\\s]+|&|<|>|\bRT\\b"
# tokenize into words
words <- flag_language %>%
# remove urls, non ASCII characters etc
mutate(text = str_replace_all(text, replace_reg, ""),
text = iconv(text, "latin1", "ASCII", sub="")) %>%
filter(text != "") %>%
unnest_tokens(word, text, token = "tweets") %>%
anti_join(stop_words) %>%
anti_join(custom_stopwords) %>%
group_by(word,flag) %>%
count(sort = TRUE) %>%
# remove hashtags,handles, and anything containing a number
filter(!grepl("@|#|[0-9]",word))
# calculate frequency of word use within category and select top 20 words in each
words_pc <- words %>%
group_by(flag) %>%
mutate(percent = n/sum(n)*100,
r = row_number()) %>%
slice(1:20) %>%
ungroup() %>%
arrange(-percent)
# chart
ggplot(words_pc, aes(x = reorder(r,-r), y = percent)) +
geom_col(aes(fill = flag), color = "white") +
geom_text(aes(label = word,
y = -1,
x = reorder(r,-r),
color = flag),
size = 4.5,
family = "Basier Square SemiBold") +
facet_wrap(~flag) +
xlab("") +
scale_fill_manual(values = c("red","black"), guide = FALSE) +
scale_color_manual(values = c("red","black"), guide = FALSE) +
scale_x_discrete(breaks = NULL) +
scale_y_continuous(limits = c(-2,3.5), breaks = c(1,2,3)) +
coord_flip() +
theme_minimal(base_size = 16, base_family = "Basier Square SemiBold") +
theme(panel.grid.minor = element_blank())
We had earlier geocoded the school shootings data for mapping.
# load required package
library(tidycensus)
# labels for Parkland and Santa Fe
recent_school_shootings <- recent_school_shootings %>%
mutate(label = case_when(grepl("Parkland|Santa Fe",city) ~ city,
TRUE ~ NA_character_))
# get states basemap from Census Bureau
states <- get_acs(geography = "state", variables = "B19013_001",
shift_geo = TRUE, geometry = TRUE)
# map
ggplot(states) +
geom_sf(color = "white", size = 0.3, fill = "#cccccc") +
geom_sf(data = subset(recent_school_shootings, killed > 0),
aes(size = killed),
color = "red",
alpha = 0.4) +
geom_sf_text(data = subset(recent_school_shootings, killed > 0),
aes(label = label),
color = "red",
nudge_x = 300000,
nudge_y = -200000,
family = "Basier Square SemiBold") +
scale_size_area(max_size = 12, guide = FALSE) +
theme_void(base_family = "Basier Square SemiBold", base_size = 14)