Introduction

From August 25, 2016 to September 21, 2016, we collected tweets that were related to the Zika outbreak. This data was used for multiple papers relating to the usage of multimedia in tweets, government Twitter interactions during times of crisis, and identifying influential actors in propogating health information. Because the data was used for so many different things, this notebook will be more of a quick retrospective of some of the things I did rather than a comprehensive log of everything that was done.

This is also my first R notebook, so there might be some quirks in formatting will arise while I knit to html.

Libraries

library(plyr)
library(tidyverse)
library(ggthemes)
library(viridis)
library(Cairo)
library(igraph)
library(rgexf)
library(maptools)
library(rgeos)
library(rgdal)
library(broom)
library(grid)
library(gridExtra)
library(twitteR)
library(rjson)
library(httr)

Importing and Cleanup

Our parsed tweets are separated, by day, into 28 .json files. In order to use our parsing function, we need to concatenate them into a single object.

#initial R script to parse 
#streaming twitter API
#Thomas Keller
#[email protected]

#code and parser based off streamR parseTweets function
#https://github.com/pablobarbera/streamR/blob/master/streamR/R/parseTweets.R

#helper function not meant for the light of day
#breaks out list columns of tweets



unlistWithNA <- function(lst, field){
  if (length(field)==1){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
  }
  if (length(field)==2){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
  }
  if (length(field)==3 & field[1]!="geo"){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
  }
  if (field[1]=="geo"){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
  }
  
  if (length(field)==4 && field[2]!="urls"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
  }
  if (length(field)==4 && field[2]=="urls"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
  }
  if (length(field)==6 && field[2]=="bounding_box"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) 
      x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
  }
  return(vect)
}

#small extra functions to parse the two more complicated fields that return lists themselves...user mentions
#right now merging into a single string with ";" as a separator
parse_user=function(user_mentions){
  num_mention=length(user_mentions)
  if(num_mention==0){return(NA)}
  else if(num_mention==1){
    return(user_mentions[[1]]$screen_name) 
  }
  else{return(paste(sapply(1:length(user_mentions),function(x) user_mentions[[x]]$screen_name),collapse=';'))}
}

parse_id=function(user_mentions){
  num_mention=length(user_mentions)
  if(num_mention==0){return(NA)}
  else if(num_mention==1){
    return(user_mentions[[1]]$id_str) 
  }
  else{return(paste(sapply(1:length(user_mentions),function(x)
    user_mentions[[x]]$id_str),collapse=';'))}
}

parse_hash=function(hashtags){
  num_hash=length(hashtags)
  if(num_hash==0){return(NA)}
  else if(num_hash==1){
    return(hashtags[[1]]$text) 
  }
  else{return(paste(sapply(1:length(hashtags),function(x)
    hashtags[[x]]$text),collapse=';'))}
}

parse_media_type=function(media){
  num_media=length(media)
  if(num_media==0){return(NA)}
  else if(num_media==1){
    return(media[[1]]$type) 
  }
  else{return(paste(sapply(1:length(media),function(x)
    media[[x]]$type),collapse=';'))}
}

parse_media_url=function(media){
  num_media=length(media)
  if(num_media==0){return(NA)}
  else if(num_media==1){
    return(media[[1]]$expanded_url) 
  }
  else{return(paste(sapply(1:length(media),function(x)
    media[[x]]$expanded_url),collapse=';'))}
}

######### 
#main parser
#code
########


library(streamR)

parseTweets_mod=function(jsonfile){
  tweet_list=readTweets(jsonfile)
  tot_tweet=length(tweet_list)
  tweet_list=lapply(tweet_list, function(x) if(x$lang=='en' & x$user$followers_count>=25 & x$user$friends_count>=100 ) return(x))
  tweet_list=tweet_list[!sapply(tweet_list,is.null)] # culls out the nulled elements
  #reweet=lapply(tweet_list,function(x) if(is.null(x$retweeted_status)==TRUE) return(NULL) else(return(x)))
  #retweet=retweet[!sapply(retweet,is.null)]
  #print(paste('After simple spam filtering',length(tweet_list),'tweets remain' , 100*length(tweet_list)/tot_tweet,'%'))
  place_lat_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
  place_lat_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 2, 2)) 
  place_lat = sapply(1:length(tweet_list), function(x) 
    mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
  place_lon_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
  place_lon_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
  place_lon = sapply(1:length(tweet_list), function(x) 
    mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
  
  mentions=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$user_mentions)
  hashes=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$hash)
  media=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$media)
  parsed_user=sapply(1:length(mentions), function(x) parse_user(mentions[[x]]))
  parsed_id=sapply(1:length(mentions), function(x) parse_id(mentions[[x]]))
  parsed_hash=sapply(1:length(hashes),function(x) parse_hash(hashes[[x]]))
  parsed_media_type=sapply(1:length(media),function(x) parse_media_type(media[[x]]))
  parsed_media_url=sapply(1:length(media),function(x) parse_media_url(media[[x]]))
  text=sapply(tweet_list, function(x) ifelse(is.null(x$retweeted_status), x$text, x$retweeted_status$text))
  timestamp_ms = unlistWithNA(tweet_list, 'timestamp_ms')
  datetime =as.POSIXct(as.numeric(as.character(timestamp_ms))/1000, origin='1970-01-01',tz='EST')
  df=data.frame(
    timestamp_ms = timestamp_ms,
    datetime=datetime,
    tweet_id = unlistWithNA(tweet_list, "id_str"),
    text=iconv(text,to='UTF-8', sub = "byte"), 
    retweet_count = unlistWithNA(tweet_list, c('retweeted_status','retweet_count')),
    favorite_count = unlistWithNA(tweet_list, c('retweeted_status','favorite_count')),
    expanded_url = unlistWithNA(tweet_list, c('entities', 'urls', 1, 'expanded_url')),
    friends_count = unlistWithNA(tweet_list, c('user', 'friends_count')),
    screen_name = unlistWithNA(tweet_list, c('user', 'screen_name')),
    user_id_str = unlistWithNA(tweet_list, c('user', 'id_str')),
    in_reply_to_screen_name = unlistWithNA(tweet_list, ('in_reply_to_screen_name')),
    in_reply_to_user_id = unlistWithNA(tweet_list, ('in_reply_to_user_id_str')),
    rt_screen_name = unlistWithNA(tweet_list,c('retweeted_status','user','screen_name')),
    rt_screen_id = unlistWithNA(tweet_list,c('retweeted_status','user','id_str')),
    country = unlistWithNA(tweet_list, c('place', 'country')),
    full_name = unlistWithNA(tweet_list, c('place', 'full_name')),
    followers_count = unlistWithNA(tweet_list, c('user', 'followers_count')),
    #place_lat_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 2)),
    #place_lat_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 2, 2)),
    place_lat = sapply(1:length(tweet_list), function(x) 
      mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE)),
    #place_lon_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 1)),
    #place_lon_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 3, 1)),
    place_lon = sapply(1:length(tweet_list), function(x) 
      mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE)),
    lat = unlistWithNA(tweet_list, c('geo', 'coordinates', 1)),
    lon = unlistWithNA(tweet_list, c('geo', 'coordinates', 2)),
    mentioned_users=parsed_user,
    mentioned_id=parsed_id,
    hashes=parsed_hash,
    parsed_media_type=parsed_media_type,
    parsed_media_url=parsed_media_url,
    stringsAsFactors=F
  )
  return(df)
}
#import a list of directories for the .jsons
dirs <- list.dirs('~/zika/USF-Zika-Research/raw month', recursive=FALSE)
#append the file to the end of each directory
dirs <- paste(dirs, "/data.txt", sep="")
#apply the parse function to the list of files
parsed30d <- suppressMessages(lapply(dirs, parseTweets_mod))
## 14976 tweets have been parsed. 
## 61173 tweets have been parsed. 
## 42643 tweets have been parsed. 
## 44454 tweets have been parsed. 
## 56285 tweets have been parsed. 
## 58710 tweets have been parsed. 
## 60179 tweets have been parsed. 
## 141317 tweets have been parsed. 
## 102067 tweets have been parsed. 
## 61375 tweets have been parsed. 
## 43629 tweets have been parsed. 
## 43652 tweets have been parsed. 
## 94657 tweets have been parsed. 
## 97476 tweets have been parsed. 
## 84537 tweets have been parsed. 
## 50115 tweets have been parsed. 
## 28432 tweets have been parsed. 
## 27403 tweets have been parsed. 
## 38355 tweets have been parsed. 
## 36657 tweets have been parsed. 
## 32600 tweets have been parsed. 
## 29084 tweets have been parsed. 
## 25304 tweets have been parsed. 
## 20421 tweets have been parsed. 
## 13555 tweets have been parsed. 
## 26237 tweets have been parsed. 
## 21674 tweets have been parsed. 
## 6349 tweets have been parsed.
#turns the list of data frames into one data frame
parsed30d <- bind_rows(parsed30d)

The data has a few quirks and issues that need to be cleaned up. Counts for tweets that haven’t been retweeted or favorited have no value (NA), and need to be set to 0. Additionally, there are a number of tweets with duplicate IDs that we will toss. We also add a cleaner version of the date/time format to make it easier to group tweets by their date.

#turning NAs to 0s for favorite count and retweet count
parsed30d$retweet_count[is.na(parsed30d$retweet_count)] <- 0
parsed30d$favorite_count[is.na(parsed30d$favorite_count)] <- 0

#get rid of duplicate tweetids, 266 duplicate tweet_ids
dup30 <- count(parsed30d, tweet_id)
dup30 <- dup30[dup30$freq > 1, ]
parsed30d <- parsed30d[!(parsed30d$tweet_id %in% dup30$x), ]

#cleaning up the date field
parsed30d$cleandate <- strptime(parsed30d$datetime, "%Y-%m-%d %H:%M:%S")
parsed30d$cleandate <- format(parsed30d$cleandate, "%Y-%m-%d")
parsed30d$cleandate <- as.Date(parsed30d$cleandate)

A Quick Visual Exploration

Let’s try to take a quick look at the data to see if there is much to take note of.

flddates <- count(parsed30d, cleandate)
ggplot(flddates, aes(x=cleandate, y = n, group = 1)) + geom_line() + labs(x = "Date", y = "Number of Tweets") + ggtitle("Number of Tweets Per Day") + scale_x_date(date_labels = "%b %d", date_breaks = "1 day", expand = c(0,0)) + theme_few() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), axis.text.y = element_text(size = 12), panel.grid.major = element_line(color = "grey", size = .25))

Here is the graph for the number of tweets per day. As you can see, there are two very large spikes in tweet frequecny during the middle of our collection. These coincide with the first case of Zika being found in a mosquito in Miami on September 1st, and the Senate failing to pass funding legislation to help fight Zika on September 6th. Some other major developments on those days were Malaysia confirming their first case of Zika (Sept 1st) and the WHO expanding their Zika sex guidelines (Sept 6th).

ggplot(flddates, aes(x = as.character(cleandate), y = 100*cumsum(n)/sum(n), group = 1)) +  geom_line() + labs(x = "Date", y = "Percentage of Total (%)") + ggtitle("Cumulative Sum of Tweets (%) w/ Average Rate of Tweets") + scale_x_discrete(labels = format.Date(flddates$cleandate, "%b %d") ,expand = c(0,0)) + theme_few() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), axis.text.y = element_text(size = 12), panel.grid.major = element_line(color = "grey", size = .25)) + geom_abline(intercept = 0, slope = 100/length(flddates$cleandate), color = "blue")

In this graph, we can see the progression of the cumulative sums of tweets as time progresses. The humps around September 1st and 6th are again present. You can also see that, after September 7th, the discussion slows down greatly.

Let’s see how much of the discussion about Zika includes officials from some of Florida’s most populous counties: Miami-Dade, Broward, and Orange County.

Creating Functions for Scraping User Data

In order to scrape data and calculate some metrics easily, we can make a few functions. We will create functions to parse a user’s custom list of other users, to parse a the accounts that a user follows, to parse the accounts that follow a user, and to count number of tweets, retweets, and mentions for a user.

#this is a function to parse the names and twitter handles of people on a twitter list.
#listowner = handle of the user that made the list; listname = the name of the list.
twitlist <- function(listowner, listname){
  
  #the twitter api limits the rate at which you can make requests.  refer to https://dev.twitter.com/rest/public/rate-limits
  api.url <- paste0("https://api.twitter.com/1.1/lists/members.json?slug=",
                    listname, "&owner_screen_name=", listowner, "&count=5000")
  response <- GET(api.url, config(token=twitteR:::get_oauth_sig()))
  
  response.list <- fromJSON(content(response, as = "text", encoding = "UTF-8"))
  
  users.names <- sapply(response.list$users, function(i) i$name)
  users.screennames <- sapply(response.list$users, function(i) i$screen_name)
  
  return(data.frame(users.screennames, users.names))
}

#a <- NULL
#for (i in 1:length(twitlists$X1)) {
#  b <- twitlist(twitlists[i,1], twitlists[i,2])
#  a <- rbind(b,a)}

#gets a list of the users that someone follows.  doesn't work for private accounts
getfriendslist <- function(handl) {
  tempname <- deparse(substitute(handl))
  user <- getUser(tempname)
  #friends <- lookupUsers(user$getFriendIDs())
  friends.names <- sapply(friends,name)
  friends.screennames <- sapply(friends, screenName)
  friends <- data.frame(friends.screennames, friends.names)
  #tempname <-  paste(tempname, "_friends", sep = "")
  #assign(tempname, friends, envir = globalenv())
  return(friends)
}

#gets a list of the users that follow someone.  doesn't work for private accounts
getfollowerslist <- function(handl) {
  tempname <- deparse(substitute(handl))
  # user <- getUser(tempname)
  followers <- lookupUsers(user$getFollowers())
  followers.names <- sapply(followers,name)
  followers.screennames <- sapply(followers, screenName)
  followers <- data.frame(followers.screennames, followers.names)
  #tempname <-  paste(tempname, "_followers", sep = "")
  #assign(tempname, followers, envir = globalenv())
  return(followers)
}

#counts the number of tweets, retweets, and mentions of each handle(handl) in target data.frame (df)
#retweeting a user counts as mentioning them, so their handle appears in both fields. as such, we subtract
#the number of retweets from the number of mentions.
#you can pass in either a single handle (e.g. "WhiteHouse"), or a list of handles (e.g. c("WhiteHouse", "POTUS")).
countstats <- function(handl, df) {
  
  if(is.character(handl) == TRUE && length(handl) == 1){
    z <- deparse(substitute(handl))
    a <- nrow(df[grep(paste0("\\b",noquote(z),"\\b"), df$screen_name, ignore.case = T), ])
    b <- nrow(df[grep(paste0("\\b",noquote(z),"\\b"), df$rt_screen_name, ignore.case = T), ])
    c <- nrow(df[grep(paste0("\\b",noquote(z),"\\b"), df$mentioned_users, ignore.case = T), ])
    c <- c - b
    cat(paste(z, " - tweets:", a, ", retweets: ", b, ", mentions: ", c))
    returndf <- data.frame(noquote(handl), a, b, c)
    return(returndf)
  }
  
  else {
 
    handl <- as.data.frame(handl)
    handl$tweets <- 0
    handl$retweets <- 0
    handl$mentions <- 0
    
    for(i in 1:nrow(handl)){
      handl[i,2] <- nrow(df[grep(paste0("\\b",noquote(handl[[i,1]]),"\\b"), df$screen_name, ignore.case = T), ])
      handl[i,3] <- nrow(df[grep(paste0("\\b",noquote(handl[[i,1]]),"\\b"), df$rt_screen_name, ignore.case = T), ])
      c <- nrow(df[grep(paste0("\\b",noquote(handl[[i,1]]),"\\b"), df$mentioned_users, ignore.case = T), ])
      handl[i,4] <- c - handl[i,3]

    }
    colnames(handl)[1] <- "screen_name"
    return(handl)
  }
}

The original thought was that some city/county account, government account, or press account might have a comprehensive and relaible list of accounts for health departments/mayors/emergency management groups/etc, but I didn’t find any and it ended up being quicker for me to just manually look up the accounts of interest to us. countstats() is still useful, though. You can either pass in a single handle to it:

countstats("WhiteHouse", parsed30d)
## "WhiteHouse"  - tweets: 0 , retweets:  0 , mentions:  0

Or you can pass in a list of handles:

countstats(c("WhiteHouse", "POTUS", "BarackObama"), parsed30d)

From here, we can make a more comprehensive list to target. I had originally made the list in Excel and exported the .csv, but here’s the R script to make it easier to follow along:

#in regards to the naming convention from here on out: We started off looking at Marco Rubio, so the list was named rublist. The project evolved away from solely looking at Rubio, but the legacy naming convention stuck around.
rublist <- c("WhiteHouse", "POTUS", "BarackObama", "HHSGov", "SecSebelius", "DrFriedenCDC", "CDCtravel", "CDC_NCBDDD", "CDCGlobal", "CDCgov", "CDCemergency", "CDC_NCEZID", "CDCChronic", "CDCMMWR", "marcorubio", "SenRubioPress", "SenBillNelson", "FLGovScott", "HealthyFla","MayorGimenez", "Tomas_Regalado", "Mayor_Jacobs", "orlandomayor", "JackSeiler", "MartyKiar", "jmuoio", "MiamiDadeCounty", "pbcgov", "BrowardCounty", "OrangeCoFL", "CityofMiami", "citybeautiful", "ftlcitynews", "westpalmbch", "repstephmurphy", "congbillposey", "RepDarrenSoto", "RepValDemings", "RepBrianMast", "RepHastingsFL", "RepLoisFrankel", "RepTedDeutch", "DWStweets", "RepDWStweets", "RepWilson", "MarioDB", "RepCurbelo", "RosLehtinen", "FLHealthBroward", "orloem", "MiamiDEM", "readybroward", "pbcdem", "MiamiDadeEM")
rublist <- countstats(rublist, parsed30d)
#checking the output
head(rublist)

From here, lets make a subset of the tweets that contains only the tweets that were made by the users in rublist$screen_name and use that to withdraw some more specific metrics. One way a tweet can be calssified is by how it delivers content. A tweet can post an image directly to Twitter, it can post a url and an image, it can post a video directly to Twitter, it can post a url and Video, it can post a non-image-or-video url, and it can be just text.

#getting the subset of tweets
rubsett <- parsed30d[tolower(parsed30d$screen_name) %in% tolower(rublist$screen_name), ]
#safety copy
zq <- rubsett
#Start by pulling out all of the embedded-image tweets.  It would seem like searching the parsed_media_type field would be
#the easiest way to distinguish between photos and videos, but it turns out that that field is not relaible.  You have to
#search for media type in the parsed_media_url instead.
zqimage <- zq[grep("photo/1", zq$parsed_media_url),]
#To get the tweets that have urls and an image, we keep rows in which the expanded_url field are populated.
zqimageurl <- zqimage[!is.na(zqimage[,7]),]
#zqimage currently includes the rows with urls in it, so we will exclude those from zqimage by way of their tweet_id
zqimage <- zqimage[!(zqimage$tweet_id %in% zqimageurl$tweet_id),]
#we can create an object of the remainder tweets by removing both zqimage and zqimageurl from zq
zqremainder <- zq[!(zq$tweet_id %in% zqimage$tweet_id),]
zqremainder <- zqremainder[!(zqremainder$tweet_id %in% zqimageurl$tweet_id),]

#getting videos and videos + urls is the same process as the image
zqvideo <- zqremainder[grep("video/1", zqremainder$parsed_media_url),]
zqvideourl <- zqvideo[!is.na(zqvideo[,7]),]
zqvideo <- zqvideo[!(zqvideo$tweet_id %in% zqvideourl$tweet_id),]
zqremainder <- zqremainder[!(zqremainder$tweet_id %in% zqvideo$tweet_id),]
zqremainder <- zqremainder[!(zqremainder$tweet_id %in% zqvideourl$tweet_id),]

#finding solely urls is easy once we've eliminated the image + url and video + url combinations
zqurl <- zqremainder[!is.na(zqremainder[,7]),]
zqremainder <- zqremainder[!(zqremainder$tweet_id %in% zqurl$tweet_id),]
#It's possible for tweets to have urls in them, but not be properly labeled by the parser. g is essentially an error.
g <- zqremainder[grep("http", zqremainder$text),]
zqremainder <- zqremainder[!(zqremainder$tweet_id %in% g$tweet_id),]
#once we've elimitated all the other subsets, zqremaider is just left over as 
zqtext <- zqremainder

#if the sizes of all of the subsets == the size of rubsett, then all tweets have been classified.
nrow(zqvideo) + nrow(zqvideourl) + nrow(zqimage) + nrow(zqimageurl) + 
  nrow(zqurl) + nrow(g) + nrow(zqtext) - nrow(rubsett)
## [1] 0

Having categorized each tweet, we now need to count up who posted what and pull it all together.

#lets combine all of these into a table that tells us how many each of our targeted users tweeted.
rubsett1 <- as.data.frame(unique(rubsett$screen_name))
colnames(rubsett1)[1] <- "screen_name"
zqimage <-  count(zqimage, screen_name)
zqimageurl = count(zqimageurl, screen_name)
zqvideo = count(zqvideo, screen_name)
zqvideourl = count(zqvideourl, screen_name) 
zqurl = count(zqurl, screen_name)
zqtext = count(zqtext, screen_name)
g = count(g, screen_name)
fin <- Reduce(function(...) merge(..., by='screen_name', all.x=TRUE), list(rubsett1,zqimage,zqimageurl,
                                                                    zqvideo,zqvideourl,zqurl,zqtext, g))
colnames(fin) <- c("screen_name", "images", "image urls", "videos", "video urls", "urls", "text", "other")
totstat <- left_join(rublist, fin)
totstat[is.na(totstat)] <- 0

#we need to add in information about what level of government each user belogns to. I originalyl did this in Excel, but
# I generated rublist by the grouping, so we can handcode it without too much issue... mostly.
totstat$Grouping <- NA
totstat[1:14,]$Grouping <- "Federal"
totstat[15:19,]$Grouping <- "State"
totstat[20:26,]$Grouping <- "Local, Mayor"
totstat[27:34,]$Grouping <- "Local, City/County"
totstat[35:48,]$Grouping <- "Local, Congress"
totstat[49,]$Grouping <- "Local, Health"
totstat[50:54,]$Grouping <- "Local, Emergency Management"
totstat$GeneralizedGrouping <- NA
totstat[1:14,]$GeneralizedGrouping <- "Federal"
totstat[15:19,]$GeneralizedGrouping <- "State"
totstat[20:54,]$GeneralizedGrouping <- "Local"

#adding in follower count for each account.  Note: Users who didn't tweet won't have information for us to pull
#followers from
totstat <- rubsett %>% subset(select = c(screen_name, followers_count, timestamp_ms)) %>%  group_by(screen_name) %>% filter(timestamp_ms == max(timestamp_ms)) %>% left_join(totstat,., by = "screen_name") %>% select(-timestamp_ms)

#we can use datatable() to make a nicer html table
totstat

The last thing we need to do is create a master list that includes all tweets in which one of our actors are either the person who tweeted, the person being retweeted, or one of the people being mentioned. This will be used for some content analysis.

rubsetr <- parsed30d[tolower(parsed30d$rt_screen_name) %in% tolower(rublist$screen_name), ]
rubsetm <- filter(parsed30d, grepl(paste(rublist$screen_name, collapse="|"), mentioned_users))
rubmaster<- rbind(rbind(rubsetr,rubsetm), rubsett)

Using hashtags are a great way to add emphasis and context to a tweet. They can act like micro-theses, of sorts, for the tweet. Twitter also allows users to search for and follow hashtags, making them a good conversation-specific networking. Let’s see how our actors are utilizing hashtags.

#The hashtag field currently contains multiple hashtags, delimited by semicolons.
#We will start by separating those out into 20 columns.
rubhash <- separate(rubmaster, col = hashes, into = c("hash1", "hash2", "hash3", 
                                                 "hash4", "hash5", "hash6", "hash7", "hash8", "hash9", "hash10", "hash11", 
                                                 "hash12", "hash13", "hash14", "hash15", "hash16", "hash17", "hash18",                                                      "hash19", "hash20"), sep = ";", extra = "merge", fill = "right")

#Capitalization isn't uniform for hashtags (e.g. "Zika" and "zika"), so we'll convert them to lowercase while we tally their frequencies
rubhashes <- apply(rubhash[,24:43], 2, FUN=function(x) plyr::count(stringi::stri_trans_tolower(x)))
rubhashes <- do.call(rbind.data.frame, rubhashes)
rubhashes <- ddply(rubhashes, "x", numcolwise(sum))
rubhashes <- rubhashes[!is.na(rubhashes[,1]), ]
rubhashes <- arrange(rubhashes, desc(freq))
#the 95th percentile of hashes are those that occur 38 times or more.  50 hashtags meet this criteria.
rubhashes <- filter(rubhashes, freq > 37)
head(rubhashes)

Now that we know what some of the most popular hashtags are, it might be helpful for us to see how the ebb and flow of hashtags change over time.

#shaping data for a stacked area plot of hashtag ifnormation
#we're going to grab all of the hashtag colums, plus overshoot to the date, then remove two extraneous colums
rubhashsap <- rubhash[ ,24:46]
rubhashsap <- rubhashsap[ ,-c(21:22)]
#We're only looking for tweets that have hashetags. This includes 29,071 tweets, or about 74% of the tweets that involve our actors.
rubhashsap <- rubhashsap[!(is.na(rubhashsap$hash1)), ]
rubhashsap <- as.data.frame(sapply(rubhashsap,tolower))

#making a loop to make the data more managable
xz <- colnames(rubhashsap)
rubhashsap1 <- data.frame(rubhashsap$cleandate, rubhashsap$hash1)
colnames(rubhashsap1) <- c("date", "hashtag")

for(i in 2:20) {
  tempdf <- data.frame(rubhashsap$cleandate, rubhashsap[[i]])
  colnames(tempdf) <- c("date", "hashtag")
  rubhashsap1 <<- rbind(rubhashsap1, tempdf)
} 
rm(xz)
rubhashsap1 <- rubhashsap1[!(is.na(rubhashsap1$hashtag)), ]
rubhashsap1 <- rubhashsap1 %>% group_by(date, hashtag) %>% tally(wt = NULL)
rubhashsapc <- spread(rubhashsap1, key = hashtag,  value = n)       #
rubhashsap1 <- rubhashsap1[rubhashsap1$hashtag %in% rubhashes[1:10,1], ]
#even the most popular hashtags don't appear every day, which causes holes in graphs, so we need to fill out the 0's
rubhashsap1<-merge(rubhashsap1,expand.grid(date=unique(rubhashsap1$date),hashtag=unique(rubhashsap1$hashtag),stringsAsFactors=F),all.y=T)
rubhashsap1$n[is.na(rubhashsap1$n)] <- 0
rubhashsap1$date <- strptime(rubhashsap1$date, "%Y-%m-%d")
rubhashsap1$date <- format(rubhashsap1$date, "%m-%d")

We saw earlier that the zika hashtag is roughly a full order of magnitude more common than the next most popular hashtag, but stacked area charts can help us see where some other topics are able to take focus. A 100% stacked area chart can help us see how the market share of each hashtag varies over time, and the normal stacked area chart can help us see, mainly, the overall activity of hashtag usage over time and, secondarily, the constitution of each day’s worth of hashtags.

ggplot(rubhashsap1, aes(x = date, y = n, group = hashtag)) + geom_area(colour="black", size=.4, aes(color = hashtag, fill = hashtag, group = hashtag), position = "fill") + theme_few() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), axis.text.y = element_text(size = 12)) +  scale_fill_viridis(option = "plasma", discrete = TRUE) + labs(x = "Date", y = "Percentage of Total") +  ggtitle("100% Stacked Area Chart of the 10 Most Common Hashtags") + scale_y_continuous(labels = scales::percent, expand = c(0,0)) + scale_x_discrete(expand = c(0,0))

ggplot(rubhashsap1, aes(x = date, y = n, group = hashtag)) + geom_area(colour="black", size=.4, aes(color = hashtag, fill = hashtag, group = hashtag), position = "stack") +  theme_few() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), axis.text.y = element_text(size = 12), panel.grid.major = element_line(color = "grey", size = .25)) + scale_fill_viridis(option = "plasma", discrete = TRUE) + labs(x = "Date", y = "Number of Hashtags") +  ggtitle("Stacked Area Chart of the 10 Most Common Hashtags") + scale_y_continuous(breaks = seq(0,1500, by = 200), expand = c(0,0), limits = c(0, 1500)) + scale_x_discrete(expand = c(0,0))

We can see that #zika remains dominant through most of our data capture period, but we also see surges in #atozika, #zapzika, #specialreport, and #flsen. It’s interesting to note that, with the exception of #flsen, those hashtags are mostly used for education and awareness.

Geospatial Visualization

The hashtag #sofla, or South Florida, was a popular hashtag since it was ground-zero for Zika in Florida. We can visualize tweet density by congressional district to see which members of congress talked about the issue the most.

#we need to append district information onto totstat1 so that we can merge it with the shapefiles
congdist <- data.frame(handles = c("repstephmurphy", "congbillposey", "RepDarrenSoto", "RepValDemings", 
                                             "RepBrianMast", "RepHastingsFL", "RepLoisFrankel", "RepTedDeutch",
                                             "repDWStweets", "RepWilson", 
                                             "MarioDB", "RepCurbelo", "RosLehtinen"), 
                       DISTRICT = c(7, 8, 9, 10, 18, 20, 21, 22, 23, 24, 25, 26, 27))
totstatd <- left_join(totstat, congdist, by=c("screen_name" = "handles"))

#shape files for the proper congresional districts were found at http://cdmaps.polisci.ucla.edu/
#you'll need to point  readOGR to wherever you save the shape files to.
fldist <- rgdal::readOGR("C:/Users/Ryan/Documents/zika/USF-Zika-Research/mapping/districts114.shp")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:/Users/Ryan/Documents/zika/USF-Zika-Research/mapping/districts114.shp", layer: "districts114"
## with 436 features
## It has 15 fields
fldist <- fldist[fldist@data$STATENAME == "Florida", ]
fldist@data <- data.frame(fldist@data, totstatd[match(fldist@data$DISTRICT, totstatd$DISTRICT), ])
fldist <- gBuffer(fldist, byid = T, width = 0)
fldistm <- broom::tidy(fldist[, 1:15], region = "DISTRICT")
fldistm$id <- as.numeric(fldistm$id)
fldistm <- merge(fldistm, fldist@data, by.x = "id", by.y = "DISTRICT")

G1 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = tweets), color = "white") + 
  scale_fill_viridis(option = "plasma") + 
  coord_map() + theme_map() + 
  theme(legend.position = "right", legend.text = element_text(size=20)) +
  guides(fill = guide_colorbar(title.theme = element_text(size = 20, angle = 0)))

G2 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = tweets), color = "white") + 
  scale_fill_viridis(option = "plasma") +
  coord_map(ylim = c(25,27), xlim = c(-82, -80)) +
  theme_map() + theme(legend.position="none") + 
  geom_rect(aes(xmin = -82, xmax = -80, ymin = 25, ymax = 27), size = 1, color = "black", alpha = 0)

Miami-Dade County has a few tiny congressional districts, so we’ll add in a zoomed-in viewport to help see them.

#grid.newpage()
print(G1)
print(G2, vp = viewport(width = 0.6, height = 0.6, x = .3, y = .3))

#retweets
G1 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = retweets), color = "white") + 
  scale_fill_viridis(option = "plasma") + 
  coord_map() + theme_map() + 
  theme(legend.position = "right", legend.text = element_text(size=20)) +
  guides(fill = guide_colorbar(title.theme = element_text(size = 20, angle = 0)))

G2 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = retweets), color = "white") + 
  scale_fill_viridis(option = "plasma") +
  coord_map(ylim = c(25,27), xlim = c(-82, -80)) +
  theme_map() + theme(legend.position="none") + 
  geom_rect(aes(xmin = -82, xmax = -80, ymin = 25, ymax = 27), size = 1, color = "black", alpha = 0)
print(G1)
print(G2, vp = viewport(width = 0.6, height = 0.6, x = .3, y = .3))

#mentions
G1 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = mentions), color = "white") + 
  scale_fill_viridis(option = "plasma") + 
  coord_map() + theme_map() + 
  theme(legend.position = "right", legend.text = element_text(size=20)) +
  guides(fill = guide_colorbar(title.theme = element_text(size = 20, angle = 0)))

G2 <- ggplot() + geom_polygon(data = fldistm, aes(long,lat, group = group), fill = "light gray") + 
  geom_polygon(data = fldistm, aes(long,lat, group = group, fill = mentions), color = "white") + 
  scale_fill_viridis(option = "plasma") +
  coord_map(ylim = c(25,27), xlim = c(-82, -80)) +
  theme_map() + theme(legend.position="none") + 
  geom_rect(aes(xmin = -82, xmax = -80, ymin = 25, ymax = 27), size = 1, color = "black", alpha = 0)
print(G1)
print(G2, vp = viewport(width = 0.6, height = 0.6, x = .3, y = .3))

Miami-Dade County has a few tiny congressional districts, so we added in a zoomed-in viewport to help see them. It’s interesting to see that, for all of our measures of Twitter activity, the activity is focused around the South Florida/Everglades region. It’s also interesting that, even though we included Palm Beach, Orange, and Seminole Counties, the representatives from the associated districts weren’t really active at all despite the fact that Zika is at their doorsteps.

We used Gephi to analyze the social network of our actors. You can make the network with igraph and then write it as a Gephi .gexf file with rgexf.

df_net <- rubmaster[!is.na(rubmaster$rt_screen_name), ]

edges <- data.frame(from=df_net$screen_name, to = df_net$rt_screen_name, stringsAsFactors = F) %>%
  group_by(from,to) %>% dplyr::summarize(value = n())

nodes <- data.frame(id = unique(c(edges$from, edges$to)),
                    label = unique(c(edges$from, edges$to)),
                    stringsAsFactors = F) %>% tbl_df

rt_graph <- make_empty_graph() + vertices(nodes$id) + edges(as.vector(rbind(edges$from, edges$to)), weight = edges$value)

rg.gexf <- igraph.to.gexf(rt_graph)
f <- file("rubmasterretweets.gexf")
writeLines(rg.gexf$graph, con = f)
close(f)