Introduction

From August 25, 2016 to September 21, 2016, we collected tweets that were related to the Zika outbreak. This data was used for multiple papers relating to the usage of multimedia in tweets, government Twitter interactions during times of crisis, and identifying influential actors in propogating health information. Because the data was used for so many different things, this notebook will be more of a quick retrospective of some of the things I did rather than a comprehensive log of everything that was done.

This is also my first R notebook, so there might be some quirks in formatting will arise while I knit to html.

Libraries

library(plyr)
library(tidyverse)
library(ggthemes)
library(viridis)
library(Cairo)
library(igraph)
library(rgexf)
library(maptools)
library(rgeos)
library(rgdal)
library(broom)
library(grid)
library(gridExtra)
library(twitteR)
library(rjson)
library(httr)

Importing and Cleanup

Our parsed tweets are separated, by day, into 28 .json files. In order to use our parsing function, we need to concatenate them into a single object.

#initial R script to parse 
#streaming twitter API
#Thomas Keller
#thomas.e.keller@gmail.com

#code and parser based off streamR parseTweets function
#https://github.com/pablobarbera/streamR/blob/master/streamR/R/parseTweets.R

#helper function not meant for the light of day
#breaks out list columns of tweets



unlistWithNA <- function(lst, field){
  if (length(field)==1){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
  }
  if (length(field)==2){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
  }
  if (length(field)==3 & field[1]!="geo"){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
  }
  if (field[1]=="geo"){
    notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
  }
  
  if (length(field)==4 && field[2]!="urls"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
  }
  if (length(field)==4 && field[2]=="urls"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
  }
  if (length(field)==6 && field[2]=="bounding_box"){
    notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
    vect <- rep(NA, length(lst))
    vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) 
      x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
  }
  return(vect)
}

#small extra functions to parse the two more complicated fields that return lists themselves...user mentions
#right now merging into a single string with ";" as a separator
parse_user=function(user_mentions){
  num_mention=length(user_mentions)
  if(num_mention==0){return(NA)}
  else if(num_mention==1){
    return(user_mentions[[1]]$screen_name) 
  }
  else{return(paste(sapply(1:length(user_mentions),function(x) user_mentions[[x]]$screen_name),collapse=';'))}
}

parse_id=function(user_mentions){
  num_mention=length(user_mentions)
  if(num_mention==0){return(NA)}
  else if(num_mention==1){
    return(user_mentions[[1]]$id_str) 
  }
  else{return(paste(sapply(1:length(user_mentions),function(x)
    user_mentions[[x]]$id_str),collapse=';'))}
}

parse_hash=function(hashtags){
  num_hash=length(hashtags)
  if(num_hash==0){return(NA)}
  else if(num_hash==1){
    return(hashtags[[1]]$text) 
  }
  else{return(paste(sapply(1:length(hashtags),function(x)
    hashtags[[x]]$text),collapse=';'))}
}

parse_media_type=function(media){
  num_media=length(media)
  if(num_media==0){return(NA)}
  else if(num_media==1){
    return(media[[1]]$type) 
  }
  else{return(paste(sapply(1:length(media),function(x)
    media[[x]]$type),collapse=';'))}
}

parse_media_url=function(media){
  num_media=length(media)
  if(num_media==0){return(NA)}
  else if(num_media==1){
    return(media[[1]]$expanded_url) 
  }
  else{return(paste(sapply(1:length(media),function(x)
    media[[x]]$expanded_url),collapse=';'))}
}

######### 
#main parser
#code
########


library(streamR)

parseTweets_mod=function(jsonfile){
  tweet_list=readTweets(jsonfile)
  tot_tweet=length(tweet_list)
  tweet_list=lapply(tweet_list, function(x) if(x$lang=='en' & x$user$followers_count>=25 & x$user$friends_count>=100 ) return(x))
  tweet_list=tweet_list[!sapply(tweet_list,is.null)] # culls out the nulled elements
  #reweet=lapply(tweet_list,function(x) if(is.null(x$retweeted_status)==TRUE) return(NULL) else(return(x)))
  #retweet=retweet[!sapply(retweet,is.null)]
  #print(paste('After simple spam filtering',length(tweet_list),'tweets remain' , 100*length(tweet_list)/tot_tweet,'%'))
  place_lat_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
  place_lat_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 2, 2)) 
  place_lat = sapply(1:length(tweet_list), function(x) 
    mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
  place_lon_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
  place_lon_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
  place_lon = sapply(1:length(tweet_list), function(x) 
    mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
  
  mentions=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$user_mentions)
  hashes=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$hash)
  media=lapply(1:length(tweet_list),function(x) tweet_list[[x]]$entities$media)
  parsed_user=sapply(1:length(mentions), function(x) parse_user(mentions[[x]]))
  parsed_id=sapply(1:length(mentions), function(x) parse_id(mentions[[x]]))
  parsed_hash=sapply(1:length(hashes),function(x) parse_hash(hashes[[x]]))
  parsed_media_type=sapply(1:length(media),function(x) parse_media_type(media[[x]]))
  parsed_media_url=sapply(1:length(media),function(x) parse_media_url(media[[x]]))
  text=sapply(tweet_list, function(x) ifelse(is.null(x$retweeted_status), x$text, x$retweeted_status$text))
  timestamp_ms = unlistWithNA(tweet_list, 'timestamp_ms')
  datetime =as.POSIXct(as.numeric(as.character(timestamp_ms))/1000, origin='1970-01-01',tz='EST')
  df=data.frame(
    timestamp_ms = timestamp_ms,
    datetime=datetime,
    tweet_id = unlistWithNA(tweet_list, "id_str"),
    text=iconv(text,to='UTF-8', sub = "byte"), 
    retweet_count = unlistWithNA(tweet_list, c('retweeted_status','retweet_count')),
    favorite_count = unlistWithNA(tweet_list, c('retweeted_status','favorite_count')),
    expanded_url = unlistWithNA(tweet_list, c('entities', 'urls', 1, 'expanded_url')),
    friends_count = unlistWithNA(tweet_list, c('user', 'friends_count')),
    screen_name = unlistWithNA(tweet_list, c('user', 'screen_name')),
    user_id_str = unlistWithNA(tweet_list, c('user', 'id_str')),
    in_reply_to_screen_name = unlistWithNA(tweet_list, ('in_reply_to_screen_name')),
    in_reply_to_user_id = unlistWithNA(tweet_list, ('in_reply_to_user_id_str')),
    rt_screen_name = unlistWithNA(tweet_list,c('retweeted_status','user','screen_name')),
    rt_screen_id = unlistWithNA(tweet_list,c('retweeted_status','user','id_str')),
    country = unlistWithNA(tweet_list, c('place', 'country')),
    full_name = unlistWithNA(tweet_list, c('place', 'full_name')),
    followers_count = unlistWithNA(tweet_list, c('user', 'followers_count')),
    #place_lat_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 2)),
    #place_lat_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 2, 2)),
    place_lat = sapply(1:length(tweet_list), function(x) 
      mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE)),
    #place_lon_1 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 1, 1)),
    #place_lon_2 = unlistWithNA(tweet_list, c('place', 'bounding_box', 'coordinates', 1, 3, 1)),
    place_lon = sapply(1:length(tweet_list), function(x) 
      mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE)),
    lat = unlistWithNA(tweet_list, c('geo', 'coordinates', 1)),
    lon = unlistWithNA(tweet_list, c('geo', 'coordinates', 2)),
    mentioned_users=parsed_user,
    mentioned_id=parsed_id,
    hashes=parsed_hash,
    parsed_media_type=parsed_media_type,
    parsed_media_url=parsed_media_url,
    stringsAsFactors=F
  )
  return(df)
}
#import a list of directories for the .jsons
dirs <- list.dirs('~/zika/USF-Zika-Research/raw month', recursive=FALSE)
#append the file to the end of each directory
dirs <- paste(dirs, "/data.txt", sep="")
#apply the parse function to the list of files
parsed30d <- suppressMessages(lapply(dirs, parseTweets_mod))
## 14976 tweets have been parsed. 
## 61173 tweets have been parsed. 
## 42643 tweets have been parsed. 
## 44454 tweets have been parsed. 
## 56285 tweets have been parsed. 
## 58710 tweets have been parsed. 
## 60179 tweets have been parsed. 
## 141317 tweets have been parsed. 
## 102067 tweets have been parsed. 
## 61375 tweets have been parsed. 
## 43629 tweets have been parsed. 
## 43652 tweets have been parsed. 
## 94657 tweets have been parsed. 
## 97476 tweets have been parsed. 
## 84537 tweets have been parsed. 
## 50115 tweets have been parsed. 
## 28432 tweets have been parsed. 
## 27403 tweets have been parsed. 
## 38355 tweets have been parsed. 
## 36657 tweets have been parsed. 
## 32600 tweets have been parsed. 
## 29084 tweets have been parsed. 
## 25304 tweets have been parsed. 
## 20421 tweets have been parsed. 
## 13555 tweets have been parsed. 
## 26237 tweets have been parsed. 
## 21674 tweets have been parsed. 
## 6349 tweets have been parsed.
#turns the list of data frames into one data frame
parsed30d <- bind_rows(parsed30d)

The data has a few quirks and issues that need to be cleaned up. Counts for tweets that haven’t been retweeted or favorited have no value (NA), and need to be set to 0. Additionally, there are a number of tweets with duplicate IDs that we will toss. We also add a cleaner version of the date/time format to make it easier to group tweets by their date.

#turning NAs to 0s for favorite count and retweet count
parsed30d$retweet_count[is.na(parsed30d$retweet_count)] <- 0
parsed30d$favorite_count[is.na(parsed30d$favorite_count)] <- 0

#get rid of duplicate tweetids, 266 duplicate tweet_ids
dup30 <- count(parsed30d, tweet_id)
dup30 <- dup30[dup30$freq > 1, ]
parsed30d <- parsed30d[!(parsed30d$tweet_id %in% dup30$x), ]

#cleaning up the date field
parsed30d$cleandate <- strptime(parsed30d$datetime, "%Y-%m-%d %H:%M:%S")
parsed30d$cleandate <- format(parsed30d$cleandate, "%Y-%m-%d")
parsed30d$cleandate <- as.Date(parsed30d$cleandate)

A Quick Visual Exploration

Let’s try to take a quick look at the data to see if there is much to take note of.

flddates <- count(parsed30d, cleandate)
ggplot(flddates, aes(x=cleandate, y = n, group = 1)) + geom_line() + labs(x = "Date", y = "Number of Tweets") + ggtitle("Number of Tweets Per Day") + scale_x_date(date_labels = "%b %d", date_breaks = "1 day", expand = c(0,0)) + theme_few() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), axis.text.y = element_text(size = 12), panel.grid.major = element_line(color = "grey", size = .25))