Real-time Twitter Sentiment Analysis

StreamingTwitterSentimentAnalysis

Real-time Sentiment Analysis of Twitter Data

In a previous post, I demonstrated how one can go about building a text classification model for sentiment analysis. In this post, I will be applying the model previously developed to extract real-time sentiments of twitter data based on a search term.

Motivation

Social media has become a valuable tool for anyone hoping to understand public sentiment on any current topic. Social media platforms such as Twitter, Facebook, and Instagram can provide a unique and unfiltered glimpse to what a target demographic has to say about a topic or product. Whether it is a campaign manager, social activist, or company brand manager, being able to monitor public sentiment in real-time means that one can quickly react to events. In this post, I will be applying a text classification model to twitter data using the streaming API to obtain real-time sentiment on a topic

In [1]:
import csv
import json

import numpy as np
import pandas as pd
import re #for regex
import pickle
import dill

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.util import ngrams

import tweepy #https://github.com/tweepy/tweepy
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener

import MySQLdb
import time
import datetime

import processtweet
import credentials as credentials
In [2]:
class TweetStreamListener(StreamListener):
    """
    This class handles the tweets received from the stream
    """
    def __init__(self, num_tweets, conn, api=None):
        self.num_tweets = num_tweets
        self.textProcessing = processtweet.text_processing()
        self.conn = conn
        self.cursor = self.conn.cursor()
        
        #Check if the tables we need already exist in the MySQL db
        #and if they do drop them and recreate the tables
        self.cursor.execute("DROP TABLE IF EXISTS TwitterData CASCADE")
        self.cursor.execute("DROP TABLE IF EXISTS TwitterTrend CASCADE")
        self.cursor.execute("DROP TABLE IF EXISTS TwitterCurrent CASCADE")
        self.cursor.execute("CREATE TABLE TwitterData (id SERIAL PRIMARY KEY, Sentiment INT(1),"
                             "DateTime VARCHAR(20), Tokens TEXT, Tweet VARCHAR(200))")
        self.cursor.execute("CREATE TABLE TwitterTrend (id SERIAL PRIMARY KEY,"
                            "Pos_Sentiment VARCHAR(400), Neg_Sentiment VARCHAR(400), DateTime VARCHAR(40))")
        self.cursor.execute("CREATE TABLE TwitterCurrent (id SERIAL PRIMARY KEY, Pos VARCHAR(40), Neg VARCHAR(40),"
                             "DateTime VARCHAR(20))")
        
        #Initiate variables, including the start time
        self.start_time = datetime.datetime.utcnow()
        self.start_time_long = self.start_time
        self.pos_count = 0
        self.neg_count = 0
        self.count = 0
        super(TweetStreamListener, self).__init__()

    def on_status(self, status):
        try:
            tweet_aslist = []
            tweet = status.text.encode("utf-8")
            
            #Exclude retweets since they will skew results
            if hasattr(tweet, 'retweeted_status') or 'RT @' in tweet: 
                return
            else:
                #Note: Tweepy returns a time object set to UTC time
                #see: http://timbueno.com/
                time_stamp = status.created_at 
                
                #Convert tweet to a list of a string
                tweet_aslist.append(tweet)
                
                #Obtain the sentiment on each tweet
                sentiment = self.textProcessing.tweetClassifier(tweet_aslist)
                
                #Tokenize the tweet
                token = self.textProcessing.getTokens(tweet)
                
                current_time = datetime.datetime.utcnow()
                
                #Push sentiments to array
                if sentiment == 4:
                    self.pos_count = self.pos_count + 1
                else:
                    self.neg_count = self.neg_count + 1
                    
                #Write tweet sentiment count and timestamp to TwitterCurrent table every 1 min
                #The goal here is to capture the time-dependent twitter activity on a topic
                if current_time - self.start_time_long > datetime.timedelta(seconds=60):
                    self.start_time_long = current_time
                    self.cursor.execute("INSERT INTO TwitterCurrent (Pos, Neg, DateTime) "
                                        "VALUES (%s, %s, %s)", (self.pos_count, self.neg_count, current_time,))
                    self.pos_count = 0
                    self.neg_count = 0
                                        
                #Get tweets every 1 second and make sure there is at least 4 tweets    
                if current_time - self.start_time >  datetime.timedelta(seconds=1) and  self.count > 4:
                    self.start_time = current_time
                    
                    #Get a count of the positive and negative sentiments from TwitterData table
                    self.cursor.execute("SELECT COUNT(Sentiment) FROM TwitterData "
                                        "WHERE Sentiment = 4")
                    count_pos = int(self.cursor.fetchone()[0]) * 1
                    self.cursor.execute("SELECT COUNT(Sentiment) FROM TwitterData "
                                        "WHERE Sentiment = 0")
                    count_neg = int(self.cursor.fetchone()[0]) * -1
                    
                    #Write to the TwitterTrend table
                    self.cursor.execute("INSERT INTO TwitterTrend (Pos_Sentiment, Neg_Sentiment, DateTime) "
                                        "VALUES (%s, %s, %s)", (count_pos, count_neg, current_time,))
                    print self.start_time, count_pos, count_neg

                #If count is less than number of tweets, write data to db
                if self.count < self.num_tweets:
                    self.count += 1
                    self.cursor.execute("INSERT INTO TwitterData (Sentiment, "
                            "DateTime, Tokens, Tweet) VALUES (%s, %s, %s, %s)", 
                            (sentiment, time_stamp, str(token), tweet,))
                    self.conn.commit()
                    return True
                else:
                    #Otherwise, delete oldest entry in db and enter new one
                    self.cursor.execute("DELETE FROM TwitterData "
                                        "WHERE id IN "
                                        "(SELECT id FROM "
                                        "(SELECT id, Tweet FROM TwitterData ORDER BY id ASC LIMIT 1) AS t)")
                    self.conn.commit()
                    self.cursor.execute("INSERT INTO TwitterData (Sentiment, "
                            "DateTime, Tokens, Tweet) VALUES (%s, %s, %s, %s)",
                            (sentiment, time_stamp, str(token), tweet))
                    self.conn.commit()
                    return True

            return True

        except: 
            e = sys.exc_info()[0]
            write_to_page( "<p>Error: %s</p>" % e )


    def on_error(self, status):
        print status
        if status_code == 420:

            return False
In [ ]:
class connect_API():

    def __init__(self, num_tweets):
        """
        Obtain twitter API authorization
        """
        consumer_key = credentials.login['consumer_key']
        consumer_secret = credentials.login['consumer_secret']
        access_key = credentials.login['access_key']
        access_secret = credentials.login['access_secret']

        self.auth = OAuthHandler(consumer_key, consumer_secret)
        self.auth.set_access_token(access_key, access_secret)
        self.num_tweets = num_tweets


    def stream_data(self, search_item, conn):
        """
        Get twitter streaming data
        """
        twitterStream = tweepy.Stream(self.auth,
                listener=TweetStreamListener(num_tweets =
                    self.num_tweets, conn=conn))
        twitterStream.filter(track=[search_item])
In [ ]:
def main():
    
    #db connection credentials
    host = credentials.database['host']
    user = credentials.database['user']
    passwd = credentials.database['passwd']
    unix_socket = credentials.database['unix_socket']
    db = credentials.database['db']
 
    #num_tweets are the number of tweets
    #to be written to db while search_item
    #is the search item to send to the 
    #streaming API
    num_tweets =  2000
    search_item = "obama"
    try:
        conn = MySQLdb.connect(host = host, user = user, passwd = passwd, unix_socket = unix_socket, db = db)
        run_streaming = connect_API(num_tweets)
        run_streaming.stream_data(search_item, conn)
        
        cursor = conn.cursor()

    except (KeyboardInterrupt, SystemExit):
        raise

    except Exception as e:
        print(e.__doc__)

    finally:
        conn.close()

if __name__ == "__main__":

    main()
2016-10-13 19:07:05.217021 4 -1
2016-10-13 19:07:06.276141 18 -3
2016-10-13 19:07:07.326413 31 -8
2016-10-13 19:07:08.407842 38 -12
2016-10-13 19:07:09.417260 40 -15
2016-10-13 19:07:10.718208 42 -16
2016-10-13 19:07:11.841328 49 -17
2016-10-13 19:07:13.591763 52 -19
2016-10-13 19:07:14.716295 56 -20
2016-10-13 19:07:16.759291 57 -20
2016-10-13 19:07:17.819179 62 -21
2016-10-13 19:07:18.825260 67 -22
2016-10-13 19:07:19.991666 73 -22
2016-10-13 19:07:20.991949 79 -24
2016-10-13 19:07:22.352758 84 -27
2016-10-13 19:07:23.710001 92 -30
2016-10-13 19:07:24.769069 97 -31
2016-10-13 19:07:26.201974 104 -32
2016-10-13 19:07:27.218048 108 -34
2016-10-13 19:07:28.271075 116 -37
2016-10-13 19:07:29.451550 125 -37
2016-10-13 19:07:30.536801 132 -40
2016-10-13 19:07:31.579873 140 -42
2016-10-13 19:07:32.694604 145 -45
2016-10-13 19:07:33.742261 149 -45
2016-10-13 19:07:34.907263 156 -47
2016-10-13 19:07:36.057784 164 -49
2016-10-13 19:07:37.154289 170 -50
2016-10-13 19:07:38.436498 180 -50
2016-10-13 19:07:39.595011 183 -50
2016-10-13 19:07:40.616528 194 -51
2016-10-13 19:07:42.505184 197 -53
2016-10-13 19:07:43.569222 203 -54
2016-10-13 19:07:44.656036 205 -57
2016-10-13 19:07:45.686363 210 -61
2016-10-13 19:07:46.968124 214 -62
2016-10-13 19:07:48.489277 221 -62
2016-10-13 19:07:49.674734 226 -63
2016-10-13 19:07:50.768975 230 -63
2016-10-13 19:07:51.810154 235 -63
2016-10-13 19:07:52.998976 238 -66
2016-10-13 19:07:54.115473 242 -67
2016-10-13 19:07:55.187661 250 -72
2016-10-13 19:07:56.261770 257 -74
2016-10-13 19:07:57.639957 261 -74
2016-10-13 19:07:58.684416 265 -76
2016-10-13 19:08:00.096672 271 -78
2016-10-13 19:08:01.349201 278 -79
2016-10-13 19:08:02.374861 283 -79
2016-10-13 19:08:03.507031 285 -80
2016-10-13 19:08:04.581102 290 -82
2016-10-13 19:08:05.741261 297 -86
2016-10-13 19:08:06.761310 303 -89
2016-10-13 19:08:07.808877 306 -91
2016-10-13 19:08:08.814531 312 -93
2016-10-13 19:08:09.972340 319 -95
2016-10-13 19:08:11.172185 322 -99
2016-10-13 19:08:12.741405 328 -100
2016-10-13 19:08:13.788279 331 -102
2016-10-13 19:08:14.861790 334 -102
2016-10-13 19:08:15.959216 339 -103
2016-10-13 19:08:16.971495 345 -104
2016-10-13 19:08:18.015534 352 -108
2016-10-13 19:08:19.844162 357 -109
2016-10-13 19:08:20.854757 359 -110
2016-10-13 19:08:22.010336 363 -111
2016-10-13 19:08:23.670249 368 -114
2016-10-13 19:08:24.791014 372 -114
2016-10-13 19:08:25.791167 376 -116
2016-10-13 19:08:27.319157 379 -118
2016-10-13 19:08:28.451135 384 -120
2016-10-13 19:08:29.493726 386 -121
2016-10-13 19:08:30.759477 389 -123
2016-10-13 19:08:32.018179 393 -124
2016-10-13 19:08:33.154799 398 -124
2016-10-13 19:08:34.156642 403 -125
2016-10-13 19:08:35.170247 406 -125
2016-10-13 19:08:36.244397 410 -127
2016-10-13 19:08:37.529974 413 -127
2016-10-13 19:08:38.576283 419 -129
2016-10-13 19:08:39.701382 425 -130
2016-10-13 19:08:40.806761 428 -131
2016-10-13 19:08:41.808694 433 -132
2016-10-13 19:08:43.153894


We can then use the matplotlib animation module to view real-time plots of the twitter data. This is discussed in the next post.

-->