The goal of this project is to see the overall sentiment of tweets containing the word "lucy" and to make a wordcloud for the positive tweets and the negative tweets.
Import packages
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import pandas as pd
import json
from textblob import TextBlob
Stream in 6000 tweets containing the word 'lucy' using tweepy and put the tweets into a dataframe.
#https://github.com/shreyans29/thesemicolon/blob/master/livesenti.py
df_tweets = pd.DataFrame(columns = ['count', 'tweet', 'senti'])
count=0
class listener(StreamListener):
def on_data(self,data):
all_data=json.loads(data)
tweet=all_data["text"]
#username=all_data["user"]["screen_name"]
blob=TextBlob(tweet.strip())
global count
global df_tweets
count=count+1
if count%50 == 0:
print(count)
df_tweets = df_tweets.append(pd.DataFrame({'count':[count], 'tweet':[tweet.strip()]}))
if count==6000:
df_tweets.to_csv('df_tweet.csv')
return(False)
else:
return(True)
def on_error(self,status):
print(status)
atoken = "1009156045988491264-nzRbKNnbbHOgE9Qx8MgYrGqUEIZzqO"
asecret = "yPsJFcSoeh1M20RnIkz9dC5VVfbSgiWkUI4EVFkj6nKc2"
ckey = "BaOWdsvJJwXvlHO1FNyfAIqk5"
csecret = "nmiAfhPun8pkh76DUY96FYeYxSWoxblt7V9AnMiKxZU8FwWMXE"
auth=OAuthHandler(ckey,csecret)
auth.set_access_token(atoken,asecret)
twitterStream= Stream(auth, listener(count))
twitterStream.filter(track=["uber"], languages=['en'])
df_tweets = pd.read_csv('df_tweet.csv')
Take out the characters that the computer can't understand from the code.
df_tweets.tweet = df_tweets.tweet.map(lambda x: x.encode('ascii',errors='ignore'))
Turn all the tweets into lower case.
df_tweets.tweet = df_tweets.tweet.map(lambda x: x.lower())
Look at the first five tweets to see that they have been correctly modified.
df_tweets.tweet.head()
Find the sentiments of each tweet and turn that into a column of our dataframe.
df_tweets['senti'] = df_tweets.tweet.map(lambda x: TextBlob(str(x)).sentiment.polarity)
df_tweets.head()
Find the overall sentiment score.
df_tweets.senti.sum()
As we can see, the overall sentiment score is about 170, which means that the sentiment towards the name or word 'lucy' is overwhelmingly positive.
Now, we will make two dataframes, one for the positive tweets and one for the negative tweets.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
pos_tweets = df_tweets.tweet[df_tweets.senti>0]
neg_tweets = df_tweets.tweet[df_tweets.senti<0]
Print the average positive sentiment score and the average negative sentiment score.
print(df_tweets.senti[df_tweets.senti>0].sum()/df_tweets.senti[df_tweets.senti>0].shape[0])
print(df_tweets.senti[df_tweets.senti<0].sum()/df_tweets.senti[df_tweets.senti<0].shape[0])
Display the first five positive tweets.
Note: I decided to keep the retweets since they indicate the popularity of a tweet.
pos_tweets.head()
Display the first five negative tweets.
neg_tweets.head()
Create a wordcloud for the positive tweets.
wordcloud = WordCloud(background_color = 'white').generate(' '.join(pos_tweets.astype(str)))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
Create a wordcloud for the negative tweets.
wordcloud2 = WordCloud(background_color = 'white').generate(' '.join(neg_tweets.astype(str)))
plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis("off")
plt.show()