pydata

Keep Looking, Don't Settle

sentiment analysis with twitter 03: building models to predict for twitter data from nltk

0. Introduction

This section is to introduce the libraries from sklearn about classification prediction models.

import json
import sys
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

load the training data from nltk.

mypath = r'C:\Users\hsong01\nltk_data\corpora\twitter_samples\\'

pos = []
for line in open (mypath + r'positive_tweets.json', 'r'):
    pos.append(json.loads(line))

neg = []
for line in open (mypath + r'negative_tweets.json', 'r'):
    neg.append(json.loads(line))

twitter data explain

text: the text of the tweet itself
created_at: the date of creation
favorite_count, retweet_count: the number of favourites and retweets
favorited, retweeted: boolean stating whether the authenticated user (you) have favourited or retweeted this tweet
lang: acronym for the language (e.g. en for english)
id: the tweet identifier
place, coordinates, geo: geo-location information if available
user: the authors full profile
entities: list of entities like URLs, @-mentions, hashtags and symbols
in_reply_to_user_id: user identifier if the tweet is a reply to a specific user
in_reply_to_status_id: status identifier id the tweet is a reply to a specific status

formalize the data into pandas dataframe for analysis

colname = ["created_at", "favorite_count", "retweet_count", "id", "place", "coordinates", "geo", "text"]

df0 = []
for dic in pos:
    ll = []
    for c in colname:
        ll.append(dic[c])
    df0.append(ll)

df1 = []
for dic in neg:
    ll = []
    for c in colname:
        ll.append(dic[c])
    df1.append(ll)


df0 = pd.DataFrame(df0, columns = colname)
df0["pn"] = 1
df1 = pd.DataFrame(df1, columns = colname)
df1["pn"] = 0
df = pd.concat([df0, df1], axis = 0, ignore_index = True)

1. frequency analysis

# count the frequency
count_vect = CountVectorizer(min_df = 2)  
df_freq_count = count_vect.fit_transform(df.text).toarray()

# the features(words in twitter) and their order in the freq matrix
for key in count_vect.vocabulary_.keys()[:20]:
    print "the word is %s, ---- and the order is %s" %(key, count_vect.vocabulary_[key])
print '\n'*2

# all the words in twitter
print "%s %s " %(count_vect.get_feature_names()[:20], '\n')
print '\n'*2

# the frequency matrix which is a sparce matrix
print df_freq_count
print '\n'*2
print "the shape of output matrix is " + str(df_freq_count.shape)
the word is raining, ---- and the order is 3526
the word is four, ---- and the order is 1691
the word is hanging, ---- and the order is 1926
the word is bringing, ---- and the order is 680
the word is wednesday, ---- and the order is 4722
the word is cooking, ---- and the order is 1001
the word is sobs, ---- and the order is 4005
the word is lgbt, ---- and the order is 2575
the word is kids, ---- and the order is 2416
the word is dnt, ---- and the order is 1256
the word is music, ---- and the order is 2973
the word is yahoo, ---- and the order is 4878
the word is want, ---- and the order is 4684
the word is absolute, ---- and the order is 128
the word is travel, ---- and the order is 4458
the word is feature, ---- and the order is 1576
the word is sehun, ---- and the order is 3809
the word is typed, ---- and the order is 4520
the word is wrong, ---- and the order is 4849
the word is italy, ---- and the order is 2255



[u'00', u'000', u'01', u'01282', u'0345', u'07', u'10', u'100', u'1000', u'100k', u'101', u'10am', u'11', u'110', u'11am', u'12', u'120', u'12pm', u'12th', u'13']




[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ...,
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]



the shape of output matrix is (10000L, 4962L)

2. tf-idf

vectorizer = TfidfVectorizer(min_df = 3, use_idf = True)  
train_x = vectorizer.fit_transform(df.text)

3. model

build the model and test on the test data with these two tweets:

test_text = ["today is a sunny day and there are lots of people playing in the park", "wow, it is really bad to miss your books"]

test_text = ["today is a sunny day and there are lots of people playing in the park", "wow, it is really bad to miss  your books"]
test_df = vectorizer.transform(test_text)

## naive bayesian
clf_nb = MultinomialNB().fit(train_x, df.pn)
print clf_nb.predict(test_df)

## SVC
clf_svc = SVC().fit(train_x, df.pn)
print clf_svc.predict(test_df)

## linear svc
clf_linsvc = LinearSVC().fit(train_x, df.pn)
print clf_linsvc.predict(test_df)

## logistic regression
clf_loigt = LogisticRegression().fit(train_x, df.pn)
print clf_loigt.predict(test_df)

## SGDClassifier
clf_sgd = SGDClassifier().fit(train_x, df.pn)
print clf_sgd.predict(test_df)
[1 0]
[0 0]
[1 0]
[1 0]
[1 0]

As is shown, NB/Linear SVC/Logistic Regression/SGDClassifier predict it correctly while SVM did not predict the positive correctly.