yan song

Verizon, IBM @ Watson Research Center, MEMORIAL SLOAN KETTERING CANCER CENTER

Physics Ph.D. with rich experience in academic researches and industry adventures, has been trained as a data scientist to do analysis, modeling, simulation, and programming by using Python, R, C++, Perl, Matlab, SQL on Linux, Unix and Windows platforms.

[No canvas support]

Project: Text mining and clustering of tweets based on context

Faculty Feedback

Gagan Preet Singh

Senior Data Scientist at HARMAN International

His work has been par excellence for the time he has spent on this. The approach and the quality of execution deserves to be commended and his performance certified.

Project Demo

Problem Statement:

The project requires the student to build an IPython notebook that can:
  1. Use Twitter API to collect tweets based on a search string (eg. google)
  2. Clean the tweets and create a term-document matrix for the cleaned tweets
  3. Identify tweets that are talking about a similar 'topic' and cluster them together.
  4. Identify the optimal number of topics, tweets corresponding to each topic and list the most used words in each topic.
The end result would be a list of topics and the important word from each topic. Thereby the observer of the end result will be able to identify what is being talked about with respect to the searched entity.

Dataset:

The program should automatically collect the data using the Twitter API.
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk import word_tokenize
ckey = 'hfLQl3GrNKn9roojLYgAUXZL5'
csecret = 'H6SmcD6YQlN0rHsgjyK3bCUMa4JC4gZJ1azCdbAkOExIEJwYDR'
atoken = '3178541083-HpQ1Ht9siIYBBBZaJlC0T5Rtdo2LIPi2yBfvjKP'
asecret = 'iynA1ScN3j4Nm3XQwE8VDMQX4fOzlg3v95dqlhv2Kdu4c'

auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
api = tweepy.API(auth)
tweet_number = 500
i=0
tweetDF = pd.DataFrame(columns=['screen name','timeStamp','tweet'])
for tweet in tweepy.Cursor(api.search, q="google", lang="en").items(tweet_number):   
    tweetDF.loc[i] = [tweet.user.screen_name, tweet.created_at,tweet.text]
    i+=1
tweetList = tweetDF.tweet.tolist()
tweetList
def remove(RE,text):
    ag = re.compile(RE, re.VERBOSE | re.I)
    return ag.sub(r'',text)
words = set()
with open('OnlineDictionary.txt') as f:
    for line in f:
        words.add(line.strip())

solutions = {}
def find_words(instring):
    if instring in words:
        return [instring]
    if instring in solutions:
        return solutions[instring]
    best_solution = None
    for i in range(1, len(instring) - 1):
        part1 = find_words(instring[:i])
        part2 = find_words(instring[i:])
        if part1 is None or part2 is None:
            continue
        solution = part1 + part2
        if best_solution is None or len(solution) < len(best_solution):
            best_solution = solution
    solutions[instring] = best_solution
    return best_solution
def check_unsplited_word(text):
    splited_sentence=[]
    for w in text.split():
        if w not in words:
            result = find_words(w)
            if result is not None:
                splited = ' '.join(result)
                splited_sentence.append(splited)
            else:
                splited_sentence.append(w)
        else:
            splited_sentence.append(w)
    return ' '.join(splited_sentence)
rem={}
rem[1]='(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'
rem[2]='(\<(/?[^>]+)>)'
rem[3]='(\*)'
rem[4]='(\[)'
rem[5]='(\')'
rem[6]='(\.+)'
rem[7]='(\/)'
rem[8]='(\')'
rem[9]='(\,)'
rem[10]='(\#)'
rem[11]='(\:)'
rem[12]='(\?)'
rem[13]='(\@)'
rem[14]='(\-+)'
rem[15]='(\')'
rem[16]='(\?)'
rem[17]='(\%)'
rem[18]='(\$)'
rem[19]='(\!)'
rem[20]='(\~)'
rem[21]='(\))'
rem[22]='(\()'
rem[23]='(\")'
rem[24]='(\&)'
rem[25]='(\|)'
rem[26]='(\+)'
rem[27]='(\{)'
rem[28]='(\})'
rem[29]='(\;)'
rem[30]='(\^)'
rem[31]='(\=+)'
rem[32]='(\_+)'
rem[33]='(\+?)'
rem[34]='(\<)'
rem[35]='(\>)'
rem[36]='(\")'
rem[37]='(\])'
CleanTweets = []

for i in range(len(tweetList)):
    t = tweetList[i]
    for v in rem.values():
        t = remove(v,t)
    t1 = re.sub(r'[^\x00-\x7f]',r' ',t)
    t2 = t1.lower()
    t3 = [i for i in t2.split() if i not in stopwords.words('english')]     # remove stop words
    t4 = ' '.join(t3)
    t5 = ''.join([i for i in t4 if not i.isdigit()])              # remove dingital number
    t6 = re.sub(r'\\+','',t5)
    t7 = re.sub(r'rt','',t6)                                      # remove tag RT
    t77 = t7.encode('ascii','ignore')                             # convert unicode to ascii
    t8 = check_unsplited_word(t77)                            # separate words without space
    t9 = re.sub(r'\s{2,}',' ',t8)                             # remove more than two white spaces
    t10 = t9.strip()                                              # remove head/tail white space
    t11 = "\n".join([ll.rstrip() for ll in t10.splitlines() if ll.strip()])  # remove blank line
    CleanTweets.append(t11)
CleanTweets
wordList = set([])

for tweet in CleanTweets:
    for word in tweet.split(' '):
        wordList.add(word)

newData = pd.DataFrame(columns=wordList)

for tweet in CleanTweets:
    newData.loc[tweet] = [0]*len(newData.columns)
    for word in tweet.split(' '):
        newData.ix[tweet, word] += 1
newData
newData.drop(newData.columns[0],axis=1)  # If clean not perfect, use this step to drop extra white space
newDF = newData.div(newData.sum(axis = 1),axis = 0)
(sample_row, sample_col) = newDF.shape   # record the sample siz
print 'sample row is ', sample_row
print 'sample column is ', sample_col
data = newDF.as_matrix()
from sklearn.metrics import euclidean_distances
from sklearn.cluster import KMeans
from scipy.spatial import distance
import numpy as np
from matplotlib import pyplot as plt
def compute_bic(kmeans,X):
    centers = [kmeans.cluster_centers_]
    labels  = kmeans.labels_
    m = kmeans.n_clusters
    n = np.bincount(labels)
    N, d = X.shape
    cl_var=(1.0/(N-m)/d)*sum([sum(distance.cdist(X[np.where(labels==i)],[centers[0][i]],'euclidean')**2) for i in range(m)])
    const_term = 0.5 * m * np.log(N) * (d+1)
    BIC = np.sum([n[i] * np.log(n[i]) -
              n[i] * np.log(N) -
             ((n[i] * d) / 2) * np.log(2*np.pi*cl_var) -
             ((n[i] - 1) * d/ 2) for i in range(m)]) - const_term

    return(BIC)
def find_k(n_start, n_end):
    bics = []
    kmc =[]
    for n_clusters in range(n_start,n_end):
        kmeans = KMeans(n_clusters=n_clusters)
        kmc.append(kmeans)
        kmeans.fit(data)
        labels = kmeans.labels_
        centroids = kmeans.cluster_centers_
               
        clusters = {}
        for i,d in enumerate(kmeans.labels_):
            if d not in clusters:
                clusters[d] = []
            clusters[d].append(data[i])
        bic = compute_bic(kmeans, data)
        bics.append(bic)
    return bics, kmc[np.argmin(bics)], kmc
n_start = 2  
n_end = sample_row
bics, kmin, kme = find_k(n_start, n_end)  
print 'Exact K for minimum BIC at ',kmin.n_clusters
%matplotlib inline 
plt.plot(range(n_start,n_end),bics)
plt.ylabel("BIC")
plt.xlabel("k number")
plt.title("BIC for K-means clusters' behaviour")
plt.show()
for i in range(kmin.n_clusters):
    tw = newData.iloc[np.where(kmin.labels_==i)].sum().sort_values(inplace=False,ascending=False)
    print '---------------------------------'
    print 'FOR CLUSTER ',i
    print tw[:5]
from scipy.misc import imread
from wordcloud import WordCloud 
from wordcloud import WordCloud, STOPWORDS
def wordcloud_plot(tex):
    wordcloud = WordCloud( 
                      stopwords=STOPWORDS,
                      background_color='white',
                      width=1500,
                      height=1100,
                      mask=twitter_mask
            ).generate(tex)
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")  

twitter_mask = imread('twitter_mask.png', flatten=True)
for i in range(kmin.n_clusters):
    twe = newData.iloc[np.where(kmin.labels_==i)].index.tolist()[0]
    wordcloud_plot(twe)
best-it-exam-    | for-our-work-    | hottst-on-sale-    | it-sale-    | tast-dumps-us-    | test-king-number-    | pass-do-it-    | just-do-it-    | pass-with-us-    | passresults-everything-    | passtutor-our-dumps-    | realtests-us-exam-    | latest-update-source-for-    | cbtnuggets-sale-exam    | experts-revised-exam    | certguide-sale-exam    | test4actual-sale-exam    | get-well-prepared-    | certkiller-sale-exam    | buy-discount-dumps    | how-to-get-prepared-for-the    | in-an-easy-way    | brain-dumps-sale    | with-pass-exam-guarantee    | accurate-study-material    | at-first-try    | 100%-successful-rate    | get-certification-easily    | material-provider-exam    | real-exam-practice    | with-pass-score-guarantee    | certification-material-provider    | for-certification-professionals    | get-your-certification-successfully    | 100%-Pass-Rate    | in-pdf-file    | practice-exam-for    | it-study-guides    | study-material-sku    | study-guide-pdf    | prep-guide-demo    | certification-material-id    | actual-tests-demo    | brain-demos-test    | best-pdf-download    | our-certification-material    | best-practice-test    | leading-provider-on    | this-course-is-about    | the-most-reliable    | high-pass-rate-of    | money-back-guarantee    | high-pass-rate-demo    | recenty-updated-key    | only-for-students-free-download    | courseware-plus-kit-for    | accurate-answers-of    | the-most-reliable-id    | provide-training-for    | welcome-to-buy    | material-for-success-pass    | provide-free-support    | best-book-for-pass    | accuracy-of-the-answers    | pass-guarantee-id    |
http://forensics.sch.ac.kr/    | http://forensics.sch.ac.kr/    |