pwangbot

by hugo

pwangbot/Topic Modeling.ipynb

Introducing pwangbot

I was chatting with @dff of @tidelift at the NumFocus summit last year, and he suggested classifying @pwang's tweets. For those who don't know, @pwang tweets alot, and the content is quite good - but it spans a variety of subjects. If you want to hear @pwang tweet about tech, but not about politics, it can be a problem. So we're going to collect @pwang tweets, do some topic modeling, and then make some bots!

Topic Modeling of Tweets

%matplotlib inline
import s3fs
import json
import logging
import tweepy
logging.basicConfig(level=logging.INFO)
from string import punctuation
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import casual_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
en_stop = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stoplist = en_stop.union(set(punctuation))
stemmer = SnowballStemmer('english')
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
fs = s3fs.S3FileSystem()
root = 'saturn-cloud-data/hugo/pwang-bot-data'
paths = fs.ls(root)
data = []
for p in paths:
    with fs.open(p) as f:
        data.append(json.load(f))

Tokenization

We do the following - convert everything to lower case words, Throwing out "stop" words, optionally throwing out usernames, urls, and then stemming and lemmatizing.

en_stop.add('http')
en_stop.add('https')
en_stop.add('co')
en_stop.add('rt')
en_stop.add('like')
en_stop.add('first')
en_stop.add('time')
en_stop.add('next')

def tokenize(x, with_usernames=True):
    for token in casual_tokenize(x):
        if "'" in token:
            token = token.split("'")[0]
        token = token.lower()
        token = stemmer.stem(token)
        if token in en_stop:
            continue
        if not with_usernames:
            if "@" in token:
                continue
        if ":" in token:
            continue
        if len(token) > 3:
            yield lemmatizer.lemmatize(token)

texts = [" ".join(list(tokenize(x['text']))) for x in data]
texts_no_username = [" ".join(list(tokenize(x['text'], with_usernames=False))) for x in data]
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

documents = texts_no_username
no_features = 1000

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words=en_stop)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
# Run LDA
from sklearn.decomposition import LatentDirichletAllocation
no_topics = 2
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=50, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
import numpy as np
# results = np.argmax(nmf.transform(tfidf_vectorizer.transform(texts)), axis=1)
results = np.argmax(lda.transform(tf_vectorizer.transform(texts)), axis=1)
categorized = {'raw': {}, 'token_no_username': {}, 'token': {}}
for idx, result in enumerate(results):
    categorized['token_no_username'].setdefault(result, []).append(texts_no_username[idx])
    categorized['token'].setdefault(result, []).append(texts[idx])
    categorized['raw'].setdefault(result, []).append(data[idx]['text'])
{len(categorized['raw'][x]) for x in range(no_topics)}
{1563, 1842}
import os

from os import path
from wordcloud import WordCloud

# Generate a word cloud image
for c in range(no_topics):
    print(len(categorized['token_no_username'][c]))
    text = " ".join(categorized['token_no_username'][c])
    import matplotlib.pyplot as plt
    # lower max_font_size
    wordcloud = WordCloud(background_color="white", max_font_size=40).generate(text)
    plt.figure(figsize=(15, 15))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# The pil way (if you don't have matplotlib)
# image = wordcloud.to_image()
# image.show()
1842
1563