import numpy as np
import pandas as pd
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import matplotlib as mpl
from collections import Counter
from glob import glob
from os.path import basename, splitext
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.manifold import MDS
from scipy.cluster.hierarchy import dendrogram, linkage
from IPython.core.display import display, HTML
house_accounts_filenames = glob("house/*.csv")
senate_accounts_filenames = glob("senate/*.csv")
house_accounts_dataframes = [pd.read_csv(filename).assign(account="@" + splitext(basename(filename))[0])
for filename in house_accounts_filenames]
senate_accounts_dataframes = [pd.read_csv(filename).assign(account="@" + splitext(basename(filename))[0])
for filename in senate_accounts_filenames]
top_n_tweets = 1
most_retweets_house_accounts_dataframes = [df.sort_values('Retweets').tail(top_n_tweets)
for df in house_accounts_dataframes]
#[df.iloc[[df['Retweets'].idxmax()]]
most_favorites_house_accounts_dataframes = [df.sort_values('Favorites').tail(top_n_tweets)
for df in house_accounts_dataframes]
#[df.iloc[[df['Favorites'].idxmax()]]
most_retweets_senate_accounts_dataframes = [df.sort_values('Retweets').tail(top_n_tweets)
for df in senate_accounts_dataframes]
#[df.iloc[[df['Retweets'].idxmax()]]
most_favorites_senate_accounts_dataframes = [df.sort_values('Favorites').tail(top_n_tweets)
for df in senate_accounts_dataframes]
#[df.iloc[[df['Favorites'].idxmax()]]
most_retweets_congress_dataframe = pd.concat(most_retweets_house_accounts_dataframes + most_retweets_senate_accounts_dataframes).reset_index(drop=True)
most_favorites_congress_dataframe = pd.concat(most_favorites_house_accounts_dataframes + most_favorites_senate_accounts_dataframes).reset_index(drop=True)
most_retweets_congress_dataframe.sort_values('Retweets').tail()
len(most_retweets_congress_dataframe)
too_few_retweeted_tweets = [(account, total,) for account, total in Counter(most_retweets_congress_dataframe["account"]).most_common() if total != top_n_tweets]
for account, total in too_few_retweeted_tweets:
display(HTML("<p><a href='https://twitter.com/" + account[1:] + "' target='_blank'>@" + account + "</a>: " + str(total) + "</p>"))
most_favorites_congress_dataframe.sort_values('Favorites').tail()
len(most_favorites_congress_dataframe)
too_few_favorited_tweets = [(account, total,) for account, total in Counter(most_favorites_congress_dataframe["account"]).most_common() if total != top_n_tweets]
for account, total in too_few_favorited_tweets:
display(HTML("<p><a href='https://twitter.com/" + account[1:] + "' target='_blank'>" + account + "</a>: " + str(total) + "</p>"))
house_dataframe = pd.concat(house_accounts_dataframes)
senate_dataframe = pd.concat(senate_accounts_dataframes)
congress_dataframe = pd.concat([house_dataframe, senate_dataframe]).reset_index(drop=True)
congress_dataframe.dropna(inplace=True)
print("Total number of Tweets for all accounts: " + str(len(congress_dataframe)))
print("Total number of accounts: " + str(len(set(congress_dataframe["account"]))))
print("Total number of house members: " + str(len(set(house_dataframe["account"]))))
print("Total number of senators: " + str(len(set(senate_dataframe["account"]))))
stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
def clean_word(word):
return all([("#" not in word),
("@" not in word),
("." not in word),
(word.isalpha()),
(word not in stopwords)])
def tokenize_and_stem(text):
tokens = tokenizer.tokenize(text)
filtered_tokens = [word for word in tokens if clean_word(word)]
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
tokens = tokenizer.tokenize(text)
filtered_tokens = [word.lower() for word in tokens if clean_word(word)]
return filtered_tokens
def get_stemmed_and_tokenized_dict(tweets):
stemmed = []
tokenized = []
for tweet in tweets:
stemmed.extend(tokenize_and_stem(tweet))
tokenized.extend(tokenize_only(tweet))
return {"Stemmed": stemmed, "Tokenized": tokenized}
%time stemmed_and_tokenized_dict = get_stemmed_and_tokenized_dict(most_favorites_congress_dataframe["Text"])
vocab_frame = pd.DataFrame({'words': stemmed_and_tokenized_dict["Tokenized"]},
index = stemmed_and_tokenized_dict["Stemmed"])
print("There are " + str(vocab_frame.shape[0]) + " items in vocab_frame")
vocab_frame.tail()
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, sublinear_tf=True,
min_df=0.001, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem)
%time tfidf_matrix = tfidf_vectorizer.fit_transform(most_favorites_congress_dataframe["Text"])
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
dist = 1 - cosine_similarity(tfidf_matrix)
num_clusters = 5
clusterer = KMeans(n_clusters=num_clusters)
%time clusterer.fit(tfidf_matrix)
clusters = clusterer.labels_.tolist()
joblib.dump(clusterer, 'doc_cluster.pkl')
#clusterer = joblib.load('doc_cluster.pkl')
#clusters = clusterer.labels_.tolist()
tweets = {'Account': most_favorites_congress_dataframe["account"],
'Text': most_favorites_congress_dataframe["Text"],
'cluster': clusters}
frame = pd.DataFrame(tweets, columns = ['Account', 'Text', 'cluster'])
frame.head()
frame['cluster'].value_counts()
top_n_words = 10
top_n_accounts = 5
top_words_dict = {}
print("Top terms and accounts per cluster:")
print()
order_centroids = clusterer.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
top_words = [vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0]
for ind in order_centroids[i, :top_n_words]]
top_words_dict[i] = ", ".join(top_words[:4])
print("Cluster %s words: " % str(i+1) + ", ".join(top_words))
print("Cluster " + str(i+1) + " top " + str(top_n_accounts) + " accounts:")
top_accounts = [account for account, value in Counter(frame[frame["cluster"] == i]["Account"]).most_common(top_n_accounts)]
for account in top_accounts:
display(HTML("<a href='https://twitter.com/" + account[1:] + "' target='_blank'>" + account + "</a>"))
print()
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
%time pos = mds.fit_transform(dist)
xs, ys = pos[:, 0], pos[:, 1]
cluster_colors = {0: '#e6194b', 1: '#3cb44b', 2: '#ffe119', 3: '#0082c8', 4: '#f58231'}
cluster_names = top_words_dict
%matplotlib inline
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=most_favorites_congress_dataframe["account"]))
groups = df.groupby('label')
fig, ax = plt.subplots(figsize=(20, 12))
ax.margins(0.05)
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_names[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
ax.tick_params(axis= 'y', which='both', left='off', top='off', labelleft='off')
ax.legend(numpoints=1)
for i in range(len(df)):
ax.text(df.loc[i]['x'], df.loc[i]['y'], df.loc[i]['title'], size=8)
plt.savefig('clusters.png', dpi=200)
plt.show()
mds = MDS(n_components=3, dissimilarity="precomputed", random_state=42)
%time pos = mds.fit_transform(dist)
xs, ys, zs = pos[:, 0], pos[:, 1], pos[:, 2]
plotly.tools.set_credentials_file(username='segalgouldn', api_key='3npOhUHFK0ZcvmfVfIzx')
pd_threed = pd.DataFrame(dict(x=xs, y=ys, z=zs, label=clusters, txt=most_favorites_congress_dataframe["account"]))
pd_threed.head()
cluster1 = go.Scatter3d(
x=pd_threed[pd_threed["label"] == 0]["x"]*1000,
y=pd_threed[pd_threed["label"] == 0]["y"]*1000,
z=pd_threed[pd_threed["label"] == 0]["z"]*1000,
mode='markers',
name=cluster_names[0],
marker=dict(
size=10,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=0.8,
color=[cluster_colors[cluster] for cluster in pd_threed[pd_threed["label"] == 0]["label"]]
),
text=pd_threed[pd_threed["label"] == 0]["txt"]
)
cluster2 = go.Scatter3d(
x=pd_threed[pd_threed["label"] == 1]["x"]*1000,
y=pd_threed[pd_threed["label"] == 1]["y"]*1000,
z=pd_threed[pd_threed["label"] == 1]["z"]*1000,
mode='markers',
name=cluster_names[1],
marker=dict(
size=10,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=0.8,
color=[cluster_colors[cluster] for cluster in pd_threed[pd_threed["label"] == 1]["label"]]
),
text=pd_threed[pd_threed["label"] == 1]["txt"]
)
cluster3 = go.Scatter3d(
x=pd_threed[pd_threed["label"] == 2]["x"]*1000,
y=pd_threed[pd_threed["label"] == 2]["y"]*1000,
z=pd_threed[pd_threed["label"] == 2]["z"]*1000,
mode='markers',
name=cluster_names[2],
marker=dict(
size=10,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=0.8,
color=[cluster_colors[cluster] for cluster in pd_threed[pd_threed["label"] == 2]["label"]]
),
text=pd_threed[pd_threed["label"] == 2]["txt"]
)
cluster4 = go.Scatter3d(
x=pd_threed[pd_threed["label"] == 3]["x"]*1000,
y=pd_threed[pd_threed["label"] == 3]["y"]*1000,
z=pd_threed[pd_threed["label"] == 3]["z"]*1000,
mode='markers',
name=cluster_names[3],
marker=dict(
size=10,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=0.8,
color=[cluster_colors[cluster] for cluster in pd_threed[pd_threed["label"] == 3]["label"]]
),
text=pd_threed[pd_threed["label"] == 3]["txt"]
)
cluster5 = go.Scatter3d(
x=pd_threed[pd_threed["label"] == 4]["x"]*1000,
y=pd_threed[pd_threed["label"] == 4]["y"]*1000,
z=pd_threed[pd_threed["label"] == 4]["z"]*1000,
mode='markers',
name=cluster_names[4],
marker=dict(
size=10,
line=dict(
color='rgba(217, 217, 217, 0.14)',
width=0.5
),
opacity=0.8,
color=[cluster_colors[cluster] for cluster in pd_threed[pd_threed["label"] == 4]["label"]]
),
text=pd_threed[pd_threed["label"] == 4]["txt"]
)
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
data = [cluster1, cluster2, cluster3, cluster4, cluster5]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter-congress')
linkage = linkage(dist, method="complete", metric="cosine")
fig, ax = plt.subplots(figsize=(15, 70))
ax = dendrogram(linkage,
orientation="right",
leaf_font_size=10,
show_contracted=True,
labels=most_favorites_congress_dataframe["account"].values.tolist())
plt.tick_params(axis= 'x', which='both', bottom='off', top='off', labelbottom='off')
plt.tight_layout()
plt.savefig('dendrogram.png', dpi=200)
plt.show()