基于LDA的Twitter数据分析

131 阅读2分钟

公众号:尤而小屋
作者:Peter
编辑:Peter

本文介绍一个LDA数据分析的项目,使用的是一份Twitter数据集

导入库

import os
import pandas as pd
import numpy as np
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go

import gensim
from gensim import corpora, models, similarities
import logging
import tempfile

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from string import punctuation
from collections import OrderedDict

import seaborn as sns
import pyLDAvis.gensim  # pip install pyLDAvis
import matplotlib.pyplot as plt
%matplotlib inline

init_notebook_mode(connected=True) # do not miss this line

import warnings
warnings.filterwarnings("ignore") 

导入数据

df = pd.read_csv("data.csv",encoding="gb18030") 
df.head()
df.shape
df.isnull().sum()

数据预处理

df.dtypes  # 转换前
df["Time"] = pd.to_datetime(df["Time"])  # 转换成时间格式  
df.dtypes  # 转换后
df["Time"] = pd.to_datetime(df['Time'], format='%y-%m-%d %H:%M:%S')
df.head()
df.drop("row ID", axis=1, inplace=True)   
print("Number of tweets: ", len(df))

不同year对比

tweetsdata = df["Time"]
tweetsdata
trace = go.Histogram(  # 绘制数据
    x = tweetsdata,  # x 轴的数据
    marker = dict(
        color="blue"),
    opacity = 0.75
)

layout = go.Layout(
    title = "Tweet Activity Over Years",
    height=450,
    width=1200,
    xaxis=dict(title='Month and year'),
    yaxis=dict(title='Tweet Quantity'),
    bargap=0.2)

data = [trace]

fig = go.Figure(data=data, layout=layout)

fig.show()

语料处理

准备文本列表

准备好语料库corpus:

corpus = df["Tweet"].tolist()
corpus[:10]
import os  
  
TEMP_FOLDER = os.getcwd()  
TEMP_FOLDER

print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

停用词处理stopwords dealing

list1 = ['RT','rt']

stoplist = stopwords.words('english') + list(punctuation) + list1

print(stoplist[:20])

获取words

下面的代码表示:

  • str(document).lower().split():文档全部转成字符串 + 全部小写 + 空字符的切割,生成一个个切割后的单词
texts = [[word for word in str(document).lower().split() if word not in stoplist] for document in corpus]

将单词用词袋表示,并且存储在指定路径下:

dictionary = corpora.Dictionary(texts)
dictionary.save(os.path.join(TEMP_FOLDER, 'elon.dict')) 
dictionary

获取每个单词对应的id序号:

dictionary.token2id  # 获取每个单词对应的id序号

词袋表示

生成语料corpus内容:将单词转换成词袋表示

corpus = [dictionary.doc2bow(text) for text in texts] 
corpus[:2]

将已经序列化的语料库保存成文件,需要指定一个路径:

corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus) 

建模

TF-IDF模型

tfidf = models.TfidfModel(corpus)  # 1-模型初始化
corpus_tfidf = tfidf[corpus]  # 2-基于模型将语料转成向量

LDA模型

total_topics = 5  # 5个主题
lda = models.LdaModel(corpus,  # 语料
                      id2word=dictionary,  # 单词与序号的对应字典
                      num_topics=total_topics  # 设置主题数
                     )
corpus_lda = lda[corpus_tfidf] 
lda.show_topics(total_topics, 5)
data_lda = {i: OrderedDict(lda.show_topic(i,25)) for i in range(total_topics)}      
data_lda

生成数据

df_lda = pd.DataFrame(data_lda)
df_lda 
df_lda = df_lda.fillna(0).T
df_lda

LDA可视化

clustermap图

显示不同单词之间的相关性

g = sns.clustermap(df_lda.corr(),
                center = 0,
                standard_scale = 1,
                cmap = "RdBu",
                metric = "cosine",
                linewidths = 0.75,
                figsize = (12,12)
               )

plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
plt.show()

LDA可视化

pyLDAvis.enable_notebook()

panel = pyLDAvis.gensim.prepare(lda, corpus_lda, dictionary, mds='tsne')
panel