首先安装所需库:
pip install pandas gensim jieba scikit-learn
完整代码:
import pandas as pd
from gensim.models import Word2Vec
from jieba import cut
from sklearn.cluster import KMeans
# 读取 CSV 文件并加载数据
df = pd.read_csv('data.csv')
# 将文本数据处理成用空格分隔的字符串格式,并进行分词
sentences = df['text'].apply(lambda x: ' '.join(cut(x))).tolist()
# 切分词汇列表
tokenized_sentences = [sentence.split() for sentence in sentences]
# 使用 Word2Vec 模型对文本进行向量化处理
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
# 获取词向量
word_vectors = model.wv
# 将文本中的每个词转换为词向量的平均值,得到文本的向量表示
text_vectors = [sum(word_vectors[word] for word in sentence) / len(sentence) for sentence in tokenized_sentences]
# 使用 KMeans 聚类算法进行聚类
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(text_vectors)
# 获取聚类结果的标签
cluster_labels = kmeans.labels_
# 将聚类结果添加到原始 DataFrame 中
df['cluster'] = cluster_labels
# 打印每个簇的样本
for i in range(5): # 假设有 5 个簇
cluster_samples = df[df['cluster'] == i]['text'].tolist()
print(f"Cluster {i}:")
for sample in cluster_samples:
print(sample)
print('\n')