python3 读取csv文件并使用word2vec对中文文本向量化处理,最后使用kmeans聚类

273 阅读1分钟

首先安装所需库:

pip install pandas gensim jieba scikit-learn

完整代码:

import pandas as pd
from gensim.models import Word2Vec
from jieba import cut
from sklearn.cluster import KMeans

# 读取 CSV 文件并加载数据
df = pd.read_csv('data.csv')

# 将文本数据处理成用空格分隔的字符串格式,并进行分词
sentences = df['text'].apply(lambda x: ' '.join(cut(x))).tolist()

# 切分词汇列表
tokenized_sentences = [sentence.split() for sentence in sentences]

# 使用 Word2Vec 模型对文本进行向量化处理
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# 获取词向量
word_vectors = model.wv

# 将文本中的每个词转换为词向量的平均值,得到文本的向量表示
text_vectors = [sum(word_vectors[word] for word in sentence) / len(sentence) for sentence in tokenized_sentences]

# 使用 KMeans 聚类算法进行聚类
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(text_vectors)

# 获取聚类结果的标签
cluster_labels = kmeans.labels_

# 将聚类结果添加到原始 DataFrame 中
df['cluster'] = cluster_labels

# 打印每个簇的样本
for i in range(5):  # 假设有 5 个簇
    cluster_samples = df[df['cluster'] == i]['text'].tolist()
    print(f"Cluster {i}:")
    for sample in cluster_samples:
        print(sample)
    print('\n')