import jieba
# 停用词表 stop_words.txt 停用词表defget_stop_words():
stop = []
with open('./stop_words.txt', 'r', encoding='utf-8-sig') as f:
lines = f.readlines()
for line in lines:
lline = line.strip()
stop.append(lline)
return stop
# 分词 停用defsegment_and_stop_word(word):
stop_words = get_stop_words()
a = list(jieba.cut(word, cut_all=True))
data = []
for i in a:
if i notin stop_words and i != ' 'and i != '':
data.append(i)
return data
dataSet = [
"自动闪退",
"太卡,闪退",
"上传视频总是闪退",
"添加字幕时会闪退",
"不可以下载高像素的视频。",
"我高清导不出怎么办",
"不能保存好像素的视屏",
]
# 1.闪退 2.高清导出
listClasses = [
1,
1,
1,
1,
2,
2,
2,
]
if __name__ == "__main__":
# 1. 加载数据集
newDataSet = []
for i in dataSet:
newDataSet.append(segment_and_stop_word(i))
def getTestWordClassId(wordJson, words, text):
"""
获取测试文本所属类别
:param wordJson: 单词在类型中出现概率
:param words: 测试文本
:return:
"""
p = 0
classId = 1
for i in wordJson:
num = 0
for j in words:
if j in wordJson[i]:
num += wordJson[i][j]['pab']
iffloat(num) > float(p):
p = num
classId = i
return int(classId)
if __name__ == "__main__":
newDataSet = []
for i in dataSet:
newDataSet.append(segment_and_stop_word(i))
vocabList = createVocabList(newDataSet)
# 3. 单词在类型中出现概率
wordJson = {}
classListSet = set(listClasses)
for i in classListSet:
wordJson[i] = setOfWords2Vec(i, vocabList, newDataSet, listClasses)
testDataSet = newDataSet # 测试集
classResult = {}
trainTextAndIds = []
for i in range(len(dataSet)):
trainTextAndIds.append({
"id": i,
"text": dataSet[i],
})
for i in range(len(testDataSet)):
classResult[trainTextAndIds[i]['id']] = {
"train_id": listClasses[i],
"test_id": getTestWordClassId(wordJson, testDataSet[i], trainTextAndIds[i]['text']),
"id": trainTextAndIds[i]['id'],
"text": trainTextAndIds[i]['text']
}
计算召回率正确率
# 计算准确率召回率
def getRate(dataSet, classVec):
rates = {}
for i in classVec:
rates[i] = {
'TP': 0, 'FN': 0, 'FP': 0, 'TN': 0
}
for i in dataSet:
if dataSet[i]['train_id'] == dataSet[i]['test_id']:
rates[dataSet[i]['train_id']]['TP'] += 1 # TP: 将正类预测为正类数else:
rates[dataSet[i]['train_id']]['FN'] += 1 # FN: 将正类预测为负类数
rates[dataSet[i]['test_id']]['FP'] += 1 # FP: 将负类预测为正类数for i in rates:
rates[i]['TN'] = len(dataSet) - rates[i]['TP'] - rates[i]['FP'] - rates[i]['FN'] # TN: 将负类预测为负类数
accuracy_recall_list = []
for i in rates:
row = rates[i]
_row = {
"tag_id": i,
"recall": 0.0 if row['TP'] + row['FN'] == 0 else round(row['TP'] / (row['TP'] + row['FN']), 4),
"accuracy": round((row['TP'] + row['TN']) / (row['TP'] + row['FP'] + row['TN'] + row['FN']), 4),
"row": row
}
accuracy_recall_list.append(_row)
return accuracy_recall_list
if __name__ == "__main__":
newDataSet = []
for i in dataSet:
newDataSet.append(segment_and_stop_word(i))
vocabList = createVocabList(newDataSet)
# 3. 单词在类型中出现概率
wordJson = {}
classListSet = set(listClasses)
for i in classListSet:
wordJson[i] = setOfWords2Vec(i, vocabList, newDataSet, listClasses)
testDataSet = newDataSet # 测试集
classResult = {}
trainTextAndIds = []
for i in range(len(dataSet)):
trainTextAndIds.append({
"id": i,
"text": dataSet[i],
})
for i in range(len(testDataSet)):
classResult[trainTextAndIds[i]['id']] = {
"train_id": listClasses[i],
"test_id": getTestWordClassId(wordJson, testDataSet[i], trainTextAndIds[i]['text']),
"id": trainTextAndIds[i]['id'],
"text": trainTextAndIds[i]['text']
}
# 4.计算召回率正确率
accuracy_recall_list = getRate(classResult, set(listClasses))