参与拿奖:本文已参与「新人创作礼」活动,一起开启掘金创作之路
1.朴素贝叶斯
朴素:在整个形式化过程中只做最原始、最简单的假设
核心思想:选择具有最高概率的决策
贝叶斯概率引入先验知识和逻辑推理来处理不确定命题
频数概率只从数据本身获得结论,并不考虑逻辑推理和先验知识
适用数据类型:标称型数据
优点:在数据较少的情况下仍然有效,可以处理多类别问题
缺点:对于输入数据的准备方式较为敏感
使用条件概率来分类:
p(c|x,y)表示点(x,y)所属类别c的概率
p(c|x,y)=p(x,y|c)p(c)/p(x,y)
然后将所有p(c|x,y)求出选择其取最大值时的类别
2.一般步骤
数据搜集:获取数据集
准备数据:需要数值型或者布尔型数据
分析数据:有大量特征时,绘制特征作用不大,此时使用直方图效果较好
训练算法:计算不同的独立特征的条件概率
测试算法:计算错误率
使用算法:文档分类
注意:样本数会随着特征数目增大而迅速增大
3.案例
3.1 使用朴素贝叶斯判断词性
贝叶斯分类器有两种实现方式:
一种基于贝努利模型实现,不考虑词在文档中出现次数,只考虑是否出现;
多项式模型实现,考虑词在文档中出现次数。
这里以第一种为例。
from numpy import *
def loadDataSet():
postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
# 1表示侮辱性文字,0代表正常言论
classVec = [0,1,0,1,0,1] #1 is abusive, 0 not
return postingList,classVec
def createVocabList(dataSet):
# 存放在所有文档中出现的不重复词
vocabSet=set([])
for document in dataSet:
# 创建两个集合的并集
vocabSet=vocabSet|set(document)
return list(vocabSet)
# 该函数输入为:词汇表和想要检查的所有单词
# 每一个单词构建一个特征
def setOfWords2Vec(vocabList,inputSet):
# 词向量
returnVec=[0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)]=1
else:
print('the world: %s is not in my Vocabulary!'%world)
return returnVec
# 测试
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
# 函数输入为文档矩阵,每篇文档类别所构成的向量
def trainNB0(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
# 初始化概率
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords); p1Num = ones(numWords) #change to ones()
p0Denom = 2.0; p1Denom = 2.0 #change to 2.0
for i in range(numTrainDocs):
# 向量相加
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 对每个元素做除法
p1Vect = log(p1Num/p1Denom) #change to log()
p0Vect = log(p0Num/p0Denom) #change to log()
# 返回两个向量和一个概率
return p0Vect,p1Vect,pAbusive
# 测试:
listOPosts,listClasses=loadDataSet()
myVocabList=createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb=trainNB0(trainMat,listClasses)
print("词汇为侮辱性概率:%f"%pAb)
# 分类器进行训练与测试
# 测试算法:根据实际情况修改分类器
# 在计算多个概率乘积时,为了防止因为其中一个概率为0导致结果为0,可以采用初始化每个词汇出现数为1,将分母初始化为2
# p0Num = ones(numWords);
# p1Num = ones(numWords) #change to ones()
# p0Denom = 2.0; p1Denom = 2.0
# 由于太多太小的数相乘会导致下溢出
# 可以通过取对数来避免下溢出或者浮点数舍入导致的错误
# p1Vect = log(p1Num/p1Denom) #change to log()
# p0Vect = log(p0Num/p0Denom) #change to log()
# 朴素贝叶斯分类函数
### 输入参数分别为:待分类变量、使用trainNB0所得三个概率
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
# 对应元素相乘
p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat=[]
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
testEntry = ['stupid', 'garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb))
# 文档词袋模型
### 词集模型:每个词汇是否出现当作一个特征
### 在词袋中每个词可以多次出现,在词集中只能出现一次
def bagOfWords2VecMN(vocabList, inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] += 1
return returnVec
# 测试
testingNB()
3.2 使用朴素贝叶斯过滤垃圾邮件
import re
s='I love i, not you!'
regEx=re.compile(r'\W*')
listOfTokens=regEx.split(s)
res=[token for token in listOfTokens if len(token)>0]
res
# 测试算法:使用朴素贝叶斯进行交叉验证
# 接受一个大的字符串,将其解析为字符串列表,并去掉长度少于2的字符串,将其转换为小写字符
def textParse(bigString): #input is big string, #output is word list
import re
listOfTokens = re.split('\W+', bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
# 对贝叶斯垃圾文件进行自动化处理
def spamTest():
docList=[]; classList = []; fullText =[]
for i in range(1,26):
wordList = textParse(open(r'email/spam/%d.txt' % i,encoding='ISO-8859-15').read())
# print(wordList,end=" ")
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open(r'email/ham/%d.txt' % i,encoding='ISO-8859-15').read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
# print(docList)
vocabList = createVocabList(docList)#create vocabulary
# 构建测试集和训练集
trainingSet = list(range(50));
testSet=[] #create test set
# 随机选择数据的一部分作为训练集,余下部分作为测试机的过程叫:留存交叉验证
# 可以经过多次迭代求平均错误率
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[];
trainClasses = []
for docIndex in trainingSet:#train the classifier (get probs) trainNB0
trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
# 对测试集进行分类,如果邮件分类错误,则错误数加1
for docIndex in testSet: #classify the remaining items
wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
print("classification error",docList[docIndex])
print('the error rate is: ',float(errorCount)/len(testSet))
#return vocabList,fullText
# 测试:
spamTest()
4.运行结果
5.参考资料
[1] 机器学习实战
[2] 书籍源码
[3] jupyter版本