基于复旦大学预料及进行分类
资源:
首先是训练用的语料库
www.kesci.com/home/dataset/5d3a9c86cf76a600360edd04
然后是停用词也是借的:
blog.csdn.net/dataastron/…
开始
keras可能更通用一点吧,但本次使用的是sklearn的包 调包大师
import os
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
先定义readfile方法,简化之后读文件
def readfile(path):
#一开始open(path,'rb')出现了一些无法解析中的错误,被迫改了
fp = open(path,encoding='gb18030',errors='ignore')
content = fp.read()
fp.close()
return content
定义一个存放训练集分此后个文档内容和文档类别标签的列表
train_corpus =[]
train_label = []
获取语料库路径及子文件名
corpus_path = 'E:\\迅雷下载\\answer'
cate_list = os.listdir(corpus_path)
获取每个子目录下的所有文件
for mydir in cate_list:
class_path = corpus_path+'\\'+mydir
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path +"\\"+ file_path
content =readfile(fullname).strip()
content =content.replace('\n',' ').strip()
content_seg =jieba.cut(content)
train_corpus.append(' '.join(content_seg))
train_label.append(mydir)
print("分词结束")
读取停用词
stopword_path = 'E:\\迅雷下载\\stop.txt'
stpwrdlst = readfile(stopword_path).splitlines()
还有个这个BUG不会处理
UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
该类会将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer(stop_words=stpwrdlst,max_df=0.5)
该类会统计每个词的TF-IDF权重
transformer =TfidfTransformer()
第一个fit_transform用来计算tf-idf,第二个用来将文本转化为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(train_corpus))
查看所有文档形成的TF-IDF举证
weight= tfidf.toarray()
print(weight,"生成矩阵结束")
训练集按7:3进行划分为训练集和验证集
X_train,X_test,y_train,y_test = train_test_split(tfidf,train_label,test_size=0.3)
#训练模型
bayes_clf =MultinomialNB(alpha=0.01,fit_prior=True)
bayes_clf.fit(X_train,y_train)
y_pred = bayes_clf.predict(X_test)
print("训练结束,预测结果")
print("交叉验证结果")
print(cross_val_score(bayes_clf,X_test,y_test,cv=5))
print(classification_report(y_test,y_pred,target_names=None))
print(metrics.confusion_matrix(y_test,y_pred))
这里还涉及到一个内存不够的问题
解决方案
blog.csdn.net/youhuakongz…
最后是模型的存储和读取 使用pickle模块或者sklearn内部的joblib
一、使用pickle模块
from sklearn import svm
from sklearn import datasets
clf=svm.SVC()
iris=datasets.load_iris()
X,y=iris.data,iris.target
clf.fit(X,y)
import pickle
s=pickle.dumps(clf)
f=open('svm.txt','w')
f.write(s)
f.close()
f2=open('svm.txt','r')
s2=f2.read()
clf2=pickle.loads(s2)
clf2.score(X,y)
二、使用joblib
joblib更适合大数据量的模型,且只能往硬盘存储,不能往字符串存储
from sklearn.externals import joblib
joblib.dump(clf,'filename.pkl')
clf=joblib.load('filename.pkl')