先想想,我们用不同的分类技术去做题的流程是不是一样的
首先加载数据,数据预处理,然后建立模型训练,再预测,最后可视化结果。中间训练的方法可能不一样但是再这个整体的流程都是差不多的。
那么我们开始做题
直接看题
敲代码
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn import tree
# 导入数据
data = pd.read_csv('task1_data.csv')
data.head()
# x,y赋值
x = data.drop(['y'], axis=1)
x.head()
y = data.loc[:, 'y']
y.head()
# 维度确认
print(x.shape)
print(y.shape)
# 模型训练
model = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=5)
model.fit(x, y)
# 模型预测
y_predict = model.predict(x)
print(y_predict)
print(y)
print(y == y_predict)
# 模型评估
accuracy = accuracy_score(y, y_predict)
print(accuracy)
# 测试样本预测
x_test = np.array([[1, 0, 1, 1]])
y_test_predict = model.predict(x_test)
print(y_test_predict)
print('适合' if y_test_predict==1 else '不适合')
# fig.show()报错,所以选择保存图片
fig1 = plt.figure(figsize=(200, 200))
tree.plot_tree(model, filled=True, feature_names=['Skill', 'Experience', 'Degree', 'income'], class_names=['Un-qualified', 'Qualified'])
# fig1.savefig('dc_result2.png')
# 看完图发现,我们这个模型,可能是有点过拟合的的,我们调整一下模型的min_samples_leaf
# 用模型2重新做一遍
model2 = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=50)
model2.fit(x, y)
y_predict2 = model2.predict(x)
accuracy2 = accuracy_score(y, y_predict2)
print(accuracy2)
fig2 = plt.figure(figsize=(200, 200))
tree.plot_tree(model2, filled=True, feature_names=['Skill', 'Experience', 'Degree', 'income'], class_names=['Un-qualified', 'Qualified'])
# fig2.savefig('dc_result3.png')
# 支持中文字体
"""
百度找到的方法:https://www.cnblogs.com/wangjunyan/p/8572079.html
还没使用
"""
# fig3 = plt.figure(figsize=(200, 200))
# tree.plot_tree(model2, filled=True, feature_names=['技能', '经验', '学历', '预期收入'], class_names=['不合适', '合适'])
# fig3.savefig('dc_result4.png')