人工智能-分类任务与逻辑回归-代码实战(二)

270 阅读2分钟

今天做这道题

image.png

直接开做

我们就跟上一题的流程一样,直接开干

上代码


import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

# 导入数据及可视化

data = pd.read_csv('task2_data.csv')
data.head()


mask = data.loc[:, 'y'] == 1
print(mask)

fig1 = plt.figure()
plt.ylabel('pay2')
plt.xlabel('pay1')
plt.title('pay1_pay2')
abnormal = plt.scatter(data.loc[:, 'pay1'][mask], data.loc[:, 'pay2'][mask])
normal = plt.scatter(data.loc[:, 'pay1'][~mask], data.loc[:, 'pay2'][~mask])
plt.legend((normal, abnormal), ('normal', 'abnormal'))
plt.show()

x = data.drop(['y'], axis=1)
y = data.loc[:, 'y']

x.head()
y.head()


# 简历模型

LR1 = LogisticRegression()
LR1.fit(x, y)

# 模型预测

y_predict = LR1.predict(x)
print(y_predict, y)

# 预测目标值





# 评估模型


accuracy = accuracy_score(y, y_predict)
print(accuracy)


# 可视化决策边界

theta0 = LR1.intercept_[0]
theta1, theta2 = LR1.coef_[0][0], LR1.coef_[0][1]

print(theta0, theta1, theta2)

x1 = data.loc[:, 'pay1']

x2_new = -(theta0 + theta1 * x1)/theta2

print(x2_new)


fig2 = plt.figure()
plt.ylabel('pay2')
plt.xlabel('pay1')
plt.title('pay1_pay2')
abnormal = plt.scatter(data.loc[:, 'pay1'][mask], data.loc[:, 'pay2'][mask])
normal = plt.scatter(data.loc[:, 'pay1'][~mask], data.loc[:, 'pay2'][~mask])
plt.legend((normal, abnormal), ('normal', 'abnormal'))
plt.plot(x1, x2_new)
plt.show()

做起来的时候总感觉一切顺利,直到这个算出准确率只有80%和画出决策边界的时候

她怎么是个这样的呢,总感觉有点不对劲

image.png

我们看看,现在这样画好像分类分的也不是没有道理,就是有点牵强。是不是考虑用个二次的函数更合适。

增加二次项,从新敲一次代码


import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score

# 导入数据及可视化

data = pd.read_csv('task2_data.csv')
data.head()

mask = data.loc[:, 'y'] == 1
print(mask)

fig1 = plt.figure()
plt.ylabel('pay2')
plt.xlabel('pay1')
plt.title('pay1_pay2')
abnormal = plt.scatter(data.loc[:, 'pay1'][mask], data.loc[:, 'pay2'][mask])
normal = plt.scatter(data.loc[:, 'pay1'][~mask], data.loc[:, 'pay2'][~mask])
plt.legend((normal, abnormal), ('normal', 'abnormal'))
plt.show()

x = data.drop(['y'], axis=1)
y = data.loc[:, 'y']

x.head()
y.head()

# 简历模型

LR1 = LogisticRegression()
LR1.fit(x, y)

# 模型预测

y_predict = LR1.predict(x)
print(y_predict, y)

# 预测目标值


# 评估模型


accuracy = accuracy_score(y, y_predict)
print(accuracy)

# 可视化决策边界

theta0 = LR1.intercept_[0]
theta1, theta2 = LR1.coef_[0][0], LR1.coef_[0][1]

print(theta0, theta1, theta2)

x1 = data.loc[:, 'pay1']

x2_new = -(theta0 + theta1 * x1) / theta2

print(x2_new)

fig2 = plt.figure()
plt.ylabel('pay2')
plt.xlabel('pay1')
plt.title('pay1_pay2')
abnormal = plt.scatter(data.loc[:, 'pay1'][mask], data.loc[:, 'pay2'][mask])
normal = plt.scatter(data.loc[:, 'pay1'][~mask], data.loc[:, 'pay2'][~mask])
plt.legend((normal, abnormal), ('normal', 'abnormal'))
plt.plot(x1, x2_new)
plt.show()

# 构造二次项

x2 = data.loc[:, 'pay2']

x1_2 = x1 * x1
x2_2 = x2 * x2
x1_x2 = x1 * x2

print(x1_2.shape, x2_2.shape, x1_x2.shape)

# 创建2次分类边界数据

x_new = {
    'x1': x1,
    'x2': x2,
    'x1_2': x1_2,
    'x2_2': x2_2,
    'x1_x2': x1_x2
}
x_new = pd.DataFrame(x_new)

print(x_new)

# 创建新模型

LR2 = LogisticRegression()

# 模型训练

LR2.fit(x_new, y)

# 模型预测

y_predict2 = LR2.predict(x_new)
print(y)
print(y_predict2)

# 计算准确率
accuracy2 = accuracy_score(y, y_predict2)

print(accuracy2)


# 测试样本预测
x_test = np.array([[80, 20]])
y_predict1_test = LR1.predict(x_test)
print(y_predict1_test)# 老版本预测
print('abnormal' if y_predict1_test==1 else 'normal')
x_test_2 = np.array([[80, 20, 6400, 400, 1600]])
y_predict2_test = LR2.predict(x_test_2)
print(y_predict2_test)# 新版本预测
print('abnormal' if y_predict2_test==1 else 'normal')



# 计算决策边界函数

"""
要把最后得出的theta0,theta0,theta1,theta2,theta3,theta4,theta5
的theta0 + theta1*x1 + theta2*x2 + theta3*x1^2 + theta4*x2^2 +theta5*x1*x2
转换成ax^2 + bx +c = 0的模式
得出

a = theta4

b = theta5*x1 + theta2

c = theta0 + theta1*x1 +theta3*x1*x1

把数据代入就是下面的代码
"""

x1_new_2 = x1.sort_values()  # 把x1排序,便于后面画图

theta0 = LR2.intercept_[0]
theta1, theta2, theta3, theta4, theta5 = LR2.coef_[0][0], LR2.coef_[0][1], LR2.coef_[0][2], LR2.coef_[0][3], \
                                         LR2.coef_[0][4],
print(theta0, theta1, theta2, theta3, theta4, theta5)

a = theta4

b = theta5 * x1_new_2 + theta2

c = theta0 + theta1 * x1_new_2 + theta3 * x1_new_2 * x1_new_2

x2_new_2 = (-b + np.sqrt(b * b - 4 * a * c)) / (2 * a)

print(x2_new_2)

# 画出新决策边界

fig3 = plt.figure()
plt.ylabel('pay2')
plt.xlabel('pay1')
plt.title('pay1_pay2')
abnormal = plt.scatter(data.loc[:, 'pay1'][mask], data.loc[:, 'pay2'][mask])
normal = plt.scatter(data.loc[:, 'pay1'][~mask], data.loc[:, 'pay2'][~mask])
plt.legend((normal, abnormal), ('normal', 'abnormal'))
plt.plot(x1_new_2, x2_new_2)
plt.show()

看看最后的图

image.png

再看看这图,合理的不行