代码
# -*- coding: UTF-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import *
from scipy import stats
df = pd.read_excel("D:/学习/大数据/每周任务/第6周数据/data.xlsx", header=0)
# ~取反,Fare==0和Fare>500的数据应非正常数据,应剔除
df = df[~((df.Fare > 500) | (df.Fare == 0))]
grouped_df = df.groupby('Embarked')
# 定义统计信息函数,可以输出均值、方差、标准差、变异系数等信息
def Static_Info(type_str):
info_df = pd.DataFrame()
info_df['mean'] = grouped_df.mean()[type_str]
info_df['var'] = grouped_df.var()[type_str]
info_df['std'] = grouped_df.std()[type_str]
info_df['cov'] = info_df['std'] / info_df['mean']
info_df['min'] = grouped_df.min()[type_str]
info_df['max'] = grouped_df.max()[type_str]
info_df['count'] = grouped_df.count()['Fare']
return info_df
print('-' * 20 + '年龄统计信息:' + '-' * 20)
print(Static_Info('Age'))
print('-' * 20 + '票价统计信息:' + '-' * 20)
print(Static_Info('Fare'))
# 解决乱码
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 绘制分布图
plt.hist(df.Fare, 50, density=True)
plt.axis([min(df.Fare), max(df.Fare), 0, 0.06])
plt.xlabel(u'价 格')
plt.ylabel(u'概 率')
plt.show()
print("-" * 50)
print("使用shapiro方法检验总体船票价格是否符合正态分布:")
w, p = stats.shapiro(df['Fare'])
print('w值为:%f,p值为:%f' % (w, p))
if p > 0.005:
print('总体票价符合正态分布')
else:
print('总体票价符不合正态分布')
print("-" * 50)
print("使用shapiro方法检验C港口船票价格是否符合正态分布:")
w_C, p_C = stats.shapiro(df[(df.Embarked == 'C')].Fare)
print('w值为:%f,p值为:%f' % (w_C, p_C))
if p_C > 0.005:
print('C港口票价符合正态分布')
else:
print('C港口票价符不合正态分布')
print("-" * 50)
print("使用shapiro方法检验S港口船票价格是否符合正态分布:")
w_S, p_S = stats.shapiro(df[(df.Embarked == 'S')].Fare)
print('w值为:%f,p值为:%f' % (w_S, p_S))
if p_S > 0.005:
print('S港口票价符合正态分布')
else:
print('S港口票价符不合正态分布')
print("-" * 50)
运行结果

剔除粗差后票价概率分布图

卡方检验
待补充
C港S港价格之差服从何种分布
待补充