数据分析必学案例--波士顿犯罪分析

14 阅读9分钟
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from pandas.api.types import CategoricalDtype
import warnings
warnings.filterwarnings('ignore')
import os
import folium
from folium.plugins import HeatMap
rcParams['figure.figsize'] = 22,11
df = pd.read_csv('/Users/fangcheng/sklearn/项目五:EDA-波士顿犯罪分析.ipynb/crime.csv',encoding='latin-1')
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long Location
0 I182070945 619 Larceny LARCENY ALL OTHERS D14 808 NaN 2018-09-02 13:00:00 2018 9 Sunday 13 Part One LINCOLN ST 42.357791 -71.139371 (42.35779134, -71.13937053)
1 I182070943 1402 Vandalism VANDALISM C11 347 NaN 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two HECLA ST 42.306821 -71.060300 (42.30682138, -71.06030035)
2 I182070941 3410 Towed TOWED MOTOR VEHICLE D4 151 NaN 2018-09-03 19:27:00 2018 9 Monday 19 Part Three CAZENOVE ST 42.346589 -71.072429 (42.34658879, -71.07242943)
3 I182070940 3114 Investigate Property INVESTIGATE PROPERTY D4 272 NaN 2018-09-03 21:16:00 2018 9 Monday 21 Part Three NEWCOMB ST 42.334182 -71.078664 (42.33418175, -71.07866441)
4 I182070938 3114 Investigate Property INVESTIGATE PROPERTY B3 421 NaN 2018-09-03 21:05:00 2018 9 Monday 21 Part Three DELHI ST 42.275365 -71.090361 (42.27536542, -71.09036101)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319073 entries, 0 to 319072
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   INCIDENT_NUMBER      319073 non-null  object 
 1   OFFENSE_CODE         319073 non-null  int64  
 2   OFFENSE_CODE_GROUP   319073 non-null  object 
 3   OFFENSE_DESCRIPTION  319073 non-null  object 
 4   DISTRICT             317308 non-null  object 
 5   REPORTING_AREA       319073 non-null  object 
 6   SHOOTING             1019 non-null    object 
 7   OCCURRED_ON_DATE     319073 non-null  object 
 8   YEAR                 319073 non-null  int64  
 9   MONTH                319073 non-null  int64  
 10  DAY_OF_WEEK          319073 non-null  object 
 11  HOUR                 319073 non-null  int64  
 12  UCR_PART             318983 non-null  object 
 13  STREET               308202 non-null  object 
 14  Lat                  299074 non-null  float64
 15  Long                 299074 non-null  float64
 16  Location             319073 non-null  object 
dtypes: float64(2), int64(4), object(11)
memory usage: 41.4+ MB
df.describe().T
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
count mean std min 25% 50% 75% max
OFFENSE_CODE 319073.0 2317.546956 1185.285543 111.000000 1001.000000 2907.000000 3201.000000 3831.000000
YEAR 319073.0 2016.560586 0.996344 2015.000000 2016.000000 2017.000000 2017.000000 2018.000000
MONTH 319073.0 6.609719 3.273691 1.000000 4.000000 7.000000 9.000000 12.000000
HOUR 319073.0 13.118205 6.294205 0.000000 9.000000 14.000000 18.000000 23.000000
Lat 299074.0 42.214381 2.159766 -1.000000 42.297442 42.325538 42.348624 42.395042
Long 299074.0 -70.908272 3.493618 -71.178674 -71.097135 -71.077524 -71.062467 -1.000000

数据初步分析及缺失值的处理

def missing_zero_values_table(df):
    zero_ver = (df == 0.00).astype(int).sum(axis = 0)
    miss_var = df.isnull().sum()
    miss_var_percent = 100*miss_var/len(df)
    mz_table = pd.concat([zero_ver,miss_var,miss_var_percent],axis=1)
    mz_table = mz_table.rename(columns={0:'Zero Values',1:'Miss Values',2:'% of the Miss Values'})
    mz_table['total Miss Zero Values'] = mz_table['Miss Values'] + mz_table['Zero Values']
    mz_table['% of the Miss Values'] = mz_table['total Miss Zero Values']/len(df)
    mz_table['Data Type'] = df.dtypes
    mz_table = mz_table[mz_table.iloc[:,1]!=0].sort_values('total Miss Zero Values',ascending=False).round(1)
    print('ypur selected datafeame has'+str(df.shape[0])+'rows,and has'+str(df.shape[1])+'columns.Theree are '+str(mz_table.shape[0])+'columns that have missing values')
    return mz_table
missing_zero_values_table(df)
ypur selected datafeame has319073rows,and has17columns.Theree are 6columns that have missing values
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Zero Values Miss Values % of the Miss Values total Miss Zero Values Data Type
SHOOTING 0 318054 1.0 318054 object
Lat 0 19999 0.1 19999 float64
Long 0 19999 0.1 19999 float64
STREET 0 10871 0.0 10871 object
DISTRICT 0 1765 0.0 1765 object
UCR_PART 0 90 0.0 90 object
figure = plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(),yticklabels='')
<Axes: >




png

df.drop('SHOOTING',axis = 1,inplace = True)

数据处理及增加数据新特征

df.dtypes
INCIDENT_NUMBER         object
OFFENSE_CODE             int64
OFFENSE_CODE_GROUP      object
OFFENSE_DESCRIPTION     object
DISTRICT                object
REPORTING_AREA          object
OCCURRED_ON_DATE        object
YEAR                     int64
MONTH                    int64
DAY_OF_WEEK             object
HOUR                     int64
UCR_PART                object
STREET                  object
Lat                    float64
Long                   float64
Location                object
dtype: object
df['OCCURRED_ON_DATE'] = pd.to_datetime(df['OCCURRED_ON_DATE'])

df['OFFENSE_CODE_GROUP'] = df['OFFENSE_CODE_GROUP'].astype(CategoricalDtype())
df['OFFENSE_DESCRIPTION'] = df['OFFENSE_DESCRIPTION'].astype(CategoricalDtype())
df['DISTRICT'] = df['DISTRICT'].astype(CategoricalDtype())
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype(CategoricalDtype())
df['UCR_PART'] = df['UCR_PART'].astype(CategoricalDtype())
df.dtypes
INCIDENT_NUMBER                object
OFFENSE_CODE                    int64
OFFENSE_CODE_GROUP           category
OFFENSE_DESCRIPTION          category
DISTRICT                     category
REPORTING_AREA                 object
OCCURRED_ON_DATE       datetime64[ns]
YEAR                            int64
MONTH                           int64
DAY_OF_WEEK                  category
HOUR                            int64
UCR_PART                     category
STREET                         object
Lat                           float64
Long                          float64
Location                       object
dtype: object
rename = {'OFFENSE_CODE_GROUP':'group','OFFENSE_DESCRIPTION':'Decription','DISTRICT':'District','OCCURRED_ON_DATE':'Date',
         'YEAR':'Year','MONTH':'Month','DAY_OF_WEEK':'Day','HOUR':'Hour','STREET':'Street'}
df.rename(index = str,columns = rename,inplace = True)
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
INCIDENT_NUMBER OFFENSE_CODE group Decription District REPORTING_AREA Date Year Month Day Hour UCR_PART Street Lat Long Location
0 I182070945 619 Larceny LARCENY ALL OTHERS D14 808 2018-09-02 13:00:00 2018 9 Sunday 13 Part One LINCOLN ST 42.357791 -71.139371 (42.35779134, -71.13937053)
1 I182070943 1402 Vandalism VANDALISM C11 347 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two HECLA ST 42.306821 -71.060300 (42.30682138, -71.06030035)
2 I182070941 3410 Towed TOWED MOTOR VEHICLE D4 151 2018-09-03 19:27:00 2018 9 Monday 19 Part Three CAZENOVE ST 42.346589 -71.072429 (42.34658879, -71.07242943)
3 I182070940 3114 Investigate Property INVESTIGATE PROPERTY D4 272 2018-09-03 21:16:00 2018 9 Monday 21 Part Three NEWCOMB ST 42.334182 -71.078664 (42.33418175, -71.07866441)
4 I182070938 3114 Investigate Property INVESTIGATE PROPERTY B3 421 2018-09-03 21:05:00 2018 9 Monday 21 Part Three DELHI ST 42.275365 -71.090361 (42.27536542, -71.09036101)

增加新特征用于后面的时间影响分析

def create_features(df):

    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['dayofweek','quarter','dayofyear','dayofmonth','weekofyear']]
    return X
create_features(df).head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
dayofweek quarter dayofyear dayofmonth weekofyear
0 6 3 245 2 35
1 1 3 233 21 34
2 0 3 246 3 36
3 0 3 246 3 36
4 0 3 246 3 36
X = create_features(df)
X.dtypes
dayofweek     int64
quarter       int64
dayofyear     int64
dayofmonth    int64
weekofyear    int64
dtype: object
X.shape
(319073, 5)
df.shape
(319073, 21)
df.columns
Index(['INCIDENT_NUMBER', 'OFFENSE_CODE', 'group', 'Decription', 'District',       'REPORTING_AREA', 'Date', 'Year', 'Month', 'Day', 'Hour', 'UCR_PART',       'Street', 'Lat', 'Long', 'Location', 'dayofweek', 'quarter',       'dayofyear', 'dayofmonth', 'weekofyear'],
      dtype='object')
df.dtypes
INCIDENT_NUMBER            object
OFFENSE_CODE                int64
group                    category
Decription               category
District                 category
REPORTING_AREA             object
Date               datetime64[ns]
Year                        int64
Month                       int64
Day                      category
Hour                        int64
UCR_PART                 category
Street                     object
Lat                       float64
Long                      float64
Location                   object
dayofweek                   int64
quarter                     int64
dayofyear                   int64
dayofmonth                  int64
weekofyear                  int64
dtype: object
df.dayofweek  = df.dayofweek.astype(CategoricalDtype())
df.quarter = df.quarter.astype(CategoricalDtype())
df.dayofyear = df.dayofyear.astype(CategoricalDtype())
df.dayofmonth = df.dayofmonth.astype(CategoricalDtype())
df.weekofyear = df.weekofyear.astype(CategoricalDtype())
df.dtypes
INCIDENT_NUMBER            object
OFFENSE_CODE                int64
group                    category
Decription               category
District                 category
REPORTING_AREA             object
Date               datetime64[ns]
Year                        int64
Month                       int64
Day                      category
Hour                        int64
UCR_PART                 category
Street                     object
Lat                       float64
Long                      float64
Location                   object
dayofweek                category
quarter                  category
dayofyear                category
dayofmonth               category
weekofyear               category
dtype: object

EDA 探索犯罪与时间的关系

plt.figure(figsize = (8,5))
sns.countplot(data = df,x = 'Month')
<Axes: xlabel='Month', ylabel='count'>




png

从图中可以看出6月7月8月的犯罪率是最高的,夏季是犯罪的高发期。

plt.figure(figsize = (8,5))
sns.countplot(data = df,x = 'Year')
<Axes: xlabel='Year', ylabel='count'>




png

plt.figure(figsize = (8,5))
sns.countplot(data = df,x = 'Hour')
<Axes: xlabel='Hour', ylabel='count'>




png

犯罪率最高的时间为下午16~18点。

观测犯罪地区和犯罪组织的关系和影响(双特征)

rcParams['figure.figsize'] = 20,9
order = df['group'].value_counts().head(5).index
sns.countplot(data = df,x = 'group',hue = 'District',order = order)
<Axes: xlabel='group', ylabel='count'>




png

观测犯罪地区和犯罪月份的关系和影响

df.Year.unique()
array([2018, 2017, 2016, 2015])
nask = ((df['Year'] == 2018) | (df['Year'] == 2017) | (df['Year'] == 2016))
grouped = df[nask].groupby(['Month','District']).count()
sns.lineplot(data = grouped.reset_index(),x = 'Month',hue = 'District',y = 'group')
<Axes: xlabel='Month', ylabel='group'>




png

grouped = df.groupby(['Month','District']).count()
sns.boxplot(data = grouped.reset_index(),x = 'Month',y = 'group',palette = 'ch:.25')
<Axes: xlabel='Month', ylabel='group'>




png

观测犯罪组织的危险程度

sns.catplot(y = 'group',
           kind = 'count',
           height = 11,
           aspect = 2,
           order = df.group.value_counts().index,
           data = df)
<seaborn.axisgrid.FacetGrid at 0x29cff6ad0>




png

labels = df['group'].astype('category').cat.categories.tolist()
counts = df['group'].value_counts()

sizes = [counts[var_cat] for var_cat in labels]
fig1,ax1 = plt.subplots(figsize = (22,12))
ax1.pie(sizes,labels = labels,autopct = '%1.1f%%',shadow = True,startangle = 140)
ax1.axis('equal')
plt.show()

png

观测犯罪地区和其他特征的影响以及其他特征的密度分析

def eda_numeric(df,feature):
    x_ = df[feature]
    y_ = df['District']
    data = pd.concat([x_,y_],1)
    plt.figure(figsize = (20,5))
    
    ax1 = plt.subplot(1,2,1)
    sns.boxplot(data = data,x = 'District',y = feature)
    plt.title(feature + '-Boxplot')
    
    ax2 = plt.subplot(1,2,2)
    plt.title(feature + '-Density')
    sns.kdeplot(data[data['District'] == 'D4'][feature].apply(np.log),color = 'b',legend = False)
    
    plt.legend(loc = 'upper right',labels = ['0'])
    plt.tight_layout()
    plt.show()
rm_list = ['lat','long']
type_list = ['int32','int64']
feature_list = []

for feature in df.columns:
    if (feature not in rm_list) & (df[feature].dtypes in type_list) & (len(df[feature].unique()) > 2):
        feature_list.append(feature)
df_drop = df.dropna().copy()
for feature in feature_list:
    print('Feature: ',feature)
    eda_numeric(df_drop,feature)
Feature:  OFFENSE_CODE



png

Feature:  Year



png

Feature:  Month



png

Feature:  Hour



png

df.Lat.replace(-1,None,inplace = True)
df.Long.replace(-1,None,inplace = True)

rcParams['figure.figsize'] = 21,11
plt.subplots(figsize = (11,6))
sns.scatterplot(x = 'Lat',
               y = 'Long',
               hue = 'District',
               alpha = 0.2,
               data = df)
plt.legend(loc = 2)
<matplotlib.legend.Legend at 0x2aa54af50>




png

B2_district = df.loc[df['District'] == 'B2'][['Lat','Long']]

B2_district.Lat.fillna(0,inplace = True)
B2_district.Long.fillna(0,inplace = True)

map_1 = folium.map(location = [42.356145,-71.064083],
                  tiles = 'OpenStreetMap',
                  zoom_start = 11)
folium.circlemarker([42.356145,-71.064083],
                   radius = 70,
                   fillcolor = '#b22222',
                   popup = 'Homicide',
                   color = 'red',
                   ).add_to(map_1)
HeatMap(data = B2_district,radius = 16).add_to(map_1)
map_1
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

Cell In[49], line 6
      3 B2_district.Lat.fillna(0,inplace = True)
      4 B2_district.Long.fillna(0,inplace = True)
----> 6 map_1 = folium.map(location = [42.356145,-71.064083],
      7                   tiles = 'OpenStreetMap',
      8                   zoom_start = 11)
      9 folium.circlemarker([42.356145,-71.064083],
     10                    radius = 70,
     11                    fillcolor = '#b22222',
     12                    popup = 'Homicide',
     13                    color = 'red',
     14                    ).add_to(map_1)
     15 HeatMap(data = B2_district,radius = 16).add_to(map_1)


TypeError: 'module' object is not callable

Boston犯罪分析项目总结

EDA,在数据清洗和建模工作之前,对数据集的分布有一个总体的认识。探索各个特征之间的关系。 EDA部分的主要工作在于数据可视化,前提是对数据做了初步的预处理工作。在本例子中,将许多object类型的数据转化为了category类型。 熟悉sns.countplot(),sns.boxplot(),sns.kdeplot(),sns.lineplot(),sns.heatmap(),sns.scatterplot() sns.countplot(data,x,hue,order)适合单特征分析。 sns.lineplot(data,x,y,hue) sns.boxplot(data,x,y) sns.scatterplot(data,x,y,hue,alpha) 带有时间特性的数据处理,时间特性可以增加很多特征 地图数据的绘制,仍然需要探索的一个地方