数据分析 - Matplotlib库简单使用

153 阅读6分钟
# Import the libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Load the dataset
uber = pd.read_csv('D:/KevinZhang/Study/data_analytics/uber-raw-data-apr14.csv')
# Display the head of the dataset
uber.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Date/Time Lat Lon Base
0 4/1/2014 0:11 40.7690 -73.9549 B02512
1 4/1/2014 0:17 40.7267 -74.0345 B02512
2 4/1/2014 0:21 40.7316 -73.9873 B02512
3 4/1/2014 0:28 40.7588 -73.9776 B02512
4 4/1/2014 0:33 40.7594 -73.9722 B02512
uber.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564516 entries, 0 to 564515
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Date/Time  564516 non-null  object 
 1   Lat        564516 non-null  float64
 2   Lon        564516 non-null  float64
 3   Base       564516 non-null  object 
dtypes: float64(2), object(2)
memory usage: 17.2+ MB
# Check if there are any missing values
def num_missing(x):
    return sum(x.isnull())
print('Number of missing/null values per column')
print(uber.apply(num_missing, axis=0))
Number of missing/null values per column
Date/Time    0
Lat          0
Lon          0
Base         0
dtype: int64
uber.isna().sum()
Date/Time    0
Lat          0
Lon          0
Base         0
dtype: int64
# Extact additional information
uber['Date/Time'] = pd.to_datetime(uber['Date/Time'], format='%m/%d/%Y %H:%M:%S')
uber['DayofWeekNum'] = uber['Date/Time'].dt.dayofweek
uber['DayofWeek'] = uber['Date/Time'].dt.day_name()
uber['DayNum'] = uber['Date/Time'].dt.day
uber['HourOfDay'] = uber['Date/Time'].dt.hour
# Display the head of dataset
uber.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Date/Time Lat Lon Base DayofWeekNum DayofWeek DayNum HourOfDay
0 2014-04-01 00:11:00 40.7690 -73.9549 B02512 1 Tuesday 1 0
1 2014-04-01 00:17:00 40.7267 -74.0345 B02512 1 Tuesday 1 0
2 2014-04-01 00:21:00 40.7316 -73.9873 B02512 1 Tuesday 1 0
3 2014-04-01 00:28:00 40.7588 -73.9776 B02512 1 Tuesday 1 0
4 2014-04-01 00:33:00 40.7594 -73.9722 B02512 1 Tuesday 1 0
# Display the shape
uber.shape
(564516, 8)
# Unique base codes
uber['Base'].unique()
array(['B02512', 'B02598', 'B02617', 'B02682', 'B02764'], dtype=object)
# Total rides based on the base code
sns.catplot(x='Base', data=uber, kind='count')
<seaborn.axisgrid.FacetGrid at 0x1d1de4e2490>




output_10_1.png

uber_week_data = uber.pivot_table(index=['DayofWeekNum','DayofWeek'], values='Base', aggfunc='count')
uber_week_data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
Base
DayofWeekNum DayofWeek
0 Monday 60861
1 Tuesday 91185
2 Wednesday 108631
3 Thursday 85067
4 Friday 90303
5 Saturday 77218
6 Sunday 51251
# Visualize the pibot table
uber_week_data.plot(kind='bar', figsize=(8,6))
<AxesSubplot:xlabel='DayofWeekNum,DayofWeek'>




output_12_1.png

uber_hourly_data = uber.pivot_table(index=['HourOfDay'], values='Base', aggfunc='count')
uber_hourly_data.plot(kind='line', figsize=(10,6), title='Hourly Journeys')
<AxesSubplot:title={'center':'Hourly Journeys'}, xlabel='HourOfDay'>




output_13_1.png

uber_day_data = uber.pivot_table(index=['DayNum'], values='Base', aggfunc='count')
uber_day_data.plot(kind='bar', figsize=(10,5), title='Journeys by DayNum')
<AxesSubplot:title={'center':'Journeys by DayNum'}, xlabel='DayNum'>




output_14_1.png

def count_rows(rows):
    return len(rows)

by_date = uber.groupby('DayNum').apply(count_rows)
by_date
DayNum
1     14546
2     17474
3     20701
4     26714
5     19521
6     13445
7     19550
8     16188
9     16843
10    20041
11    20420
12    18170
13    12112
14    12674
15    20641
16    17717
17    20973
18    18074
19    14602
20    11017
21    13162
22    16975
23    20346
24    23352
25    25095
26    24925
27    14677
28    15475
29    22835
30    36251
dtype: int64
# Sort Day of the month by values rather than date
by_date_sorted = by_date.sort_values()
by_date_sorted
DayNum
20    11017
13    12112
14    12674
21    13162
6     13445
1     14546
19    14602
27    14677
28    15475
8     16188
9     16843
22    16975
2     17474
16    17717
18    18074
12    18170
5     19521
7     19550
10    20041
23    20346
11    20420
15    20641
3     20701
17    20973
29    22835
24    23352
26    24925
25    25095
4     26714
30    36251
dtype: int64
# Analyze the hours
plt.hist(uber.HourOfDay, bins=24, range=(.5, 24))
(array([ 7769.,  4935.,  5040.,  6095.,  9476., 18498., 24924., 22843.,
        17939., 17865., 18774., 19425., 22603., 27190., 35324., 42003.,
        45475., 43003., 38923., 36244., 36964., 30645., 20649.,     0.]),
 array([ 0.5       ,  1.47916667,  2.45833333,  3.4375    ,  4.41666667,
         5.39583333,  6.375     ,  7.35416667,  8.33333333,  9.3125    ,
        10.29166667, 11.27083333, 12.25      , 13.22916667, 14.20833333,
        15.1875    , 16.16666667, 17.14583333, 18.125     , 19.10416667,
        20.08333333, 21.0625    , 22.04166667, 23.02083333, 24.        ]),
 <BarContainer object of 24 artists>)




output_17_1.png

count_rows(uber)
by_hour_weekday = uber.groupby('HourOfDay DayofWeekNum'.split()).apply(count_rows).unstack()
by_hour_weekday
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
DayofWeekNum 0 1 2 3 4 5 6
HourOfDay
0 518 765 899 792 1367 3027 4542
1 261 367 507 459 760 2479 2936
2 238 304 371 342 513 1577 1590
3 571 516 585 567 736 1013 1052
4 1021 887 1003 861 932 706 685
5 1619 1734 1990 1454 1382 704 593
6 2974 3766 4230 3179 2836 844 669
7 3888 5304 5647 4159 3943 1110 873
8 3138 4594 5242 3616 3648 1372 1233
9 2211 2962 3846 2654 2732 1764 1770
10 1953 2900 3844 2370 2599 2086 2113
11 1929 2949 3889 2516 2816 2315 2360
12 1945 2819 3988 2657 2978 2560 2478
13 2294 3556 4469 3301 3535 2685 2763
14 3117 4489 5438 4083 4087 3042 2934
15 3818 6042 7071 5182 5354 4457 3400
16 4962 7521 8213 6149 6259 5410 3489
17 5574 8297 9151 6951 6790 5558 3154
18 4725 7089 8334 6637 7258 6165 2795
19 4386 6459 7794 5929 6247 5529 2579
20 3573 6310 7783 6345 5165 4792 2276
21 3079 5993 6921 6585 6265 5811 2310
22 1976 3614 4845 5370 6708 6493 1639
23 1091 1948 2571 2909 5393 5719 1018
# heat map the brightest spot shows the day/hour with the highest frequency
plt.figure(figsize=(15, 10))
sns.heatmap(by_hour_weekday)
<AxesSubplot:xlabel='DayofWeekNum', ylabel='HourOfDay'>




output_19_1.png