数据分析 - Pandas pylot的简单使用

168 阅读4分钟
# Import the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
# Load the car.csv dataset
car = pd.read_csv('D:/KevinZhang/Study/data_analytics/USA_cars_datasets.csv')
car.head()
car = car.drop(columns=car.columns[0], axis='columns')
car.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price brand model year title_status mileage color vin lot state country condition
0 6300 toyota cruiser 2008 clean vehicle 274117 black jtezu11f88k007763 159348797 new jersey usa 10 days left
1 2899 ford se 2011 clean vehicle 190552 silver 2fmdk3gc4bbb02217 166951262 tennessee usa 6 days left
2 5350 dodge mpv 2018 clean vehicle 39590 silver 3c4pdcgg5jt346413 167655728 georgia usa 2 days left
3 25000 ford door 2014 clean vehicle 64146 blue 1ftfw1et4efc23745 167753855 virginia usa 22 hours left
4 27700 chevrolet 1500 2018 clean vehicle 6654 red 3gcpcrec2jg473991 167763266 florida usa 22 hours left
car.dtypes
price             int64
brand            object
model            object
year              int64
title_status     object
mileage         float64
color            object
vin              object
lot               int64
state            object
country          object
condition        object
dtype: object
car.columns
Index(['price', 'brand', 'model', 'year', 'title_status', 'mileage', 'color',       'vin', 'lot', 'state', 'country', 'condition'],
      dtype='object')
# Analytical summary of the dataset
car.describe(include='all')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price brand model year title_status mileage color vin lot state country condition
count 2499.000000 2499 2499 2499.000000 2499 2.499000e+03 2499 2499 2.499000e+03 2499 2499 2499
unique NaN 28 127 NaN 2 NaN 49 2495 NaN 44 2 47
top NaN ford door NaN clean vehicle NaN white 1gnevhkw8jj148388 NaN pennsylvania usa 2 days left
freq NaN 1235 651 NaN 2336 NaN 707 2 NaN 299 2492 832
mean 18767.671469 NaN NaN 2016.714286 NaN 5.229869e+04 NaN NaN 1.676914e+08 NaN NaN NaN
std 12116.094936 NaN NaN 3.442656 NaN 5.970552e+04 NaN NaN 2.038772e+05 NaN NaN NaN
min 0.000000 NaN NaN 1973.000000 NaN 0.000000e+00 NaN NaN 1.593488e+08 NaN NaN NaN
25% 10200.000000 NaN NaN 2016.000000 NaN 2.146650e+04 NaN NaN 1.676253e+08 NaN NaN NaN
50% 16900.000000 NaN NaN 2018.000000 NaN 3.536500e+04 NaN NaN 1.677451e+08 NaN NaN NaN
75% 25555.500000 NaN NaN 2019.000000 NaN 6.347250e+04 NaN NaN 1.677798e+08 NaN NaN NaN
max 84900.000000 NaN NaN 2020.000000 NaN 1.017936e+06 NaN NaN 1.678055e+08 NaN NaN NaN
car.hist(figsize=(20,30))
array([[<AxesSubplot:title={'center':'price'}>,
        <AxesSubplot:title={'center':'year'}>],
       [<AxesSubplot:title={'center':'mileage'}>,
        <AxesSubplot:title={'center':'lot'}>]], dtype=object)




output_5_1.png

# Relationship between categorical and continuous variable
sns.boxplot(x='country', y='price', data=car)
<AxesSubplot:xlabel='country', ylabel='price'>




output_6_1.png

sns.pairplot(car)
<seaborn.axisgrid.PairGrid at 0x1a27976d520>




output_7_1.png

# Drop irrelevant columns
car = car.drop(['lot', 'condition'], axis=1)
car.head(5)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price brand model year title_status mileage color vin state country
0 6300 toyota cruiser 2008 clean vehicle 274117.0 black jtezu11f88k007763 new jersey usa
1 2899 ford se 2011 clean vehicle 190552.0 silver 2fmdk3gc4bbb02217 tennessee usa
2 5350 dodge mpv 2018 clean vehicle 39590.0 silver 3c4pdcgg5jt346413 georgia usa
3 25000 ford door 2014 clean vehicle 64146.0 blue 1ftfw1et4efc23745 virginia usa
4 27700 chevrolet 1500 2018 clean vehicle 6654.0 red 3gcpcrec2jg473991 florida usa
# Renaming the column names
car = car.rename(columns={'mileage': 'meter', 'title_status': 'type'})
car.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price brand model year type meter color vin state country
0 6300 toyota cruiser 2008 clean vehicle 274117.0 black jtezu11f88k007763 new jersey usa
1 2899 ford se 2011 clean vehicle 190552.0 silver 2fmdk3gc4bbb02217 tennessee usa
2 5350 dodge mpv 2018 clean vehicle 39590.0 silver 3c4pdcgg5jt346413 georgia usa
3 25000 ford door 2014 clean vehicle 64146.0 blue 1ftfw1et4efc23745 virginia usa
4 27700 chevrolet 1500 2018 clean vehicle 6654.0 red 3gcpcrec2jg473991 florida usa
# Total number of rows and columns
car.shape
(2499, 10)
# Rows containing duplicate data
duplicate_rows_car = car[car.duplicated()]
print('Number of duplicate rows', duplicate_rows_car.shape)
duplicate_rows_car
Number of duplicate rows (4, 10)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price brand model year type meter color vin state country
518 50 chevrolet door 2008 salvage insurance 172102.0 blue 1g1al58f787159241 texas usa
530 0 chevrolet door 2003 salvage insurance 194673.0 gray 1gndt13s632267445 texas usa
541 25 chevrolet pickup 2010 salvage insurance 149189.0 white 3gcrkse37ag234620 colorado usa
661 26900 chevrolet traverse 2018 clean vehicle 42941.0 black 1gnevhkw8jj148388 missouri usa
# Count the rows before removing the data
car.count()
price      2499
brand      2499
model      2499
year       2499
type       2499
meter      2499
color      2499
vin        2499
state      2499
country    2499
dtype: int64
# Drop the duplicates
car = car.drop_duplicates()
car.count()
price      2495
brand      2495
model      2495
year       2495
type       2495
meter      2495
color      2495
vin        2495
state      2495
country    2495
dtype: int64
# Find the null values
print(car.isnull().sum())
price           0
brand           0
model           0
year            0
title_status    6
mileage         0
color           0
vin             0
lot             0
state           0
country         0
condition       4
dtype: int64
# Drop the missing values
car = car.dropna()
car.count()
price           2489
brand           2489
model           2489
year            2489
title_status    2489
mileage         2489
color           2489
vin             2489
lot             2489
state           2489
country         2489
condition       2489
dtype: int64
print(car.isnull().sum())
price           0
brand           0
model           0
year            0
title_status    0
mileage         0
color           0
vin             0
lot             0
state           0
country         0
condition       0
dtype: int64
# Finding the outliers 找出异常值
sns.boxplot(x=car['price'])
<AxesSubplot:xlabel='price'>




output_17_1.png

sns.boxplot(x=car['year'])
<AxesSubplot:xlabel='year'>




output_18_1.png

# Plotting a Histogram for number of cars per brand
car.brand.value_counts().nlargest(10).plot(kind='bar', figsize=(10,5))
plt.title('Number of cars by brand')
plt.xlabel('Brand')
plt.ylabel('Number of cars')
Text(0, 0.5, 'Number of cars')




output_19_1.png

# Find the relations between the variables
plt.figure(figsize=(20,10))
c = car.corr()
sns.heatmap(c, cmap='BrBG', annot=True)
c
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
price year mileage lot
price 1.000000 0.417339 -0.399225 0.158911
year 0.417339 1.000000 -0.594010 0.164073
mileage -0.399225 -0.594010 1.000000 -0.129988
lot 0.158911 0.164073 -0.129988 1.000000

output_20_1.png