import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
car = pd.read_csv('D:/KevinZhang/Study/data_analytics/USA_cars_datasets.csv')
car.head()
car = car.drop(columns=car.columns[0], axis='columns')
car.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
brand |
model |
year |
title_status |
mileage |
color |
vin |
lot |
state |
country |
condition |
| 0 |
6300 |
toyota |
cruiser |
2008 |
clean vehicle |
274117 |
black |
jtezu11f88k007763 |
159348797 |
new jersey |
usa |
10 days left |
| 1 |
2899 |
ford |
se |
2011 |
clean vehicle |
190552 |
silver |
2fmdk3gc4bbb02217 |
166951262 |
tennessee |
usa |
6 days left |
| 2 |
5350 |
dodge |
mpv |
2018 |
clean vehicle |
39590 |
silver |
3c4pdcgg5jt346413 |
167655728 |
georgia |
usa |
2 days left |
| 3 |
25000 |
ford |
door |
2014 |
clean vehicle |
64146 |
blue |
1ftfw1et4efc23745 |
167753855 |
virginia |
usa |
22 hours left |
| 4 |
27700 |
chevrolet |
1500 |
2018 |
clean vehicle |
6654 |
red |
3gcpcrec2jg473991 |
167763266 |
florida |
usa |
22 hours left |
car.dtypes
price int64
brand object
model object
year int64
title_status object
mileage float64
color object
vin object
lot int64
state object
country object
condition object
dtype: object
car.columns
Index(['price', 'brand', 'model', 'year', 'title_status', 'mileage', 'color', 'vin', 'lot', 'state', 'country', 'condition'],
dtype='object')
car.describe(include='all')
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
brand |
model |
year |
title_status |
mileage |
color |
vin |
lot |
state |
country |
condition |
| count |
2499.000000 |
2499 |
2499 |
2499.000000 |
2499 |
2.499000e+03 |
2499 |
2499 |
2.499000e+03 |
2499 |
2499 |
2499 |
| unique |
NaN |
28 |
127 |
NaN |
2 |
NaN |
49 |
2495 |
NaN |
44 |
2 |
47 |
| top |
NaN |
ford |
door |
NaN |
clean vehicle |
NaN |
white |
1gnevhkw8jj148388 |
NaN |
pennsylvania |
usa |
2 days left |
| freq |
NaN |
1235 |
651 |
NaN |
2336 |
NaN |
707 |
2 |
NaN |
299 |
2492 |
832 |
| mean |
18767.671469 |
NaN |
NaN |
2016.714286 |
NaN |
5.229869e+04 |
NaN |
NaN |
1.676914e+08 |
NaN |
NaN |
NaN |
| std |
12116.094936 |
NaN |
NaN |
3.442656 |
NaN |
5.970552e+04 |
NaN |
NaN |
2.038772e+05 |
NaN |
NaN |
NaN |
| min |
0.000000 |
NaN |
NaN |
1973.000000 |
NaN |
0.000000e+00 |
NaN |
NaN |
1.593488e+08 |
NaN |
NaN |
NaN |
| 25% |
10200.000000 |
NaN |
NaN |
2016.000000 |
NaN |
2.146650e+04 |
NaN |
NaN |
1.676253e+08 |
NaN |
NaN |
NaN |
| 50% |
16900.000000 |
NaN |
NaN |
2018.000000 |
NaN |
3.536500e+04 |
NaN |
NaN |
1.677451e+08 |
NaN |
NaN |
NaN |
| 75% |
25555.500000 |
NaN |
NaN |
2019.000000 |
NaN |
6.347250e+04 |
NaN |
NaN |
1.677798e+08 |
NaN |
NaN |
NaN |
| max |
84900.000000 |
NaN |
NaN |
2020.000000 |
NaN |
1.017936e+06 |
NaN |
NaN |
1.678055e+08 |
NaN |
NaN |
NaN |
car.hist(figsize=(20,30))
array([[<AxesSubplot:title={'center':'price'}>,
<AxesSubplot:title={'center':'year'}>],
[<AxesSubplot:title={'center':'mileage'}>,
<AxesSubplot:title={'center':'lot'}>]], dtype=object)

sns.boxplot(x='country', y='price', data=car)
<AxesSubplot:xlabel='country', ylabel='price'>

sns.pairplot(car)
<seaborn.axisgrid.PairGrid at 0x1a27976d520>

car = car.drop(['lot', 'condition'], axis=1)
car.head(5)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
brand |
model |
year |
title_status |
mileage |
color |
vin |
state |
country |
| 0 |
6300 |
toyota |
cruiser |
2008 |
clean vehicle |
274117.0 |
black |
jtezu11f88k007763 |
new jersey |
usa |
| 1 |
2899 |
ford |
se |
2011 |
clean vehicle |
190552.0 |
silver |
2fmdk3gc4bbb02217 |
tennessee |
usa |
| 2 |
5350 |
dodge |
mpv |
2018 |
clean vehicle |
39590.0 |
silver |
3c4pdcgg5jt346413 |
georgia |
usa |
| 3 |
25000 |
ford |
door |
2014 |
clean vehicle |
64146.0 |
blue |
1ftfw1et4efc23745 |
virginia |
usa |
| 4 |
27700 |
chevrolet |
1500 |
2018 |
clean vehicle |
6654.0 |
red |
3gcpcrec2jg473991 |
florida |
usa |
car = car.rename(columns={'mileage': 'meter', 'title_status': 'type'})
car.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
brand |
model |
year |
type |
meter |
color |
vin |
state |
country |
| 0 |
6300 |
toyota |
cruiser |
2008 |
clean vehicle |
274117.0 |
black |
jtezu11f88k007763 |
new jersey |
usa |
| 1 |
2899 |
ford |
se |
2011 |
clean vehicle |
190552.0 |
silver |
2fmdk3gc4bbb02217 |
tennessee |
usa |
| 2 |
5350 |
dodge |
mpv |
2018 |
clean vehicle |
39590.0 |
silver |
3c4pdcgg5jt346413 |
georgia |
usa |
| 3 |
25000 |
ford |
door |
2014 |
clean vehicle |
64146.0 |
blue |
1ftfw1et4efc23745 |
virginia |
usa |
| 4 |
27700 |
chevrolet |
1500 |
2018 |
clean vehicle |
6654.0 |
red |
3gcpcrec2jg473991 |
florida |
usa |
car.shape
(2499, 10)
duplicate_rows_car = car[car.duplicated()]
print('Number of duplicate rows', duplicate_rows_car.shape)
duplicate_rows_car
Number of duplicate rows (4, 10)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
brand |
model |
year |
type |
meter |
color |
vin |
state |
country |
| 518 |
50 |
chevrolet |
door |
2008 |
salvage insurance |
172102.0 |
blue |
1g1al58f787159241 |
texas |
usa |
| 530 |
0 |
chevrolet |
door |
2003 |
salvage insurance |
194673.0 |
gray |
1gndt13s632267445 |
texas |
usa |
| 541 |
25 |
chevrolet |
pickup |
2010 |
salvage insurance |
149189.0 |
white |
3gcrkse37ag234620 |
colorado |
usa |
| 661 |
26900 |
chevrolet |
traverse |
2018 |
clean vehicle |
42941.0 |
black |
1gnevhkw8jj148388 |
missouri |
usa |
car.count()
price 2499
brand 2499
model 2499
year 2499
type 2499
meter 2499
color 2499
vin 2499
state 2499
country 2499
dtype: int64
car = car.drop_duplicates()
car.count()
price 2495
brand 2495
model 2495
year 2495
type 2495
meter 2495
color 2495
vin 2495
state 2495
country 2495
dtype: int64
print(car.isnull().sum())
price 0
brand 0
model 0
year 0
title_status 6
mileage 0
color 0
vin 0
lot 0
state 0
country 0
condition 4
dtype: int64
car = car.dropna()
car.count()
price 2489
brand 2489
model 2489
year 2489
title_status 2489
mileage 2489
color 2489
vin 2489
lot 2489
state 2489
country 2489
condition 2489
dtype: int64
print(car.isnull().sum())
price 0
brand 0
model 0
year 0
title_status 0
mileage 0
color 0
vin 0
lot 0
state 0
country 0
condition 0
dtype: int64
sns.boxplot(x=car['price'])
<AxesSubplot:xlabel='price'>

sns.boxplot(x=car['year'])
<AxesSubplot:xlabel='year'>

car.brand.value_counts().nlargest(10).plot(kind='bar', figsize=(10,5))
plt.title('Number of cars by brand')
plt.xlabel('Brand')
plt.ylabel('Number of cars')
Text(0, 0.5, 'Number of cars')

plt.figure(figsize=(20,10))
c = car.corr()
sns.heatmap(c, cmap='BrBG', annot=True)
c
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
price |
year |
mileage |
lot |
| price |
1.000000 |
0.417339 |
-0.399225 |
0.158911 |
| year |
0.417339 |
1.000000 |
-0.594010 |
0.164073 |
| mileage |
-0.399225 |
-0.594010 |
1.000000 |
-0.129988 |
| lot |
0.158911 |
0.164073 |
-0.129988 |
1.000000 |
