数据分析 - Pandas pylot的简单使用```python # Import the necessary lib

# Import the necessary libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)

# Load the car.csv dataset
car = pd.read_csv('D:/KevinZhang/Study/data_analytics/USA_cars_datasets.csv')
car.head()
car = car.drop(columns=car.columns[0], axis='columns')
car.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	brand	model	year	title_status	mileage	color	vin	lot	state	country	condition
0	6300	toyota	cruiser	2008	clean vehicle	274117	black	jtezu11f88k007763	159348797	new jersey	usa	10 days left
1	2899	ford	se	2011	clean vehicle	190552	silver	2fmdk3gc4bbb02217	166951262	tennessee	usa	6 days left
2	5350	dodge	mpv	2018	clean vehicle	39590	silver	3c4pdcgg5jt346413	167655728	georgia	usa	2 days left
3	25000	ford	door	2014	clean vehicle	64146	blue	1ftfw1et4efc23745	167753855	virginia	usa	22 hours left
4	27700	chevrolet	1500	2018	clean vehicle	6654	red	3gcpcrec2jg473991	167763266	florida	usa	22 hours left

car.dtypes

price             int64
brand            object
model            object
year              int64
title_status     object
mileage         float64
color            object
vin              object
lot               int64
state            object
country          object
condition        object
dtype: object

car.columns

Index(['price', 'brand', 'model', 'year', 'title_status', 'mileage', 'color',       'vin', 'lot', 'state', 'country', 'condition'],
      dtype='object')

# Analytical summary of the dataset
car.describe(include='all')

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	brand	model	year	title_status	mileage	color	vin	lot	state	country	condition
count	2499.000000	2499	2499	2499.000000	2499	2.499000e+03	2499	2499	2.499000e+03	2499	2499	2499
unique	NaN	28	127	NaN	2	NaN	49	2495	NaN	44	2	47
top	NaN	ford	door	NaN	clean vehicle	NaN	white	1gnevhkw8jj148388	NaN	pennsylvania	usa	2 days left
freq	NaN	1235	651	NaN	2336	NaN	707	2	NaN	299	2492	832
mean	18767.671469	NaN	NaN	2016.714286	NaN	5.229869e+04	NaN	NaN	1.676914e+08	NaN	NaN	NaN
std	12116.094936	NaN	NaN	3.442656	NaN	5.970552e+04	NaN	NaN	2.038772e+05	NaN	NaN	NaN
min	0.000000	NaN	NaN	1973.000000	NaN	0.000000e+00	NaN	NaN	1.593488e+08	NaN	NaN	NaN
25%	10200.000000	NaN	NaN	2016.000000	NaN	2.146650e+04	NaN	NaN	1.676253e+08	NaN	NaN	NaN
50%	16900.000000	NaN	NaN	2018.000000	NaN	3.536500e+04	NaN	NaN	1.677451e+08	NaN	NaN	NaN
75%	25555.500000	NaN	NaN	2019.000000	NaN	6.347250e+04	NaN	NaN	1.677798e+08	NaN	NaN	NaN
max	84900.000000	NaN	NaN	2020.000000	NaN	1.017936e+06	NaN	NaN	1.678055e+08	NaN	NaN	NaN

car.hist(figsize=(20,30))

array([[<AxesSubplot:title={'center':'price'}>,
        <AxesSubplot:title={'center':'year'}>],
       [<AxesSubplot:title={'center':'mileage'}>,
        <AxesSubplot:title={'center':'lot'}>]], dtype=object)

# Relationship between categorical and continuous variable
sns.boxplot(x='country', y='price', data=car)

<AxesSubplot:xlabel='country', ylabel='price'>

sns.pairplot(car)

<seaborn.axisgrid.PairGrid at 0x1a27976d520>

# Drop irrelevant columns
car = car.drop(['lot', 'condition'], axis=1)
car.head(5)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	brand	model	year	title_status	mileage	color	vin	state	country
0	6300	toyota	cruiser	2008	clean vehicle	274117.0	black	jtezu11f88k007763	new jersey	usa
1	2899	ford	se	2011	clean vehicle	190552.0	silver	2fmdk3gc4bbb02217	tennessee	usa
2	5350	dodge	mpv	2018	clean vehicle	39590.0	silver	3c4pdcgg5jt346413	georgia	usa
3	25000	ford	door	2014	clean vehicle	64146.0	blue	1ftfw1et4efc23745	virginia	usa
4	27700	chevrolet	1500	2018	clean vehicle	6654.0	red	3gcpcrec2jg473991	florida	usa

# Renaming the column names
car = car.rename(columns={'mileage': 'meter', 'title_status': 'type'})
car.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	brand	model	year	type	meter	color	vin	state	country
0	6300	toyota	cruiser	2008	clean vehicle	274117.0	black	jtezu11f88k007763	new jersey	usa
1	2899	ford	se	2011	clean vehicle	190552.0	silver	2fmdk3gc4bbb02217	tennessee	usa
2	5350	dodge	mpv	2018	clean vehicle	39590.0	silver	3c4pdcgg5jt346413	georgia	usa
3	25000	ford	door	2014	clean vehicle	64146.0	blue	1ftfw1et4efc23745	virginia	usa
4	27700	chevrolet	1500	2018	clean vehicle	6654.0	red	3gcpcrec2jg473991	florida	usa

# Total number of rows and columns
car.shape

(2499, 10)

# Rows containing duplicate data
duplicate_rows_car = car[car.duplicated()]
print('Number of duplicate rows', duplicate_rows_car.shape)
duplicate_rows_car

Number of duplicate rows (4, 10)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	brand	model	year	type	meter	color	vin	state	country
518	50	chevrolet	door	2008	salvage insurance	172102.0	blue	1g1al58f787159241	texas	usa
530	0	chevrolet	door	2003	salvage insurance	194673.0	gray	1gndt13s632267445	texas	usa
541	25	chevrolet	pickup	2010	salvage insurance	149189.0	white	3gcrkse37ag234620	colorado	usa
661	26900	chevrolet	traverse	2018	clean vehicle	42941.0	black	1gnevhkw8jj148388	missouri	usa

# Count the rows before removing the data
car.count()

price      2499
brand      2499
model      2499
year       2499
type       2499
meter      2499
color      2499
vin        2499
state      2499
country    2499
dtype: int64

# Drop the duplicates
car = car.drop_duplicates()
car.count()

price      2495
brand      2495
model      2495
year       2495
type       2495
meter      2495
color      2495
vin        2495
state      2495
country    2495
dtype: int64

# Find the null values
print(car.isnull().sum())

price           0
brand           0
model           0
year            0
title_status    6
mileage         0
color           0
vin             0
lot             0
state           0
country         0
condition       4
dtype: int64

# Drop the missing values
car = car.dropna()
car.count()

price           2489
brand           2489
model           2489
year            2489
title_status    2489
mileage         2489
color           2489
vin             2489
lot             2489
state           2489
country         2489
condition       2489
dtype: int64

print(car.isnull().sum())

price           0
brand           0
model           0
year            0
title_status    0
mileage         0
color           0
vin             0
lot             0
state           0
country         0
condition       0
dtype: int64

# Finding the outliers 找出异常值
sns.boxplot(x=car['price'])

<AxesSubplot:xlabel='price'>

sns.boxplot(x=car['year'])

<AxesSubplot:xlabel='year'>

# Plotting a Histogram for number of cars per brand
car.brand.value_counts().nlargest(10).plot(kind='bar', figsize=(10,5))
plt.title('Number of cars by brand')
plt.xlabel('Brand')
plt.ylabel('Number of cars')

Text(0, 0.5, 'Number of cars')

# Find the relations between the variables
plt.figure(figsize=(20,10))
c = car.corr()
sns.heatmap(c, cmap='BrBG', annot=True)
c

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	price	year	mileage	lot
price	1.000000	0.417339	-0.399225	0.158911
year	0.417339	1.000000	-0.594010	0.164073
mileage	-0.399225	-0.594010	1.000000	-0.129988
lot	0.158911	0.164073	-0.129988	1.000000