Pandas 是 Python 中用于数据分析的核心库之一,其中 DataFrame 是最常用的数据结构,类似于 Excel 表格或 SQL 表。下面我们将从 创建、属性、索引、筛选、统计、排序 等多个维度全面介绍 DataFrame 的基本用法。
import pandas as pd
import numpy as np
s1= pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df=pd.DataFrame({"第一列":s1,"第二列":s2})
type(df["第一列"])
df = pd.DataFrame(
{
"id":[1,2,3,4,5],
"name":["Tom","jack","alice","bob","allen"],
"age":[15,17,20,26,30],
"score":[60.5,80,30.6,70,83.5]
},index=[1,2,3,4,5],columns=["name","id","age","score"]
)
df
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 1 |
Tom |
1 |
15 |
60.5 |
| 2 |
jack |
2 |
17 |
80.0 |
| 3 |
alice |
3 |
20 |
30.6 |
| 4 |
bob |
4 |
26 |
70.0 |
| 5 |
allen |
5 |
30 |
83.5 |
print('行索引:')
print(df.index)
print('列标签:')
print(df.columns)
print('值:')
print(df.values)
行索引:
Index([1, 2, 3, 4, 5], dtype='int64')
列标签:
Index(['name', 'id', 'age', 'score'], dtype='object')
值:
[['Tom' 1 15 60.5]
['jack' 2 17 80.0]
['alice' 3 20 30.6]
['bob' 4 26 70.0]
['allen' 5 30 83.5]]
print('维度:',df.ndim)
print('数据类型:')
print(df.dtypes)
print('形状:',df.shape)
print('元素个数:',df.size)
维度: 2
数据类型:
name object
id int64
age int64
score float64
dtype: object
形状: (5, 4)
元素个数: 20
print(df.T)
1 2 3 4 5
name Tom jack alice bob allen
id 1 2 3 4 5
age 15 17 20 26 30
score 60.5 80.0 30.6 70.0 83.5
print(df.loc[4])
print(df.iloc[3])
name bob
id 4
age 26
score 70.0
Name: 4, dtype: object
name bob
id 4
age 26
score 70.0
Name: 4, dtype: object
print(df.loc[:,'name'])
print(df.iloc[:,0])
1 Tom
2 jack
3 alice
4 bob
5 allen
Name: name, dtype: object
1 Tom
2 jack
3 alice
4 bob
5 allen
Name: name, dtype: object
print(df.at[3,'score'])
print(df.iat[2,1])
print(df.loc[3,'score'])
print(df.iloc[2,1])
30.6
3
30.6
3
print(df['name'])
print(type(df['name']))
print(df.name)
1 Tom
2 jack
3 alice
4 bob
5 allen
Name: name, dtype: object
<class
1 Tom
2 jack
3 alice
4 bob
5 allen
Name: name, dtype: object
print(df[['name','score']])
name score
1 Tom 60.5
2 jack 80.0
3 alice 30.6
4 bob 70.0
5 allen 83.5
print(df.head(2))
print(df.tail(3))
name id age score
1 Tom 1 15 60.5
2 jack 2 17 80.0
name id age score
3 alice 3 20 30.6
4 bob 4 26 70.0
5 allen 5 30 83.5
df[df.score>70]
df[(df.score>70)& (df.age<20)]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 2 |
jack |
2 |
17 |
80.0 |
df.sample(3)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 4 |
bob |
4 |
26 |
70.0 |
| 1 |
Tom |
1 |
15 |
60.5 |
| 2 |
jack |
2 |
17 |
80.0 |
print(df.isin(['jack',20]))
name id age score
1 False False False False
2 True False False False
3 False False True False
4 False False False False
5 False False False False
print(df.isna())
name id age score
1 False False False False
2 False False False False
3 False False False False
4 False False False False
5 False False False False
print(df['score'].sum())
print(df.score.max())
print(df.age.min())
print(df.score.mean())
print(df.score.median())
print(df.age.mode())
print(df.score.std())
print(df.score.quantile(0.25))
print(df.describe())
324.6
83.5
15
64.92
70.0
0 15
1 17
2 20
3 26
4 30
Name: age, dtype: int64
21.188605428390044
60.5
id age score
count 5.000000 5.000000 5.000000
mean 3.000000 21.600000 64.920000
std 1.581139 6.268971 21.188605
min 1.000000 15.000000 30.600000
25
50
75
max 5.000000 30.000000 83.500000
print(df.count())
name 5
id 5
age 5
score 5
dtype: int64
print(df.value_counts())
name id age score
Tom 1 15 60.5 1
alice 3 20 30.6 1
allen 5 30 83.5 1
bob 4 26 70.0 1
jack 2 17 80.0 1
Name: count, dtype: int64
print(df.drop_duplicates())
name id age score
1 Tom 1 15 60.5
2 jack 2 17 80.0
3 alice 3 20 30.6
4 bob 4 26 70.0
5 allen 5 30 83.5
print(df.duplicated(subset=['age']))
1 False
2 False
3 False
4 False
5 False
dtype: bool
df.sample(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 1 |
Tom |
1 |
15 |
60.5 |
| 2 |
jack |
2 |
17 |
80.0 |
print(df.replace(15,30))
name id age score
1 Tom 1 30 60.5
2 jack 2 17 80.0
3 alice 3 20 30.6
4 bob 4 26 70.0
5 allen 5 30 83.5
df.cumsum()
df.cummax()
df.cummin(axis=0)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 1 |
Tom |
1 |
15 |
60.5 |
| 2 |
Tom |
1 |
15 |
60.5 |
| 3 |
Tom |
1 |
15 |
30.6 |
| 4 |
Tom |
1 |
15 |
30.6 |
| 5 |
Tom |
1 |
15 |
30.6 |
print(df.sort_index(ascending=False))
name id age score
5 allen 5 30 83.5
4 bob 4 26 70.0
3 alice 3 20 30.6
2 jack 2 17 80.0
1 Tom 1 15 60.5
print(df.sort_values(by=['score','age']))
name id age score
3 alice 3 20 30.6
1 Tom 1 15 60.5
4 bob 4 26 70.0
2 jack 2 17 80.0
5 allen 5 30 83.5
df.nlargest(2,columns=['score','age'])
df.nsmallest(2,columns=['score','age'])
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
|
name |
id |
age |
score |
| 3 |
alice |
3 |
20 |
30.6 |
| 1 |
Tom |
1 |
15 |
60.5 |