数据据统计基础之Pandas.DataFrame学习

20 阅读4分钟

Pandas 是 Python 中用于数据分析的核心库之一,其中 DataFrame 是最常用的数据结构,类似于 Excel 表格或 SQL 表。下面我们将从 创建、属性、索引、筛选、统计、排序 等多个维度全面介绍 DataFrame 的基本用法。

# dataframe的创建方式
import pandas as pd
import numpy as np
# 通过series来创建
s1= pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df=pd.DataFrame({"第一列":s1,"第二列":s2})
type(df["第一列"])
# 通过字典来创建
df = pd.DataFrame(
    {
        "id":[1,2,3,4,5],
        "name":["Tom","jack","alice","bob","allen"],
        "age":[15,17,20,26,30],
        "score":[60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5],columns=["name","id","age","score"]
)
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
1 Tom 1 15 60.5
2 jack 2 17 80.0
3 alice 3 20 30.6
4 bob 4 26 70.0
5 allen 5 30 83.5
# dataframe的属性
print('行索引:')
print(df.index)
print('列标签:')
print(df.columns)
print('值:')
print(df.values)
行索引:
Index([1, 2, 3, 4, 5], dtype='int64')
列标签:
Index(['name', 'id', 'age', 'score'], dtype='object')
值:
[['Tom' 1 15 60.5]
 ['jack' 2 17 80.0]
 ['alice' 3 20 30.6]
 ['bob' 4 26 70.0]
 ['allen' 5 30 83.5]]
print('维度:',df.ndim)
print('数据类型:')
print(df.dtypes)
print('形状:',df.shape)
print('元素个数:',df.size)
维度: 2
数据类型:
name      object
id         int64
age        int64
score    float64
dtype: object
形状: (5, 4)
元素个数: 20
# 行列转置
print(df.T)
          1     2      3     4      5
name    Tom  jack  alice   bob  allen
id        1     2      3     4      5
age      15    17     20    26     30
score  60.5  80.0   30.6  70.0   83.5
# 获取元素 loc iloc at iat
# 某行
print(df.loc[4])
print(df.iloc[3])
name      bob
id          4
age        26
score    70.0
Name: 4, dtype: object
name      bob
id          4
age        26
score    70.0
Name: 4, dtype: object
# 某列
print(df.loc[:,'name'])
print(df.iloc[:,0])
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
# 单个元素
print(df.at[3,'score'])
print(df.iat[2,1])
print(df.loc[3,'score'])
print(df.iloc[2,1])
30.6
3
30.6
3
# 获取单列数据
print(df['name'])
print(type(df['name']))
print(df.name)
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
print(df[['name','score']]) #多列数据的获取
    name  score
1    Tom   60.5
2   jack   80.0
3  alice   30.6
4    bob   70.0
5  allen   83.5
# 查看部分数据
print(df.head(2))
print(df.tail(3))
   name  id  age  score
1   Tom   1   15   60.5
2  jack   2   17   80.0
    name  id  age  score
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5
# 使用布尔索引筛选数据
df[df.score>70]
df[(df.score>70)& (df.age<20)]
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
2 jack 2 17 80.0
# 随机抽样
df.sample(3)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
4 bob 4 26 70.0
1 Tom 1 15 60.5
2 jack 2 17 80.0
print(df.isin(['jack',20])) # 查看元素是否包含在参数集合中
    name     id    age  score
1  False  False  False  False
2   True  False  False  False
3  False  False   True  False
4  False  False  False  False
5  False  False  False  False
print(df.isna()) # 查看元素是否是缺失值
    name     id    age  score
1  False  False  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False  False
5  False  False  False  False
print(df['score'].sum()) # 某一列的总和
print(df.score.max()) # 最大值
print(df.age.min()) # 最小值
print(df.score.mean()) # 平均值
print(df.score.median())  # 中位数
print(df.age.mode()) # 众数
print(df.score.std()) # 标准差
print(df.score.quantile(0.25))# 分位数
print(df.describe())
324.6
83.5
15
64.92
70.0
0    15
1    17
2    20
3    26
4    30
Name: age, dtype: int64
21.188605428390044
60.5
             id        age      score
count  5.000000   5.000000   5.000000
mean   3.000000  21.600000  64.920000
std    1.581139   6.268971  21.188605
min    1.000000  15.000000  30.600000
25%    2.000000  17.000000  60.500000
50%    3.000000  20.000000  70.000000
75%    4.000000  26.000000  80.000000
max    5.000000  30.000000  83.500000
print(df.count()) # 返回每一列非缺失值的个数
name     5
id       5
age      5
score    5
dtype: int64
print(df.value_counts()) # 出现的次数
name   id  age  score
Tom    1   15   60.5     1
alice  3   20   30.6     1
allen  5   30   83.5     1
bob    4   26   70.0     1
jack   2   17   80.0     1
Name: count, dtype: int64
print(df.drop_duplicates())
    name  id  age  score
1    Tom   1   15   60.5
2   jack   2   17   80.0
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5
print(df.duplicated(subset=['age'])) # 查看是否重复
1    False
2    False
3    False
4    False
5    False
dtype: bool
df.sample(2) # 随机抽样
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
1 Tom 1 15 60.5
2 jack 2 17 80.0
print(df.replace(15,30))
    name  id  age  score
1    Tom   1   30   60.5
2   jack   2   17   80.0
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5
df.cumsum()
df.cummax()
df.cummin(axis=0)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
1 Tom 1 15 60.5
2 Tom 1 15 60.5
3 Tom 1 15 30.6
4 Tom 1 15 30.6
5 Tom 1 15 30.6
print(df.sort_index(ascending=False))
    name  id  age  score
5  allen   5   30   83.5
4    bob   4   26   70.0
3  alice   3   20   30.6
2   jack   2   17   80.0
1    Tom   1   15   60.5
print(df.sort_values(by=['score','age']))
    name  id  age  score
3  alice   3   20   30.6
1    Tom   1   15   60.5
4    bob   4   26   70.0
2   jack   2   17   80.0
5  allen   5   30   83.5
df.nlargest(2,columns=['score','age'])
df.nsmallest(2,columns=['score','age'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
name id age score
3 alice 3 20 30.6
1 Tom 1 15 60.5