数据据统计基础之Pandas.DataFrame学习本文介绍了使用Python pandas库创建和操作DataFram

Pandas 是 Python 中用于数据分析的核心库之一，其中 DataFrame 是最常用的数据结构，类似于 Excel 表格或 SQL 表。下面我们将从创建、属性、索引、筛选、统计、排序等多个维度全面介绍 DataFrame 的基本用法。

# dataframe的创建方式
import pandas as pd
import numpy as np
# 通过series来创建
s1= pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df=pd.DataFrame({"第一列":s1,"第二列":s2})
type(df["第一列"])
# 通过字典来创建
df = pd.DataFrame(
    {
        "id":[1,2,3,4,5],
        "name":["Tom","jack","alice","bob","allen"],
        "age":[15,17,20,26,30],
        "score":[60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5],columns=["name","id","age","score"]
)
df

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
1	Tom	1	15	60.5
2	jack	2	17	80.0
3	alice	3	20	30.6
4	bob	4	26	70.0
5	allen	5	30	83.5

# dataframe的属性
print('行索引：')
print(df.index)
print('列标签：')
print(df.columns)
print('值：')
print(df.values)

行索引：
Index([1, 2, 3, 4, 5], dtype='int64')
列标签：
Index(['name', 'id', 'age', 'score'], dtype='object')
值：
[['Tom' 1 15 60.5]
 ['jack' 2 17 80.0]
 ['alice' 3 20 30.6]
 ['bob' 4 26 70.0]
 ['allen' 5 30 83.5]]

print('维度：',df.ndim)
print('数据类型：')
print(df.dtypes)
print('形状：',df.shape)
print('元素个数：',df.size)

维度： 2
数据类型：
name      object
id         int64
age        int64
score    float64
dtype: object
形状： (5, 4)
元素个数： 20

# 行列转置
print(df.T)

          1     2      3     4      5
name    Tom  jack  alice   bob  allen
id        1     2      3     4      5
age      15    17     20    26     30
score  60.5  80.0   30.6  70.0   83.5

# 获取元素 loc iloc at iat
# 某行
print(df.loc[4])
print(df.iloc[3])

name      bob
id          4
age        26
score    70.0
Name: 4, dtype: object
name      bob
id          4
age        26
score    70.0
Name: 4, dtype: object

# 某列
print(df.loc[:,'name'])
print(df.iloc[:,0])

1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object

# 单个元素
print(df.at[3,'score'])
print(df.iat[2,1])
print(df.loc[3,'score'])
print(df.iloc[2,1])

# 获取单列数据
print(df['name'])
print(type(df['name']))
print(df.name)

1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
1      Tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object

print(df[['name','score']]) #多列数据的获取

    name  score
1    Tom   60.5
2   jack   80.0
3  alice   30.6
4    bob   70.0
5  allen   83.5

# 查看部分数据
print(df.head(2))
print(df.tail(3))

   name  id  age  score
1   Tom   1   15   60.5
2  jack   2   17   80.0
    name  id  age  score
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5

# 使用布尔索引筛选数据
df[df.score>70]
df[(df.score>70)& (df.age<20)]

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
2	jack	2	17	80.0

# 随机抽样
df.sample(3)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
4	bob	4	26	70.0
1	Tom	1	15	60.5
2	jack	2	17	80.0

print(df.isin(['jack',20])) # 查看元素是否包含在参数集合中

    name     id    age  score
1  False  False  False  False
2   True  False  False  False
3  False  False   True  False
4  False  False  False  False
5  False  False  False  False

print(df.isna()) # 查看元素是否是缺失值

    name     id    age  score
1  False  False  False  False
2  False  False  False  False
3  False  False  False  False
4  False  False  False  False
5  False  False  False  False

print(df['score'].sum()) # 某一列的总和
print(df.score.max()) # 最大值
print(df.age.min()) # 最小值
print(df.score.mean()) # 平均值
print(df.score.median())  # 中位数
print(df.age.mode()) # 众数
print(df.score.std()) # 标准差
print(df.score.quantile(0.25))# 分位数
print(df.describe())

324.6
83.5
15
64.92
70.0
0    15
1    17
2    20
3    26
4    30
Name: age, dtype: int64
21.188605428390044
60.5
             id        age      score
count  5.000000   5.000000   5.000000
mean   3.000000  21.600000  64.920000
std    1.581139   6.268971  21.188605
min    1.000000  15.000000  30.600000
25%    2.000000  17.000000  60.500000
50%    3.000000  20.000000  70.000000
75%    4.000000  26.000000  80.000000
max    5.000000  30.000000  83.500000

print(df.count()) # 返回每一列非缺失值的个数

name     5
id       5
age      5
score    5
dtype: int64

print(df.value_counts()) # 出现的次数

name   id  age  score
Tom    1   15   60.5     1
alice  3   20   30.6     1
allen  5   30   83.5     1
bob    4   26   70.0     1
jack   2   17   80.0     1
Name: count, dtype: int64

print(df.drop_duplicates())

    name  id  age  score
1    Tom   1   15   60.5
2   jack   2   17   80.0
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5

print(df.duplicated(subset=['age'])) # 查看是否重复

1    False
2    False
3    False
4    False
5    False
dtype: bool

df.sample(2) # 随机抽样

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
1	Tom	1	15	60.5
2	jack	2	17	80.0

print(df.replace(15,30))

    name  id  age  score
1    Tom   1   30   60.5
2   jack   2   17   80.0
3  alice   3   20   30.6
4    bob   4   26   70.0
5  allen   5   30   83.5

df.cumsum()
df.cummax()
df.cummin(axis=0)

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
1	Tom	1	15	60.5
2	Tom	1	15	60.5
3	Tom	1	15	30.6
4	Tom	1	15	30.6
5	Tom	1	15	30.6

print(df.sort_index(ascending=False))

    name  id  age  score
5  allen   5   30   83.5
4    bob   4   26   70.0
3  alice   3   20   30.6
2   jack   2   17   80.0
1    Tom   1   15   60.5

print(df.sort_values(by=['score','age']))

    name  id  age  score
3  alice   3   20   30.6
1    Tom   1   15   60.5
4    bob   4   26   70.0
2   jack   2   17   80.0
5  allen   5   30   83.5

df.nlargest(2,columns=['score','age'])
df.nsmallest(2,columns=['score','age'])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	name	id	age	score
3	alice	3	20	30.6
1	Tom	1	15	60.5