Pandas入门教程(一)数据分析处理库df.head(N)读取数据的前N行df.info()获取DataFrame的简

数据分析处理库

import pandas as pd
df=pd.read_csv("./pandas/data/titanic.csv")

df.head(N) 读取数据的前N行

df.head(6)

df.info() 获取DataFrame的简要摘要

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

df.index 查看索引

df.index

RangeIndex(start=0, stop=891, step=1)

df.columns 查看所有列名

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

df.dtypes 查看每一列的字段类型

df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

df.values查看所有数据

df.values

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)

df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

df=df.set_index('Name')
df

查询Age列的前8列数据

df['Age'][:8]

Name
Braund, Mr. Owen Harris                                22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    38.0
Heikkinen, Miss. Laina                                 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           35.0
Allen, Mr. William Henry                               35.0
Moran, Mr. James                                        NaN
McCarthy, Mr. Timothy J                                54.0
Palsson, Master. Gosta Leonard                          2.0
Name: Age, dtype: float64

对单列数据的操作

age=df['Age']
age

Name
Braund, Mr. Owen Harris                                22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    38.0
Heikkinen, Miss. Laina                                 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           35.0
Allen, Mr. William Henry                               35.0
                                                       ... 
Montvila, Rev. Juozas                                  27.0
Graham, Miss. Margaret Edith                           19.0
Johnston, Miss. Catherine Helen "Carrie"                NaN
Behr, Mr. Karl Howell                                  26.0
Dooley, Mr. Patrick                                    32.0
Name: Age, Length: 891, dtype: float64

# 每一个Age统一加10
age=age+10
age

Name
Braund, Mr. Owen Harris                                32.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    48.0
Heikkinen, Miss. Laina                                 36.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           45.0
Allen, Mr. William Henry                               45.0
                                                       ... 
Montvila, Rev. Juozas                                  37.0
Graham, Miss. Margaret Edith                           29.0
Johnston, Miss. Catherine Helen "Carrie"                NaN
Behr, Mr. Karl Howell                                  36.0
Dooley, Mr. Patrick                                    42.0
Name: Age, Length: 891, dtype: float64

# Age的最大值
age.max()

90.0

# Age的最小值
age.min()

10.42

# Age的平均值
age.mean()

39.69911764705882

describe得到数据的基本统计特征

df.describe()

只查询某集几列

df[['Age','Fare']][:5]

通过索引或者标签查询数据

# 通过索引查看某一行的数据
df.iloc[0]
# 查询前4行数据
df.iloc[0:5]
# 查询前4行前3列的数据
df.iloc[0:5,1:3]

# 通过索引列值读取某一行的数据
df.loc['Futrelle, Mrs. Jacques Heath (Lily May Peel)']
# 查询某行某列的某个值
df.loc['Futrelle, Mrs. Jacques Heath (Lily May Peel)','Age']
# 查询某几行的数某几列的数据
df.loc['Braund, Mr. Owen Harris':'Graham, Miss. Margaret Edith','Sex':'Age']
# 修改某个值
df.loc['Heikkinen, Miss. Laina','Age']=2000

bool运算

# 查询Age大于50的前5行数据
df[df['Age']>50][:5]
# 查询Sex为female的数据
df[df['Sex']=='female']
# 计算Sex为male,Age的平均值
df.loc[df['Sex']=='male','Age'].mean()
# 计算Age大于50的年龄和
(df['Age']>50).sum()

DataFrame groupby数据分组

dff=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'value':[0,5,10,5,10,15,10,15,20]})
dff

按照key分组求和

dff.groupby('key').sum()

import numpy as np
dff.groupby('key').aggregate(np.mean)

# 按照Sex分组,计算Age的平均值
df.groupby('Sex')['Age'].mean()

Sex
female    35.478927
male      30.726645
Name: Age, dtype: float64

数值运算

df1=pd.DataFrame([[1,2,3,4],[3,4,5,6]],index=['a','b'],columns=['A','B','C','D'])
df1

# 每一列求值
df1.sum()
df1.sum(axis=0)

A     4
B     6
C     8
D    10
dtype: int64

# 每一行求和
df1.sum(axis=1)

a    10
b    18
dtype: int64

# 每一列求平均值
df1.mean(axis=0)

A    2.0
B    3.0
C    4.0
D    5.0
dtype: float64

# 每一行求平均值
df1.mean(axis=1)

a    2.5
b    4.5
dtype: float64

df

# 协方差
df.cov()

# 相关性
df.corr()

# 统计某一个每一个值出现的次数
df['Age'].value_counts()

24.00    30
22.00    27
18.00    26
28.00    25
19.00    25
         ..
53.00     1
55.50     1
70.50     1
23.50     1
0.42      1
Name: Age, Length: 89, dtype: int64

# 统计某一个每一个值出现的次数,次数由少到多排列
df['Age'].value_counts(ascending=True)

0.42      1
23.50     1
70.50     1
55.50     1
53.00     1
         ..
19.00    25
28.00    25
18.00    26
22.00    27
24.00    30
Name: Age, Length: 89, dtype: int64

对象操作(Series一行或者一列)

data=[1,2,3,4]
index=['a','b','c','d']
s=pd.Series(index=index,data=data)

# 查询第一行
s[0]

# 查询1到3行
s[1:3]

# 掩码操作 只显示a c行
mask=[True,False,True,False]
s[mask]

#修改某个值
s['a']=200

# 值替换将3替换为300
s.replace(to_replace=3,value=300,inplace=True)

# 修改列名
s.rename(index={'a':'A'},inplace=True)


# 添加数据
s1=pd.Series(index=['e','f'],data=[5,6])

s3=s.append(s1)


# 删除A行数据
del s3['A']

# 一次删除多行数据

s3.drop(['c','d'],inplace=True)

s3

b    2
e    5
f    6
dtype: int64

DataFrame的增删改查操作

# 构造一个DataFrame
data=[[1,2,3,4],[5,6,7,8]]
index=['a','b']
columns=['A','B','C','D']
dff=pd.DataFrame(data=data,index=index,columns=columns)

	A	B	C	D
a	1	2	3	4
b	5	6	7	8

# 通过loc(‘索引值’)和iloc(索引数值)查询
dff1=dff.iloc[1]
dff1=dff.loc['a']
dff1

A    1
B    2
C    3
D    4
Name: a, dtype: int64

# 修改值
dff.loc['a']['A']=1000
dff

	A	B	C	D
a	1000	2	3	4
b	5	6	7	8

# 修改索引
dff.index=['m','n']
dff

	A	B	C	D
m	1000	2	3	4
n	5	6	7	8

# 添加一行数据
dff.loc['o']=[10,11,12,13]
dff

	A	B	C	D
m	1000	2	3	4
n	5	6	7	8
o	10	11	12	13

#  添加一列数据
dff['E']=[5,9,14]
dff

	A	B	C	D	E
m	1000	2	3	4	5
n	5	6	7	8	9
o	10	11	12	13	14

# 批量添加多列数据
df4=pd.DataFrame([[6,10,15],[7,11,16],[8,12,17]],index=['m','n','o'],columns=['F','M','N'])
df5=pd.concat([dff,df4],axis=1)
df5

	A	B	C	D	E	F	M	N
m	1000	2	3	4	5	6	10	15
n	5	6	7	8	9	7	11	16
o	10	11	12	13	14	8	12	17

# 删除一行数据
df5.drop(['o'],axis=0,inplace=True)
df5

	A	B	C	D	E	F	M	N
m	1000	2	3	4	5	6	10	15
n	5	6	7	8	9	7	11	16

# 删除列
df5.drop(['E','F'],axis=1,inplace=True)
df5

	A	B	C	D	M	N
m	1000	2	3	4	10	15
n	5	6	7	8	11	16