数据分析处理库
import pandas as pd
df=pd.read_csv("./pandas/data/titanic.csv")
df.head(N) 读取数据的前N行
df.head(6)
df.info() 获取DataFrame的简要摘要
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.index 查看索引
df.index
RangeIndex(start=0, stop=891, step=1)
df.columns 查看所有列名
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
df.dtypes 查看每一列的字段类型
df.dtypes
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
df.values查看所有数据
df.values
array([[1, 0, 3, ..., 7.25, nan, 'S'],
[2, 1, 1, ..., 71.2833, 'C85', 'C'],
[3, 1, 3, ..., 7.925, nan, 'S'],
...,
[889, 0, 3, ..., 23.45, nan, 'S'],
[890, 1, 1, ..., 30.0, 'C148', 'C'],
[891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
df['Name']
0 Braund, Mr. Owen Harris
1 Cumings, Mrs. John Bradley (Florence Briggs Th...
2 Heikkinen, Miss. Laina
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 Allen, Mr. William Henry
...
886 Montvila, Rev. Juozas
887 Graham, Miss. Margaret Edith
888 Johnston, Miss. Catherine Helen "Carrie"
889 Behr, Mr. Karl Howell
890 Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object
df=df.set_index('Name')
df
查询Age列的前8列数据
df['Age'][:8]
Name
Braund, Mr. Owen Harris 22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 38.0
Heikkinen, Miss. Laina 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0
Allen, Mr. William Henry 35.0
Moran, Mr. James NaN
McCarthy, Mr. Timothy J 54.0
Palsson, Master. Gosta Leonard 2.0
Name: Age, dtype: float64
对单列数据的操作
age=df['Age']
age
Name
Braund, Mr. Owen Harris 22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 38.0
Heikkinen, Miss. Laina 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0
Allen, Mr. William Henry 35.0
...
Montvila, Rev. Juozas 27.0
Graham, Miss. Margaret Edith 19.0
Johnston, Miss. Catherine Helen "Carrie" NaN
Behr, Mr. Karl Howell 26.0
Dooley, Mr. Patrick 32.0
Name: Age, Length: 891, dtype: float64
age=age+10
age
Name
Braund, Mr. Owen Harris 32.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 48.0
Heikkinen, Miss. Laina 36.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 45.0
Allen, Mr. William Henry 45.0
...
Montvila, Rev. Juozas 37.0
Graham, Miss. Margaret Edith 29.0
Johnston, Miss. Catherine Helen "Carrie" NaN
Behr, Mr. Karl Howell 36.0
Dooley, Mr. Patrick 42.0
Name: Age, Length: 891, dtype: float64
age.max()
90.0
age.min()
10.42
age.mean()
39.69911764705882
describe得到数据的基本统计特征
df.describe()
只查询某集几列
df[['Age','Fare']][:5]
通过索引或者标签查询数据
df.iloc[0]
df.iloc[0:5]
df.iloc[0:5,1:3]
df.loc['Futrelle, Mrs. Jacques Heath (Lily May Peel)']
df.loc['Futrelle, Mrs. Jacques Heath (Lily May Peel)','Age']
df.loc['Braund, Mr. Owen Harris':'Graham, Miss. Margaret Edith','Sex':'Age']
df.loc['Heikkinen, Miss. Laina','Age']=2000
bool运算
df[df['Age']>50][:5]
df[df['Sex']=='female']
df.loc[df['Sex']=='male','Age'].mean()
(df['Age']>50).sum()
65
DataFrame groupby数据分组
dff=pd.DataFrame({'key':['A','B','C','A','B','C','A','B','C'],'value':[0,5,10,5,10,15,10,15,20]})
dff
按照key分组求和
dff.groupby('key').sum()
import numpy as np
dff.groupby('key').aggregate(np.mean)
df.groupby('Sex')['Age'].mean()
Sex
female 35.478927
male 30.726645
Name: Age, dtype: float64
数值运算
df1=pd.DataFrame([[1,2,3,4],[3,4,5,6]],index=['a','b'],columns=['A','B','C','D'])
df1
df1.sum()
df1.sum(axis=0)
A 4
B 6
C 8
D 10
dtype: int64
df1.sum(axis=1)
a 10
b 18
dtype: int64
df1.mean(axis=0)
A 2.0
B 3.0
C 4.0
D 5.0
dtype: float64
df1.mean(axis=1)
a 2.5
b 4.5
dtype: float64
df
df.cov()
df.corr()
df['Age'].value_counts()
24.00 30
22.00 27
18.00 26
28.00 25
19.00 25
..
53.00 1
55.50 1
70.50 1
23.50 1
0.42 1
Name: Age, Length: 89, dtype: int64
df['Age'].value_counts(ascending=True)
0.42 1
23.50 1
70.50 1
55.50 1
53.00 1
..
19.00 25
28.00 25
18.00 26
22.00 27
24.00 30
Name: Age, Length: 89, dtype: int64
对象操作(Series一行或者一列)
data=[1,2,3,4]
index=['a','b','c','d']
s=pd.Series(index=index,data=data)
s[0]
s[1:3]
mask=[True,False,True,False]
s[mask]
s['a']=200
s.replace(to_replace=3,value=300,inplace=True)
s.rename(index={'a':'A'},inplace=True)
s1=pd.Series(index=['e','f'],data=[5,6])
s3=s.append(s1)
del s3['A']
s3.drop(['c','d'],inplace=True)
s3
b 2
e 5
f 6
dtype: int64
DataFrame的增删改查操作
data=[[1,2,3,4],[5,6,7,8]]
index=['a','b']
columns=['A','B','C','D']
dff=pd.DataFrame(data=data,index=index,columns=columns)
dff1=dff.iloc[1]
dff1=dff.loc['a']
dff1
A 1
B 2
C 3
D 4
Name: a, dtype: int64
dff.loc['a']['A']=1000
dff
dff.index=['m','n']
dff
dff.loc['o']=[10,11,12,13]
dff
| A | B | C | D |
|---|
| m | 1000 | 2 | 3 | 4 |
| n | 5 | 6 | 7 | 8 |
| o | 10 | 11 | 12 | 13 |
dff['E']=[5,9,14]
dff
| A | B | C | D | E |
|---|
| m | 1000 | 2 | 3 | 4 | 5 |
| n | 5 | 6 | 7 | 8 | 9 |
| o | 10 | 11 | 12 | 13 | 14 |
df4=pd.DataFrame([[6,10,15],[7,11,16],[8,12,17]],index=['m','n','o'],columns=['F','M','N'])
df5=pd.concat([dff,df4],axis=1)
df5
| A | B | C | D | E | F | M | N |
|---|
| m | 1000 | 2 | 3 | 4 | 5 | 6 | 10 | 15 |
| n | 5 | 6 | 7 | 8 | 9 | 7 | 11 | 16 |
| o | 10 | 11 | 12 | 13 | 14 | 8 | 12 | 17 |
df5.drop(['o'],axis=0,inplace=True)
df5
| A | B | C | D | E | F | M | N |
|---|
| m | 1000 | 2 | 3 | 4 | 5 | 6 | 10 | 15 |
| n | 5 | 6 | 7 | 8 | 9 | 7 | 11 | 16 |
df5.drop(['E','F'],axis=1,inplace=True)
df5
| A | B | C | D | M | N |
|---|
| m | 1000 | 2 | 3 | 4 | 10 | 15 |
| n | 5 | 6 | 7 | 8 | 11 | 16 |
