Pandas 入门教程Pandas 入门教程目录什么是 Pandas 安装 Pandas 核心数据结构数据读取与写

Pandas 入门教程

什么是 Pandas

Pandas 是一个强大的 Python 数据分析库，建立在 NumPy 之上，提供了高性能、易用的数据结构和数据分析工具。

核心特点

DataFrame：类似 Excel 表格的二维数据结构
Series：一维标记数组
数据读写：支持 CSV、Excel、SQL、JSON 等多种格式
数据清洗：处理缺失值、重复值、异常值
数据转换：重塑、透视、合并、分组
时间序列：强大的日期和时间处理功能
集成生态：与 NumPy、Matplotlib、Scikit-learn 无缝集成

Pandas vs Excel

特性	Excel	Pandas
数据量	有限（约100万行）	几乎无限（受内存限制）
自动化	手动操作	可编程自动化
可重复性	低	高
版本控制	困难	容易
复杂分析	困难	简单
学习曲线	低	中等

安装 Pandas

方法一：通过 Anaconda 安装

Pandas 已预装在 Anaconda 中。

# 验证安装
python -c "import pandas; print(pandas.__version__)"

方法二：通过 pip 安装

pip install pandas

导入 Pandas

import pandas as pd
import numpy as np

# 查看版本
print(pd.__version__)

约定俗成：始终使用 import pandas as pd 作为标准导入方式。

核心数据结构

Pandas 有两个核心数据结构：Series 和 DataFrame。

Series（一维数组）

Series 是带标签的一维数组，可以存储任何数据类型。

import pandas as pd
import numpy as np

# 从列表创建
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)
# 0    1.0
# 1    3.0
# 2    5.0
# 3    NaN
# 4    6.0
# 5    8.0
# dtype: float64

# 指定索引
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s)
# a    10
# b    20
# c    30
# d    40
# dtype: int64

# 从字典创建
data = {'apple': 3, 'banana': 5, 'orange': 2}
s = pd.Series(data)
print(s)
# apple     3
# banana    5
# orange    2
# dtype: int64

# Series 属性
print(s.index)    # Index(['apple', 'banana', 'orange'], dtype='object')
print(s.values)   # array([3, 5, 2])
print(s.dtype)    # int64
print(s.name)     # None

DataFrame（二维表格）

DataFrame 是带标签的二维数据结构，类似电子表格或 SQL 表。

# 从字典创建
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen'],
    'Salary': [8000, 12000, 15000, 20000]
}
df = pd.DataFrame(data)
print(df)
#       Name  Age       City  Salary
# 0    Alice   25    Beijing    8000
# 1      Bob   30   Shanghai   12000
# 2  Charlie   35  Guangzhou   15000
# 3    David   40   Shenzhen   20000

# 从列表的列表创建
data = [
    ['Alice', 25, 'Beijing'],
    ['Bob', 30, 'Shanghai'],
    ['Charlie', 35, 'Guangzhou']
]
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

# 从 NumPy 数组创建
arr = np.random.randn(5, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])

# 从 Series 字典创建
data = {
    'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
    'two': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
}
df = pd.DataFrame(data)
print(df)
#    one  two
# a  1.0    1
# b  2.0    2
# c  3.0    3
# d  NaN    4

# DataFrame 属性
print(df.shape)     # (行数, 列数)
print(df.columns)   # 列名
print(df.index)     # 索引
print(df.dtypes)    # 每列的数据类型
print(df.values)    # NumPy 数组

数据读取与写入

读取数据

import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', encoding='utf-8')  # 指定编码
df = pd.read_csv('data.csv', sep='\t')          # 制表符分隔
df = pd.read_csv('data.csv', header=0)           # 指定表头行
df = pd.read_csv('data.csv', index_col=0)        # 指定索引列
df = pd.read_csv('data.csv', usecols=['A', 'B']) # 只读取指定列
df = pd.read_csv('data.csv', nrows=100)          # 只读取前100行

# 读取 Excel 文件
df = pd.read_excel('data.xlsx')
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
df = pd.read_excel('data.xlsx', sheet_name=0)

# 需要安装 openpyxl
# pip install openpyxl

# 读取 JSON 文件
df = pd.read_json('data.json')

# 读取 SQL 数据库
import sqlite3
conn = sqlite3.connect('database.db')
df = pd.read_sql_query('SELECT * FROM table_name', conn)
conn.close()

# 读取 HTML 表格
df = pd.read_html('https://example.com/table.html')[0]

# 读取剪贴板
df = pd.read_clipboard()

写入数据

# 写入 CSV
df.to_csv('output.csv', index=False)              # 不保存索引
df.to_csv('output.csv', encoding='utf-8-sig')     # Excel 兼容编码

# 写入 Excel
df.to_excel('output.xlsx', index=False, sheet_name='Sheet1')

# 写入 JSON
df.to_json('output.json', orient='records')

# 写入 SQL
df.to_sql('table_name', conn, if_exists='replace', index=False)

# 写入剪贴板
df.to_clipboard(index=False)

读取参数详解

# read_csv 常用参数
df = pd.read_csv(
    'data.csv',
    sep=',',              # 分隔符
    header=0,             # 表头行号
    index_col=None,       # 索引列
    usecols=None,         # 读取的列
    dtype=None,           # 数据类型
    na_values=['NA', ''], # 缺失值标记
    parse_dates=False,    # 解析日期
    encoding='utf-8',     # 编码
    nrows=None,           # 读取行数
    skiprows=None,        # 跳过的行
    chunksize=None        # 分块大小
)

数据查看与探索

import pandas as pd
import numpy as np

# 创建示例数据
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'City': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou'],
    'Salary': [8000, 12000, 15000, 20000, 18000],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR']
})

基本信息查看

# 查看前几行
print(df.head())        # 前5行
print(df.head(3))       # 前3行

# 查看后几行
print(df.tail())        # 后5行
print(df.tail(2))       # 后2行

# 查看形状
print(df.shape)         # (5, 5)

# 查看列名
print(df.columns)       # Index(['Name', 'Age', 'City', 'Salary', 'Department'])

# 查看索引
print(df.index)         # RangeIndex(start=0, stop=5, step=1)

# 查看数据类型
print(df.dtypes)
# Name          object
# Age            int64
# City          object
# Salary         int64
# Department    object
# dtype: object

# 查看详细信息
print(df.info())
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 5 entries, 0 to 4
# Data columns (total 5 columns):
#  #   Column      Non-Null Count  Dtype
# ---  ------      --------------  -----
#  0   Name        5 non-null      object
#  1   Age         5 non-null      int64
#  2   City        5 non-null      object
#  3   Salary      5 non-null      int64
#  4   Department  5 non-null      object
# dtypes: int64(2), object(3)
# memory usage: 328.0+ bytes

# 统计摘要
print(df.describe())
#             Age       Salary
# count   5.00000     5.000000
# mean   35.00000  14600.000000
# std    7.90569   5458.937626
# min    25.00000   8000.000000
# 25%    30.00000  12000.000000
# 50%    35.00000  15000.000000
# 75%    40.00000  18000.000000
# max    45.00000  20000.000000

# 包括所有列的统计
print(df.describe(include='all'))

# 唯一值
print(df['City'].unique())
# ['Beijing' 'Shanghai' 'Guangzhou' 'Shenzhen' 'Hangzhou']

# 唯一值数量
print(df['City'].nunique())  # 5

# 值计数
print(df['Department'].value_counts())
# IT       2
# HR       2
# Finance  1
# Name: Department, dtype: int64

数据选择与索引

列选择

# 选择单列（返回 Series）
name_series = df['Name']
age_series = df['Age']

# 选择多列（返回 DataFrame）
subset = df[['Name', 'Age']]
subset = df[['Name', 'Age', 'Salary']]

# 使用 loc 选择列
df.loc[:, 'Name']
df.loc[:, ['Name', 'Age']]

行选择

# 按位置选择（iloc）
print(df.iloc[0])      # 第一行
print(df.iloc[0:3])    # 前3行
print(df.iloc[-1])     # 最后一行

# 按标签选择（loc）
df_indexed = df.set_index('Name')
print(df_indexed.loc['Alice'])

# 布尔索引
young = df[df['Age'] < 35]
high_salary = df[df['Salary'] > 15000]

loc 和 iloc 详解

# iloc - 基于位置的索引
df.iloc[0]            # 第一行
df.iloc[0:3]          # 第0-2行
df.iloc[:, 0]         # 第一列
df.iloc[0:3, 0:2]     # 前3行，前2列
df.iloc[[0, 2, 4]]    # 指定行

# loc - 基于标签的索引
df.loc[0]             # 索引为0的行
df.loc[0:2]           # 索引0-2的行（包含2）
df.loc[:, 'Name']     # Name列
df.loc[0:2, 'Name':'Age']  # 指定行列范围

# 条件选择
df.loc[df['Age'] > 30]
df.loc[df['Department'] == 'IT', ['Name', 'Salary']]

条件选择

# 单条件
df[df['Age'] > 30]
df[df['City'] == 'Beijing']

# 多条件（& 且，| 或，~ 非）
df[(df['Age'] > 30) & (df['Salary'] > 15000)]
df[(df['Department'] == 'IT') | (df['Department'] == 'HR')]
df[~(df['Age'] < 30)]  # 年龄不小于30

# isin
cities = ['Beijing', 'Shanghai']
df[df['City'].isin(cities)]

# between
df[df['Age'].between(25, 35)]

# str 方法
df[df['Name'].str.startswith('A')]
df[df['City'].str.contains('ang')]

query 方法

# 使用 query 进行条件查询
df.query('Age > 30')
df.query('Age > 30 and Salary > 15000')
df.query('Department in ["IT", "HR"]')
df.query('City == "Beijing" or City == "Shanghai"')

# 使用变量
min_age = 30
df.query('Age > @min_age')

数据清洗

处理缺失值

import pandas as pd
import numpy as np

# 创建含缺失值的数据
df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, 5],
    'D': [1, 2, 3, 4, 5]
})

# 检测缺失值
print(df.isnull())       # 缺失值为 True
print(df.notnull())      # 非缺失值为 True
print(df.isnull().sum()) # 每列缺失值数量

# 删除缺失值
df_drop = df.dropna()                    # 删除任何含缺失值的行
df_drop = df.dropna(axis=1)              # 删除任何含缺失值的列
df_drop = df.dropna(how='all')           # 删除全为缺失值的行
df_drop = df.dropna(subset=['A', 'B'])   # 删除指定列含缺失值的行
df_drop = df.dropna(thresh=3)            # 至少3个非缺失值才保留

# 填充缺失值
df_filled = df.fillna(0)                 # 用0填充
df_filled = df.fillna(method='ffill')    # 向前填充
df_filled = df.fillna(method='bfill')    # 向后填充
df_filled = df.fillna(df.mean())         # 用均值填充
df_filled = df['A'].fillna(df['A'].median())  # 用中位数填充

# 插值
df_interpolated = df.interpolate()

# 判断是否有缺失值
print(df.isnull().any().any())  # True/False

处理重复值

# 创建含重复值的数据
df = pd.DataFrame({
    'A': [1, 2, 2, 3, 4, 4],
    'B': ['a', 'b', 'b', 'c', 'd', 'd']
})

# 检测重复值
print(df.duplicated())
# 0    False
# 1    False
# 2     True
# 3    False
# 4    False
# 5     True

# 删除重复值
df_unique = df.drop_duplicates()
df_unique = df.drop_duplicates(subset=['A'])  # 基于指定列
df_unique = df.drop_duplicates(keep='last')   # 保留最后一个
df_unique = df.drop_duplicates(keep=False)    # 删除所有重复

# 统计重复数量
print(df.duplicated().sum())  # 2

数据类型转换

# 查看数据类型
print(df.dtypes)

# 转换数据类型
df['Age'] = df['Age'].astype(int)
df['Salary'] = df['Salary'].astype(float)
df['Date'] = pd.to_datetime(df['Date'])
df['Category'] = df['Category'].astype('category')

# 转换多列
df = df.astype({'Age': int, 'Salary': float})

# 数值转字符串
df['Age_str'] = df['Age'].astype(str)

# 字符串转数值
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

# 分类类型（节省内存）
df['Department'] = df['Department'].astype('category')
print(df['Department'].cat.categories)

字符串处理

df = pd.DataFrame({
    'Name': ['alice', 'BOB', 'Charlie', 'david'],
    'Email': ['alice@example.com', 'bob@test.com',
              'charlie@example.com', 'david@test.com']
})

# 转换为小写/大写
df['Name_lower'] = df['Name'].str.lower()
df['Name_upper'] = df['Name'].str.upper()
df['Name_title'] = df['Name'].str.title()

# 去除空格
df['Name_strip'] = df['Name'].str.strip()

# 替换
df['Name_replaced'] = df['Name'].str.replace('a', 'A')

# 分割
df[['First', 'Last']] = df['Name'].str.split(' ', expand=True)

# 提取
df['Domain'] = df['Email'].str.extract(r'@(.+)')

# 包含判断
mask = df['Email'].str.contains('example')
df_example = df[mask]

# 长度
df['Name_length'] = df['Name'].str.len()

# 开头/结尾判断
df['StartsWith_a'] = df['Name'].str.startswith('a')
df['EndsWith_e'] = df['Name'].str.endswith('e')

重命名

# 重命名列
df = df.rename(columns={'old_name': 'new_name'})
df = df.rename(columns={'A': 'Column_A', 'B': 'Column_B'})

# 重命名索引
df = df.rename(index={0: 'first', 1: 'second'})

# 重命名所有列
df.columns = ['Col1', 'Col2', 'Col3']

# 使用函数重命名
df = df.rename(columns=str.lower)
df = df.rename(columns=lambda x: x.strip())

数据转换

添加新列

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [8000, 12000, 15000]
})

# 直接赋值
df['Bonus'] = df['Salary'] * 0.1

# 基于条件
df['Level'] = np.where(df['Salary'] > 10000, 'High', 'Low')

# 使用 apply
def age_group(age):
    if age < 30:
        return 'Young'
    elif age < 40:
        return 'Middle'
    else:
        return 'Senior'

df['Age_Group'] = df['Age'].apply(age_group)

# 使用 map
level_map = {8000: 'Junior', 12000: 'Mid', 15000: 'Senior'}
df['Level'] = df['Salary'].map(level_map)

# 使用 assign（返回新 DataFrame）
df_new = df.assign(
    Bonus=df['Salary'] * 0.1,
    Tax=df['Salary'] * 0.2
)

apply 函数

# Series 的 apply
df['Age_squared'] = df['Age'].apply(lambda x: x ** 2)

# DataFrame 的 apply（按列）
df[['Age', 'Salary']].apply(np.mean)

# DataFrame 的 apply（按行）
def custom_func(row):
    return row['Salary'] / row['Age']

df['Salary_per_Age'] = df.apply(custom_func, axis=1)

# 带额外参数
def multiply(x, factor):
    return x * factor

df['Salary_doubled'] = df['Salary'].apply(multiply, factor=2)

map 和 replace

# map - Series 专用
df = pd.DataFrame({'City': ['Beijing', 'Shanghai', 'Guangzhou']})
city_code = {'Beijing': 'BJ', 'Shanghai': 'SH', 'Guangzhou': 'GZ'}
df['City_Code'] = df['City'].map(city_code)

# replace - 替换值
df = pd.DataFrame({'Status': [1, 0, 1, 0, 1]})
df['Status'] = df['Status'].replace({1: 'Active', 0: 'Inactive'})

# 替换多个值
df = df.replace({'A': 1, 'B': 2, 'C': 3})

数据离散化

# cut - 等宽分箱
ages = [20, 25, 30, 35, 40, 45, 50]
bins = [0, 30, 40, 50, 60]
labels = ['Young', 'Middle', 'Old', 'Senior']
age_groups = pd.cut(ages, bins=bins, labels=labels)
print(age_groups)

# qcut - 等频分箱
scores = np.random.randint(0, 100, 100)
quantiles = pd.qcut(scores, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print(quantiles.value_counts())

数据筛选与查询

基本筛选

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 45],
    'Salary': [8000, 12000, 15000, 20000, 18000],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR']
})

# 单条件筛选
it_dept = df[df['Department'] == 'IT']
high_salary = df[df['Salary'] > 15000]

# 多条件筛选
result = df[(df['Age'] > 30) & (df['Salary'] > 15000)]
result = df[(df['Department'] == 'IT') | (df['Department'] == 'HR')]

# 取反
result = df[~(df['Age'] < 30)]

nlargest 和 nsmallest

# 最大的 N 个值
top3_salary = df.nlargest(3, 'Salary')
top2_age = df.nlargest(2, 'Age')

# 最小的 N 个值
bottom3_salary = df.nsmallest(3, 'Salary')

sample 随机采样

# 随机抽取
sample = df.sample(n=3)           # 抽取3行
sample = df.sample(frac=0.5)      # 抽取50%
sample = df.sample(n=3, random_state=42)  # 固定随机种子

# 加权采样
weights = [0.1, 0.1, 0.1, 0.5, 0.2]
sample = df.sample(n=2, weights=weights)

数据排序

df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'Salary': [8000, 12000, 15000, 20000]
})

# 按单列排序
df_sorted = df.sort_values('Age')
df_sorted = df.sort_values('Age', ascending=False)  # 降序

# 按多列排序
df_sorted = df.sort_values(['Department', 'Salary'],
                           ascending=[True, False])

# 按索引排序
df_sorted = df.sort_index()
df_sorted = df.sort_index(ascending=False)

# 原地排序
df.sort_values('Age', inplace=True)

# 获取排序后的索引
sorted_indices = df['Salary'].argsort()
df_sorted = df.iloc[sorted_indices]

数据统计分析

基本统计

df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

# 求和
print(df.sum())
print(df.sum(axis=1))  # 按行求和

# 平均值
print(df.mean())

# 中位数
print(df.median())

# 标准差
print(df.std())

# 方差
print(df.var())

# 最小值/最大值
print(df.min())
print(df.max())

# 极差
print(df.max() - df.min())

# 众数
print(df.mode())

# 分位数
print(df.quantile(0.25))  # 25% 分位数
print(df.quantile([0.25, 0.5, 0.75]))

# 描述性统计
print(df.describe())

# 相关性
print(df.corr())

# 协方差
print(df.cov())

# 计数
print(df.count())
print(df['A'].value_counts())

分组统计

df = pd.DataFrame({
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'Salary': [8000, 12000, 15000, 20000, 18000, 22000],
    'Age': [25, 30, 35, 40, 45, 28]
})

# 按部门分组
grouped = df.groupby('Department')

# 分组统计
print(grouped.mean())
print(grouped.sum())
print(grouped.count())
print(grouped.std())

# 特定列的统计
print(grouped['Salary'].mean())
print(grouped[['Salary', 'Age']].mean())

# 多个统计量
print(grouped['Salary'].agg(['mean', 'sum', 'count']))

# 自定义统计
print(grouped['Salary'].agg(lambda x: x.max() - x.min()))

分组聚合

groupby 基础

df = pd.DataFrame({
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    'Gender': ['F', 'M', 'M', 'M', 'F', 'F'],
    'Salary': [8000, 12000, 15000, 20000, 18000, 22000],
    'Age': [25, 30, 35, 40, 45, 28]
})

# 单列分组
grouped = df.groupby('Department')

# 多列分组
grouped = df.groupby(['Department', 'Gender'])

# 遍历分组
for name, group in grouped:
    print(f"Group: {name}")
    print(group)
    print()

聚合函数

# 单个聚合函数
result = df.groupby('Department')['Salary'].mean()
result = df.groupby('Department')['Salary'].sum()
result = df.groupby('Department')['Salary'].count()

# 多个聚合函数
result = df.groupby('Department')['Salary'].agg(['mean', 'sum', 'count'])

# 不同列不同聚合
result = df.groupby('Department').agg({
    'Salary': ['mean', 'sum'],
    'Age': ['min', 'max']
})

# 自定义聚合函数
def salary_range(x):
    return x.max() - x.min()

result = df.groupby('Department')['Salary'].agg(salary_range)

# 命名聚合
result = df.groupby('Department').agg(
    avg_salary=('Salary', 'mean'),
    total_salary=('Salary', 'sum'),
    emp_count=('Salary', 'count')
)

transform 和 filter

# transform - 保持原 DataFrame 形状
df = pd.DataFrame({
    'Department': ['IT', 'HR', 'IT', 'Finance'],
    'Salary': [8000, 12000, 15000, 20000]
})

# 计算每个部门的平均薪水
df['Dept_Avg_Salary'] = df.groupby('Department')['Salary'].transform('mean')
print(df)
#   Department  Salary  Dept_Avg_Salary
# 0         IT    8000          11500.0
# 1         HR   12000          12000.0
# 2         IT   15000          11500.0
# 3    Finance   20000          20000.0

# 标准化
df['Salary_Standardized'] = (
    df['Salary'] - df.groupby('Department')['Salary'].transform('mean')
) / df.groupby('Department')['Salary'].transform('std')

# filter - 过滤分组
# 只保留员工数大于1的部门
filtered = df.groupby('Department').filter(lambda x: len(x) > 1)

pivot_table 透视表

df = pd.DataFrame({
    'Department': ['IT', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    'Gender': ['F', 'M', 'M', 'M', 'F', 'F'],
    'Salary': [8000, 12000, 15000, 20000, 18000, 22000]
})

# 创建透视表
pivot = pd.pivot_table(
    df,
    values='Salary',
    index='Department',
    columns='Gender',
    aggfunc='mean',
    fill_value=0
)
print(pivot)
# Gender         F       M
# Department
# Finance      0.0  20000.0
# HR       18000.0  12000.0
# IT       15000.0  15000.0

# 多个值和聚合函数
pivot = pd.pivot_table(
    df,
    values=['Salary'],
    index='Department',
    columns='Gender',
    aggfunc=['mean', 'count']
)

crosstab 交叉表

# 交叉表
cross = pd.crosstab(df['Department'], df['Gender'])
print(cross)

# 带汇总
cross = pd.crosstab(df['Department'], df['Gender'], margins=True)

# 归一化
cross = pd.crosstab(df['Department'], df['Gender'], normalize='index')

数据合并与连接

concat 拼接

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
})

df2 = pd.DataFrame({
    'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5']
})

# 垂直拼接
result = pd.concat([df1, df2])
result = pd.concat([df1, df2], ignore_index=True)  # 重置索引

# 水平拼接
df3 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
})
result = pd.concat([df1, df3], axis=1)

# 多个 DataFrame
result = pd.concat([df1, df2, df3], ignore_index=True)

merge 合并

df1 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K3'],
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3']
})

df2 = pd.DataFrame({
    'key': ['K0', 'K1', 'K2', 'K4'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']
})

# 内连接（默认）
result = pd.merge(df1, df2, on='key')

# 左连接
result = pd.merge(df1, df2, on='key', how='left')

# 右连接
result = pd.merge(df1, df2, on='key', how='right')

# 外连接
result = pd.merge(df1, df2, on='key', how='outer')

# 多键连接
result = pd.merge(df1, df2, on=['key1', 'key2'])

# 不同列名连接
result = pd.merge(df1, df2, left_on='key1', right_on='key2')

# 指示器
result = pd.merge(df1, df2, on='key', how='outer', indicator=True)

join 连接

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])

df2 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2'],
    'D': ['D0', 'D1', 'D2']
}, index=['K0', 'K1', 'K2'])

# 基于索引连接
result = df1.join(df2)
result = df1.join(df2, how='left')
result = df1.join(df2, how='right')
result = df1.join(df2, how='outer')

合并方式对比

方式	说明	SQL 等价
inner	只保留匹配的	INNER JOIN
left	保留左表全部	LEFT JOIN
right	保留右表全部	RIGHT JOIN
outer	保留所有	FULL OUTER JOIN

时间序列处理

创建时间序列

import pandas as pd
import numpy as np

# 创建日期范围
dates = pd.date_range('2024-01-01', periods=10, freq='D')
print(dates)

# 不同频率
dates_daily = pd.date_range('2024-01-01', '2024-01-31', freq='D')
dates_monthly = pd.date_range('2024-01-01', periods=12, freq='M')
dates_hourly = pd.date_range('2024-01-01', periods=24, freq='H')

# 创建时间序列 DataFrame
df = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=100, freq='D'),
    'value': np.random.randn(100)
})
df = df.set_index('date')

日期时间转换

# 字符串转日期
df = pd.DataFrame({'date': ['2024-01-01', '2024-02-01', '2024-03-01']})
df['date'] = pd.to_datetime(df['date'])

# 指定格式
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# 提取日期组件
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday
df['dayofweek'] = df['date'].dt.day_name()
df['quarter'] = df['date'].dt.quarter

# 日期运算
df['next_day'] = df['date'] + pd.Timedelta(days=1)
df['last_month'] = df['date'] - pd.DateOffset(months=1)

时间序列操作

# 设置时间为索引
df = df.set_index('date')

# 时间切片
df['2024-01']              # 2024年1月
df['2024-01-01':'2024-01-31']
df['2024']                 # 2024年全年

# 重采样
daily_data = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=365, freq='D'),
    'value': np.random.randn(365)
}).set_index('date')

monthly_avg = daily_data.resample('M').mean()
weekly_sum = daily_data.resample('W').sum()

# 滚动窗口
rolling_mean = daily_data.rolling(window=7).mean()
rolling_std = daily_data.rolling(window=30).std()

# 移动
shifted = daily_data.shift(1)  # 向下移动1行
diff = daily_data.diff()       # 差分

常用频率别名

别名	说明
D	天
W	周
M	月末
MS	月初
Q	季末
Y	年末
H	小时
T/min	分钟
S	秒

数据透视表

pivot_table 详解

import pandas as pd
import numpy as np

# 创建示例数据
np.random.seed(42)
df = pd.DataFrame({
    'Date': pd.date_range('2024-01-01', periods=100, freq='D'),
    'Product': np.random.choice(['A', 'B', 'C'], 100),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], 100),
    'Sales': np.random.randint(100, 1000, 100),
    'Quantity': np.random.randint(1, 50, 100)
})

# 基本透视表
pivot = pd.pivot_table(
    df,
    values='Sales',
    index='Product',
    columns='Region',
    aggfunc='sum',
    fill_value=0
)
print(pivot)

# 多个值
pivot = pd.pivot_table(
    df,
    values=['Sales', 'Quantity'],
    index='Product',
    columns='Region',
    aggfunc='sum'
)

# 多个聚合函数
pivot = pd.pivot_table(
    df,
    values='Sales',
    index='Product',
    columns='Region',
    aggfunc=['sum', 'mean', 'count']
)

# 添加总计
pivot = pd.pivot_table(
    df,
    values='Sales',
    index='Product',
    columns='Region',
    aggfunc='sum',
    margins=True,
    margins_name='Total'
)

melt 逆透视

# 宽表转长表
df_wide = pd.DataFrame({
    'Product': ['A', 'B', 'C'],
    '2023': [100, 200, 300],
    '2024': [150, 250, 350]
})

df_long = pd.melt(
    df_wide,
    id_vars=['Product'],
    value_vars=['2023', '2024'],
    var_name='Year',
    value_name='Sales'
)
print(df_long)
#   Product  Year  Sales
# 0       A  2023    100
# 1       B  2023    200
# 2       C  2023    300
# 3       A  2024    150
# 4       B  2024    250
# 5       C  2024    350

可视化

Pandas 集成了 Matplotlib，可以直接绘图。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # macOS
plt.rcParams['axes.unicode_minus'] = False

# 创建示例数据
df = pd.DataFrame({
    'Month': pd.date_range('2024-01-01', periods=12, freq='M'),
    'Sales': np.random.randint(1000, 5000, 12),
    'Profit': np.random.randint(200, 1000, 12)
})
df = df.set_index('Month')

基本图表

# 折线图
df['Sales'].plot(kind='line', title='Monthly Sales')
plt.show()

# 柱状图
df.plot(kind='bar', y=['Sales', 'Profit'])
plt.show()

# 水平柱状图
df.plot(kind='barh', y='Sales')
plt.show()

# 面积图
df.plot(kind='area', stacked=False)
plt.show()

# 散点图
df.plot(kind='scatter', x='Sales', y='Profit')
plt.show()

# 饼图
df['Sales'].plot(kind='pie', autopct='%1.1f%%')
plt.show()

# 直方图
df['Sales'].plot(kind='hist', bins=10)
plt.show()

# 箱线图
df.plot(kind='box')
plt.show()

高级可视化

# 双轴图
fig, ax1 = plt.subplots()
ax1.plot(df.index, df['Sales'], 'b-', label='Sales')
ax1.set_ylabel('Sales', color='b')
ax2 = ax1.twinx()
ax2.plot(df.index, df['Profit'], 'r-', label='Profit')
ax2.set_ylabel('Profit', color='r')
plt.title('Sales and Profit')
plt.show()

# 子图
df.plot(subplots=True, layout=(2, 1), figsize=(10, 8))
plt.show()

# 分组柱状图
df_grouped = df.groupby(df.index.month)['Sales'].sum()
df_grouped.plot(kind='bar')
plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Monthly Sales Summary')
plt.show()

性能优化

数据类型优化

# 查看内存使用
print(df.memory_usage(deep=True))

# 优化整数类型
df['small_int'] = df['small_int'].astype('int8')    # -128 to 127
df['medium_int'] = df['medium_int'].astype('int16') # -32768 to 32767

# 优化浮点数
df['float_col'] = df['float_col'].astype('float32')

# 使用分类类型
df['category_col'] = df['category_col'].astype('category')

# 优化前后对比
print("优化前:")
print(df.memory_usage(deep=True).sum() / 1024 ** 2, "MB")

# 优化后
print("优化后:")
print(df.memory_usage(deep=True).sum() / 1024 ** 2, "MB")

向量化操作

# ❌ 避免：使用循环
def slow_way(df):
    result = []
    for idx, row in df.iterrows():
        result.append(row['A'] + row['B'])
    return result

# ✅ 推荐：向量化
def fast_way(df):
    return df['A'] + df['B']

# ✅ 推荐：使用 apply
def medium_way(df):
    return df.apply(lambda row: row['A'] + row['B'], axis=1)

分块处理大数据

# 分块读取大型 CSV
chunk_size = 10000
chunks = pd.read_csv('large_file.csv', chunksize=chunk_size)

results = []
for chunk in chunks:
    # 处理每个块
    processed = chunk[chunk['value'] > 100]
    results.append(processed)

# 合并结果
result = pd.concat(results, ignore_index=True)

使用 eval 和 query

# ❌ 普通方式
result = df[(df['A'] > 10) & (df['B'] < 20)]

# ✅ 使用 query（更清晰，对大数据更快）
result = df.query('A > 10 and B < 20')

# ✅ 使用 eval 进行复杂计算
df.eval('C = A + B', inplace=True)
df.eval('D = A * B / C', inplace=True)

实战案例

案例 1：销售数据分析

import pandas as pd
import numpy as np

# 创建模拟销售数据
np.random.seed(42)
n_records = 1000

sales_data = pd.DataFrame({
    'Date': pd.date_range('2024-01-01', periods=n_records, freq='D'),
    'Product': np.random.choice(['Product_A', 'Product_B', 'Product_C'], n_records),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], n_records),
    'Salesperson': np.random.choice(['Alice', 'Bob', 'Charlie', 'David'], n_records),
    'Quantity': np.random.randint(1, 100, n_records),
    'Unit_Price': np.random.uniform(10, 100, n_records).round(2)
})

# 计算销售额
sales_data['Revenue'] = sales_data['Quantity'] * sales_data['Unit_Price']

# 基本统计
print("总销售额:", sales_data['Revenue'].sum())
print("平均订单金额:", sales_data['Revenue'].mean())
print("总订单数:", len(sales_data))

# 按产品分析
product_stats = sales_data.groupby('Product').agg({
    'Revenue': ['sum', 'mean', 'count'],
    'Quantity': 'sum'
}).round(2)
print("\n产品销售统计:")
print(product_stats)

# 按地区分析
region_stats = sales_data.groupby('Region')['Revenue'].sum().sort_values(ascending=False)
print("\n地区销售排名:")
print(region_stats)

# 按销售人员分析
salesperson_stats = sales_data.groupby('Salesperson').agg({
    'Revenue': 'sum',
    'Quantity': 'sum'
}).sort_values('Revenue', ascending=False)
print("\n销售人员业绩:")
print(salesperson_stats)

# 月度趋势
sales_data['Month'] = sales_data['Date'].dt.to_period('M')
monthly_sales = sales_data.groupby('Month')['Revenue'].sum()
print("\n月度销售趋势:")
print(monthly_sales)

# Top 10 订单
top_orders = sales_data.nlargest(10, 'Revenue')[['Date', 'Product', 'Salesperson', 'Revenue']]
print("\nTop 10 订单:")
print(top_orders)

案例 2：学生成绩分析

import pandas as pd
import numpy as np

# 创建学生成绩数据
np.random.seed(42)
students = pd.DataFrame({
    'Student_ID': range(1, 101),
    'Name': [f'Student_{i}' for i in range(1, 101)],
    'Class': np.random.choice(['Class_A', 'Class_B', 'Class_C'], 100),
    'Math': np.random.randint(50, 100, 100),
    'English': np.random.randint(50, 100, 100),
    'Science': np.random.randint(50, 100, 100)
})

# 计算总分和平均分
students['Total'] = students[['Math', 'English', 'Science']].sum(axis=1)
students['Average'] = students[['Math', 'English', 'Science']].mean(axis=1)

# 等级划分
def get_grade(score):
    if score >= 90:
        return 'A'
    elif score >= 80:
        return 'B'
    elif score >= 70:
        return 'C'
    elif score >= 60:
        return 'D'
    else:
        return 'F'

students['Grade'] = students['Average'].apply(get_grade)

# 班级统计
class_stats = students.groupby('Class').agg({
    'Math': ['mean', 'max', 'min'],
    'English': ['mean', 'max', 'min'],
    'Science': ['mean', 'max', 'min'],
    'Average': 'mean'
}).round(2)
print("班级统计:")
print(class_stats)

# 科目统计
subject_stats = students[['Math', 'English', 'Science']].describe()
print("\n科目统计:")
print(subject_stats)

# 等级分布
grade_dist = students['Grade'].value_counts()
print("\n等级分布:")
print(grade_dist)

# 优秀学生（平均分 >= 85）
excellent_students = students[students['Average'] >= 85].sort_values('Average', ascending=False)
print(f"\n优秀学生人数: {len(excellent_students)}")
print(excellent_students.head(10))

# 不及格学生
failed_students = students[students['Average'] < 60]
print(f"\n不及格学生人数: {len(failed_students)}")
if len(failed_students) > 0:
    print(failed_students)

# 相关性分析
correlation = students[['Math', 'English', 'Science']].corr()
print("\n科目相关性:")
print(correlation)

案例 3：股票数据分析

import pandas as pd
import numpy as np

# 创建模拟股票数据
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=252, freq='B')  # 交易日

stock_data = pd.DataFrame({
    'Date': dates,
    'Open': 100 + np.cumsum(np.random.randn(252) * 2),
    'High': 0,
    'Low': 0,
    'Close': 0,
    'Volume': np.random.randint(1000000, 5000000, 252)
})

# 计算 High, Low, Close
stock_data['Close'] = stock_data['Open'] + np.random.randn(252) * 2
stock_data['High'] = stock_data[['Open', 'Close']].max(axis=1) + abs(np.random.randn(252))
stock_data['Low'] = stock_data[['Open', 'Close']].min(axis=1) - abs(np.random.randn(252))

# 设置日期为索引
stock_data = stock_data.set_index('Date')

# 计算日收益率
stock_data['Return'] = stock_data['Close'].pct_change()

# 计算移动平均线
stock_data['MA_5'] = stock_data['Close'].rolling(window=5).mean()
stock_data['MA_20'] = stock_data['Close'].rolling(window=20).mean()
stock_data['MA_60'] = stock_data['Close'].rolling(window=60).mean()

# 基本统计
print("股票基本统计:")
print(f"起始价格: {stock_data['Close'].iloc[0]:.2f}")
print(f"结束价格: {stock_data['Close'].iloc[-1]:.2f}")
print(f"最高价格: {stock_data['High'].max():.2f}")
print(f"最低价格: {stock_data['Low'].min():.2f}")
print(f"平均成交量: {stock_data['Volume'].mean():.0f}")

# 收益率统计
print("\n收益率统计:")
print(f"平均日收益率: {stock_data['Return'].mean():.4f}")
print(f"收益率标准差: {stock_data['Return'].std():.4f}")
print(f"年化收益率: {stock_data['Return'].mean() * 252:.4f}")
print(f"年化波动率: {stock_data['Return'].std() * np.sqrt(252):.4f}")

# 月度表现
stock_data['Month'] = stock_data.index.to_period('M')
monthly_returns = stock_data.groupby('Month')['Return'].sum()
print("\n月度收益率:")
print(monthly_returns)

# 最佳和最差交易日
best_day = stock_data['Return'].nlargest(1)
worst_day = stock_data['Return'].nsmallest(1)
print(f"\n最佳交易日: {best_day.index[0].date()} ({best_day.values[0]:.2%})")
print(f"最差交易日: {worst_day.index[0].date()} ({worst_day.values[0]:.2%})")

案例 4：数据清洗实战

import pandas as pd
import numpy as np

# 创建脏数据
dirty_data = pd.DataFrame({
    'Name': ['Alice', 'bob', 'CHARLIE', 'david', ' Eve ', None, 'Grace'],
    'Age': [25, '30', 35, None, 40, 45, 'invalid'],
    'Email': ['alice@example.com', 'BOB@TEST.COM', 'charlie@example.com',
              None, 'eve@test.com', 'frank@example.com', 'grace@test.com'],
    'Salary': ['8000', '12000', None, '20000', '18000', '22000', '15000'],
    'Join_Date': ['2020-01-15', '2019-06-20', 'invalid', '2021-03-10',
                  '2018-11-25', '2022-07-01', '2020-09-15']
})

print("原始数据:")
print(dirty_data)
print("\n数据类型:")
print(dirty_data.dtypes)

# 1. 处理姓名
dirty_data['Name'] = dirty_data['Name'].str.strip().str.title()
print("\n处理姓名后:")
print(dirty_data['Name'])

# 2. 处理年龄
dirty_data['Age'] = pd.to_numeric(dirty_data['Age'], errors='coerce')
print("\n处理年龄后:")
print(dirty_data['Age'])

# 3. 处理邮箱
dirty_data['Email'] = dirty_data['Email'].str.lower()
print("\n处理邮箱后:")
print(dirty_data['Email'])

# 4. 处理薪水
dirty_data['Salary'] = pd.to_numeric(dirty_data['Salary'], errors='coerce')
print("\n处理薪水后:")
print(dirty_data['Salary'])

# 5. 处理日期
dirty_data['Join_Date'] = pd.to_datetime(dirty_data['Join_Date'], errors='coerce')
print("\n处理日期后:")
print(dirty_data['Join_Date'])

# 6. 处理缺失值
print("\n缺失值统计:")
print(dirty_data.isnull().sum())

# 填充数值列的中位数
dirty_data['Age'] = dirty_data['Age'].fillna(dirty_data['Age'].median())
dirty_data['Salary'] = dirty_data['Salary'].fillna(dirty_data['Salary'].median())

# 删除邮箱为空的行
dirty_data = dirty_data.dropna(subset=['Email'])

# 删除姓名为空的行
dirty_data = dirty_data.dropna(subset=['Name'])

print("\n清洗后的数据:")
print(dirty_data)
print("\n最终数据类型:")
print(dirty_data.dtypes)
print("\n缺失值检查:")
print(dirty_data.isnull().sum())

总结

核心知识点回顾

✅ 数据结构：Series 和 DataFrame
✅ 数据读写：CSV、Excel、JSON、SQL 等格式
✅ 数据查看：head、tail、info、describe
✅ 数据选择：loc、iloc、布尔索引、query
✅ 数据清洗：缺失值、重复值、类型转换
✅ 数据转换：apply、map、replace、离散化
✅ 数据筛选：条件筛选、排序、采样
✅ 统计分析：基本统计、分组聚合
✅ 数据合并：concat、merge、join
✅ 时间序列：日期处理、重采样、滚动窗口
✅ 透视表：pivot_table、crosstab、melt
✅ 可视化：内置绘图功能
✅ 性能优化：数据类型、向量化、分块处理

学习建议

多实践：通过真实数据集练习
理解索引：loc 和 iloc 的区别至关重要
掌握 groupby：分组聚合是数据分析的核心
善用文档：Pandas 官方文档非常详细
结合 NumPy：两者配合使用效果更佳
关注性能：大数据集时注意优化

下一步

掌握 Pandas 后，建议继续学习：

Matplotlib/Seaborn：高级数据可视化
Scikit-learn：机器学习
SQL：数据库查询
Spark：大规模数据处理

参考资料

Pandas 官方文档：pandas.pydata.org/docs/
Pandas 用户指南：pandas.pydata.org/docs/user_g…
10 Minutes to Pandas：pandas.pydata.org/docs/user_g…
Pandas Cookbook：github.com/jvns/pandas…