1. 向Excel写入内容
# 创建一个表,第一列ID 内容:1、2、3 第二列 Name Tom Danny Boob
df = pd.DataFrame({"ID":[1,2,3],"Name":["Tom","Danny","Boob"]})
# execl存放路径
execl_path = "D:/test/test1.xlsx"
# execl创建
df.to_excel(execl_path)
- DataFrame:
由多种类型的列构成的二维标签数据结构
往往包含index(行标签)和columns(列标签), 彼此独立, 互不影响
直观理解: DataFrame 是带标签的二维数组,一个数据表
- to_excel方法
功能:
用于将pd.DataFrame形式的数据写入Excel中
参数:
to_excel(excel_writer, sheet_name='Sheet1', na_rep='',
float_format=None, columns=None, header=True, index=True,
index_label=None, startrow=0, startcol=0, engine=None,
merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
freeze_panes=None)
常用参数解析 :
excel_writer : 字符串或ExcelWriter 对象,文件路径或现有的ExcelWriter
sheet_name :字符串,默认“Sheet1”,将包含DataFrame的表的名称。
na_rep : 字符串,默认‘ ’,缺失数据表示方式
float_format : 字符串,默认None,格式化浮点数的字符串
columns : 序列,可选,要编写的列
header : 布尔或字符串列表,默认为Ture。写出列名。如果给定字符串列表,则假定它是列名称的别名。
index :布尔,默认的Ture,写行名(索引)
index_label : 字符串或序列,默认为None。如果需要,可以使用索引列的列标签。如果没有给出,标题和索引为true,则使用索引名称。如果数据文件使用多索引,则需使用序列。
startrow :左上角的单元格行来转储数据框
startcol :左上角的单元格列转储数据帧
engine : 字符串,默认没有使用写引擎 - 您也可以通过选项io.excel.xlsx.writer,io.excel.xls.writer和io.excel.xlsm.writer进行设置。
merge_cells : 布尔,默认为Ture编码生成的excel文件。 只有xlwt需要,其他编写者本地支持unicode。
inf_rep : 字符串,默认“正”无穷大的表示(在Excel中不存在无穷大的本地表示)
freeze_panes : 整数的元组(长度2),默认为None。指定要冻结的基于1的最底部行和最右边的列
- Excel展现
A列:DataFrame索引,自动创建
1.1 指定列为索引set_index
案例:将ID设置为索引
# 创建一个表,第一列ID 内容:1、2、3 第二列 Name Tom Danny Boob
df = pd.DataFrame({"ID":[1,2,3],"Name":["Tom","Danny","Boob"]})
# 设置ID列为索引
df = df.set_index("ID")
# execl存放路径
execl_path = "D:/test/test1.xlsx"
# execl创建
df.to_excel(execl_path)
- set_index方法参数介绍
set_index(
keys,
drop=True,
append=False,
inplace=False,
verify_integrity=False
)
参数:
keys:label or array-like or list of labels/arrays,这个是需要设置为索引的列名,可以是单个列名,或者是多个列名
drop:bool, default True,删除要用作新索引的列
append:bool, default False,添加新索引
inplace:bool, default False,是否要覆盖数据集
verify_integrity:bool, default False,检查新索引是否重复。否则,将检查推迟到必要时进行。设置为False将改善此方法的性能
2. 读取excel read_excel方法
import pandas as pd
# read_excel()作用:将Excel文件读取到pandas DataFrame中
# excel 路径
excel_path = "D:/test/test1.xlsx"
read_data = pd.read_excel(excel_path)
# 获取属性 行、列
print(read_data.shape) # (3, 2)
# 获取列的标题
print(read_data.columns) # Index(['ID', 'Name'], dtype='object')
# 打印内容head()默认打印前五行,可在head(2)内指定打印几行
print(read_data.head())
# ID Name
# 0 1 Tom
# 1 2 Danny
# 2 3 Boob
# 3 4 baby
# 4 5 阿斯顿
# 获取excel末尾的行数信息, 默认打印后5行,可指定打印行数 在tail(2)内指定打印几行
print(read_data.tail())
# ID Name
# 18 19 test14
# 19 20 test15
# 20 21 test16
# 21 22 test17
# 22 23 test18
2.1 读取excel内容过滤无用的行
- 存在脏数据的行,或空白
import pandas as pd
excel_path = "D:/test/test1.xlsx"
# 未指定起始的行,默认从第0行读取
# read_data = pd.read_excel(excel_path)
# 获取列的标题
# print(read_data.columns) # Index(['无用的行', '无用的行2'], dtype='object')
# 指定起始的行,指定header从1开始
read_data = pd.read_excel(excel_path, header=1)
# 获取列的标题
print(read_data.columns) # Index(['ID', 'Name'], dtype='object')
- 没有列标题
import pandas as pd
excel_path = "D:/test/test1.xlsx"
# 未指定起始的行,默认从第0行读取
read_data = pd.read_excel(excel_path)
# 获取列的标题
print(read_data.columns) # Index(['无用的行', '无用的行2'], dtype='object')
- 写入到新的excel内
import pandas as pd
excel_path = "D:/test/test1.xlsx"
# header=None 读取excel不需要设置header
read_data = pd.read_excel(excel_path, header=None)
# 获取列的标题
print(read_data.columns) # Index(['ID', 'NAME'], dtype='object')
# 设置标题
read_data.columns = ["ID", "NAME"]
print(read_data.columns) # Int64Index([0, 1], dtype='int64')
# 生成到新的excel内
read_data.to_excel("D:/test/test2.xlsx")
问题一: 索引出现了,如何取消?
import pandas as pd
excel_path = "D:/test/test1.xlsx"
# header=None 读取excel不需要设置header
read_data = pd.read_excel(excel_path, header=None)
# 设置标题
read_data.columns = ["ID", "NAME"]
# 覆盖数据集
read_data.set_index("ID", inplace=True)
# 生成到新的excel内
read_data.to_excel("D:/test/test2.xlsx")
问题二:再次读取excel,索引出现了
import pandas as pd
excel_path = "D:/test/test2.xlsx"
# index_col 指定索引列
read_data = pd.read_excel(excel_path,index_col="ID")
print(read_data)
# 生成到新的excel内
read_data.to_excel("D:/test/test3.xlsx")
3. 向excel内写入内容
- 写入内容
# series是一个一维数组,序列
# pd.Series([list],index=[list]) 参数为list;index为可选参数,若不填写则默认index从0开始;若填写则index长度应该与value长度相等。
# data:写入excel的内容
# index:行号
# name:列名
s1 = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="A")
s2 = pd.Series(data=[10, 20, 30], index=[1, 2, 3], name="B")
s3 = pd.Series(data=[100, 200, 300], index=[1, 2, 3], name="C")
# 创建一个Datarame表格的数据结构,传入字典格式以key为列,value为值
df = pd.DataFrame(data={s1.name: s1, s2.name: s2, s3.name: s3})
print(df)
# A B C
# 1 1 10 100
# 2 2 20 200
# 3 3 30 300
# 创建一个Datarame表格的数据结构,传入列表格式, 每个序列看作一行,序列的name看作行号
df = pd.DataFrame(data=[s1, s2, s3])
print((df))
# 1 2 3
# A 1 2 3
# B 10 20 30
# C 100 200 300
写入内容进行错位
import pandas as pd
# series是一个一维数组,序列
# pd.Series([list],index=[list]) 参数为list;index为可选参数,若不填写则默认index从0开始;若填写则index长度应该与value长度相等。
# data:写入excel的内容
# index:行号
# name:列名
s1 = pd.Series(data=[1, 2, 3], index=[1, 2, 3], name="A")
s2 = pd.Series(data=[10, 20, 30], index=[1, 2, 3], name="B")
# s3 index 为第2行 3行 4行
s3 = pd.Series(data=[100, 200, 300], index=[2, 3, 4], name="C")
# 创建一个Datarame表格的数据结构,传入字典格式以key为列,value为值
df = pd.DataFrame(data={s1.name: s1, s2.name: s2, s3.name: s3})
print(df)
# NaN:表示没有值
# A B C
# 1 1.0 10.0 NaN
# 2 2.0 20.0 100.0
# 3 3.0 30.0 200.0
# 4 NaN NaN 300.0
4. 自动填充
4.1 读取指定区域的数据
import pandas as pd
excel_path = "D:/test/test4.xlsx"
read_data = pd.read_excel(excel_path)
print(read_data)
# Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5
# 0 NaN NaN NaN NaN NaN NaN
# 1 NaN NaN NaN NaN NaN NaN
# 2 NaN NaN ID NAME INSTORE DATE
# 3 NaN NaN NaN BOOK_001 NaN NaN
# 4 NaN NaN NaN BOOK_002 NaN NaN
# 5 NaN NaN NaN BOOK_003 NaN NaN
# 6 NaN NaN NaN BOOK_004 NaN NaN
# skiprows: 跳过前3个空行,从第4行开始读取
read_data = pd.read_excel(excel_path, skiprows=3)
print(read_data)
# Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5
# 0 NaN NaN ID NAME INSTORE DATE
# 1 NaN NaN NaN BOOK_001 NaN NaN
# 2 NaN NaN NaN BOOK_002 NaN NaN
# 3 NaN NaN NaN BOOK_003 NaN NaN
# 4 NaN NaN NaN BOOK_004 NaN NaN
# 5 NaN NaN NaN BOOK_005 NaN NaN
# 6 NaN NaN NaN BOOK_006 NaN NaN
# 7 NaN NaN NaN BOOK_007 NaN NaN
# 8 NaN NaN NaN BOOK_008 NaN NaN
# 9 NaN NaN NaN BOOK_009 NaN NaN
# 10 NaN NaN NaN BOOK_010 NaN NaN
# 11 NaN NaN NaN BOOK_011 NaN NaN
# 12 NaN NaN NaN BOOK_012 NaN NaN
# 13 NaN NaN NaN BOOK_013 NaN NaN
# 14 NaN NaN NaN BOOK_014 NaN NaN
# 15 NaN NaN NaN BOOK_015 NaN NaN
# 16 NaN NaN NaN BOOK_016 NaN NaN
# 17 NaN NaN NaN BOOK_017 NaN NaN
# 18 NaN NaN NaN BOOK_018 NaN NaN
# 19 NaN NaN NaN BOOK_019 NaN NaN
# 20 NaN NaN NaN BOOK_020 NaN NaN
# skiprows: 跳过前3个空行,从第4行开始读取
# usecols: 指定看的列 C到F
read_data = pd.read_excel(excel_path, skiprows=3, usecols="C:F")
print(read_data)
# Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5
# 0 ID NAME INSTORE DATE
# 1 NaN BOOK_001 NaN NaN
# 2 NaN BOOK_002 NaN NaN
# 3 NaN BOOK_003 NaN NaN
# 4 NaN BOOK_004 NaN NaN
# 5 NaN BOOK_005 NaN NaN
# 6 NaN BOOK_006 NaN NaN
# 7 NaN BOOK_007 NaN NaN
# 8 NaN BOOK_008 NaN NaN
# 9 NaN BOOK_009 NaN NaN
# 10 NaN BOOK_010 NaN NaN
# 11 NaN BOOK_011 NaN NaN
# 12 NaN BOOK_012 NaN NaN
# 13 NaN BOOK_013 NaN NaN
# 14 NaN BOOK_014 NaN NaN
# 15 NaN BOOK_015 NaN NaN
# 16 NaN BOOK_016 NaN NaN
# 17 NaN BOOK_017 NaN NaN
# 18 NaN BOOK_018 NaN NaN
# 19 NaN BOOK_019 NaN NaN
# 20 NaN BOOK_020 NaN NaN
4.2 填充数据
import pandas as pd
from datetime import date, timedelta
excel_path = "D:/test/test4.xlsx"
# skiprows: 跳过前3个空行,从第4行开始读取
# usecols: 指定看的列 C到F
# dtype:表示将ID这一列设置为int类型,默认不设置是浮点类型,这里注意设置nan不支持转int 需要先设置为str
read_data = pd.read_excel(excel_path, skiprows=3, usecols="C:F",dtype={"ID":str, "INSTORE":str, "DATE":str})
print(type(read_data["ID"])) # <class 'pandas.core.series.Series'> 序列类
data = date(2022,8,2)
for i in read_data.index:
# Pandas Series.at属性使我们能够访问行/列标签对的单个值。该属性类似于loc,因为两者都提供了基于标签的查找
read_data["ID"].at[i] = i + 1
read_data["INSTORE"].at[i] = "yes"
read_data["DATE"].at[i] = data + timedelta(days=i)
print(read_data)
# ID NAME INSTORE DATE
# 0 1 BOOK_001 yes 2022-08-02
# 1 2 BOOK_002 yes 2022-08-03
# 2 3 BOOK_003 yes 2022-08-04
# 3 4 BOOK_004 yes 2022-08-05
# 4 5 BOOK_005 yes 2022-08-06
# 5 6 BOOK_006 yes 2022-08-07
# 6 7 BOOK_007 yes 2022-08-08
# 7 8 BOOK_008 yes 2022-08-09
# 8 9 BOOK_009 yes 2022-08-10
# 9 10 BOOK_010 yes 2022-08-11
# 10 11 BOOK_011 yes 2022-08-12
# 11 12 BOOK_012 yes 2022-08-13
# 12 13 BOOK_013 yes 2022-08-14
# 13 14 BOOK_014 yes 2022-08-15
# 14 15 BOOK_015 yes 2022-08-16
# 15 16 BOOK_016 yes 2022-08-17
# 16 17 BOOK_017 yes 2022-08-18
# 17 18 BOOK_018 yes 2022-08-19
# 18 19 BOOK_019 yes 2022-08-20
# 19 20 BOOK_020 yes 2022-08-21
5. 个人项目实战经验
案例:
class ReadCsv(object):
"""读取csv静态表类"""
@property
def read_objects_csv(self) -> list:
"""读取物体静态表"""
objects_df = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + "/static_data/objects.csv",
encoding='utf-8',
# usecols=[0, 1, 2, 3], # ID Name Type Desc, 可指定读取哪些列
header=0 # 去除表头
)
csv_data_list = []
for line in objects_df.values:
dic = {}
for item, data in zip(objects_df, line.tolist()):
dic[item] = data
csv_data_list.append(dic)
return csv_data_list # [{一行数据}, {列名:列值}]
# 定义查询方法
@staticmethod
def get_line_dict(csv_data_list: list, tid) -> dict:
"""
获取一行的数据
Args:
csv_dict: csv静态表字典数据
Returns:
"""
csv_line_data_dict = [one_dict for one_dict in csv_data_list if one_dict.get('ID') == tid][0]
return csv_line_data_dict # 返回一行数据 {}
if __name__ == '__main__':
rc = ReadCsv()
ob_df = rc.read_objects_csv
print(f"ob_df:{ob_df}" )
data = ReadCsv.get_line_dict(ob_df, 1)
print(f'{data}')