SQLAlchemy 教程
我们在数据分析的时候,会经常使用到关系型数据库,这就不可避免的需要用到sql语言。但是很多人并不是很熟悉这门语言,而且使用它其实会使我们的项目变得难以维护,所以我们需要用到SQLAlchemy。 他提供了一个Python的方式操作数据库的方法。
安装
pip install sqlalchemy
连接
基本语法是
import sqlalchemy as db
engine = db.create_engine('dialect+driver://user:pass@host:port/db')
你可以在这里看到更多连接的方式docs.sqlalchemy.org/en/14/core/…
这里我就使用过了最简单的数据库sqllite
这里是数据库的两个测试表格和测试数据
CREATE TABLE student (
id INTEGER NOT NULL UNIQUE PRIMARY KEY autoincrement ,
name VARCHAR(20) NOT NULL ,
sex VARCHAR(4) ,
birth YEAR,
department VARCHAR(20) ,
address VARCHAR(50)
);
CREATE TABLE score (
id INTEGER NOT NULL UNIQUE PRIMARY KEY autoincrement ,
stu_id INT(10) NOT NULL ,
c_name VARCHAR(20) ,
grade INT(10)
);
INSERT INTO student VALUES( 901,'张老大', '男',1985,'计算机系', '北京市海淀区');
INSERT INTO student VALUES( 902,'张老二', '男',1986,'中文系', '北京市昌平区');
INSERT INTO student VALUES( 903,'张三', '女',1990,'中文系', '湖南省永州市');
INSERT INTO student VALUES( 904,'李四', '男',1990,'英语系', '辽宁省阜新市');
INSERT INTO student VALUES( 905,'王五', '女',1991,'英语系', '福建省厦门市');
INSERT INTO student VALUES( 906,'王六', '男',1988,'计算机系', '湖南省衡阳市');
INSERT INTO score (stu_id, c_name, grade) VALUES(901, '计算机',98);
INSERT INTO score (stu_id, c_name, grade) VALUES(901, '英语', 80);
INSERT INTO score (stu_id, c_name, grade) VALUES(902, '计算机',65);
INSERT INTO score (stu_id, c_name, grade) VALUES(902, '中文',88);
INSERT INTO score (stu_id, c_name, grade) VALUES(903, '中文',95);
INSERT INTO score (stu_id, c_name, grade) VALUES(904, '计算机',70);
INSERT INTO score (stu_id, c_name, grade) VALUES(904, '英语',92);
INSERT INTO score (stu_id, c_name, grade) VALUES(905, '英语',94);
INSERT INTO score (stu_id, c_name, grade) VALUES(906, '计算机',90);
INSERT INTO score (stu_id, c_name, grade) VALUES(906, '英语',85);
# 具体连接方式
import sqlalchemy as db
engine = db.create_engine("sqlite:///data/school")
connection = engine.connect()
metadata = db.MetaData()
student = db.Table("student", metadata, autoload=True, autoload_with=engine)
print(student.columns.keys())
['id', 'name', 'sex', 'birth', 'department', 'address']
metadata.tables['student']
Table('student', MetaData(), Column('id', INTEGER(), table=<student>, primary_key=True, nullable=False), Column('name', VARCHAR(length=20), table=<student>, nullable=False), Column('sex', VARCHAR(length=4), table=<student>), Column('birth', NUMERIC(), table=<student>), Column('department', VARCHAR(length=20), table=<student>), Column('address', VARCHAR(length=50), table=<student>), schema=None)
查询
select * from student
query = db.select([student])
result_proxy = connection.execute(query)
result_set = result_proxy.fetchall()
result_set[:3]
/tmp/ipykernel_125029/3419018909.py:2: SAWarning: Dialect sqlite+pysqlite does *not* support Decimal objects natively, and SQLAlchemy must convert from floating point - rounding errors and other issues may occur. Please consider storing Decimal numbers as strings or integers on this platform for lossless storage.
result_proxy = connection.execute(query)
[(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区'),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区'),
(903, '张三', '女', Decimal('1990.0000000000'), '中文系', '湖南省永州市')]
还可以使用.fetchmany()
获取任意条数据,这样对处理大量的数据是有好的
flag = True
while flag:
partial_results = result_proxy.fetchmany(2)
if (partial_results == []):
flag = False
print("finished")
else:
print(partial_results)
result_proxy.close()
finished
## 将数据转化为dataframe
import pandas as pd
df = pd.DataFrame(result_set)
df.columns = result_set[0].keys()
df.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | name | sex | birth | department | address | |
---|---|---|---|---|---|---|
0 | 901 | 张老大 | 男 | 1985.0000000000 | 计算机系 | 北京市海淀区 |
1 | 902 | 张老二 | 男 | 1986.0000000000 | 中文系 | 北京市昌平区 |
2 | 903 | 张三 | 女 | 1990.0000000000 | 中文系 | 湖南省永州市 |
3 | 904 | 李四 | 男 | 1990.0000000000 | 英语系 | 辽宁省阜新市 |
4 | 905 | 王五 | 女 | 1991.0000000000 | 英语系 | 福建省厦门市 |
过滤数据
where
select * from student where id > 903
query = db.select([student.columns.id, student.columns.name]).where(student.columns.id > 903)
connection.execute(query).fetchall()
[(904, '李四'), (905, '王五'), (906, '王六')]
in
select s.id, s.name from student s where s.department in ("计算机系", "中文系")
query = db.select([student.columns.id, student.columns.name])\
.where(student.columns.department.in_(["计算机系", "中文系"]))
connection.execute(query).fetchall()
[(901, '张老大'), (902, '张老二'), (903, '张三'), (906, '王六')]
and, or , not
select s.id, s.name
where s.department in ("计算机系", "中文系") and s.sex = '男'
query = db.select([student.columns.id, student.columns.name])\
.where(db.and_(student.columns.department.in_(["计算机系", "中文系"])), student.columns.sex == '男')
connection.execute(query).fetchall()
[(901, '张老大'), (902, '张老二'), (906, '王六')]
order by
select * from student s
order by s.id desc, s.birth
query = db.select([student]).order_by(db.desc(student.columns.id), student.columns.birth)
connection.execute(query).fetchall()
[(906, '王六', '男', Decimal('1988.0000000000'), '计算机系', '湖南省衡阳市'),
(905, '王五', '女', Decimal('1991.0000000000'), '英语系', '福建省厦门市'),
(904, '李四', '男', Decimal('1990.0000000000'), '英语系', '辽宁省阜新市'),
(903, '张三', '女', Decimal('1990.0000000000'), '中文系', '湖南省永州市'),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区'),
(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区')]
sum
select sum(s.id) from stuent s
query = db.select([db.func.sum(student.columns.id)])
connection.execute(query).fetchall()
[(5421,)]
group by
select sum(s.id) id_count, s.sex from student s
query = db.select([db.func.sum(student.columns.id).label("s_id"), student.columns.sex]).group_by(student.columns.sex)
connection.execute(query).fetchall()
[(1808, '女'), (3613, '男')]
distinct
select distinct sex from student
query = db.select([student.columns.sex.distinct()])
connection.execute(query).fetchall()
[('男',), ('女',)]
case
使用case方法计算女生的数量
select sum(num)
from (select case
when sex = '女' then 1
else 0 end as num
from student
);
female_pop = db.func.sum(db.case([(student.columns.sex == '女', 1)], else_=0))
total_pop = db.cast(db.func.count(student.columns.id), db.Float) # 需要转换成浮点数,要不0.3的int型就是0
query = db.select([female_pop / total_pop * 100])
connection.execute(query).scalar()
33.33333333333333
join
select student.*, score.* where score.stu_id = student.id
import sqlalchemy as db
import pandas as pd
engine = db.create_engine("sqlite:///data/school")
connection = engine.connect()
metadata = db.MetaData()
student = db.Table("student", metadata, autoload=True, autoload_with=engine)
score = db.Table("score", metadata, autoload=True, autoload_with=engine)
/tmp/ipykernel_125029/343074732.py:9: SAWarning: Could not instantiate type <class 'sqlalchemy.sql.sqltypes.INTEGER'> with reflected arguments ['10']; using no arguments.
score = db.Table("score", metadata, autoload=True, autoload_with=engine)
# 方法一
query = db.select([student, score]).filter(score.columns.stu_id==student.columns.id)
connection.execute(query).fetchall()
/tmp/ipykernel_125029/2986192374.py:3: SAWarning: Dialect sqlite+pysqlite does *not* support Decimal objects natively, and SQLAlchemy must convert from floating point - rounding errors and other issues may occur. Please consider storing Decimal numbers as strings or integers on this platform for lossless storage.
connection.execute(query).fetchall()
[(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区', 1, 901, '计算机', 98),
(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区', 2, 901, '英语', 80),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区', 3, 902, '计算机', 65),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区', 4, 902, '中文', 88),
(903, '张三', '女', Decimal('1990.0000000000'), '中文系', '湖南省永州市', 5, 903, '中文', 95),
(904, '李四', '男', Decimal('1990.0000000000'), '英语系', '辽宁省阜新市', 6, 904, '计算机', 70),
(904, '李四', '男', Decimal('1990.0000000000'), '英语系', '辽宁省阜新市', 7, 904, '英语', 92),
(905, '王五', '女', Decimal('1991.0000000000'), '英语系', '福建省厦门市', 8, 905, '英语', 94),
(906, '王六', '男', Decimal('1988.0000000000'), '计算机系', '湖南省衡阳市', 9, 906, '计算机', 90),
(906, '王六', '男', Decimal('1988.0000000000'), '计算机系', '湖南省衡阳市', 10, 906, '英语', 85)]
# 方法二
query = db.select([student, score]).join(score, score.columns.stu_id == student.columns.id)
connection.execute(query).fetchall()
[(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区', 1, 901, '计算机', 98),
(901, '张老大', '男', Decimal('1985.0000000000'), '计算机系', '北京市海淀区', 2, 901, '英语', 80),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区', 3, 902, '计算机', 65),
(902, '张老二', '男', Decimal('1986.0000000000'), '中文系', '北京市昌平区', 4, 902, '中文', 88),
(903, '张三', '女', Decimal('1990.0000000000'), '中文系', '湖南省永州市', 5, 903, '中文', 95),
(904, '李四', '男', Decimal('1990.0000000000'), '英语系', '辽宁省阜新市', 6, 904, '计算机', 70),
(904, '李四', '男', Decimal('1990.0000000000'), '英语系', '辽宁省阜新市', 7, 904, '英语', 92),
(905, '王五', '女', Decimal('1991.0000000000'), '英语系', '福建省厦门市', 8, 905, '英语', 94),
(906, '王六', '男', Decimal('1988.0000000000'), '计算机系', '湖南省衡阳市', 9, 906, '计算机', 90),
(906, '王六', '男', Decimal('1988.0000000000'), '计算机系', '湖南省衡阳市', 10, 906, '英语', 85)]
创建表并插入数据
create table emp
(
Id INTEGER,
name VARCHAR(255) not null,
salary FLOAT,
active BOOLEAN
);
insert into emp (Id, name, salary, active)
VALUES (1, 'naveen', 60000, true),
(2, 'ram', 80000, false);
emp = db.Table('emp', metadata,
db.Column('Id', db.Integer()),
db.Column('name', db.String(255), nullable=False),
db.Column('salary', db.Float(), default=100.0),
db.Column('active', db.Boolean(), default=True)
)
metadata.create_all(engine)
query = db.insert(emp).values(Id=1, name='naveen', salary=60000.00, active=True)
connection.execute(query)
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f7a6433ec70>
#Inserting many records at ones
query = db.insert(emp)
values_list = [{'Id':'2', 'name':'ram', 'salary':80000, 'active':False},
{'Id':'3', 'name':'ramesh', 'salary':70000, 'active':True}]
ResultProxy = connection.execute(query,values_list)
results = connection.execute(db.select([emp])).fetchall()
df = pd.DataFrame(results)
df.columns = results[0].keys()
df.head(4)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Id | name | salary | active | |
---|---|---|---|---|
0 | 1 | naveen | 60000.0 | True |
1 | 2 | ram | 80000.0 | False |
2 | 3 | ramesh | 70000.0 | True |
update
update emp set salary = 103 where emp.id = 1
query = db.update(emp).values(salary = 103.0).where(emp.columns.Id == 1)
connection.execute(query)
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f7a25fb8ac0>
--
delete
delete from emp where salary < 200;
query = db.delete(emp).where(emp.columns.salary < 200)
connection.execute(query)
<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7f7a25f7d8b0>
drop 删除表
drop table emp
emp.drop(engine)
metadata.drop_all(engine) # 删除所有的数据
结语
祝大家技术更进一层楼。