SQLite + Python:轻量级数据库的正确打开方式

3 阅读5分钟

摘要:SQLite是Python内置的数据库,零配置、零依赖,适合中小型项目和数据分析。本文从基础CRUD到性能优化、并发处理、与Pandas集成,全面讲解SQLite在Python中的最佳实践。

为什么选SQLite?

  • 零配置:不需要安装数据库服务器
  • Python内置:import sqlite3 直接用
  • 单文件存储:一个.db文件就是整个数据库,方便备份和迁移
  • 性能够用:百万级数据毫无压力

基础操作

import sqlite3
from contextlib import contextmanager

# 推荐:用上下文管理器自动处理连接
@contextmanager
def get_db(db_path='app.db'):
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row  # 返回字典风格的行
    conn.execute('PRAGMA journal_mode=WAL')  # WAL模式,提升并发性能
    conn.execute('PRAGMA foreign_keys=ON')   # 启用外键约束
    try:
        yield conn
        conn.commit()
    except Exception:
        conn.rollback()
        raise
    finally:
        conn.close()

建表

with get_db() as db:
    db.execute('''
        CREATE TABLE IF NOT EXISTS users (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT NOT NULL,
            email TEXT UNIQUE NOT NULL,
            age INTEGER CHECK(age >= 0 AND age <= 150),
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    db.execute('''
        CREATE TABLE IF NOT EXISTS posts (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            user_id INTEGER NOT NULL,
            title TEXT NOT NULL,
            content TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
        )
    ''')
    
    # 创建索引
    db.execute('CREATE INDEX IF NOT EXISTS idx_posts_user ON posts(user_id)')
    db.execute('CREATE INDEX IF NOT EXISTS idx_users_email ON users(email)')

CRUD操作

with get_db() as db:
    # 插入
    db.execute(
        'INSERT INTO users (name, email, age) VALUES (?, ?, ?)',
        ('张三', 'zhang@test.com', 25)
    )
    
    # 批量插入(比循环insert快10倍+)
    users = [
        ('李四', 'li@test.com', 30),
        ('王五', 'wang@test.com', 28),
        ('赵六', 'zhao@test.com', 35),
    ]
    db.executemany(
        'INSERT INTO users (name, email, age) VALUES (?, ?, ?)',
        users
    )
    
    # 查询
    cursor = db.execute('SELECT * FROM users WHERE age > ?', (25,))
    for row in cursor:
        print(dict(row))  # {'id': 2, 'name': '李四', ...}
    
    # 更新
    db.execute('UPDATE users SET age = ? WHERE name = ?', (26, '张三'))
    
    # 删除
    db.execute('DELETE FROM users WHERE id = ?', (1,))

参数化查询(防SQL注入)

# ❌ 危险:字符串拼接
name = "'; DROP TABLE users; --"
db.execute(f"SELECT * FROM users WHERE name = '{name}'")

# ✅ 安全:参数化查询
db.execute("SELECT * FROM users WHERE name = ?", (name,))

# ✅ 命名参数
db.execute(
    "SELECT * FROM users WHERE name = :name AND age > :age",
    {'name': '张三', 'age': 20}
)

高级查询

分页

def get_users_page(db, page=1, per_page=20):
    offset = (page - 1) * per_page
    
    # 总数
    total = db.execute('SELECT COUNT(*) FROM users').fetchone()[0]
    
    # 当前页数据
    rows = db.execute(
        'SELECT * FROM users ORDER BY id DESC LIMIT ? OFFSET ?',
        (per_page, offset)
    ).fetchall()
    
    return {
        'data': [dict(r) for r in rows],
        'total': total,
        'page': page,
        'pages': (total + per_page - 1) // per_page,
    }

聚合和分组

with get_db() as db:
    # 每个用户的文章数
    rows = db.execute('''
        SELECT u.name, COUNT(p.id) as post_count
        FROM users u
        LEFT JOIN posts p ON u.id = p.user_id
        GROUP BY u.id
        ORDER BY post_count DESC
    ''').fetchall()
    
    # 年龄分布统计
    rows = db.execute('''
        SELECT 
            CASE 
                WHEN age < 20 THEN '20以下'
                WHEN age < 30 THEN '20-29'
                WHEN age < 40 THEN '30-39'
                ELSE '40以上'
            END as age_group,
            COUNT(*) as count
        FROM users
        GROUP BY age_group
    ''').fetchall()

全文搜索(FTS5)

with get_db() as db:
    # 创建全文搜索虚拟表
    db.execute('''
        CREATE VIRTUAL TABLE IF NOT EXISTS posts_fts 
        USING fts5(title, content, content=posts, content_rowid=id)
    ''')
    
    # 同步数据
    db.execute('''
        INSERT INTO posts_fts(rowid, title, content)
        SELECT id, title, content FROM posts
    ''')
    
    # 全文搜索
    results = db.execute('''
        SELECT rowid, title, snippet(posts_fts, 1, '<b>', '</b>', '...', 20) as excerpt
        FROM posts_fts
        WHERE posts_fts MATCH ?
        ORDER BY rank
        LIMIT 10
    ''', ('Python 爬虫',)).fetchall()

与Pandas集成

import pandas as pd

# 读取到DataFrame
with get_db() as db:
    df = pd.read_sql_query('SELECT * FROM users', db)
    
    # 带参数的查询
    df = pd.read_sql_query(
        'SELECT * FROM users WHERE age > ?',
        db,
        params=(25,)
    )

# DataFrame写入SQLite
df = pd.DataFrame({
    'name': ['A', 'B', 'C'],
    'value': [1, 2, 3],
})

with get_db() as db:
    df.to_sql('metrics', db, if_exists='replace', index=False)
    
    # 追加模式
    df.to_sql('metrics', db, if_exists='append', index=False)

性能优化

批量插入优化

import time

def benchmark_insert(db, n=100000):
    data = [(f'user_{i}', f'user_{i}@test.com', i % 60 + 18) for i in range(n)]
    
    # 方法1:逐条插入(最慢)
    start = time.time()
    for name, email, age in data:
        db.execute('INSERT INTO users (name, email, age) VALUES (?, ?, ?)', (name, email, age))
    db.commit()
    print(f'逐条插入: {time.time() - start:.2f}s')
    
    # 方法2:executemany(快5-10倍)
    start = time.time()
    db.executemany('INSERT INTO users (name, email, age) VALUES (?, ?, ?)', data)
    db.commit()
    print(f'executemany: {time.time() - start:.2f}s')
    
    # 方法3:事务 + executemany + PRAGMA优化(最快)
    start = time.time()
    db.execute('PRAGMA synchronous=OFF')
    db.execute('PRAGMA temp_store=MEMORY')
    db.executemany('INSERT INTO users (name, email, age) VALUES (?, ?, ?)', data)
    db.commit()
    db.execute('PRAGMA synchronous=FULL')
    print(f'优化后: {time.time() - start:.2f}s')

# 10万条数据大约:
# 逐条插入: 25s
# executemany: 3s
# 优化后: 0.5s

查询优化

with get_db() as db:
    # 查看查询计划
    plan = db.execute('EXPLAIN QUERY PLAN SELECT * FROM users WHERE email = ?', ('test@test.com',)).fetchall()
    for row in plan:
        print(dict(row))
    # 如果显示 SCAN TABLE 说明没用到索引
    # 如果显示 SEARCH TABLE ... USING INDEX 说明用到了索引
    
    # 分析数据库统计信息
    db.execute('ANALYZE')

数据库维护

with get_db() as db:
    # 压缩数据库(删除数据后回收空间)
    db.execute('VACUUM')
    
    # 完整性检查
    result = db.execute('PRAGMA integrity_check').fetchone()
    print(result[0])  # 'ok' 表示正常
    
    # 查看数据库大小
    page_count = db.execute('PRAGMA page_count').fetchone()[0]
    page_size = db.execute('PRAGMA page_size').fetchone()[0]
    print(f'数据库大小: {page_count * page_size / 1024 / 1024:.1f} MB')

实战:简易KV存储

class KVStore:
    """基于SQLite的键值存储,支持过期时间"""
    
    def __init__(self, db_path='kv.db'):
        self.conn = sqlite3.connect(db_path)
        self.conn.execute('PRAGMA journal_mode=WAL')
        self.conn.execute('''
            CREATE TABLE IF NOT EXISTS kv (
                key TEXT PRIMARY KEY,
                value TEXT,
                expires_at REAL
            )
        ''')
        self.conn.commit()
    
    def set(self, key, value, ttl=None):
        import json
        expires = time.time() + ttl if ttl else None
        self.conn.execute(
            'INSERT OR REPLACE INTO kv (key, value, expires_at) VALUES (?, ?, ?)',
            (key, json.dumps(value), expires)
        )
        self.conn.commit()
    
    def get(self, key, default=None):
        import json
        row = self.conn.execute(
            'SELECT value, expires_at FROM kv WHERE key = ?', (key,)
        ).fetchone()
        
        if row is None:
            return default
        
        value, expires = row
        if expires and time.time() > expires:
            self.delete(key)
            return default
        
        return json.loads(value)
    
    def delete(self, key):
        self.conn.execute('DELETE FROM kv WHERE key = ?', (key,))
        self.conn.commit()
    
    def cleanup(self):
        self.conn.execute('DELETE FROM kv WHERE expires_at IS NOT NULL AND expires_at < ?', (time.time(),))
        self.conn.commit()

# 使用
store = KVStore()
store.set('user:1', {'name': '张三', 'age': 25}, ttl=3600)
print(store.get('user:1'))

总结

SQLite的最佳使用场景:

  • 单机应用、桌面软件、移动端
  • 数据分析和原型开发
  • 缓存和本地存储
  • 测试环境替代MySQL/PostgreSQL

不适合的场景:

  • 高并发写入(>100次/秒)
  • 多机器共享数据库
  • 数据量超过100GB

用好SQLite,很多场景根本不需要部署MySQL或PostgreSQL,开发效率直接翻倍。