Python 正则表达式

0 阅读13分钟

Python 正则表达式

目录


正则表达式基础

什么是正则表达式?

正则表达式(Regular Expression)是一种强大的文本模式匹配工具,用于:

  • 搜索和替换文本
  • 验证输入格式
  • 提取特定信息
  • 数据清洗和处理
import re

# 简单示例:检查字符串是否包含数字
text = "我的电话号码是 138-1234-5678"
pattern = r"\d+"  # 匹配一个或多个数字

match = re.search(pattern, text)
if match:
    print(f"找到数字: {match.group()}")  # 找到数字: 138

为什么使用正则表达式?

# ❌ 不使用正则:复杂且易错
def is_email_manual(email):
    if "@" not in email:
        return False
    parts = email.split("@")
    if len(parts) != 2:
        return False
    local, domain = parts
    if not local or not domain:
        return False
    if "." not in domain:
        return False
    return True

# ✅ 使用正则:简洁清晰
def is_email_regex(email):
    pattern = r'^[\w.-]+@[\w.-]+\.\w+$'
    return bool(re.match(pattern, email))

re 模块简介

主要函数

import re

# 1. re.match() - 从字符串开头匹配
result = re.match(r'\d+', '123abc')
print(result.group())  # 123

# 2. re.search() - 搜索整个字符串
result = re.search(r'\d+', 'abc123def')
print(result.group())  # 123

# 3. re.findall() - 查找所有匹配
results = re.findall(r'\d+', 'a1b2c3')
print(results)  # ['1', '2', '3']

# 4. re.finditer() - 返回迭代器
for match in re.finditer(r'\d+', 'a1b2c3'):
    print(f"位置 {match.start()}-{match.end()}: {match.group()}")

# 5. re.sub() - 替换
result = re.sub(r'\d+', 'X', 'a1b2c3')
print(result)  # aXbXcX

# 6. re.split() - 分割
result = re.split(r'[,\s]+', 'apple,banana orange grape')
print(result)  # ['apple', 'banana', 'orange', 'grape']

# 7. re.compile() - 编译正则
pattern = re.compile(r'\d+')
result = pattern.search('abc123')
print(result.group())  # 123

Match 对象

import re

text = "Email: test@example.com, Phone: 138-1234-5678"
pattern = r'(\w+)@(\w+\.\w+)'

match = re.search(pattern, text)

if match:
    print(f"完整匹配: {match.group(0)}")     # test@example.com
    print(f"第一组: {match.group(1)}")        # test
    print(f"第二组: {match.group(2)}")        # example.com
    print(f"所有组: {match.groups()}")        # ('test', 'example.com')
    print(f"起始位置: {match.start()}")       # 7
    print(f"结束位置: {match.end()}")         # 23
    print(f"跨度: {match.span()}")            # (7, 23)
    print(f"原始字符串: {match.string}")      # 完整文本

基本语法

普通字符

import re

# 字母和数字直接匹配
print(re.search(r'hello', 'say hello world').group())  # hello
print(re.search(r'123', 'abc123def').group())          # 123

# 特殊字符需要转义
print(re.search(r'\.', 'file.txt').group())   # .
print(re.search(r'\?', 'what?').group())      # ?
print(re.search(r'\*', '5*3').group())        # *

元字符

import re

# . 匹配任意字符(除换行符)
print(re.findall(r'a.c', 'abc axc a1c'))  # ['abc', 'axc', 'a1c']

# ^ 匹配字符串开头
print(re.findall(r'^Hello', 'Hello World'))  # ['Hello']

# $ 匹配字符串结尾
print(re.findall(r'World$', 'Hello World'))  # ['World']

# | 或运算
print(re.findall(r'cat|dog', 'I have a cat and a dog'))  # ['cat', 'dog']

# \ 转义字符
print(re.findall(r'\d+', 'Price: $99.99'))  # ['99', '99']

字符类

预定义字符类

import re

text = "ABC abc 123 !@#"

# \d - 数字 [0-9]
print(re.findall(r'\d', text))    # ['1', '2', '3']

# \D - 非数字
print(re.findall(r'\D', text)[:5])  # ['A', 'B', 'C', ' ', 'a']

# \w - 单词字符 [a-zA-Z0-9_]
print(re.findall(r'\w', text)[:5])  # ['A', 'B', 'C', 'a', 'b']

# \W - 非单词字符
print(re.findall(r'\W', text))    # [' ', ' ', ' ', '!', '@', '#']

# \s - 空白字符 [ \t\n\r\f\v]
print(re.findall(r'\s', "a b\tc"))  # [' ', '\t']

# \S - 非空白字符
print(re.findall(r'\S', "a b"))   # ['a', 'b']

自定义字符类

import re

# [] 定义字符集合
print(re.findall(r'[aeiou]', 'hello world'))  # ['e', 'o', 'o']

# 范围
print(re.findall(r'[a-z]', 'ABC abc 123'))    # ['a', 'b', 'c']
print(re.findall(r'[0-9]', 'abc123'))          # ['1', '2', '3']

# 取反
print(re.findall(r'[^0-9]', 'abc123'))         # ['a', 'b', 'c']

# 组合
print(re.findall(r'[a-zA-Z0-9]', 'abc ABC 123'))  # ['a','b','c','A','B','C','1','2','3']

# 特殊字符在字符类中不需要转义(大部分)
print(re.findall(r'[.*+?]', 'a.b*c+d?'))        # ['.', '*', '+', '?']

常用字符类示例

import re

# 匹配十六进制颜色
colors = "#FF5733 #abc #123456 #GGG"
hex_colors = re.findall(r'#[0-9a-fA-F]{6}\b', colors)
print(hex_colors)  # ['#FF5733', '#123456']

# 匹配中文
text = "Hello 你好 World 世界"
chinese = re.findall(r'[\u4e00-\u9fff]+', text)
print(chinese)  # ['你好', '世界']

# 匹配身份证号
id_card = "身份证:110101199001011234"
ids = re.findall(r'\d{17}[\dXx]', id_card)
print(ids)  # ['110101199001011234']

量词

基本量词

import re

# * 零次或多次
print(re.findall(r'ab*c', 'ac abc abbc abbbc'))  # ['ac', 'abc', 'abbc', 'abbbc']

# + 一次或多次
print(re.findall(r'ab+c', 'ac abc abbc'))         # ['abc', 'abbc']

# ? 零次或一次
print(re.findall(r'colou?r', 'color colour'))     # ['color', 'colour']

# {n} 恰好 n 次
print(re.findall(r'\d{4}', '2024-01-15'))         # ['2024', '01', '15']

# {n,} 至少 n 次
print(re.findall(r'\d{2,}', '1 22 333 4444'))     # ['22', '333', '4444']

# {n,m} n 到 m 次
print(re.findall(r'\d{2,4}', '1 22 333 4444 55555'))  # ['22', '333', '4444', '5555']

贪婪 vs 非贪婪

import re

text = "<div>content1</div><div>content2</div>"

# 贪婪匹配(默认)
greedy = re.findall(r'<div>.*</div>', text)
print(greedy)  # ['<div>content1</div><div>content2</div>']

# 非贪婪匹配(加 ?)
non_greedy = re.findall(r'<div>.*?</div>', text)
print(non_greedy)  # ['<div>content1</div>', '<div>content2</div>']

# 其他非贪婪量词
print(re.findall(r'\d+?', '12345'))   # ['1', '2', '3', '4', '5']
print(re.findall(r'\d*?', '12345'))   # ['', '', '', '', '', '']
print(re.findall(r'\d??', '12345'))   # ['', '', '', '', '', '']

量词对比表

量词含义示例匹配
*0 或更多ab*cac, abc, abbc
+1 或更多ab+cabc, abbc
?0 或 1colou?rcolor, colour
{n}恰好 n 次\d{4}1234
{n,}至少 n 次\d{2,}12, 123
{n,m}n 到 m 次\d{2,4}12, 123, 1234

边界匹配

单词边界

import re

text = "The cat scattered the catalog"

# \b 单词边界
print(re.findall(r'\bcat\b', text))      # ['cat']
print(re.findall(r'\bcat', text))        # ['cat', 'cat']

# \B 非单词边界
print(re.findall(r'\Bcat\B', text))      # []
print(re.findall(r'\Bcat', text))        # ['cat'] (scattered)

字符串边界

import re

text = "Hello\nWorld\nPython"

# ^ 和 $ 默认只匹配首尾
print(re.findall(r'^\w+', text))         # ['Hello']
print(re.findall(r'\w+$', text))         # ['Python']

# MULTILINE 标志:每行的首尾
print(re.findall(r'^\w+', text, re.MULTILINE))    # ['Hello', 'World', 'Python']
print(re.findall(r'\w+$', text, re.MULTILINE))    # ['Hello', 'World', 'Python']

# \A 和 \Z 始终匹配整个字符串的首尾
print(re.findall(r'\A\w+', text))        # ['Hello']
print(re.findall(r'\w+\Z', text))        # ['Python']

分组和捕获

基本分组

import re

# () 创建分组
text = "John Doe, age 25"
pattern = r'(\w+) (\w+), age (\d+)'

match = re.search(pattern, text)
if match:
    print(f"全名: {match.group(0)}")     # John Doe, age 25
    print(f"名: {match.group(1)}")       # John
    print(f"姓: {match.group(2)}")       # Doe
    print(f"年龄: {match.group(3)}")     # 25
    print(f"所有组: {match.groups()}")   # ('John', 'Doe', '25')

命名分组

import re

text = "2024-01-15"
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'

match = re.search(pattern, text)
if match:
    print(f"年: {match.group('year')}")     # 2024
    print(f"月: {match.group('month')}")    # 01
    print(f"日: {match.group('day')}")      # 15

    # 通过名称访问
    print(match.groupdict())  # {'year': '2024', 'month': '01', 'day': '15'}

非捕获分组

import re

# (?:...) 非捕获分组(不保存匹配结果)
text = "www.example.com"

# 捕获分组
result1 = re.findall(r'(www|ftp)\.(\w+)\.(\w+)', text)
print(result1)  # [('www', 'example', 'com')]

# 非捕获分组
result2 = re.findall(r'(?:www|ftp)\.(\w+)\.(\w+)', text)
print(result2)  # [('example', 'com')]

反向引用

import re

# \1, \2 引用前面的分组
text = "hello hello world world"

# 查找重复的单词
pattern = r'\b(\w+)\s+\1\b'
matches = re.findall(pattern, text)
print(matches)  # ['hello', 'world']

# 替换重复单词
result = re.sub(r'\b(\w+)\s+\1\b', r'\1', text)
print(result)  # hello world

# 匹配成对的标签
html = "<div>content</div><span>text</span>"
pattern = r'<(\w+)>(.*?)</\1>'
matches = re.findall(pattern, html)
print(matches)  # [('div', 'content'), ('span', 'text')]

前瞻和后顾

前瞻(Lookahead)

import re

# (?=...) 正向前瞻:后面必须是...
text = "Windows 10, Windows 11, Linux"
pattern = r'Windows (?=\d+)'
matches = re.findall(pattern, text)
print(matches)  # ['Windows', 'Windows']

# (?!...) 负向前瞻:后面不能是...
pattern = r'Windows (?!\d+)'
matches = re.findall(pattern, text)
print(matches)  # []

后顾(Lookbehind)

import re

# (?<=...) 正向后顾:前面必须是...
text = "$100 €200 ¥300"
pattern = r'(?<=\$)\d+'
matches = re.findall(pattern, text)
print(matches)  # ['100']

# (?<!...) 负向后顾:前面不能是...
pattern = r'(?<!\$)\d+'
matches = re.findall(pattern, text)
print(matches)  # ['200', '300']

实际应用

import re

# 密码强度验证(至少8位,包含大小写和数字)
password = "Password123"
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
print(bool(re.match(pattern, password)))  # True

# 提取 URL 中的域名
url = "https://www.example.com/path/to/page"
pattern = r'(?<=://)[^/]+'
domain = re.search(pattern, url)
print(domain.group())  # www.example.com

# 金额格式化(千分位)
amount = "1234567890"
formatted = re.sub(r'(?<=\d)(?=(\d{3})+$)', ',', amount)
print(formatted)  # 1,234,567,890

编译正则表达式

compile() 的优势

import re
import time

# 未编译:每次都要解析正则
start = time.time()
for _ in range(10000):
    re.search(r'\d+', 'abc123def')
print(f"未编译: {time.time() - start:.4f}秒")

# 已编译:只需解析一次
pattern = re.compile(r'\d+')
start = time.time()
for _ in range(10000):
    pattern.search('abc123def')
print(f"已编译: {time.time() - start:.4f}秒")

编译选项

import re

# IGNORECASE - 忽略大小写
pattern = re.compile(r'hello', re.IGNORECASE)
print(pattern.findall('Hello HELLO hello'))  # ['Hello', 'HELLO', 'hello']

# MULTILINE - 多行模式
pattern = re.compile(r'^\w+', re.MULTILINE)
print(pattern.findall('line1\nline2\nline3'))  # ['line1', 'line2', 'line3']

# DOTALL - . 匹配包括换行符在内的所有字符
pattern = re.compile(r'.+', re.DOTALL)
print(pattern.findall('line1\nline2'))  # ['line1\nline2']

# VERBOSE - 详细模式(允许注释和空格)
pattern = re.compile(r'''
    \d{4}     # 年
    -         # 分隔符
    \d{2}     # 月
    -         # 分隔符
    \d{2}     # 日
''', re.VERBOSE)
print(pattern.search('2024-01-15').group())  # 2024-01-15

# 组合多个标志
pattern = re.compile(r'hello', re.IGNORECASE | re.MULTILINE)

常用函数详解

match() vs search()

import re

text = "abc123def"

# match() 只从开头匹配
print(re.match(r'\d+', text))      # None
print(re.match(r'[a-z]+', text))   # <Match object>

# search() 搜索整个字符串
print(re.search(r'\d+', text))     # <Match object>
print(re.search(r'[a-z]+', text))  # <Match object>

findall() vs finditer()

import re

text = "Phone: 138-1234-5678, Alt: 139-8765-4321"

# findall() 返回列表
phones = re.findall(r'\d{3}-\d{4}-\d{4}', text)
print(phones)  # ['138-1234-5678', '139-8765-4321']

# finditer() 返回迭代器(更节省内存)
for match in re.finditer(r'(\d{3})-(\d{4})-(\d{4})', text):
    print(f"完整: {match.group(0)}, 区号: {match.group(1)}")

sub() 高级用法

import re

# 基本替换
text = "Color: red, green, blue"
result = re.sub(r'red|green|blue', 'COLOR', text)
print(result)  # Color: COLOR, COLOR, COLOR

# 使用函数进行替换
def uppercase_match(match):
    return match.group().upper()

result = re.sub(r'\b\w+\b', uppercase_match, "hello world")
print(result)  # HELLO WORLD

# 限制替换次数
text = "aaa bbb ccc"
result = re.sub(r'\w+', 'X', text, count=2)
print(result)  # X X ccc

# 反向引用替换
text = "2024-01-15"
result = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', text)
print(result)  # 01/15/2024

split() 高级用法

import re

# 基本分割
text = "apple,banana;orange grape"
result = re.split(r'[,\s;]+', text)
print(result)  # ['apple', 'banana', 'orange', 'grape']

# 保留分隔符
text = "apple,banana;orange"
result = re.split(r'([,;])', text)
print(result)  # ['apple', ',', 'banana', ';', 'orange']

# 限制分割次数
text = "one,two,three,four"
result = re.split(r',', text, maxsplit=2)
print(result)  # ['one', 'two', 'three,four']

实战示例

邮箱验证

import re

def validate_email(email):
    """验证邮箱格式"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

# 测试
emails = [
    "user@example.com",
    "test.user@domain.co.uk",
    "invalid@",
    "@example.com",
    "user@.com"
]

for email in emails:
    status = "✓" if validate_email(email) else "✗"
    print(f"{status} {email}")

# 输出:
# ✓ user@example.com
# ✓ test.user@domain.co.uk
# ✗ invalid@
# ✗ @example.com
# ✗ user@.com

手机号提取

import re

def extract_phones(text):
    """提取中国手机号"""
    # 匹配 11 位手机号
    pattern = r'1[3-9]\d{9}'
    return re.findall(pattern, text)

text = """
联系人:张三 13812345678
李四电话:159-8765-4321
王五手机:186 1234 5678
无效号码:12345678901
"""

phones = extract_phones(text)
print("找到的手机号:")
for phone in phones:
    print(f"  {phone}")

# 输出:
# 找到的手机号:
#   13812345678
#   15987654321
#   18612345678

HTML 标签提取

import re

def extract_links(html):
    """提取 HTML 中的链接"""
    pattern = r'<a\s+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>'
    matches = re.findall(pattern, html, re.IGNORECASE | re.DOTALL)

    links = []
    for url, text in matches:
        links.append({
            'url': url,
            'text': re.sub(r'<[^>]+>', '', text).strip()  # 去除嵌套标签
        })

    return links

html = """
<a href="https://example.com">Example</a>
<a href="/page" class="link">Page <strong>Title</strong></a>
<a href='https://test.org'>Test</a>
"""

links = extract_links(html)
for link in links:
    print(f"URL: {link['url']}, 文本: {link['text']}")

# 输出:
# URL: https://example.com, 文本: Example
# URL: /page, 文本: Page Title
# URL: https://test.org, 文本: Test

日志解析

import re
from datetime import datetime

def parse_log_line(line):
    """解析日志行"""
    pattern = r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.+)'

    match = re.match(pattern, line)
    if match:
        return {
            'timestamp': datetime.strptime(match.group('timestamp'), '%Y-%m-%d %H:%M:%S'),
            'level': match.group('level'),
            'message': match.group('message')
        }
    return None

log_lines = [
    "2024-01-15 10:30:00 [INFO] Application started",
    "2024-01-15 10:30:05 [ERROR] Database connection failed",
    "2024-01-15 10:30:10 [WARNING] High memory usage"
]

for line in log_lines:
    parsed = parse_log_line(line)
    if parsed:
        print(f"[{parsed['level']}] {parsed['timestamp']}: {parsed['message']}")

IP 地址验证

import re

def validate_ip(ip):
    """验证 IPv4 地址"""
    pattern = r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$'
    match = re.match(pattern, ip)

    if not match:
        return False

    # 检查每个部分是否在 0-255 范围内
    for i in range(1, 5):
        octet = int(match.group(i))
        if octet < 0 or octet > 255:
            return False

    return True

# 测试
ips = [
    "192.168.1.1",
    "255.255.255.255",
    "0.0.0.0",
    "256.1.1.1",
    "192.168.1",
    "abc.def.ghi.jkl"
]

for ip in ips:
    status = "✓" if validate_ip(ip) else "✗"
    print(f"{status} {ip}")

性能优化

编译正则表达式

import re
import time

# ❌ 不推荐:重复编译
def slow_search(text, patterns):
    results = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        results.extend(matches)
    return results

# ✅ 推荐:预先编译
def fast_search(text, compiled_patterns):
    results = []
    for pattern in compiled_patterns:
        matches = pattern.findall(text)
        results.extend(matches)
    return results

# 使用
patterns = [r'\d+', r'[a-z]+', r'\w+']
compiled = [re.compile(p) for p in patterns]

text = "abc123 def456 ghi789" * 1000
result = fast_search(text, compiled)

避免灾难性回溯

import re

# ❌ 危险:可能导致灾难性回溯
text = "aaaaaaaaaaaaaaaaaaaa!"
try:
    result = re.match(r'(a+)+b', text)
except RecursionError:
    print("递归错误!")

# ✅ 安全:使用原子分组或优化正则
result = re.match(r'a+b', text)  # 简单有效

使用适当的方法

import re

text = "large text with many words" * 10000

# ❌ 只需要判断是否存在,却用了 findall
if re.findall(r'pattern', text):
    pass

# ✅ 使用 search 更高效
if re.search(r'pattern', text):
    pass

# ❌ 只需要第一个匹配,却用了 findall
first = re.findall(r'pattern', text)[0]

# ✅ 使用 search
match = re.search(r'pattern', text)
if match:
    first = match.group()

综合实战

实战1: 数据清洗工具

"""
数据清洗工具
展示正则表达式在数据清洗中的应用
"""

import re
from typing import List, Dict

class DataCleaner:
    """数据清洗器"""

    @staticmethod
    def remove_html_tags(text: str) -> str:
        """去除 HTML 标签"""
        return re.sub(r'<[^>]+>', '', text)

    @staticmethod
    def normalize_whitespace(text: str) -> str:
        """标准化空白字符"""
        # 将多个空白字符替换为单个空格
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    @staticmethod
    def extract_emails(text: str) -> List[str]:
        """提取邮箱地址"""
        pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        return list(set(re.findall(pattern, text)))

    @staticmethod
    def extract_urls(text: str) -> List[str]:
        """提取 URL"""
        pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
        return list(set(re.findall(pattern, text)))

    @staticmethod
    def extract_phone_numbers(text: str) -> List[str]:
        """提取电话号码"""
        # 中国手机号
        pattern = r'1[3-9]\d{9}'
        return list(set(re.findall(pattern, text)))

    @staticmethod
    def mask_sensitive_info(text: str) -> str:
        """脱敏敏感信息"""
        # 隐藏邮箱
        text = re.sub(
            r'([a-zA-Z0-9._%+-]{2})[a-zA-Z0-9._%+-]*(@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
            r'\1***\2',
            text
        )

        # 隐藏手机号
        text = re.sub(
            r'(1[3-9]\d)\d{4}(\d{4})',
            r'\1****\2',
            text
        )

        # 隐藏身份证号
        text = re.sub(
            r'(\d{6})\d{8}(\d{4})',
            r'\1********\2',
            text
        )

        return text

    @staticmethod
    def normalize_date(text: str) -> str:
        """标准化日期格式"""
        # YYYY/MM/DD -> YYYY-MM-DD
        text = re.sub(r'(\d{4})/(\d{2})/(\d{2})', r'\1-\2-\3', text)

        # DD-MM-YYYY -> YYYY-MM-DD
        text = re.sub(r'(\d{2})-(\d{2})-(\d{4})', r'\3-\2-\1', text)

        return text

    @staticmethod
    def extract_hashtags(text: str) -> List[str]:
        """提取话题标签"""
        pattern = r'#([\w\u4e00-\u9fff]+)'
        return re.findall(pattern, text)

    @staticmethod
    def clean_text(text: str, options: Dict[str, bool] = None) -> str:
        """综合清洗文本"""
        if options is None:
            options = {}

        # 去除 HTML 标签
        if options.get('remove_html', True):
            text = self.remove_html_tags(text)

        # 标准化空白
        if options.get('normalize_whitespace', True):
            text = self.normalize_whitespace(text)

        # 标准化日期
        if options.get('normalize_date', False):
            text = self.normalize_date(text)

        return text

# 使用示例
def main():
    cleaner = DataCleaner()

    # 测试文本
    text = """
    <p>联系邮箱: john.doe@example.com, jane@test.org</p>
    <p>电话: 13812345678, 15987654321</p>
    <p>访问我们的网站: https://www.example.com</p>
    <p>发布日期: 2024/01/15 或 15-01-2024</p>
    <p>#Python #编程 #教程</p>
    """

    print("="*60)
    print("原始文本:")
    print(text)
    print("="*60)

    # 提取信息
    print("\n提取的邮箱:")
    for email in cleaner.extract_emails(text):
        print(f"  {email}")

    print("\n提取的 URL:")
    for url in cleaner.extract_urls(text):
        print(f"  {url}")

    print("\n提取的手机号:")
    for phone in cleaner.extract_phone_numbers(text):
        print(f"  {phone}")

    print("\n提取的话题标签:")
    for tag in cleaner.extract_hashtags(text):
        print(f"  #{tag}")

    # 脱敏
    print("\n脱敏后的文本:")
    masked = cleaner.mask_sensitive_info(text)
    print(masked)

    # 清洗
    print("\n清洗后的文本:")
    cleaned = cleaner.clean_text(text)
    print(cleaned)

if __name__ == "__main__":
    main()

实战2: 配置文件解析器

"""
配置文件解析器
展示正则表达式在配置解析中的应用
"""

import re
from typing import Dict, Any, Optional

class ConfigParser:
    """配置文件解析器"""

    def __init__(self):
        self.config: Dict[str, Dict[str, str]] = {}

    def parse(self, content: str) -> Dict[str, Dict[str, str]]:
        """解析配置内容"""
        sections = re.split(r'^\[([^\]]+)\]', content, flags=re.MULTILINE)

        current_section = 'DEFAULT'
        self.config[current_section] = {}

        i = 1
        while i < len(sections):
            section_name = sections[i].strip()
            section_content = sections[i + 1] if i + 1 < len(sections) else ""

            self.config[section_name] = {}

            # 解析键值对
            for line in section_content.split('\n'):
                line = line.strip()

                # 跳过注释和空行
                if not line or line.startswith('#') or line.startswith(';'):
                    continue

                # 匹配键值对
                match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
                if match:
                    key = match.group(1).strip()
                    value = match.group(2).strip()

                    # 去除引号
                    value = re.sub(r'^["\'](.*)["\']$', r'\1', value)

                    self.config[section_name][key] = value

            i += 2

        return self.config

    def get(self, section: str, key: str, default: str = None) -> Optional[str]:
        """获取配置值"""
        return self.config.get(section, {}).get(key, default)

    def get_int(self, section: str, key: str, default: int = 0) -> int:
        """获取整数配置"""
        value = self.get(section, key)
        if value is None:
            return default
        try:
            return int(value)
        except ValueError:
            return default

    def get_bool(self, section: str, key: str, default: bool = False) -> bool:
        """获取布尔配置"""
        value = self.get(section, key)
        if value is None:
            return default
        return value.lower() in ('true', 'yes', '1', 'on')

    def get_list(self, section: str, key: str, separator: str = ',') -> list:
        """获取列表配置"""
        value = self.get(section, key)
        if value is None:
            return []
        return [item.strip() for item in value.split(separator)]

# 使用示例
def main():
    config_content = """
# 数据库配置
[database]
host = localhost
port = 5432
name = mydb
user = "admin"
password = 'secret123'

# 服务器配置
[server]
host = 0.0.0.0
port = 8080
debug = true
workers = 4

# 日志配置
[logging]
level = INFO
file = /var/log/app.log
max_size = 10MB
backup_count = 5

# 功能开关
[features]
enable_cache = yes
enable_auth = true
allowed_origins = http://localhost,https://example.com
"""

    parser = ConfigParser()
    config = parser.parse(config_content)

    print("="*60)
    print("配置解析结果:")
    print("="*60)

    for section, values in config.items():
        if values:
            print(f"\n[{section}]")
            for key, value in values.items():
                print(f"  {key} = {value}")

    print("\n" + "="*60)
    print("类型转换示例:")
    print("="*60)

    print(f"数据库端口 (int): {parser.get_int('database', 'port')}")
    print(f"调试模式 (bool): {parser.get_bool('server', 'debug')}")
    print(f"工作进程数 (int): {parser.get_int('server', 'workers')}")
    print(f"允许的源 (list): {parser.get_list('features', 'allowed_origins')}")

if __name__ == "__main__":
    main()

实战3: 代码统计工具

"""
代码统计工具
展示正则表达式在代码分析中的应用
"""

import re
import os
from pathlib import Path
from typing import Dict, List
from collections import defaultdict

class CodeAnalyzer:
    """代码分析器"""

    def __init__(self, file_extensions: List[str] = None):
        self.file_extensions = file_extensions or ['.py', '.js', '.java']
        self.stats = defaultdict(lambda: {
            'lines': 0,
            'code_lines': 0,
            'comment_lines': 0,
            'blank_lines': 0,
            'functions': 0,
            'classes': 0,
            'imports': 0
        })

    def analyze_file(self, filepath: Path) -> Dict:
        """分析单个文件"""
        ext = filepath.suffix
        if ext not in self.file_extensions:
            return None

        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            lines = content.split('\n')

        stats = {
            'lines': len(lines),
            'code_lines': 0,
            'comment_lines': 0,
            'blank_lines': 0,
            'functions': 0,
            'classes': 0,
            'imports': 0
        }

        # 统计不同类型的行
        for line in lines:
            stripped = line.strip()

            if not stripped:
                stats['blank_lines'] += 1
            elif self._is_comment(stripped, ext):
                stats['comment_lines'] += 1
            else:
                stats['code_lines'] += 1

        # 统计函数
        if ext == '.py':
            stats['functions'] = len(re.findall(r'^\s*def\s+\w+', content, re.MULTILINE))
            stats['classes'] = len(re.findall(r'^\s*class\s+\w+', content, re.MULTILINE))
            stats['imports'] = len(re.findall(r'^(?:import|from)\s+', content, re.MULTILINE))
        elif ext == '.js':
            stats['functions'] = len(re.findall(r'\bfunction\s+\w+', content))
            stats['classes'] = len(re.findall(r'\bclass\s+\w+', content))
            stats['imports'] = len(re.findall(r'^(?:import|require)\s+', content, re.MULTILINE))

        self.stats[str(filepath)] = stats
        return stats

    def _is_comment(self, line: str, ext: str) -> bool:
        """判断是否为注释行"""
        if ext == '.py':
            return line.startswith('#')
        elif ext in ['.js', '.java']:
            return line.startswith('//') or line.startswith('/*') or line.startswith('*')
        return False

    def analyze_directory(self, directory: Path) -> Dict:
        """分析目录"""
        for filepath in directory.rglob('*'):
            if filepath.is_file():
                self.analyze_file(filepath)

        return dict(self.stats)

    def print_report(self):
        """打印报告"""
        total_stats = {
            'files': len(self.stats),
            'lines': 0,
            'code_lines': 0,
            'comment_lines': 0,
            'blank_lines': 0,
            'functions': 0,
            'classes': 0,
            'imports': 0
        }

        print("="*80)
        print(f"{'文件':<40} {'行数':>6} {'代码':>6} {'注释':>6} {'空白':>6}")
        print("="*80)

        for filepath, stats in sorted(self.stats.items()):
            filename = Path(filepath).name
            print(f"{filename:<40} {stats['lines']:>6} {stats['code_lines']:>6} "
                  f"{stats['comment_lines']:>6} {stats['blank_lines']:>6}")

            total_stats['lines'] += stats['lines']
            total_stats['code_lines'] += stats['code_lines']
            total_stats['comment_lines'] += stats['comment_lines']
            total_stats['blank_lines'] += stats['blank_lines']
            total_stats['functions'] += stats['functions']
            total_stats['classes'] += stats['classes']
            total_stats['imports'] += stats['imports']

        print("="*80)
        print(f"{'总计':<40} {total_stats['lines']:>6} {total_stats['code_lines']:>6} "
              f"{total_stats['comment_lines']:>6} {total_stats['blank_lines']:>6}")
        print("="*80)

        print(f"\n代码行数: {total_stats['code_lines']}")
        print(f"注释行数: {total_stats['comment_lines']}")
        print(f"空白行数: {total_stats['blank_lines']}")
        print(f"函数数量: {total_stats['functions']}")
        print(f"类数量: {total_stats['classes']}")
        print(f"导入语句: {total_stats['imports']}")

        if total_stats['lines'] > 0:
            comment_ratio = total_stats['comment_lines'] / total_stats['lines'] * 100
            print(f"注释率: {comment_ratio:.1f}%")

# 使用示例
def main():
    analyzer = CodeAnalyzer(['.py'])

    # 分析当前目录
    current_dir = Path('.')
    analyzer.analyze_directory(current_dir)

    # 打印报告
    analyzer.print_report()

if __name__ == "__main__":
    main()

小结

概念说明示例
字符类匹配特定字符集合\d, \w, [abc]
量词指定匹配次数*, +, ?, {n,m}
边界匹配位置^, $, \b
分组捕获和引用(), (?:), (?P<name>)
前瞻后顾零宽断言(?=), (?!), (?<=), (?<!)`
标志修改匹配行为re.I, re.M, re.S

核心要点

  • 正则表达式是强大的文本处理工具
  • 理解基本语法和元字符
  • 掌握分组和反向引用
  • 注意贪婪和非贪婪匹配
  • 编译正则提高性能
  • 避免灾难性回溯
  • 合理使用在线工具测试
  • 复杂逻辑考虑其他方案

常用资源

  • Regex101 - 在线测试工具
  • RegExr - 学习和测试
  • Python 官方文档 - re 模块

掌握正则表达式将极大提升你的文本处理能力!