Python 正则表达式
目录
正则表达式基础
什么是正则表达式?
正则表达式(Regular Expression)是一种强大的文本模式匹配工具,用于:
- 搜索和替换文本
- 验证输入格式
- 提取特定信息
- 数据清洗和处理
import re
# 简单示例:检查字符串是否包含数字
text = "我的电话号码是 138-1234-5678"
pattern = r"\d+" # 匹配一个或多个数字
match = re.search(pattern, text)
if match:
print(f"找到数字: {match.group()}") # 找到数字: 138
为什么使用正则表达式?
# ❌ 不使用正则:复杂且易错
def is_email_manual(email):
if "@" not in email:
return False
parts = email.split("@")
if len(parts) != 2:
return False
local, domain = parts
if not local or not domain:
return False
if "." not in domain:
return False
return True
# ✅ 使用正则:简洁清晰
def is_email_regex(email):
pattern = r'^[\w.-]+@[\w.-]+\.\w+$'
return bool(re.match(pattern, email))
re 模块简介
主要函数
import re
# 1. re.match() - 从字符串开头匹配
result = re.match(r'\d+', '123abc')
print(result.group()) # 123
# 2. re.search() - 搜索整个字符串
result = re.search(r'\d+', 'abc123def')
print(result.group()) # 123
# 3. re.findall() - 查找所有匹配
results = re.findall(r'\d+', 'a1b2c3')
print(results) # ['1', '2', '3']
# 4. re.finditer() - 返回迭代器
for match in re.finditer(r'\d+', 'a1b2c3'):
print(f"位置 {match.start()}-{match.end()}: {match.group()}")
# 5. re.sub() - 替换
result = re.sub(r'\d+', 'X', 'a1b2c3')
print(result) # aXbXcX
# 6. re.split() - 分割
result = re.split(r'[,\s]+', 'apple,banana orange grape')
print(result) # ['apple', 'banana', 'orange', 'grape']
# 7. re.compile() - 编译正则
pattern = re.compile(r'\d+')
result = pattern.search('abc123')
print(result.group()) # 123
Match 对象
import re
text = "Email: test@example.com, Phone: 138-1234-5678"
pattern = r'(\w+)@(\w+\.\w+)'
match = re.search(pattern, text)
if match:
print(f"完整匹配: {match.group(0)}") # test@example.com
print(f"第一组: {match.group(1)}") # test
print(f"第二组: {match.group(2)}") # example.com
print(f"所有组: {match.groups()}") # ('test', 'example.com')
print(f"起始位置: {match.start()}") # 7
print(f"结束位置: {match.end()}") # 23
print(f"跨度: {match.span()}") # (7, 23)
print(f"原始字符串: {match.string}") # 完整文本
基本语法
普通字符
import re
# 字母和数字直接匹配
print(re.search(r'hello', 'say hello world').group()) # hello
print(re.search(r'123', 'abc123def').group()) # 123
# 特殊字符需要转义
print(re.search(r'\.', 'file.txt').group()) # .
print(re.search(r'\?', 'what?').group()) # ?
print(re.search(r'\*', '5*3').group()) # *
元字符
import re
# . 匹配任意字符(除换行符)
print(re.findall(r'a.c', 'abc axc a1c')) # ['abc', 'axc', 'a1c']
# ^ 匹配字符串开头
print(re.findall(r'^Hello', 'Hello World')) # ['Hello']
# $ 匹配字符串结尾
print(re.findall(r'World$', 'Hello World')) # ['World']
# | 或运算
print(re.findall(r'cat|dog', 'I have a cat and a dog')) # ['cat', 'dog']
# \ 转义字符
print(re.findall(r'\d+', 'Price: $99.99')) # ['99', '99']
字符类
预定义字符类
import re
text = "ABC abc 123 !@#"
# \d - 数字 [0-9]
print(re.findall(r'\d', text)) # ['1', '2', '3']
# \D - 非数字
print(re.findall(r'\D', text)[:5]) # ['A', 'B', 'C', ' ', 'a']
# \w - 单词字符 [a-zA-Z0-9_]
print(re.findall(r'\w', text)[:5]) # ['A', 'B', 'C', 'a', 'b']
# \W - 非单词字符
print(re.findall(r'\W', text)) # [' ', ' ', ' ', '!', '@', '#']
# \s - 空白字符 [ \t\n\r\f\v]
print(re.findall(r'\s', "a b\tc")) # [' ', '\t']
# \S - 非空白字符
print(re.findall(r'\S', "a b")) # ['a', 'b']
自定义字符类
import re
# [] 定义字符集合
print(re.findall(r'[aeiou]', 'hello world')) # ['e', 'o', 'o']
# 范围
print(re.findall(r'[a-z]', 'ABC abc 123')) # ['a', 'b', 'c']
print(re.findall(r'[0-9]', 'abc123')) # ['1', '2', '3']
# 取反
print(re.findall(r'[^0-9]', 'abc123')) # ['a', 'b', 'c']
# 组合
print(re.findall(r'[a-zA-Z0-9]', 'abc ABC 123')) # ['a','b','c','A','B','C','1','2','3']
# 特殊字符在字符类中不需要转义(大部分)
print(re.findall(r'[.*+?]', 'a.b*c+d?')) # ['.', '*', '+', '?']
常用字符类示例
import re
# 匹配十六进制颜色
colors = "#FF5733 #abc #123456 #GGG"
hex_colors = re.findall(r'#[0-9a-fA-F]{6}\b', colors)
print(hex_colors) # ['#FF5733', '#123456']
# 匹配中文
text = "Hello 你好 World 世界"
chinese = re.findall(r'[\u4e00-\u9fff]+', text)
print(chinese) # ['你好', '世界']
# 匹配身份证号
id_card = "身份证:110101199001011234"
ids = re.findall(r'\d{17}[\dXx]', id_card)
print(ids) # ['110101199001011234']
量词
基本量词
import re
# * 零次或多次
print(re.findall(r'ab*c', 'ac abc abbc abbbc')) # ['ac', 'abc', 'abbc', 'abbbc']
# + 一次或多次
print(re.findall(r'ab+c', 'ac abc abbc')) # ['abc', 'abbc']
# ? 零次或一次
print(re.findall(r'colou?r', 'color colour')) # ['color', 'colour']
# {n} 恰好 n 次
print(re.findall(r'\d{4}', '2024-01-15')) # ['2024', '01', '15']
# {n,} 至少 n 次
print(re.findall(r'\d{2,}', '1 22 333 4444')) # ['22', '333', '4444']
# {n,m} n 到 m 次
print(re.findall(r'\d{2,4}', '1 22 333 4444 55555')) # ['22', '333', '4444', '5555']
贪婪 vs 非贪婪
import re
text = "<div>content1</div><div>content2</div>"
# 贪婪匹配(默认)
greedy = re.findall(r'<div>.*</div>', text)
print(greedy) # ['<div>content1</div><div>content2</div>']
# 非贪婪匹配(加 ?)
non_greedy = re.findall(r'<div>.*?</div>', text)
print(non_greedy) # ['<div>content1</div>', '<div>content2</div>']
# 其他非贪婪量词
print(re.findall(r'\d+?', '12345')) # ['1', '2', '3', '4', '5']
print(re.findall(r'\d*?', '12345')) # ['', '', '', '', '', '']
print(re.findall(r'\d??', '12345')) # ['', '', '', '', '', '']
量词对比表
| 量词 | 含义 | 示例 | 匹配 |
|---|---|---|---|
* | 0 或更多 | ab*c | ac, abc, abbc |
+ | 1 或更多 | ab+c | abc, abbc |
? | 0 或 1 | colou?r | color, colour |
{n} | 恰好 n 次 | \d{4} | 1234 |
{n,} | 至少 n 次 | \d{2,} | 12, 123 |
{n,m} | n 到 m 次 | \d{2,4} | 12, 123, 1234 |
边界匹配
单词边界
import re
text = "The cat scattered the catalog"
# \b 单词边界
print(re.findall(r'\bcat\b', text)) # ['cat']
print(re.findall(r'\bcat', text)) # ['cat', 'cat']
# \B 非单词边界
print(re.findall(r'\Bcat\B', text)) # []
print(re.findall(r'\Bcat', text)) # ['cat'] (scattered)
字符串边界
import re
text = "Hello\nWorld\nPython"
# ^ 和 $ 默认只匹配首尾
print(re.findall(r'^\w+', text)) # ['Hello']
print(re.findall(r'\w+$', text)) # ['Python']
# MULTILINE 标志:每行的首尾
print(re.findall(r'^\w+', text, re.MULTILINE)) # ['Hello', 'World', 'Python']
print(re.findall(r'\w+$', text, re.MULTILINE)) # ['Hello', 'World', 'Python']
# \A 和 \Z 始终匹配整个字符串的首尾
print(re.findall(r'\A\w+', text)) # ['Hello']
print(re.findall(r'\w+\Z', text)) # ['Python']
分组和捕获
基本分组
import re
# () 创建分组
text = "John Doe, age 25"
pattern = r'(\w+) (\w+), age (\d+)'
match = re.search(pattern, text)
if match:
print(f"全名: {match.group(0)}") # John Doe, age 25
print(f"名: {match.group(1)}") # John
print(f"姓: {match.group(2)}") # Doe
print(f"年龄: {match.group(3)}") # 25
print(f"所有组: {match.groups()}") # ('John', 'Doe', '25')
命名分组
import re
text = "2024-01-15"
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, text)
if match:
print(f"年: {match.group('year')}") # 2024
print(f"月: {match.group('month')}") # 01
print(f"日: {match.group('day')}") # 15
# 通过名称访问
print(match.groupdict()) # {'year': '2024', 'month': '01', 'day': '15'}
非捕获分组
import re
# (?:...) 非捕获分组(不保存匹配结果)
text = "www.example.com"
# 捕获分组
result1 = re.findall(r'(www|ftp)\.(\w+)\.(\w+)', text)
print(result1) # [('www', 'example', 'com')]
# 非捕获分组
result2 = re.findall(r'(?:www|ftp)\.(\w+)\.(\w+)', text)
print(result2) # [('example', 'com')]
反向引用
import re
# \1, \2 引用前面的分组
text = "hello hello world world"
# 查找重复的单词
pattern = r'\b(\w+)\s+\1\b'
matches = re.findall(pattern, text)
print(matches) # ['hello', 'world']
# 替换重复单词
result = re.sub(r'\b(\w+)\s+\1\b', r'\1', text)
print(result) # hello world
# 匹配成对的标签
html = "<div>content</div><span>text</span>"
pattern = r'<(\w+)>(.*?)</\1>'
matches = re.findall(pattern, html)
print(matches) # [('div', 'content'), ('span', 'text')]
前瞻和后顾
前瞻(Lookahead)
import re
# (?=...) 正向前瞻:后面必须是...
text = "Windows 10, Windows 11, Linux"
pattern = r'Windows (?=\d+)'
matches = re.findall(pattern, text)
print(matches) # ['Windows', 'Windows']
# (?!...) 负向前瞻:后面不能是...
pattern = r'Windows (?!\d+)'
matches = re.findall(pattern, text)
print(matches) # []
后顾(Lookbehind)
import re
# (?<=...) 正向后顾:前面必须是...
text = "$100 €200 ¥300"
pattern = r'(?<=\$)\d+'
matches = re.findall(pattern, text)
print(matches) # ['100']
# (?<!...) 负向后顾:前面不能是...
pattern = r'(?<!\$)\d+'
matches = re.findall(pattern, text)
print(matches) # ['200', '300']
实际应用
import re
# 密码强度验证(至少8位,包含大小写和数字)
password = "Password123"
pattern = r'^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).{8,}$'
print(bool(re.match(pattern, password))) # True
# 提取 URL 中的域名
url = "https://www.example.com/path/to/page"
pattern = r'(?<=://)[^/]+'
domain = re.search(pattern, url)
print(domain.group()) # www.example.com
# 金额格式化(千分位)
amount = "1234567890"
formatted = re.sub(r'(?<=\d)(?=(\d{3})+$)', ',', amount)
print(formatted) # 1,234,567,890
编译正则表达式
compile() 的优势
import re
import time
# 未编译:每次都要解析正则
start = time.time()
for _ in range(10000):
re.search(r'\d+', 'abc123def')
print(f"未编译: {time.time() - start:.4f}秒")
# 已编译:只需解析一次
pattern = re.compile(r'\d+')
start = time.time()
for _ in range(10000):
pattern.search('abc123def')
print(f"已编译: {time.time() - start:.4f}秒")
编译选项
import re
# IGNORECASE - 忽略大小写
pattern = re.compile(r'hello', re.IGNORECASE)
print(pattern.findall('Hello HELLO hello')) # ['Hello', 'HELLO', 'hello']
# MULTILINE - 多行模式
pattern = re.compile(r'^\w+', re.MULTILINE)
print(pattern.findall('line1\nline2\nline3')) # ['line1', 'line2', 'line3']
# DOTALL - . 匹配包括换行符在内的所有字符
pattern = re.compile(r'.+', re.DOTALL)
print(pattern.findall('line1\nline2')) # ['line1\nline2']
# VERBOSE - 详细模式(允许注释和空格)
pattern = re.compile(r'''
\d{4} # 年
- # 分隔符
\d{2} # 月
- # 分隔符
\d{2} # 日
''', re.VERBOSE)
print(pattern.search('2024-01-15').group()) # 2024-01-15
# 组合多个标志
pattern = re.compile(r'hello', re.IGNORECASE | re.MULTILINE)
常用函数详解
match() vs search()
import re
text = "abc123def"
# match() 只从开头匹配
print(re.match(r'\d+', text)) # None
print(re.match(r'[a-z]+', text)) # <Match object>
# search() 搜索整个字符串
print(re.search(r'\d+', text)) # <Match object>
print(re.search(r'[a-z]+', text)) # <Match object>
findall() vs finditer()
import re
text = "Phone: 138-1234-5678, Alt: 139-8765-4321"
# findall() 返回列表
phones = re.findall(r'\d{3}-\d{4}-\d{4}', text)
print(phones) # ['138-1234-5678', '139-8765-4321']
# finditer() 返回迭代器(更节省内存)
for match in re.finditer(r'(\d{3})-(\d{4})-(\d{4})', text):
print(f"完整: {match.group(0)}, 区号: {match.group(1)}")
sub() 高级用法
import re
# 基本替换
text = "Color: red, green, blue"
result = re.sub(r'red|green|blue', 'COLOR', text)
print(result) # Color: COLOR, COLOR, COLOR
# 使用函数进行替换
def uppercase_match(match):
return match.group().upper()
result = re.sub(r'\b\w+\b', uppercase_match, "hello world")
print(result) # HELLO WORLD
# 限制替换次数
text = "aaa bbb ccc"
result = re.sub(r'\w+', 'X', text, count=2)
print(result) # X X ccc
# 反向引用替换
text = "2024-01-15"
result = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', text)
print(result) # 01/15/2024
split() 高级用法
import re
# 基本分割
text = "apple,banana;orange grape"
result = re.split(r'[,\s;]+', text)
print(result) # ['apple', 'banana', 'orange', 'grape']
# 保留分隔符
text = "apple,banana;orange"
result = re.split(r'([,;])', text)
print(result) # ['apple', ',', 'banana', ';', 'orange']
# 限制分割次数
text = "one,two,three,four"
result = re.split(r',', text, maxsplit=2)
print(result) # ['one', 'two', 'three,four']
实战示例
邮箱验证
import re
def validate_email(email):
"""验证邮箱格式"""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
# 测试
emails = [
"user@example.com",
"test.user@domain.co.uk",
"invalid@",
"@example.com",
"user@.com"
]
for email in emails:
status = "✓" if validate_email(email) else "✗"
print(f"{status} {email}")
# 输出:
# ✓ user@example.com
# ✓ test.user@domain.co.uk
# ✗ invalid@
# ✗ @example.com
# ✗ user@.com
手机号提取
import re
def extract_phones(text):
"""提取中国手机号"""
# 匹配 11 位手机号
pattern = r'1[3-9]\d{9}'
return re.findall(pattern, text)
text = """
联系人:张三 13812345678
李四电话:159-8765-4321
王五手机:186 1234 5678
无效号码:12345678901
"""
phones = extract_phones(text)
print("找到的手机号:")
for phone in phones:
print(f" {phone}")
# 输出:
# 找到的手机号:
# 13812345678
# 15987654321
# 18612345678
HTML 标签提取
import re
def extract_links(html):
"""提取 HTML 中的链接"""
pattern = r'<a\s+href=["\']([^"\']+)["\'][^>]*>(.*?)</a>'
matches = re.findall(pattern, html, re.IGNORECASE | re.DOTALL)
links = []
for url, text in matches:
links.append({
'url': url,
'text': re.sub(r'<[^>]+>', '', text).strip() # 去除嵌套标签
})
return links
html = """
<a href="https://example.com">Example</a>
<a href="/page" class="link">Page <strong>Title</strong></a>
<a href='https://test.org'>Test</a>
"""
links = extract_links(html)
for link in links:
print(f"URL: {link['url']}, 文本: {link['text']}")
# 输出:
# URL: https://example.com, 文本: Example
# URL: /page, 文本: Page Title
# URL: https://test.org, 文本: Test
日志解析
import re
from datetime import datetime
def parse_log_line(line):
"""解析日志行"""
pattern = r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(?P<level>\w+)\] (?P<message>.+)'
match = re.match(pattern, line)
if match:
return {
'timestamp': datetime.strptime(match.group('timestamp'), '%Y-%m-%d %H:%M:%S'),
'level': match.group('level'),
'message': match.group('message')
}
return None
log_lines = [
"2024-01-15 10:30:00 [INFO] Application started",
"2024-01-15 10:30:05 [ERROR] Database connection failed",
"2024-01-15 10:30:10 [WARNING] High memory usage"
]
for line in log_lines:
parsed = parse_log_line(line)
if parsed:
print(f"[{parsed['level']}] {parsed['timestamp']}: {parsed['message']}")
IP 地址验证
import re
def validate_ip(ip):
"""验证 IPv4 地址"""
pattern = r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$'
match = re.match(pattern, ip)
if not match:
return False
# 检查每个部分是否在 0-255 范围内
for i in range(1, 5):
octet = int(match.group(i))
if octet < 0 or octet > 255:
return False
return True
# 测试
ips = [
"192.168.1.1",
"255.255.255.255",
"0.0.0.0",
"256.1.1.1",
"192.168.1",
"abc.def.ghi.jkl"
]
for ip in ips:
status = "✓" if validate_ip(ip) else "✗"
print(f"{status} {ip}")
性能优化
编译正则表达式
import re
import time
# ❌ 不推荐:重复编译
def slow_search(text, patterns):
results = []
for pattern in patterns:
matches = re.findall(pattern, text)
results.extend(matches)
return results
# ✅ 推荐:预先编译
def fast_search(text, compiled_patterns):
results = []
for pattern in compiled_patterns:
matches = pattern.findall(text)
results.extend(matches)
return results
# 使用
patterns = [r'\d+', r'[a-z]+', r'\w+']
compiled = [re.compile(p) for p in patterns]
text = "abc123 def456 ghi789" * 1000
result = fast_search(text, compiled)
避免灾难性回溯
import re
# ❌ 危险:可能导致灾难性回溯
text = "aaaaaaaaaaaaaaaaaaaa!"
try:
result = re.match(r'(a+)+b', text)
except RecursionError:
print("递归错误!")
# ✅ 安全:使用原子分组或优化正则
result = re.match(r'a+b', text) # 简单有效
使用适当的方法
import re
text = "large text with many words" * 10000
# ❌ 只需要判断是否存在,却用了 findall
if re.findall(r'pattern', text):
pass
# ✅ 使用 search 更高效
if re.search(r'pattern', text):
pass
# ❌ 只需要第一个匹配,却用了 findall
first = re.findall(r'pattern', text)[0]
# ✅ 使用 search
match = re.search(r'pattern', text)
if match:
first = match.group()
综合实战
实战1: 数据清洗工具
"""
数据清洗工具
展示正则表达式在数据清洗中的应用
"""
import re
from typing import List, Dict
class DataCleaner:
"""数据清洗器"""
@staticmethod
def remove_html_tags(text: str) -> str:
"""去除 HTML 标签"""
return re.sub(r'<[^>]+>', '', text)
@staticmethod
def normalize_whitespace(text: str) -> str:
"""标准化空白字符"""
# 将多个空白字符替换为单个空格
text = re.sub(r'\s+', ' ', text)
return text.strip()
@staticmethod
def extract_emails(text: str) -> List[str]:
"""提取邮箱地址"""
pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
return list(set(re.findall(pattern, text)))
@staticmethod
def extract_urls(text: str) -> List[str]:
"""提取 URL"""
pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
return list(set(re.findall(pattern, text)))
@staticmethod
def extract_phone_numbers(text: str) -> List[str]:
"""提取电话号码"""
# 中国手机号
pattern = r'1[3-9]\d{9}'
return list(set(re.findall(pattern, text)))
@staticmethod
def mask_sensitive_info(text: str) -> str:
"""脱敏敏感信息"""
# 隐藏邮箱
text = re.sub(
r'([a-zA-Z0-9._%+-]{2})[a-zA-Z0-9._%+-]*(@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
r'\1***\2',
text
)
# 隐藏手机号
text = re.sub(
r'(1[3-9]\d)\d{4}(\d{4})',
r'\1****\2',
text
)
# 隐藏身份证号
text = re.sub(
r'(\d{6})\d{8}(\d{4})',
r'\1********\2',
text
)
return text
@staticmethod
def normalize_date(text: str) -> str:
"""标准化日期格式"""
# YYYY/MM/DD -> YYYY-MM-DD
text = re.sub(r'(\d{4})/(\d{2})/(\d{2})', r'\1-\2-\3', text)
# DD-MM-YYYY -> YYYY-MM-DD
text = re.sub(r'(\d{2})-(\d{2})-(\d{4})', r'\3-\2-\1', text)
return text
@staticmethod
def extract_hashtags(text: str) -> List[str]:
"""提取话题标签"""
pattern = r'#([\w\u4e00-\u9fff]+)'
return re.findall(pattern, text)
@staticmethod
def clean_text(text: str, options: Dict[str, bool] = None) -> str:
"""综合清洗文本"""
if options is None:
options = {}
# 去除 HTML 标签
if options.get('remove_html', True):
text = self.remove_html_tags(text)
# 标准化空白
if options.get('normalize_whitespace', True):
text = self.normalize_whitespace(text)
# 标准化日期
if options.get('normalize_date', False):
text = self.normalize_date(text)
return text
# 使用示例
def main():
cleaner = DataCleaner()
# 测试文本
text = """
<p>联系邮箱: john.doe@example.com, jane@test.org</p>
<p>电话: 13812345678, 15987654321</p>
<p>访问我们的网站: https://www.example.com</p>
<p>发布日期: 2024/01/15 或 15-01-2024</p>
<p>#Python #编程 #教程</p>
"""
print("="*60)
print("原始文本:")
print(text)
print("="*60)
# 提取信息
print("\n提取的邮箱:")
for email in cleaner.extract_emails(text):
print(f" {email}")
print("\n提取的 URL:")
for url in cleaner.extract_urls(text):
print(f" {url}")
print("\n提取的手机号:")
for phone in cleaner.extract_phone_numbers(text):
print(f" {phone}")
print("\n提取的话题标签:")
for tag in cleaner.extract_hashtags(text):
print(f" #{tag}")
# 脱敏
print("\n脱敏后的文本:")
masked = cleaner.mask_sensitive_info(text)
print(masked)
# 清洗
print("\n清洗后的文本:")
cleaned = cleaner.clean_text(text)
print(cleaned)
if __name__ == "__main__":
main()
实战2: 配置文件解析器
"""
配置文件解析器
展示正则表达式在配置解析中的应用
"""
import re
from typing import Dict, Any, Optional
class ConfigParser:
"""配置文件解析器"""
def __init__(self):
self.config: Dict[str, Dict[str, str]] = {}
def parse(self, content: str) -> Dict[str, Dict[str, str]]:
"""解析配置内容"""
sections = re.split(r'^\[([^\]]+)\]', content, flags=re.MULTILINE)
current_section = 'DEFAULT'
self.config[current_section] = {}
i = 1
while i < len(sections):
section_name = sections[i].strip()
section_content = sections[i + 1] if i + 1 < len(sections) else ""
self.config[section_name] = {}
# 解析键值对
for line in section_content.split('\n'):
line = line.strip()
# 跳过注释和空行
if not line or line.startswith('#') or line.startswith(';'):
continue
# 匹配键值对
match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
if match:
key = match.group(1).strip()
value = match.group(2).strip()
# 去除引号
value = re.sub(r'^["\'](.*)["\']$', r'\1', value)
self.config[section_name][key] = value
i += 2
return self.config
def get(self, section: str, key: str, default: str = None) -> Optional[str]:
"""获取配置值"""
return self.config.get(section, {}).get(key, default)
def get_int(self, section: str, key: str, default: int = 0) -> int:
"""获取整数配置"""
value = self.get(section, key)
if value is None:
return default
try:
return int(value)
except ValueError:
return default
def get_bool(self, section: str, key: str, default: bool = False) -> bool:
"""获取布尔配置"""
value = self.get(section, key)
if value is None:
return default
return value.lower() in ('true', 'yes', '1', 'on')
def get_list(self, section: str, key: str, separator: str = ',') -> list:
"""获取列表配置"""
value = self.get(section, key)
if value is None:
return []
return [item.strip() for item in value.split(separator)]
# 使用示例
def main():
config_content = """
# 数据库配置
[database]
host = localhost
port = 5432
name = mydb
user = "admin"
password = 'secret123'
# 服务器配置
[server]
host = 0.0.0.0
port = 8080
debug = true
workers = 4
# 日志配置
[logging]
level = INFO
file = /var/log/app.log
max_size = 10MB
backup_count = 5
# 功能开关
[features]
enable_cache = yes
enable_auth = true
allowed_origins = http://localhost,https://example.com
"""
parser = ConfigParser()
config = parser.parse(config_content)
print("="*60)
print("配置解析结果:")
print("="*60)
for section, values in config.items():
if values:
print(f"\n[{section}]")
for key, value in values.items():
print(f" {key} = {value}")
print("\n" + "="*60)
print("类型转换示例:")
print("="*60)
print(f"数据库端口 (int): {parser.get_int('database', 'port')}")
print(f"调试模式 (bool): {parser.get_bool('server', 'debug')}")
print(f"工作进程数 (int): {parser.get_int('server', 'workers')}")
print(f"允许的源 (list): {parser.get_list('features', 'allowed_origins')}")
if __name__ == "__main__":
main()
实战3: 代码统计工具
"""
代码统计工具
展示正则表达式在代码分析中的应用
"""
import re
import os
from pathlib import Path
from typing import Dict, List
from collections import defaultdict
class CodeAnalyzer:
"""代码分析器"""
def __init__(self, file_extensions: List[str] = None):
self.file_extensions = file_extensions or ['.py', '.js', '.java']
self.stats = defaultdict(lambda: {
'lines': 0,
'code_lines': 0,
'comment_lines': 0,
'blank_lines': 0,
'functions': 0,
'classes': 0,
'imports': 0
})
def analyze_file(self, filepath: Path) -> Dict:
"""分析单个文件"""
ext = filepath.suffix
if ext not in self.file_extensions:
return None
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
lines = content.split('\n')
stats = {
'lines': len(lines),
'code_lines': 0,
'comment_lines': 0,
'blank_lines': 0,
'functions': 0,
'classes': 0,
'imports': 0
}
# 统计不同类型的行
for line in lines:
stripped = line.strip()
if not stripped:
stats['blank_lines'] += 1
elif self._is_comment(stripped, ext):
stats['comment_lines'] += 1
else:
stats['code_lines'] += 1
# 统计函数
if ext == '.py':
stats['functions'] = len(re.findall(r'^\s*def\s+\w+', content, re.MULTILINE))
stats['classes'] = len(re.findall(r'^\s*class\s+\w+', content, re.MULTILINE))
stats['imports'] = len(re.findall(r'^(?:import|from)\s+', content, re.MULTILINE))
elif ext == '.js':
stats['functions'] = len(re.findall(r'\bfunction\s+\w+', content))
stats['classes'] = len(re.findall(r'\bclass\s+\w+', content))
stats['imports'] = len(re.findall(r'^(?:import|require)\s+', content, re.MULTILINE))
self.stats[str(filepath)] = stats
return stats
def _is_comment(self, line: str, ext: str) -> bool:
"""判断是否为注释行"""
if ext == '.py':
return line.startswith('#')
elif ext in ['.js', '.java']:
return line.startswith('//') or line.startswith('/*') or line.startswith('*')
return False
def analyze_directory(self, directory: Path) -> Dict:
"""分析目录"""
for filepath in directory.rglob('*'):
if filepath.is_file():
self.analyze_file(filepath)
return dict(self.stats)
def print_report(self):
"""打印报告"""
total_stats = {
'files': len(self.stats),
'lines': 0,
'code_lines': 0,
'comment_lines': 0,
'blank_lines': 0,
'functions': 0,
'classes': 0,
'imports': 0
}
print("="*80)
print(f"{'文件':<40} {'行数':>6} {'代码':>6} {'注释':>6} {'空白':>6}")
print("="*80)
for filepath, stats in sorted(self.stats.items()):
filename = Path(filepath).name
print(f"{filename:<40} {stats['lines']:>6} {stats['code_lines']:>6} "
f"{stats['comment_lines']:>6} {stats['blank_lines']:>6}")
total_stats['lines'] += stats['lines']
total_stats['code_lines'] += stats['code_lines']
total_stats['comment_lines'] += stats['comment_lines']
total_stats['blank_lines'] += stats['blank_lines']
total_stats['functions'] += stats['functions']
total_stats['classes'] += stats['classes']
total_stats['imports'] += stats['imports']
print("="*80)
print(f"{'总计':<40} {total_stats['lines']:>6} {total_stats['code_lines']:>6} "
f"{total_stats['comment_lines']:>6} {total_stats['blank_lines']:>6}")
print("="*80)
print(f"\n代码行数: {total_stats['code_lines']}")
print(f"注释行数: {total_stats['comment_lines']}")
print(f"空白行数: {total_stats['blank_lines']}")
print(f"函数数量: {total_stats['functions']}")
print(f"类数量: {total_stats['classes']}")
print(f"导入语句: {total_stats['imports']}")
if total_stats['lines'] > 0:
comment_ratio = total_stats['comment_lines'] / total_stats['lines'] * 100
print(f"注释率: {comment_ratio:.1f}%")
# 使用示例
def main():
analyzer = CodeAnalyzer(['.py'])
# 分析当前目录
current_dir = Path('.')
analyzer.analyze_directory(current_dir)
# 打印报告
analyzer.print_report()
if __name__ == "__main__":
main()
小结
| 概念 | 说明 | 示例 |
|---|---|---|
| 字符类 | 匹配特定字符集合 | \d, \w, [abc] |
| 量词 | 指定匹配次数 | *, +, ?, {n,m} |
| 边界 | 匹配位置 | ^, $, \b |
| 分组 | 捕获和引用 | (), (?:), (?P<name>) |
| 前瞻后顾 | 零宽断言 | (?=), (?!), (?<=), (?<!)` |
| 标志 | 修改匹配行为 | re.I, re.M, re.S |
核心要点:
- 正则表达式是强大的文本处理工具
- 理解基本语法和元字符
- 掌握分组和反向引用
- 注意贪婪和非贪婪匹配
- 编译正则提高性能
- 避免灾难性回溯
- 合理使用在线工具测试
- 复杂逻辑考虑其他方案
常用资源:
掌握正则表达式将极大提升你的文本处理能力!