起因
2025 年 11 月,国际风云变幻。每天的国际新闻都有诸多值得乐呵乐呵的事情,可 乐子(新闻)太多,看花了眼。
刚好朋友有个闲置的 GLM Coding Plan Lite,我就在想:能不能让大模型帮我看新闻?
说干就干。
插件化加载
为以后拓展留后路:多语言分发、TGBot、飞书Bot...
干脆从 nonebot2 扒了个 PluginManager,所有功能插件化加载。加个新源?写个 crawler 扔 plugins/ 目录,重启完事。
于是 main.py 就可以优化成这样,非常的丝滑
:filename: main.py
def main(port: int = 8000, debug: bool = False):
load_plugins("src/plugins")
load_dotenv(".env")
logger.info("Starting FastAPI server...")
try:
uvicorn.run(
"src.app:app",
host="0.0.0.0",
port=port,
log_level="info" if not debug else "debug",
)
except KeyboardInterrupt:
logger.info("Shutting down...")
finally:
logger.info("Scheduler shut down.")
if __name__ == "__main__":
app()
新闻爬虫库 paper3k
插件加载搞完之后就是新闻的获取,一口气把联合早报写了个爬虫
lianhe_zaobao.py
from datetime import UTC, datetime
from urllib.parse import urljoin
from zoneinfo import ZoneInfo
import httpx
from bs4 import BeautifulSoup
from loguru import logger
LIANHE_ZAOBAO_WORLD_URL = "https://www.zaobao.com/news/world"
LIANHE_ZAOBAO_CHINA_URL = "https://www.zaobao.com/news/china"
LIANHE_ZAOBAO_SEA_URL = "https://www.zaobao.com/news/sea"
# LIANHE_ZAOBAO_CHINA_REALTIME_URL = "https://www.zaobao.com/realtime/china"
# LIANHE_ZAOBAO_WORLD_REALTIME_URL = "https://www.zaobao.com/realtime/world"
LIANHE_ZAOBAO_NEWS = [
LIANHE_ZAOBAO_WORLD_URL,
LIANHE_ZAOBAO_CHINA_URL,
LIANHE_ZAOBAO_SEA_URL,
]
@dataclass
class News:
datetime: datetime
title: str
link: str
html: str
contents: list[str]
keywords: list[str] = field(default_factory=list)
persons: list[str] = field(default_factory=list)
countries: list[str] = field(default_factory=list)
cities: list[str] = field(default_factory=list)
abstract: str = ""
@property
def body(self) -> str:
return "\n\n".join(self.contents)
async def get_html(url: str) -> str | None:
try:
async with httpx.AsyncClient(follow_redirects=True) as client:
response = await client.get(url, headers=HEADERS, timeout=30.0)
response.raise_for_status()
return response.text
except httpx.RequestError as e:
logger.opt(exception=True).error(f"请求错误: {e}")
except httpx.HTTPStatusError as e:
logger.opt(exception=True).error(f"HTTP 错误: {e}")
return None
async def fetch_lianhe_zaobao_news() -> list[tuple[str, str]]:
result = []
for url in LIANHE_ZAOBAO_NEWS:
news_list = await _fetch_lianhe_zaobao_news(url)
result.extend(news_list)
return result
async def _fetch_lianhe_zaobao_news(url: str) -> list[tuple[str, str]]:
"""
从联合早报新闻列表页抓取新闻标题和链接
Return: list of tuple(title, link)
"""
news_list = []
html = await get_html(url)
if not html:
logger.error(f"无法获取联合早报新闻列表页 HTML,url: {url}")
return news_list
soup = BeautifulSoup(html, "html.parser")
articles = soup.find_all("article")
for article in articles:
info_item = article.find("a")
if info_item:
href = info_item.get("href")
title = info_item.get("title")
news_list.append((str(title), urljoin(url, str(href))))
return news_list
async def fetch_lianhe_zaobao_news_detail(url: str) -> News | None:
html = await get_html(url)
if not html:
logger.error(f"无法获取联合早报新闻详情页 HTML,url: {url}")
return None
soup = BeautifulSoup(html, "html.parser")
h1 = soup.find("h1")
if not h1:
logger.warning(f"联合早报新闻标题未找到 url: {url}")
return None
title = h1.get_text(strip=True)
body: list[str] = []
keywords: list[str] = []
body_tag = soup.find("article")
if body_tag:
paragraphs = body_tag.find_all("p")
for p in paragraphs:
body.append(p.get_text(strip=True))
if not body:
logger.warning(f"联合早报新闻正文未找到 url={url}")
return None
# 根据 XPath //*[@id="publish_time"]/div 获取发布时间
publish_time_container = soup.find(id="publish_time")
if not publish_time_container:
logger.warning(f"联合早报新闻发布时间容器未找到 url={url}")
return None
time_div = publish_time_container.find("div")
# 根据 XPath # /html/body/div[6]/main/div[1]/div[1]/div[2]/div[3] 获取关键词
keywords_container = soup.select_one(
"body > div:nth-child(6) > main > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(3)"
)
if keywords_container:
keywords.extend(
[kw.get_text(strip=True) for kw in keywords_container.find_all("a")]
)
if not time_div:
logger.warning(f"联合早报新闻发布时间标签未找到 url: {url}")
return None
publish_time_text = time_div.get_text(strip=True)
if not publish_time_text:
logger.warning(f"联合早报新闻发布时间文本未找到 url: {url}")
return None
try:
# 解析格式: "发布 / 2026年1月10日 22:23"
time_str = publish_time_text.split("/")[-1].strip()
# 解析为带新加坡时区 (Asia/Singapore) 的时间
publish_datetime = datetime.strptime(time_str, "%Y年%m月%d日 %H:%M").replace(
tzinfo=TIME_ZONE
)
# 统一转换为 UTC 以便存储
publish_datetime_utc = publish_datetime.astimezone(UTC)
return News(
datetime=publish_datetime_utc,
title=title,
link=url,
contents=body,
keywords=keywords,
html=html,
)
except (ValueError, IndexError) as e:
logger.error(f"联合早报时间解析失败: {publish_time_text}, 错误: {e}")
代码不多,也没有任何反爬措施,但是写一个爬虫的时间还是相对来说较慢的,如果一个个写过去就不知道猴年马月。
于是我就在思考,也没有什么可以偷懒的方法。
基于以下这个前提:原新闻不一定要格式很规整,内容一定准确,它可以有噪音,可以让LLM去读,LLM会把这些噪音给过滤掉
找了一圈,发现 paper3k 这个传奇库。
好东西啊——基于它,我只需要对每个源的 publish_date 做特化解析,其他全交给它。
而后我只需要对我感兴趣的几个新闻源做类似联合早报的事情就好,其余的交给 paper3k
paper3k 爬取之后就将新闻发送给大模型
而大模型众所周知的是,它的的 API 五花八门,于是干脆就开始搞抽象,自己封装一个
造轮子好爽,好想继续造轮子
我在这上面建了一个比较好的抽象层。
adapter.py
import abc
from collections.abc import AsyncIterator
from functools import cache
from typing import TYPE_CHECKING, Any
from src.llm.message import Message, MessageStreamEvent, Tool
if TYPE_CHECKING:
from .utils import Messages
class ChatAdapter(abc.ABC):
def __init__(
self,
api_key: str | None,
model: str,
base_url: str,
stream: bool = True,
**kwargs: Any,
) -> None:
super().__init__()
self.api_key = api_key
self.model = model
self.base_url = base_url
self.stream_enabled = stream
@property
@abc.abstractmethod
def type(self) -> str:
raise NotImplementedError
@property
@abc.abstractmethod
def model_name(self) -> str:
raise NotImplementedError
async def _call_api(self, api: str, data: Any) -> Any:
raise NotImplementedError
@abc.abstractmethod
async def chat(
self,
system_prompt: str,
messages: "Messages",
tools: list[Tool] = [],
**kwargs: Any,
) -> Message:
"""chat with the model"""
raise NotImplementedError
def stream(
self,
system_prompt: str,
messages: "Messages",
tools: list[Tool] = [],
**kwargs: Any,
) -> AsyncIterator[MessageStreamEvent]:
"""stream responses from the model"""
raise NotImplementedError
@cache
def list_models(self) -> list[str]:
"""list available models"""
raise NotImplementedError
async def balance(self) -> float:
"""account balance"""
raise NotImplementedErro
对于我们这种新闻 API 实时性不强的情况,可以把 stream 和 list_models 删掉,留下 chat 方法即可。
而后是数据的抽象,对于深度思考/文本输出/多模态输出/流式片段的抽象如下:
message.py
from collections.abc import Sequence
from datetime import UTC, datetime
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field
type Role = Literal[
"system",
"developer",
"user",
"assistant",
"tool",
]
class TextSegment(BaseModel):
type: Literal["text"] = "text"
text: str
class ImageSegment(BaseModel):
type: Literal["image"] = "image"
url: str
class ThinkSegment(BaseModel):
type: Literal["think"] = "think"
thought_process: str
class ToolSegment(BaseModel):
type: Literal["tool"] = "tool"
tool_call_id: str
tool_name: str
arguments: dict[str, Any] | None = None
class ToolCallFunctionBody(BaseModel):
name: str
arguments: dict[str, Any]
class ToolCall(BaseModel):
type: Literal["function"] = "function"
id: str
function: ToolCallFunctionBody
class TextDetailSegment(BaseModel):
type: Literal["text_detail"] = "text_detail"
text: str
class ThinkDetailSegment(BaseModel):
type: Literal["think_detail"] = "think_detail"
thought_process: str
class ToolCallDetailSegment(BaseModel):
type: Literal["tool_call_detail"] = "tool_call_detail"
tool_call_id: str
tool_name: str
partial_arguments: str = ""
arguments: dict[str, Any] | None = None
class UsageSegment(BaseModel):
type: Literal["usage"] = "usage"
provider: str
input_tokens: int | None = None
output_tokens: int | None = None
total_tokens: int | None = None
cached_tokens: int | None = None
reasoning_tokens: int | None = None
cache_creation_input_tokens: int | None = None
cache_read_input_tokens: int | None = None
stop_reason: str | None = None
stop_sequence: str | None = None
server_tool_use: dict[str, Any] | None = None
MessageSegment = TextSegment | ImageSegment | ThinkSegment | ToolSegment
MessageDetailSegment = (
TextDetailSegment | ThinkDetailSegment | ToolCallDetailSegment | UsageSegment
)
MessageCompleteSegment = MessageSegment | UsageSegment
MessageStreamSegment = MessageDetailSegment | MessageCompleteSegment
class Tool(BaseModel):
name: str
description: str
parameters: dict[str, Any] | None = None
class Message(BaseModel):
role: Role
# The name of the user or sy stem sending the message
name: str
content: Sequence[MessageSegment]
tool_calls: list[ToolCall] = Field(default_factory=list)
time: datetime = Field(default_factory=lambda: datetime.now(tz=UTC))
# total tokens used in this message
input_tokens: int = 0
output_tokens: int = 0
# Allow additional arbitrary fields
model_config = ConfigDict(extra="allow")
@property
def text(self) -> str:
return "".join(
segment.text for segment in self.content if isinstance(segment, TextSegment)
)
class MessageStreamDeltaEvent(BaseModel):
event: Literal["delta"] = "delta"
segment: MessageDetailSegment
index: int | None = None
class MessageStreamCompleteEvent(BaseModel):
event: Literal["complete"] = "complete"
segment: MessageCompleteSegment
index: int | None = None
type MessageStreamEvent = MessageStreamDeltaEvent | MessageStreamCompleteEvent
FastAPI
爬虫有了,得有个出口。FastAPI 搭接口,Pydantic 做数据校验,异步处理
这部分感觉很定式,就不放代码出来了,只是在数据库上包了一层,并不复杂,因为网站除了我自己压根没有人用 :(
彼时已经来到了 26年1月 左右,正是小龙虾 OpenClaw 最火的时候,于是我写专为 LLM 使用的 Markdown 端点
也许之后的网站都会变成识别是人类还是AI,而后再提供不一样的内容?
对 LLM 提供了两个端点,一个是 top_stories 也就是今日头条,一个是 search 用 postgres 的搜索向量捣鼓出来的搜索。
这个系统还有其它的很多 API,比如说深度分析/近期报道/实时报道这些,不过最后我想了一下给 LLM 的工具应该是尽可能简洁。
而后用 FastMCP 写了一下 MCP 相关的服务,不过我 AI 相关工具用的少,暂时还没有测试 MCP Server 相关的功能。
mcp.py
@mcp.tool()
async def newspaper_publishers() -> list[str]:
"""Get a list of news publishers"""
async with get_session() as session:
return [
publisher.name for publisher in await Publisher.publishers(session=session)
]
@mcp.tool()
async def newspaper_top_stories(
language: str = "zh",
date_str: str | None = None,
publishers: list[str] | None = None,
) -> dict[str, list[LLMArticle]]:
"""Get top news stories, optionally filtered by language, date, and publishers"""
language_enum = _parse_language(language)
if language_enum not in (Language.ZH, Language.EN):
raise ToolError("Unsupported language. Supported languages are: zh, en")
async with get_session() as session:
stories = await build_top_stories_result(
session,
language=language_enum,
date_str=date_str,
publishers=publishers,
)
return stories
@mcp.tool()
async def newspaper_search(
query: str,
language: str = "zh",
count: int = 10,
page: int = 1,
importance: int = 1,
) -> list[LLMArticle]:
"""Search news articles by free-text query."""
language_enum = _parse_language(language)
if language_enum not in (Language.ZH, Language.EN):
raise ToolError("Unsupported language. Supported languages are: zh, en")
search_query = NewsSearchParams(
query=query,
language=language_enum,
count=count,
page=page,
importance=importance,
)
async with get_session() as session:
return await search_news_articles(search_query, session)
Others
其余的代码可以直接去 Github 上搜,欢迎 Star/PR/Issue 正在快速迭代~