本篇文章主要讲解,本地化部署模型的脚本使用方法,通过此脚本可以快速进行任何模型的部署及使用。 日期:2026年4月11日 作者:任聪聪
先看效果
安装依赖
requirements.txt
# ================== Web 服务 ==================
fastapi>=0.110.0
uvicorn>=0.27.0
requests>=2.31.0
# ================== LLM 推理 ==================
transformers>=4.40.0
torch>=2.0.0
# GGUF / llama.cpp 推理
llama-cpp-python>=0.2.70
# ================== 工具依赖 ==================
pydantic>=2.0.0
typing-extensions>=4.8.0
运行安装
pip install -r requirements.txt
代码实例
import os
import json
import argparse
import socket
import requests
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
try:
from llama_cpp import Llama
LLAMA_CPP_AVAILABLE = True
except ImportError:
LLAMA_CPP_AVAILABLE = False
try:
import ollama
OLLAMA_AVAILABLE = True
except ImportError:
OLLAMA_AVAILABLE = False
CONFIG_FILE = "models.json"
SERVER_URL = "http://127.0.0.1:1234"
SERVER_PROT = 1234
# ================== 工具 ==================
def load_config():
if not os.path.exists(CONFIG_FILE):
return {}
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
return json.load(f)
def save_config(cfg):
with open(CONFIG_FILE, "w", encoding="utf-8") as f:
json.dump(cfg, f, indent=2, ensure_ascii=False)
def get_local_ip():
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
finally:
s.close()
return ip
def detect_backend(path):
if path.endswith(".gguf"):
return "llama_cpp"
elif os.path.isdir(path):
return "transformers"
else:
return "ollama"
# ================== 模型运行 ==================
class ModelProcess:
def __init__(self, name, cfg):
self.name = name
self.cfg = cfg
self.model = None
self.running = False
def load(self):
backend = self.cfg["backend"]
path = self.cfg["path"]
if backend == "llama_cpp":
if not LLAMA_CPP_AVAILABLE:
raise Exception("llama-cpp-python 未安装")
self.model = Llama(model_path=path, n_ctx=4096)
elif backend == "transformers":
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
self.tokenizer = AutoTokenizer.from_pretrained(path)
self.model = AutoModelForCausalLM.from_pretrained(
path,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto"
)
elif backend == "ollama":
if not OLLAMA_AVAILABLE:
raise Exception("ollama 未安装")
self.model_name = path
def start(self):
if self.running:
return
print(f"[{self.name}] 加载中...")
self.load()
self.running = True
print(f"[{self.name}] 已启动")
def stop(self):
self.model = None
self.running = False
def infer(self, prompt):
if not self.running:
return "模型未启动"
if self.cfg["backend"] == "llama_cpp":
output = self.model(prompt, max_tokens=200)
return output["choices"][0]["text"]
elif self.cfg["backend"] == "transformers":
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
outputs = self.model.generate(**inputs, max_new_tokens=200)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
elif self.cfg["backend"] == "ollama":
response = ollama.chat(model=self.model_name, messages=[
{'role': 'user', 'content': prompt}
])
return response['message']['content']
# ================== 管理器 ==================
class Manager:
def __init__(self):
self.cfg = load_config()
self.models = {}
def add(self, name, path, backend):
self.cfg[name] = {"path": path, "backend": backend}
save_config(self.cfg)
def start(self, name):
if name not in self.cfg:
return "模型不存在"
if name not in self.models:
proc = ModelProcess(name, self.cfg[name])
proc.start()
self.models[name] = proc
return "已启动"
def stop(self, name):
if name in self.models:
self.models[name].stop()
del self.models[name]
return "已停止"
def status(self):
return {name: proc.running for name, proc in self.models.items()}
def run(self, name, prompt):
if name not in self.models:
return "模型未启动"
return self.models[name].infer(prompt)
manager = Manager()
app = FastAPI()
# ================== API ==================
class AddModel(BaseModel):
name: str
path: str
backend: str
class RunModel(BaseModel):
name: str
prompt: str
@app.post("/add")
def add_model(data: AddModel):
manager.add(data.name, data.path, data.backend)
return {"msg": "ok"}
@app.post("/start")
def start_model(data: AddModel):
return {"msg": manager.start(data.name)}
@app.post("/stop")
def stop_model(data: AddModel):
return {"msg": manager.stop(data.name)}
@app.get("/status")
def status():
return manager.status()
@app.post("/run")
def run_model(data: RunModel):
return {"result": manager.run(data.name, data.prompt)}
# ================== CLI ==================
def serve():
ip = get_local_ip()
print(f"\n服务启动:")
print(f"本机: {SERVER_URL}")
print(f"局域网: http://{ip}:{SERVER_PROT}\n")
uvicorn.run(app, host="0.0.0.0", port=SERVER_PROT)
def cli():
parser = argparse.ArgumentParser()
sub = parser.add_subparsers(dest="cmd")
add = sub.add_parser("add")
add.add_argument("--name")
add.add_argument("--path")
start = sub.add_parser("start")
start.add_argument("--name")
stop = sub.add_parser("stop")
stop.add_argument("--name")
sub.add_parser("status")
run = sub.add_parser("run")
run.add_argument("--name")
run.add_argument("--prompt")
sub.add_parser("serve")
args = parser.parse_args()
if args.cmd == "serve":
serve()
elif args.cmd == "add":
backend = detect_backend(args.path)
requests.post(SERVER_URL + "/add", json={
"name": args.name,
"path": args.path,
"backend": backend
})
print("已添加")
elif args.cmd == "start":
print(requests.post(SERVER_URL + "/start", json={"name": args.name}).json())
elif args.cmd == "stop":
print(requests.post(SERVER_URL + "/stop", json={"name": args.name}).json())
elif args.cmd == "status":
print(requests.get(SERVER_URL + "/status").json())
elif args.cmd == "run":
res = requests.post(SERVER_URL + "/run", json={
"name": args.name,
"prompt": args.prompt
})
print(res.json())
else:
parser.print_help()
if __name__ == "__main__":
cli()
使用说明
步骤一、运行本地服务如下图:
执行命令:
python aiRun.py serve # 启动服务(常驻)
步骤二、启动服务后进行调用即可
执行命令:
python aiRun.py add --name llama3 --path ./models/llama3.gguf # 添加模型
python aiRun.py start --name llama3 # 启动模型
python aiRun.py stop --name llama3 # 停止模型
python aiRun.py status # 查看状态
python aiRun.py run --name llama3 --prompt "" # 推理