-- 引题:以mac环境为例,从基础环境安装到成功发布属于自己的一个模型
一、 环境安装
1.1 安装 Homebrew
/bin/bash -c "$(curl -fsSL https://gitee.com/ineo6/homebrew-install/raw/master/install.sh)"
安装后按提示添加环境变量。
echo 'eval "$(/opt/homebrew/bin/brew shellenv)"' >> ~/.zshrc
eval "$(/opt/homebrew/bin/brew shellenv)"
1.2安装 Miniconda(推荐)或 Anaconda
# Apple Silicon 版
curl -L -o Miniconda3-macOS-arm64.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-arm64.sh
bash Miniconda3-macOS-arm64.sh -b -p $HOME/miniconda3
$HOME/miniconda3/bin/conda init zsh
重启终端生效,验证:conda --version
1.3 创建虚拟环境并安装 Python
conda create -n llm_finetune python=3.10 -y
conda activate llm_finetune
1.4 安装 PyTorch
# 先升级 pip 可避免一些安装问题
pip install --upgrade pip
# 安装指定版本 PyTorch(确保与 transformers 兼容)
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1
验证 MPS 支持(可选):
python -c "import torch; print(torch.backends.mps.is_available())" # 应输出 True
1.5 安装模型微调核心库
pip install transformers==4.46.0 datasets accelerate peft
pip install sentencepiece # Qwen tokenizer 需要
pip install tensorboard wandb scikit-learn pandas
# 注意:Mac 不支持 bitsandbytes(CUDA 专用),无需安装
如需 DPO 对齐,额外安装:
pip install trl
二、获取模型与测试加载
2.1 下载基座模型
# 通过 Hugging Face 下载(需有网络)
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 下载对应模型,并指定存储,该下载方式下载模型主要用于模型的预处理、微调、对齐等操作,最终完成后打包
from huggingface_hub import snapshot_download
snapshot_download(repo_id="Qwen/Qwen2-0.5B-Instruct", local_dir="./Qwen2-0.5B-Instruct-Local")
2.2 测试本地模型加载与对话
创建 test_qwen.py:
from transformers import AutoModelForCausalLM, AutoTokenizer
# 1. 指定你下载的本地路径
model_path = "./Qwen2-0.5B-Instruct-Local"
# 2. 加载模型(必须 trust_remote_code=True)
print("正在加载模型...")
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map="cpu", # 强制使用 CPU
torch_dtype="auto", # 自动选择精度
low_cpu_mem_usage=True # 降低加载时的内存峰值
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
print("模型加载成功!")
# 3. 构建一个对话格式的输入(Qwen2-Instruct 需要特定格式)
prompt = "你好,请介绍一下你自己。"
messages = [
{"role": "user", "content": prompt}
]
# 应用聊天模板
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# 4. 编码并生成回复
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
do_sample=True,
top_p=0.8
)
# 5. 解码输出
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
print("🤖 模型回答:")
print(response)
运行上述脚本应得到正常回复,此刻已经讲基础模型下载至本地。
三、准备训练数据
3.1 数据格式
创建 train.jsonl,每行一个 JSON 对象,必须包含 instruction 和 output(字段名可自定义,但后面的训练脚本需对应),数据量越大越好:
{"instruction": "请用‘明亮’造句。", "output": "教室的窗户很大,所以教室里非常明亮。"}
{"instruction": "解释‘狐假虎威’的意思。", "output": "狐狸假借老虎的威势,比喻依仗别人的势力来欺压人。"}
3.2 数据加载测试
from datasets
import load_dataset
dataset = load_dataset("json", data_files="./train.jsonl", split="train")
print(dataset[0]) # 查看第一条
四、LoRA 微调训练
4.1 完整微调脚本
创建 finetune_lora.py:
# LoRA 微调 python finetune_lora.py 基础指令微调,
import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
# 配置
model_path = "./Qwen2-0.5B-Instruct-Local"
output_dir = "./qwen-lora-finetuned"
max_length = 256
batch_size = 1
grad_accum = 4
# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# 加载模型到 MPS 或 CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
torch_dtype=torch.float16 if device.type == "mps" else torch.float32,
device_map=None,
)
model.to(device)
# 配置 LoRA
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.1,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 加载数据
dataset = load_dataset("json", data_files="./train.jsonl", split="train")
def format_chat(example):
grade = example.get("grade", "小学")
system_prompt = f"你是一名{grade}语文老师,请用适合{grade}学生的语言回答。"
messages = [
{"role":"system","content":system_prompt},
{"role": "user", "content": example["instruction"]},
{"role": "assistant", "content": example["output"]}
]
return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}
dataset = dataset.map(format_chat)
def tokenize_function(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=max_length,
padding=False,
)
tokenized_dataset = dataset.map(tokenize_function, remove_columns=dataset.column_names)
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=grad_accum,
learning_rate=2e-4,
num_train_epochs=3,
logging_steps=10,
save_strategy="epoch",
fp16=True if device.type == "mps" else False,
report_to="none",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
data_collator=data_collator,
)
trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
4.2 执行训练
python finetune_lora.py
训练完成后会在./qwen-lora-finetuned 目录生成adapter_config.json和 adapter_model.safetensors 等文件。
五、偏好对齐(DPO,可选)
5.1 准备偏好数据 dpo_data.jsonl
{"prompt": "如何学习编程?", "chosen": "建议从 Python 开始,它语法简洁。", "rejected": "直接学 C++,虽然难但基础扎实。"}
5.2 DPO 训练脚本 dpo_train.py
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOTrainer
from peft import PeftModel
model_path = "./qwen-lora-finetuned"
output_dir = "./qwen-dpo-aligned"
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True,
torch_dtype=torch.float16 if device.type == "mps" else torch.float32,
)
model = PeftModel.from_pretrained(model, model_path, is_trainable=True) # 若PeftModel加载失败可先加载base再加载lora
model.to(device)
dataset = load_dataset("json", data_files="dpo_data.jsonl", split="train")
def format_dpo(example):
return {
"prompt": tokenizer.apply_chat_template([{"role": "user", "content": example["prompt"]}], tokenize=False, add_generation_prompt=True),
"chosen": example["chosen"],
"rejected": example["rejected"],
}
dataset = dataset.map(format_dpo)
training_args = TrainingArguments(
output_dir=output_dir,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
learning_rate=5e-5,
num_train_epochs=1,
logging_steps=10,
fp16=True if device.type == "mps" else False,
report_to="none",
remove_unused_columns=False,
)
dpo_trainer = DPOTrainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
max_length=256,
max_prompt_length=128,
)
dpo_trainer.train()
model.save_pretrained(output_dir)
print("✅ DPO 对齐完成")
运行:
python dpo_train.py
六、推理测试
6.1 创建推理脚本 inference.py
推理测试输出结果一共有两种方式,分别如下:
1、流式输出
# 加载 LoRA 适配器进行交互对话
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
from peft import PeftModel
# -------------------- 配置 --------------------
base_model_path = "./Qwen2-0.5B-Instruct-Local"
lora_path = "./qwen-lora-finetuned" # 或 "./qwen-dpo-aligned"
# -------------------- 加载模型与分词器 --------------------
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"🚀 使用设备: {device}")
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
trust_remote_code=True,
torch_dtype=torch.float16 if device.type == "mps" else torch.float32,
device_map=None,
)
model = PeftModel.from_pretrained(model, lora_path)
model.to(device)
model.eval() # 🔒 切换到推理模式(禁用 dropout 等)
print("✅ 模型加载完成,开始对话(输入 exit 退出)\n")
# -------------------- 流式生成函数 --------------------
def generate_stream(prompt):
"""处理对话模板并流式生成回复"""
messages = [{"role": "user", "content": prompt}]
# 应用聊天模板(兼容新版 transformers 返回列表的问题)
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
if isinstance(text, list):
text = text[0]
# 编码输入
inputs = tokenizer(text, return_tensors="pt").to(device)
# 创建流式输出器(实时打印 tokens)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# 生成参数(可根据需要调整)
generation_kwargs = {
**inputs,
"max_new_tokens": 256,
"temperature": 0.7,#控制随机性:越低越确定,越高越多样
"top_p": 0.9,#核采样,仅考虑累积概率达到 p 的候选词
"do_sample": True,
"repetition_penalty": 1.05, # 避免重复 惩罚重复出现的 token
"pad_token_id": tokenizer.eos_token_id,
"streamer": streamer, # 🔥 流式输出关键参数
}
# 开始生成(streamer 会自动打印输出)
print("🤖 模型: ", end="", flush=True)
with torch.no_grad():
model.generate(**generation_kwargs)
print() # 换行美化
# -------------------- 交互循环 --------------------
while True:
user_input = input("📂 用户: ")
if user_input.lower() in ["exit", "quit", "q"]:
break
generate_stream(user_input)
2、直接一次性输出返回
# inference.py 直接输出模型返回的信息 加载 LoRA 适配器进行交互对话
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
base_model_path = "./Qwen2-0.5B-Instruct-Local"
lora_path = "./qwen-lora-finetuned" # 或 "./qwen-lora-finetuned"
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
trust_remote_code=True,
torch_dtype=torch.float16,
device_map="cpu", # 推理可用 CPU 节省 MPS 资源
)
model = PeftModel.from_pretrained(model, lora_path)
model.eval()
while True:
user_input = input("👤 用户: ")
if user_input.lower() in ["exit", "quit"]:
break
messages = [{"role": "user", "content": user_input}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# 兼容新版 transformers 返回列表的情况
if isinstance(text, list):
text = text[0]
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
)
response = tokenizer.decode(outputs[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
print(f"🤖 模型: {response}")
运行:
python inference.py
七、导出为 Ollama 格式
7.1准备工作:安装必要工具
# 安装 llama.cpp(用于转换和量化)
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make -j # 若需 Metal 加速,可加 LLAMA_METAL=1(可选)
pip install -r requirements.txt # 安装转换脚本所需的 Python 依赖
# 确认 ollama 已安装
brew install ollama
ollama serve # 启动 Ollama 服务(保持后台运行)
7.2 合并 LoRA 到基座模型
创建 merge_lora.py:
# 合并 LoRA 与基座模型(生成完整权重) 原因:Peft 的 PeftModel 依赖 adapter_config.json,Ollama 不认识此格式,必须合并并保存为普通 Transformers 权重。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# 路径配置(按你的实际情况修改)
base_model_path = "./Qwen2-0.5B-Instruct-Local"
lora_weights_path = "./qwen-lora-finetuned" # 或 DPO 后的路径
merged_output_path = "./merged-qwen2-0.5b-chinese"
print("正在加载基座模型...")
model = AutoModelForCausalLM.from_pretrained(
base_model_path,
torch_dtype=torch.float16,
trust_remote_code=True,
device_map="cpu", # 合并不需要 GPU
)
print("正在融合 LoRA 权重...")
model = PeftModel.from_pretrained(model, lora_weights_path)
model = model.merge_and_unload() # 核心步骤:合并并移除 LoRA 层
print("正在保存完整模型...")
model.save_pretrained(merged_output_path, safe_serialization=True)
tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
tokenizer.save_pretrained(merged_output_path)
print(f"✅ 合并完成!完整模型已保存到:{merged_output_path}")
运行:
python merge_lora.py
7.3 转换为 GGUF 格式
确保已克隆并编译 llama.cpp,且安装了 sentencepiece:
pip install sentencepiece
cd Llama.cpp # 进入llama.cpp目录
python convert_hf_to_gguf.py ../merged-qwen2-0.5b-chinese \
--outfile ../qwen2-0.5b-chinese-f16.gguf \
--outtype f16
注意:--outtype f16 表示 16 位浮点,模型大小约 1GB。若想压缩模型,可使用 --outtype q8_0(8-bit 量化)或 q4_0(4-bit,损失少许精度但速度更快)。只需将 qwen2 的模型架构正确识别,脚本会自动适配。
如果想一步到位置化成 4-bit(推荐日常使用,内存/速度平衡):
./quantize ../qwen2-0.5b-chinese-f16.gguf ../qwen2-0.5b-chinese-Q4_K_M.gguf Q4_K_M
7.4 注册到 Ollama
创建 Modelfile:
FROM ./qwen2-0.5b-chinese-f16.gguf
# 设置对话模板(必须与训练时一致!)
TEMPLATE """{{ if .System }}<|im_start|>system
{{ .System }}<|im_end|>
{{ end }}{{ if .Prompt }}<|im_start|>user
{{ .Prompt }}<|im_end|>
{{ end }}<|im_start|>assistant
"""
# 设置停止词
PARAMETER stop "<|im_start|>"
PARAMETER stop "<|im_end|>"
# 自定义参数(可根据需要调整)
PARAMETER temperature 0.7
PARAMETER top_p 0.9
创建模型:
ollama create primary-chinese -f Modelfile
验证:
ollama run primary-chinese "请用‘高兴’造个句。"
至此,已将基础模型经过LoRA 微调、偏好对齐、推理测试步骤完成了自由模型的生成,并通过导出为 Ollama 格式可在Ollama中直接调用使用,谢谢观看~~~~
常见问题排查
| 问题 | 原因 | 解决 |
| transformers 提示需要 PyTorch >= 2.4 | PyTorch 版本过低 | pip install torch==2.5.1 |
| apply_chat_template 返回列表导致 tokenizer 报错 | transformers 新版返回 list[str] | 增加判断 if isinstance(text, list): text = text[0] |
| bitsandbytes 无法安装 | Mac 不兼容 | 忽略,不使用 QLoRA |
| GGUF 转换时报 No module named 'sentencepiece' | 缺少依赖 | pip install sentencepiece |
| ollama create 报 invalid model name | Modelfile 文件路径错误或 FROM 文件不存在 | 检查 GGUF 文件路径并简化文件名 |