MinerU能够将包含图片、公式、表格等元素的多模态PDF、PPT、DOCX等文档转化为易于分析的Markdown格式。
1 克隆MinerU的仓库
git clone https://github.com/opendatalab/MinerU.git
2 cd到projects/web-api
cd projects/web-api
3 在可以访问huggingface的情况下可以直接运行,后续部分步骤可能不适用于这种情况
docker build -t mineru-api .
4 如果无法通过huggingface下载,可以使用modelscope下载模型
修改requirement.txt
magic-pdf[full]==1.3.9
fastapi
uvicorn
python-multipart
requests>=2.32.3
modelscope>=1.25.0
修改download_models.py,删掉所有旧代码,改成这个
#!/usr/bin/env python
import json
import shutil
import os
import requests
from modelscope import snapshot_download
def download_json(url):
# 下载JSON文件
response = requests.get(url)
response.raise_for_status() # 检查请求是否成功
return response.json()
def download_and_modify_json(url, local_filename, modifications):
if os.path.exists(local_filename):
data = json.load(open(local_filename))
config_version = data.get('config_version', '0.0.0')
if config_version < '1.2.0':
data = download_json(url)
else:
data = download_json(url)
# 修改内容
for key, value in modifications.items():
data[key] = value
# 保存修改后的内容
with open(local_filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
mineru_patterns = [
# "models/Layout/LayoutLMv3/*",
"models/Layout/YOLO/*",
"models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*",
]
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
layoutreader_model_dir = snapshot_download('ppaanngggg/layoutreader')
model_dir = model_dir + '/models'
print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
# paddleocr_model_dir = model_dir + '/OCR/paddleocr'
# user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
# if os.path.exists(user_paddleocr_dir):
# shutil.rmtree(user_paddleocr_dir)
# shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json'
home_dir = os.path.expanduser('~')
config_file = os.path.join(home_dir, config_file_name)
json_mods = {
'models-dir': model_dir,
'layoutreader-model-dir': layoutreader_model_dir,
}
download_and_modify_json(json_url, config_file, json_mods)
print(f'The configuration file has been configured successfully, the path is: {config_file}')
5 修改Dockerfile
FROM python:3.10-slim-bookworm AS base
WORKDIR /app
ENV DEBIAN_FRONTEND=noninteractive \
LANG=C.UTF-8 \
PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1
FROM base AS build
RUN echo "deb http://mirrors.aliyun.com/debian/ bookworm main non-free non-free-firmware" > /etc/apt/sources.list && \
echo "deb http://mirrors.aliyun.com/debian-security/ bookworm-security main" >> /etc/apt/sources.list
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Build Python dependencies
COPY requirements.txt .
RUN python -m venv /app/venv && \
. /app/venv/bin/activate && \
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
# pip uninstall -y paddlepaddle && \
# pip install -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ \
# paddlepaddle-gpu==3.0.0rc1
# Download models
COPY download_models.py .
RUN . /app/venv/bin/activate && \
./download_models.py
FROM base AS prod
# Copy Python dependencies and models from the build stage
COPY --from=build /app/venv /app/venv
COPY --from=build /root/.cache/modelscope/hub/models/opendatalab/PDF-Extract-Kit-1___0/models /root/.cache/modelscope/hub/models/opendatalab/PDF-Extract-Kit-1___0/models
COPY --from=build /root/.cache/modelscope/hub/models/ppaanngggg/layoutreader /root/.cache/modelscope/hub/models/ppaanngggg/layoutreader
RUN echo "deb http://mirrors.aliyun.com/debian/ bookworm main non-free non-free-firmware" > /etc/apt/sources.list && \
echo "deb http://mirrors.aliyun.com/debian-security/ bookworm-security main" >> /etc/apt/sources.list
# Update the package list and install necessary packages
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libgl1 \
libglib2.0-0 \
libgomp1 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Create volume for paddleocr models
# RUN mkdir -p /root/.paddleocr
# VOLUME [ "/root/.paddleocr" ]
# Copy the app and its configuration file
COPY entrypoint.sh /app/entrypoint.sh
COPY --from=build /root/magic-pdf.json /root/magic-pdf.json
COPY app.py /app/app.py
# Expose the port that FastAPI will run on
EXPOSE 8000
# Command to run FastAPI using Uvicorn, pointing to app.py and binding to 0.0.0.0:8000
ENTRYPOINT [ "/app/entrypoint.sh" ]
CMD ["--host", "0.0.0.0", "--port", "8000"]
再运行docker build -t mineru-api .
5 编写docker-compose.yml
services:
mineru-api:
image: mineru-api
ports:
- "8000:8000"
stdin_open: true # 对应 -i 参数
tty: true # 对应 -t 参数
runtime: nvidia # 使用 NVIDIA 容器运行时
environment:
- NVIDIA_VISIBLE_DEVICES=all # 允许访问所有 GPU
6 启动
docker compose up -d
如果没有nvidia的runtime导致无法启动,先备份自己的daemon.json文件(/etc/docker/),通过yum install -y nvidia-docker2安装,这个命令会生成一个新的daemon.json文件,覆盖掉原来旧的,安装后之后比较一下新旧两个daemon文件,合并在一起。
systemctl daemon-reload
systemctl restart docker
docker compose up -d
7 修改一下容器内的magic-pdf.json文件,使用cuda模式,默认是cpu模式
docker exec -it daa09f5404f8 /bin/sh
apt-get update
apt-get install vim
vim /root/magic-pdf.json
重启一下
docker compose restart
8 访问localhost:8000/docs
9 调用api