第二步:构建完整的Pipeline
构建一个完整的Pipeline,分为三个部分:
- 下载数据
- 训练模型
- 上传模型
为了便于今后数据集的存储与下载,以及训练模型的保存与迭代,我们同时在腾讯云开通了MinIO对象存储服务;
MinIO
MinIO是一款开源的对象存储服务器,兼容亚马逊的S3协议,对k8s能够友好的支持,专为AI等云原生工作负载而设计。
下载数据
首先,我们把之后要用于模型训练的数据传至MinIO中;
如图,上传手写数字识别数据集mnist.npz。
我们的任务是在负责load data的component中向该MinIO服务器发起申请,下载该数据集,以供模型训练使用。
目录结构
load_data.py
from minio import Minio
import argparse
import os
parser = argparse.ArgumentParser(description='')
parser.add_argument('--endpoint',type=str,default=...)
parser.add_argument('--access_key',type=str,default=...)
parser.add_argument('--secret_key',type=str,default=...)
parser.add_argument('--bucket_name',type=str,default=...)
parser.add_argument('--object_name',type=str)
parser.add_argument('--outputDir',type=str,default='./')
args = parser.parse_args()
print('endpoint: ', args.endpoint)
...
# 下载数据
def downloadData(endpoint,access_key,secret_key,bucket_name,object_name,outputDir):
client = Minio(endpoint,access_key,secret_key)
os.makedirs(outputDir, exist_ok=True)
file_path = os.path.join(outputDir,object_name)
client.fget_object(bucket_name,object_name,file_path) #bucket_name为桶名称,object_name为文件名称,file_path为下载后的本地保存路径
MINIO_CONF = {
'endpoint': args.endpoint,
...
}
downloadData(**MINIO_CONF)
print("Done!")
制作镜像
docker build、login、push 过程等见(一),不再复述。
训练模型
目录结构
train_model.py
#!/usr/bin/env python
# coding: utf-8
import argparse
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
torch.set_default_tensor_type(torch.DoubleTensor)
parser = argparse.ArgumentParser(description='')
parser.add_argument('--dataDir',type=str,default="./")
parser.add_argument('--object_name',type=str)
parser.add_argument('--model_name',type=str,default="model")
parser.add_argument('--modelDir',type=str,default="./")
parser.add_argument('--n_epochs',type=int,default=1)
parser.add_argument('--learning_rate',type=float,default=1e-2)
parser.add_argument('--train_batch_size',type=int,default=64)
parser.add_argument('--test_batch_size',type=int,default=1024)
args = parser.parse_args()
print('dataDir: ', args.dataDir)
...
class Net(nn.Module):
def __init__(self,model_name=None):
super(Net, self).__init__()
...
def forward(self, x):
...
class Mnistset(Dataset):
def __init__(self,x,y):
self.x = x
self.y = y
self.x = self.x/255.0
def __getitem__(self, index):
return self.x[index], self.y[index]
def __len__(self):
return len(self.x)
def load_data(file_path):
with np.load(file_path) as f:
x_train, y_train = f['x_train'], f['y_train']
x_test, y_test = f['x_test'], f['y_test']
return (x_train, y_train), (x_test, y_test)
def run(dataDir,object_name,model_name,modelDir,n_epochs,learning_rate,train_batch_size,test_batch_size):
file_path = os.path.join(dataDir,object_name)
(x_train, y_train), (x_test, y_test) = load_data(file_path)
train_set = Mnistset(x_train, y_train)
train_loader = DataLoader(train_set,batch_size=train_batch_size,shuffle=True)
test_set = Mnistset(x_test, y_test)
test_loader = DataLoader(test_set,batch_size=test_batch_size,shuffle=False)
model = Net()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)
def _train():
model.train()
for batch_ix, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
def _test():
model.eval()
def _save_model():
os.makedirs(modelDir, exist_ok=True)
dummy_input = torch.randn(1, 28, 28)
modelSave_path = os.path.join(modelDir,model_name+".onnx")
torch.onnx.export(model,dummy_input,modelSave_path) #保存为onnx模型
for epoch in range(1, n_epochs+1):
_train()
_test()
_save_model()
TRAIN_CONF ={
'dataDir':args.dataDir,
...
}
run(**TRAIN_CONF)
print("Done!")
其他过程不再赘述。
上传模型
upload_model.py
#!/usr/bin/env python
# coding: utf-8
from minio import Minio
import argparse
import os
parser = argparse.ArgumentParser(description='')
parser.add_argument('--endpoint',type=str,default=...)
parser.add_argument('--access_key',type=str,default=...)
parser.add_argument('--secret_key',type=str,default=...)
parser.add_argument('--bucket_name',type=str,default=...)
parser.add_argument('--modelDir',type=str,default="./")
parser.add_argument('--model_name',type=str,default="model")
args = parser.parse_args()
print('endpoint: ', args.endpoint)
...
def uploadModel(endpoint,access_key,secret_key,bucket_name,modelDir,model_name):
client = Minio(endpoint,access_key,secret_key)
model_name+=".onnx"
modelSave_path = os.path.join(modelDir,model_name)
client.fput_object(bucket_name=bucket_name,object_name=model_name,file_path=modelSave_path) #上传模型文件至MinIO
MINIO_CONF = {
'endpoint': args.endpoint,
...
}
uploadModel(**MINIO_CONF)
print("Done!")
其他过程不再赘述。
构建Pipeline
封装各component
from functools import partial
import kfp
from kfp import dsl, components
import os
container_op_loadData = partial(
components.func_to_container_op, #采用了KFP v1所推荐的方法
base_image='lifu963/load_data:v1.0', #上文中我们所封装的各模块镜像
)
@container_op_loadData
def loadData(object_name:str,outputDir:str):
import subprocess
import os
print("Download dataset")
subprocess.check_call(
[
'python',
'/workspace/load_data.py',
'--object_name', object_name,
'--outputDir', outputDir,
],
)
print("list outputDir: ", os.listdir(outputDir))
return
container_op_trainModel = partial(
components.func_to_container_op,
base_image='lifu963/train_model:v1.1',
)
@container_op_trainModel
def trainModel(object_name:str,dataDir:str,modelName:str,modelDir:str):
import subprocess
import os
print("Train model")
subprocess.check_call(
[
'python',
'/workspace/train_model.py',
'--object_name', object_name,
'--dataDir', dataDir,
'--model_name', modelName,
'--modelDir', modelDir,
],
)
print("list modelDir: ",os.listdir(modelDir))
return
container_op_uploader = partial(
components.func_to_container_op,
base_image='lifu963/upload_model:v1.0',
)
@container_op_uploader
def uploadModel(modelDir:str,model_name:str):
import subprocess
print("Upload model")
subprocess.check_call(
[
'python',
'/workspace/upload_model.py',
'--modelDir',modelDir,
'--model_name',model_name,
],
)
return
将各component拼接为Pipeline
@dsl.pipeline(
name='go go go',
description='',
)
def pipeline(object_name:str,
model_name:str = 'model',
dataVolumeSize: str = '100M',):
createPvc = dsl.VolumeOp(
name="create-pvc",
resource_name="my-pvc",
modes=dsl.VOLUME_MODE_RWO,
size=dataVolumeSize
)
pvcMountDir = "/tmp/outputs"
datasetDir = os.path.join(pvcMountDir, "dataset")
modelDir = os.path.join(pvcMountDir,"model")
load_data = loadData(object_name,datasetDir)
load_data.add_pvolumes({pvcMountDir:createPvc.volume})
load_data.after(createPvc)
train_model = trainModel(object_name,datasetDir,model_name,modelDir)
train_model.add_pvolumes({pvcMountDir:createPvc.volume})
train_model.after(load_data)
upload_model = uploadModel(modelDir,model_name)
upload_model.add_pvolumes({pvcMountDir:createPvc.volume})
upload_model.after(train_model)
kfp.compiler.Compiler().compile(pipeline, 'pipeline.yaml')
该Pipeline创建了一个PV卷,并添加至各component中;
各文件依赖关系如下:
- 总路径为 pvcMountDir : "/tmp/outputs"
- 数据集路径为 datasetDir : "/tmp/outputs/dataset",load_data块将从MinIO下载mnist.npz至该路径下,则最终路径为 "/tmp/outputs/dataset/mnist.npz"(datasetDir + object_name)
- 模型存放路径为 modelDir : "/tmp/outputs/model", train_model块将读取数据 "/tmp/outputs/dataset/mnist.npz" 训练模型,并将训练后的模型放至该路径下,则最终路径为 "/tmp/outputs/model/model.onnx" (modelDir + model_name + '.onnx')
- 最后,upload_model块将取出"/tmp/outputs/model/model.onnx"上传至MinIO中,Pipeline流程结束。
最终效果
上传pipeline.yaml,创建Experiments,Runs。
大功告成!
补
后续项目将MinIO从腾讯云移植到本地,(即在本地服务器部署MinIO);
建立客户端连接的代码修改如下:
client = Minio(endpoint,access_key,secret_key,secure=False) # secure 默认True[https]
下一步
- 思考有无进一步提高复用性的操作;
- 考察并比对Elyra(Pipeline可视化方法)与该原生方法的异同,优势,及缺陷;