论如何在KubeFlow上跑通第一个Pipeline(二)

570 阅读3分钟

第二步:构建完整的Pipeline

构建一个完整的Pipeline,分为三个部分:

  • 下载数据
  • 训练模型
  • 上传模型

为了便于今后数据集的存储与下载,以及训练模型的保存与迭代,我们同时在腾讯云开通了MinIO对象存储服务;

MinIO

MinIO是一款开源的对象存储服务器,兼容亚马逊的S3协议,对k8s能够友好的支持,专为AI等云原生工作负载而设计。

1.jpg

2.jpg

下载数据

首先,我们把之后要用于模型训练的数据传至MinIO中;

Q8HJCHP_QQ)}GTF(D@56M.png

如图,上传手写数字识别数据集mnist.npz。

我们的任务是在负责load data的component中向该MinIO服务器发起申请,下载该数据集,以供模型训练使用。

目录结构

3.jpg

load_data.py

from minio import Minio
import argparse
import os


parser = argparse.ArgumentParser(description='')


parser.add_argument('--endpoint',type=str,default=...)
parser.add_argument('--access_key',type=str,default=...)
parser.add_argument('--secret_key',type=str,default=...)
parser.add_argument('--bucket_name',type=str,default=...)
parser.add_argument('--object_name',type=str)
parser.add_argument('--outputDir',type=str,default='./')


args = parser.parse_args()
print('endpoint: ', args.endpoint)
...

#  下载数据
def downloadData(endpoint,access_key,secret_key,bucket_name,object_name,outputDir):
    client = Minio(endpoint,access_key,secret_key) 
    os.makedirs(outputDir, exist_ok=True)
    file_path = os.path.join(outputDir,object_name)
    client.fget_object(bucket_name,object_name,file_path) #bucket_name为桶名称,object_name为文件名称,file_path为下载后的本地保存路径


MINIO_CONF = {
    'endpoint': args.endpoint,
    ...
}


downloadData(**MINIO_CONF)
print("Done!")

4.jpg

制作镜像

5.jpg

docker build、login、push 过程等见(一),不再复述。

训练模型

目录结构

6.jpg

train_model.py

#!/usr/bin/env python
# coding: utf-8


import argparse
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


torch.set_default_tensor_type(torch.DoubleTensor)


parser = argparse.ArgumentParser(description='')
parser.add_argument('--dataDir',type=str,default="./")
parser.add_argument('--object_name',type=str)
parser.add_argument('--model_name',type=str,default="model")
parser.add_argument('--modelDir',type=str,default="./")
parser.add_argument('--n_epochs',type=int,default=1)
parser.add_argument('--learning_rate',type=float,default=1e-2)
parser.add_argument('--train_batch_size',type=int,default=64)
parser.add_argument('--test_batch_size',type=int,default=1024)


args = parser.parse_args()
print('dataDir: ', args.dataDir)
...


class Net(nn.Module):
    def __init__(self,model_name=None):
        super(Net, self).__init__()
        ...
        
    def forward(self, x):
        ...

    
class Mnistset(Dataset):
    def __init__(self,x,y):
        self.x = x
        self.y = y
        
        self.x = self.x/255.0
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return len(self.x)


def load_data(file_path):
    with np.load(file_path) as f:
        x_train, y_train = f['x_train'], f['y_train']
        x_test, y_test = f['x_test'], f['y_test']
    return (x_train, y_train), (x_test, y_test)


def run(dataDir,object_name,model_name,modelDir,n_epochs,learning_rate,train_batch_size,test_batch_size):
    file_path = os.path.join(dataDir,object_name)
    (x_train, y_train), (x_test, y_test) = load_data(file_path)
    
    train_set = Mnistset(x_train, y_train)
    train_loader = DataLoader(train_set,batch_size=train_batch_size,shuffle=True)
    
    test_set = Mnistset(x_test, y_test)
    test_loader = DataLoader(test_set,batch_size=test_batch_size,shuffle=False)
    
    model = Net()
    optimizer = optim.Adam(model.parameters(),lr=learning_rate)
    
    def _train():
        model.train()
        for batch_ix, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
        
    def _test():
        model.eval()
        
    def _save_model():
        os.makedirs(modelDir, exist_ok=True)
        dummy_input = torch.randn(1, 28, 28)
        modelSave_path = os.path.join(modelDir,model_name+".onnx")
        torch.onnx.export(model,dummy_input,modelSave_path) #保存为onnx模型
    
    for epoch in range(1, n_epochs+1):
        _train()
        _test()
    
    _save_model()


TRAIN_CONF ={
    'dataDir':args.dataDir,
    ...
}


run(**TRAIN_CONF)
print("Done!")

其他过程不再赘述。

上传模型

7.jpg

upload_model.py

#!/usr/bin/env python
# coding: utf-8


from minio import Minio
import argparse
import os


parser = argparse.ArgumentParser(description='')

parser.add_argument('--endpoint',type=str,default=...)
parser.add_argument('--access_key',type=str,default=...)
parser.add_argument('--secret_key',type=str,default=...)
parser.add_argument('--bucket_name',type=str,default=...)
parser.add_argument('--modelDir',type=str,default="./")
parser.add_argument('--model_name',type=str,default="model")


args = parser.parse_args()
print('endpoint: ', args.endpoint)
...


def uploadModel(endpoint,access_key,secret_key,bucket_name,modelDir,model_name):
    client = Minio(endpoint,access_key,secret_key)
    model_name+=".onnx"
    modelSave_path = os.path.join(modelDir,model_name)
    client.fput_object(bucket_name=bucket_name,object_name=model_name,file_path=modelSave_path) #上传模型文件至MinIO
    

MINIO_CONF = {
    'endpoint': args.endpoint,
    ...
}


uploadModel(**MINIO_CONF)
print("Done!")

其他过程不再赘述。

构建Pipeline

封装各component

from functools import partial
import kfp
from kfp import dsl, components
import os

container_op_loadData = partial(
    components.func_to_container_op, #采用了KFP v1所推荐的方法
    base_image='lifu963/load_data:v1.0', #上文中我们所封装的各模块镜像
)

@container_op_loadData
def loadData(object_name:str,outputDir:str):
    import subprocess
    import os
    print("Download dataset")
    subprocess.check_call(
        [
            'python',
            '/workspace/load_data.py',
            '--object_name', object_name,
            '--outputDir', outputDir,
        ],
    )
    print("list outputDir: ", os.listdir(outputDir))
    return
    
container_op_trainModel = partial(
    components.func_to_container_op,
    base_image='lifu963/train_model:v1.1',
)
    
@container_op_trainModel
def trainModel(object_name:str,dataDir:str,modelName:str,modelDir:str):
    import subprocess
    import os
    print("Train model")
    subprocess.check_call(
        [
            'python',
            '/workspace/train_model.py',
            '--object_name', object_name,
            '--dataDir', dataDir,
            '--model_name', modelName,
            '--modelDir', modelDir,
        ],
    )
    print("list modelDir: ",os.listdir(modelDir))
    return    
    
container_op_uploader = partial(
    components.func_to_container_op,
    base_image='lifu963/upload_model:v1.0',
)

@container_op_uploader
def uploadModel(modelDir:str,model_name:str):
    import subprocess
    print("Upload model")
    subprocess.check_call(
        [
            'python',
            '/workspace/upload_model.py',
            '--modelDir',modelDir,
            '--model_name',model_name,
        ],
    )
    return

将各component拼接为Pipeline

@dsl.pipeline(
    name='go go go',
    description='',
)
def pipeline(object_name:str,
             model_name:str = 'model',
             dataVolumeSize: str = '100M',):
             
    createPvc = dsl.VolumeOp(
        name="create-pvc",
        resource_name="my-pvc",
        modes=dsl.VOLUME_MODE_RWO,
        size=dataVolumeSize
    )
    pvcMountDir = "/tmp/outputs"
    datasetDir = os.path.join(pvcMountDir, "dataset")
    modelDir = os.path.join(pvcMountDir,"model")
    
    load_data = loadData(object_name,datasetDir)
    load_data.add_pvolumes({pvcMountDir:createPvc.volume})
    load_data.after(createPvc)
    
    train_model = trainModel(object_name,datasetDir,model_name,modelDir)
    train_model.add_pvolumes({pvcMountDir:createPvc.volume})
    train_model.after(load_data)
    
    upload_model = uploadModel(modelDir,model_name)
    upload_model.add_pvolumes({pvcMountDir:createPvc.volume})
    upload_model.after(train_model)
    
kfp.compiler.Compiler().compile(pipeline, 'pipeline.yaml')

该Pipeline创建了一个PV卷,并添加至各component中;

各文件依赖关系如下:

  1. 总路径为 pvcMountDir : "/tmp/outputs"
  2. 数据集路径为 datasetDir : "/tmp/outputs/dataset",load_data块将从MinIO下载mnist.npz至该路径下,则最终路径为 "/tmp/outputs/dataset/mnist.npz"(datasetDir + object_name)
  3. 模型存放路径为 modelDir : "/tmp/outputs/model", train_model块将读取数据 "/tmp/outputs/dataset/mnist.npz" 训练模型,并将训练后的模型放至该路径下,则最终路径为 "/tmp/outputs/model/model.onnx" (modelDir + model_name + '.onnx')
  4. 最后,upload_model块将取出"/tmp/outputs/model/model.onnx"上传至MinIO中,Pipeline流程结束。

最终效果

上传pipeline.yaml,创建Experiments,Runs。

10.jpg

7.jpg

8.jpg

大功告成!

后续项目将MinIO从腾讯云移植到本地,(即在本地服务器部署MinIO);

建立客户端连接的代码修改如下: client = Minio(endpoint,access_key,secret_key,secure=False) # secure 默认True[https]

下一步

  • 思考有无进一步提高复用性的操作;
  • 考察并比对Elyra(Pipeline可视化方法)与该原生方法的异同,优势,及缺陷;