k8s初使用 -2- 部署Python项目/Python脚本

289 阅读4分钟

项目部署

Demo

想实现把python项目/脚本打入到Docker镜像中,然后运行K8s进行快速的部署项目

1: 准备好项目

需要先拉取所有的代码到mul_fac_eq中

2: 编写Dockerfile

# 初始镜像
FROM daskdev/dask:latest

MAINTAINER syt
#创建需要的目录
RUN mkdir -p /mul_fac_eq
RUN mkdir -p /factorsFiles
RUN mkdir -p /logs/historyLogs
RUN mkdir -p /logs/inspectLogs
RUN mkdir -p /ths_data


WORKDIR /mul_fac_eq

#设置PYTHONPATH 让mul_fac_eq代码可以访问
ENV PYTHONPATH /mul_fac_eq

#添加项目到镜像mul_fac_eq目录中
ADD ./ /mul_fac_eq

# 安装依赖
RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt

3:打包

进入到 mul_fac_eq 目录下执行Dockerfile

# docker build -t 镜像名:版本 路径(.代表当前路径)
docker build -t mydask:v1 .

4:准备K8s的yml文件

dask-scheduler-deployment

apiVersion: apps/v1  
kind: Deployment  
metadata:  
  name: dask-scheduler  
spec:  
  replicas: 1  
  selector:  
    matchLabels:  
      app: dask-scheduler  
  template:  
    metadata:  
      labels:  
        app: dask-scheduler  
    spec:  
      containers:  
      - name: dask-scheduler  
        image: mydask:v1
        imagePullPolicy: IfNotPresent 
        command: ["dask-scheduler"]  
        ports:  
        - containerPort: 8786
        - containerPort: 8787
      hostAliases:  
      - ip: "192.168.10.211"
        hostnames:  
        - "hadoop102"
        - "m1"
      - ip: "192.168.10.212"
        hostnames:  
        - "hadoop103"
        - "n1"
      - ip: "192.168.10.213"
        hostnames:  
        - "hadoop104"
        - "n2"
      - ip: "192.168.10.211"
        hostnames:  
        - "hadoop100"

dask-scheduler-service

apiVersion: v1  
kind: Service  
metadata:  
  name: dask-scheduler  
spec:  
  selector:  
    app: dask-scheduler  
  ports:  
  - port: 8786  
    targetPort: 8786  
    nodePort: 30086  # 自定义NodePort端口,对于type=NodePort是必须的  
    name: scheduler-port  
  - port: 8787  
    targetPort: 8787  
    nodePort: 30087  # 自定义NodePort端口,对于type=NodePort是必须的  
    name: web-ui  
  type: NodePort

dask-worker-deployment

apiVersion: apps/v1  
kind: Deployment  
metadata:  
  name: dask-worker  
spec:  
  replicas: 3  # 根据需要调整副本数  
  selector:  
    matchLabels:  
      app: dask-worker  
  template:  
    metadata:  
      labels:  
        app: dask-worker  
    spec:  
      containers:  
      - name: dask-worker  
        image: mydask:v1
        imagePullPolicy: IfNotPresent
        command: ["dask-worker", "192.168.10.212:30086"]
      hostAliases:  
      - ip: "192.168.10.211"
        hostnames:  
        - "hadoop102"
        - "m1"
      - ip: "192.168.10.212"
        hostnames:  
        - "hadoop103"
        - "n1"
      - ip: "192.168.10.213"
        hostnames:  
        - "hadoop104"
        - "n2"
      - ip: "192.168.10.211"
        hostnames:  
        - "hadoop100"

5:启动集群

按顺序分别启动

# 启动
kubectl apply -f dask-scheduler-deployment.yaml
kubectl apply -f dask-scheduler-service.yaml
kubectl apply -f dask-worker-deployment.yaml

# 查询pod状态
kubectl get pod,svc

# 查询日志
kubectl logs pod名

#增加work
kubectl scale deployment dask-worker-deployment --replicas=6

# 删除集群
kubectl delete -f dask-worker-deployment.yaml

6:测试

import time

from dask.distributed import Client

from jy_common.jy_data import StockData
from jy_common.jy_utils import MyDateUtils
from jy_common.jy_utils.MyHDFSUtils import MyHDFSUtils


def square(x):
    # 让程序暂停5秒
    time.sleep(5)
    print(f'square--{x}')
    return x ** 2


def neg(x):
    # 让程序暂停5秒
    time.sleep(5)
    print(f'neg--{x}')
    return -x


def no_dask():
    """
    不用dask 单线程
    #2个任务每个任务运行完需要5秒 10万次循环  cost time :1000000
    """
    ts = time.time()
    add_list1 = []
    for e in range(100000):
        add_list1.append(square(e))
    add_list2 = []
    for e2 in add_list1:
        add_list2.append(neg(e2))

    total = sum(add_list2)
    print(total)  # 输出: 15

    print('cost time :%s' % (time.time() - ts))


def dask_cluster():
    """
    集群
    #5 Work  2个任务每个任务运行完需要5秒 10万次循环  cost time :25066.37527489662
    """
    client = Client('192.168.10.212:30086')
    ts = time.time()
    # 执行 square
    A = client.map(square, range(10000))
    # 执行 neg
    B = client.map(neg, A)
    # 汇总结果
    total = client.submit(sum, B)
    print(total.result())
    print('cost time :%s' % (time.time() - ts))
    client.close()


def select_sql(query_date):
    print(query_date)
    return StockData.get_stock_trade_only_k_1m(query_date, cps=["3"])
    # return query_date


def select_hdfs(query_date):
    db = MyHDFSUtils()
    print(query_date)
    return db.read_files_pickle(f'/quantify/stock/k_1m/no_rr/pickle/2018/{query_date}.pickle')


# return query_date

def get_all(data):
    print(data)
    return data


def dask_cluster_sql():
    """
    集群
    #5 Work  2个任务每个任务运行完需要5秒 10万次循环  cost time :25066.37527489662
    """
    client = Client('192.168.10.212:30086')
    #client = Client()

    ts = time.time()
    # 执行 square
    trade_date_list = MyDateUtils.get_trade_date_mysql("2024-01-01", "2024-01-10")
    A = client.map(select_sql, trade_date_list)
    # 汇总结果
    total = client.submit(get_all, A)
    print(total.result())
    print('cost time :%s' % (time.time() - ts))
    client.close()


def dask_cluster_hdfs():
    """
    集群
    #5 Work  2个任务每个任务运行完需要5秒 10万次循环  cost time :25066.37527489662
    """
    client = Client('192.168.10.212:30086')
    #client = Client()
    ts = time.time()
    # 执行 square
    trade_date_list = MyDateUtils.get_trade_date_mysql("2018-01-01", "2018-01-10")
    A = client.map(select_hdfs, trade_date_list)
    # 汇总结果
    total = client.submit(get_all, A)
    print(total.result())
    print('cost time :%s' % (time.time() - ts))
    client.close()


def future_():
    from concurrent.futures import ThreadPoolExecutor

    # 创建一个ThreadPoolExecutor,指定线程池的大小

    start_time = time.time()

    with ThreadPoolExecutor(max_workers=10) as executor:
        trade_date_list = MyDateUtils.get_trade_date_mysql('2018-01-01', '2018-01-10')
        # 提交任务到线程池,每个任务返回一个Future对象
        futures = [executor.submit(select_hdfs, i) for i in trade_date_list]
        # 等待所有任务完成,并打印结果
        for future in futures:
            print(future.result())

    print(f"程序运行时间:{time.time() - start_time} 秒")


def no_dask_sql():
    ts = time.time()

    # 执行 square
    trade_date_list = MyDateUtils.get_trade_date_mysql("2022-01-01", "2024-08-10")
    data_list = []
    for trade_date in trade_date_list:
        data_list.append(StockData.get_other_data(trade_date, trade_date))
    print('cost time :%s' % (time.time() - ts))


def add(x):
    print(x)
    return x


def dask_single():
    """
    单机dask
    """
    client = Client()
    ts = time.time()
    A = client.map(square, range(100000))
    B = client.map(neg, A)
    # 汇总结果 可以传一个自定义函数对结果进行处理
    total = client.submit(add, B)
    print(total.result())
    print('cost time :%s' % (time.time() - ts))


if __name__ == '__main__':
    dask_cluster_hdfs()