项目部署
Demo
想实现把python项目/脚本打入到Docker镜像中,然后运行K8s进行快速的部署项目
1: 准备好项目
需要先拉取所有的代码到mul_fac_eq中
2: 编写Dockerfile
# 初始镜像
FROM daskdev/dask:latest
MAINTAINER syt
#创建需要的目录
RUN mkdir -p /mul_fac_eq
RUN mkdir -p /factorsFiles
RUN mkdir -p /logs/historyLogs
RUN mkdir -p /logs/inspectLogs
RUN mkdir -p /ths_data
WORKDIR /mul_fac_eq
#设置PYTHONPATH 让mul_fac_eq代码可以访问
ENV PYTHONPATH /mul_fac_eq
#添加项目到镜像mul_fac_eq目录中
ADD ./ /mul_fac_eq
# 安装依赖
RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt
3:打包
进入到 mul_fac_eq 目录下执行Dockerfile
# docker build -t 镜像名:版本 路径(.代表当前路径)
docker build -t mydask:v1 .
4:准备K8s的yml文件
dask-scheduler-deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: dask-scheduler
spec:
replicas: 1
selector:
matchLabels:
app: dask-scheduler
template:
metadata:
labels:
app: dask-scheduler
spec:
containers:
- name: dask-scheduler
image: mydask:v1
imagePullPolicy: IfNotPresent
command: ["dask-scheduler"]
ports:
- containerPort: 8786
- containerPort: 8787
hostAliases:
- ip: "192.168.10.211"
hostnames:
- "hadoop102"
- "m1"
- ip: "192.168.10.212"
hostnames:
- "hadoop103"
- "n1"
- ip: "192.168.10.213"
hostnames:
- "hadoop104"
- "n2"
- ip: "192.168.10.211"
hostnames:
- "hadoop100"
dask-scheduler-service
apiVersion: v1
kind: Service
metadata:
name: dask-scheduler
spec:
selector:
app: dask-scheduler
ports:
- port: 8786
targetPort: 8786
nodePort: 30086 # 自定义NodePort端口,对于type=NodePort是必须的
name: scheduler-port
- port: 8787
targetPort: 8787
nodePort: 30087 # 自定义NodePort端口,对于type=NodePort是必须的
name: web-ui
type: NodePort
dask-worker-deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: dask-worker
spec:
replicas: 3 # 根据需要调整副本数
selector:
matchLabels:
app: dask-worker
template:
metadata:
labels:
app: dask-worker
spec:
containers:
- name: dask-worker
image: mydask:v1
imagePullPolicy: IfNotPresent
command: ["dask-worker", "192.168.10.212:30086"]
hostAliases:
- ip: "192.168.10.211"
hostnames:
- "hadoop102"
- "m1"
- ip: "192.168.10.212"
hostnames:
- "hadoop103"
- "n1"
- ip: "192.168.10.213"
hostnames:
- "hadoop104"
- "n2"
- ip: "192.168.10.211"
hostnames:
- "hadoop100"
5:启动集群
按顺序分别启动
# 启动
kubectl apply -f dask-scheduler-deployment.yaml
kubectl apply -f dask-scheduler-service.yaml
kubectl apply -f dask-worker-deployment.yaml
# 查询pod状态
kubectl get pod,svc
# 查询日志
kubectl logs pod名
#增加work
kubectl scale deployment dask-worker-deployment --replicas=6
# 删除集群
kubectl delete -f dask-worker-deployment.yaml
6:测试
import time
from dask.distributed import Client
from jy_common.jy_data import StockData
from jy_common.jy_utils import MyDateUtils
from jy_common.jy_utils.MyHDFSUtils import MyHDFSUtils
def square(x):
# 让程序暂停5秒
time.sleep(5)
print(f'square--{x}')
return x ** 2
def neg(x):
# 让程序暂停5秒
time.sleep(5)
print(f'neg--{x}')
return -x
def no_dask():
"""
不用dask 单线程
#2个任务每个任务运行完需要5秒 10万次循环 cost time :1000000
"""
ts = time.time()
add_list1 = []
for e in range(100000):
add_list1.append(square(e))
add_list2 = []
for e2 in add_list1:
add_list2.append(neg(e2))
total = sum(add_list2)
print(total) # 输出: 15
print('cost time :%s' % (time.time() - ts))
def dask_cluster():
"""
集群
#5 Work 2个任务每个任务运行完需要5秒 10万次循环 cost time :25066.37527489662
"""
client = Client('192.168.10.212:30086')
ts = time.time()
# 执行 square
A = client.map(square, range(10000))
# 执行 neg
B = client.map(neg, A)
# 汇总结果
total = client.submit(sum, B)
print(total.result())
print('cost time :%s' % (time.time() - ts))
client.close()
def select_sql(query_date):
print(query_date)
return StockData.get_stock_trade_only_k_1m(query_date, cps=["3"])
# return query_date
def select_hdfs(query_date):
db = MyHDFSUtils()
print(query_date)
return db.read_files_pickle(f'/quantify/stock/k_1m/no_rr/pickle/2018/{query_date}.pickle')
# return query_date
def get_all(data):
print(data)
return data
def dask_cluster_sql():
"""
集群
#5 Work 2个任务每个任务运行完需要5秒 10万次循环 cost time :25066.37527489662
"""
client = Client('192.168.10.212:30086')
#client = Client()
ts = time.time()
# 执行 square
trade_date_list = MyDateUtils.get_trade_date_mysql("2024-01-01", "2024-01-10")
A = client.map(select_sql, trade_date_list)
# 汇总结果
total = client.submit(get_all, A)
print(total.result())
print('cost time :%s' % (time.time() - ts))
client.close()
def dask_cluster_hdfs():
"""
集群
#5 Work 2个任务每个任务运行完需要5秒 10万次循环 cost time :25066.37527489662
"""
client = Client('192.168.10.212:30086')
#client = Client()
ts = time.time()
# 执行 square
trade_date_list = MyDateUtils.get_trade_date_mysql("2018-01-01", "2018-01-10")
A = client.map(select_hdfs, trade_date_list)
# 汇总结果
total = client.submit(get_all, A)
print(total.result())
print('cost time :%s' % (time.time() - ts))
client.close()
def future_():
from concurrent.futures import ThreadPoolExecutor
# 创建一个ThreadPoolExecutor,指定线程池的大小
start_time = time.time()
with ThreadPoolExecutor(max_workers=10) as executor:
trade_date_list = MyDateUtils.get_trade_date_mysql('2018-01-01', '2018-01-10')
# 提交任务到线程池,每个任务返回一个Future对象
futures = [executor.submit(select_hdfs, i) for i in trade_date_list]
# 等待所有任务完成,并打印结果
for future in futures:
print(future.result())
print(f"程序运行时间:{time.time() - start_time} 秒")
def no_dask_sql():
ts = time.time()
# 执行 square
trade_date_list = MyDateUtils.get_trade_date_mysql("2022-01-01", "2024-08-10")
data_list = []
for trade_date in trade_date_list:
data_list.append(StockData.get_other_data(trade_date, trade_date))
print('cost time :%s' % (time.time() - ts))
def add(x):
print(x)
return x
def dask_single():
"""
单机dask
"""
client = Client()
ts = time.time()
A = client.map(square, range(100000))
B = client.map(neg, A)
# 汇总结果 可以传一个自定义函数对结果进行处理
total = client.submit(add, B)
print(total.result())
print('cost time :%s' % (time.time() - ts))
if __name__ == '__main__':
dask_cluster_hdfs()