涉及时间序列、kafka、并发等知识点
1. confluent-kafka 接发数据
import confluent_kafka
from confluent_kafka import Consumer, KafkaError, Producer
import uuid
import json
from multiprocessing import Process
import time
def read_kafka(server_add, topic, offset, group=None, size=8000000000):
c = confluent_kafka.Consumer({
'bootstrap.servers': server_add,
'group.id': group if group is not None else uuid.uuid1(),
'auto.offset.reset': offset
})
c.subscribe([topic])
while size > 0:
msg = c.poll(1.0)
if msg is None:
continue
size -= 1
if msg.error():
print("Consumer error:", msg.error())
continue
yield msg.value()
c.close()
def sent_kafka(keys, val):
conf = {
'bootstrap.servers': '21.33.12.84:9092',
# 'client.id':
}
producer = Producer(conf)
topic = '12345678'
key = keys
value = val
# value = json.dumps(value).encode("utf8") # 是否编码为二进制
producer.produce(topic=topic, key=key, value=value)
producer.flush()
def read_to_txt(save_name):
fout = open(save_name, 'w')
flush_time = time.time() + 1
for msg in read_kafka(server_add, topic, offset):
fout.write(msg.decode("utf8"))
fout.write("\n")
if time.time() >= flush_time:
fout.flush()
flush_time = time.time()
fout.flush()
fout.close()
if __name__ == "__main__":
process_list = []
offset = 'latest'
topic_save = [('21.33.200.97:9092', 'topic', 'save_data/df.txt')]
for addr, topic, f in topic_save:
p = Process(target=read_to_txt, args=(addr, topic, offset, f))
process_list.append(p)
for p in process_list:
p.start()
for p in process_list:
p.json()
时间的转换
# 时间戳转换为标准时间
import time
start_time = 1678173258175 # ms
to_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time/1000))
print(to_time)
# 2023-03-07 15:14:18
# 标准时间转换为时间戳
end_time = "2023-03-07 15:14:18"
to_time = time.strptime(end_time, "%Y-%m-%d %H:%M:%S")
end = int(time.mktime(to_time) * 1000)
print(end)
# 1678173258000
# 生成时间
import numpy as np
import pandas as pd
# 1. 生成指定范围日期
date = pd.date_range('4/1/2023', '5/1/2023', freq='D')
# 2. 指定起始或者结束日期,单位是日
date = pd.date_range(start='4/1/2023', periods=5, freq='D')
date = pd.date_range(end='4/1/2023', periods=5, freq='D')
时间序列保存kafka中历史5分钟数据
import datetime
import pandas as pd
def save_history_data(T):
group = None
conf = {
'bootstrap.servers': '21.33.200.97:9092',
'group.id': group if group is not None else uuid.uuid1(),
'auto.offset.reset': 'latest'
}
consumer = Consumer(conf)
topic = "xxx"
consumer.subscribe([topic])
history_interval = datetime.timedelta(minutes=T)
last_timestamp = time.time()
global history_data_trace
while True:
msg = consumer.poll(1.0)
if msg is None:
continue
if msg.error():
if msg.error().code() == KafkaError._PARTITION_EOF:
print('Reached end of partition')
else:
print('running...')
else:
key = msg.key()
value = msg.value()
timestamp = msg.timestamp()[1] / 1000.0
history_data_trace[timestamp] = value
expire_timestamp = time.time() - history_interval.total_seconds()
for ts in list(history_data_trace.keys()):
if ts < expire_timestamp:
del history_data_trace[ts]
try:
history_trace = [json.loads(log) for log in history_data_trace.values()]
if len(history_trace) <= 0:
continue
df_timestamp = int(history_trace[-1]['time'])
if ((df_timestamp - last_timestamp) > 120000) & (last_timestamp != 0):
global trace
trace = pd.json_normalize(history_trace)
...
except:
pass
多进程共享全局变量
from multiprocessing import Manager
from multiprocessing import Process
def consumer_trace():
pass
def consumer_pod():
pass
def consumer_sys():
pass
history_data_pod = Manager().dict()
history_sys = Manager().dict()
t1 = Process(target=consumer_trace, args=(history_data_pod, history_sys,))
t2 = Process(target=consumer_pod, args=(history_data_pod,))
t3 = Process(target=consumer_sys, args=(history_sys,))
T = [t1, t2, t3]
for t in T:
t.start()
for t in T:
t.json()
时间序列异常检测可视化
import numpy as np
import gc
import os
import matplotlib.pyplot as plt
def three_sigma(df_col):
rule = (df_col.mean()-3*df_col.std()>=df_col)|(df_col.mean()+3*df_col.std()<=df_col)
index = np.arange(df_col.shape[0])[rule]
outrange = df_col.iloc[index]
return outrange
def search_metric(metric, metric_name):
metric = metric.sort_values(by=['time'], ascending=True)
tags_list = metric['tags'].apply(lambda x: pd.Series(x).to_json()).tolist()
tags = pd.json_normalize([eval(i) for i in tags_list])
for tag in tags.columns:
metric[tag] = tags[tag]
metric.drop('tags', axis=1, inplace=True)
del tags, tags_list
gc.collect()
metric['data'] = metric['data'].astype(float).astype(int)
metric['time'] = metric['time'].astype(int)
metric['time'] = metric['time'].apply(
lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x/1000))
)
time_min = metric['time'].min().split(' ')[1]
time_max = metric['time'].max().split(' ')[1]
time_min_max = time_min + '~' + time_max
metric.index = pd.to_datetime(metric['time'])
plot_data = metric['data']
index_ = three_sigma(plot_data)
plt.figure(figsize=(12, 2))
plt.plot(plot_data, label=metric_name)
plt.scatter(index_.index, index_.values, color='red')
plt.legend()
if not os.path.exists('graph_save/' + time_min_max):
os.mkdir('graph_save/' + time_min_max)
plt.savefig('graph_save/' + time_min_max + '/' + metric_name + '.png')