时间序列代码汇总

112 阅读2分钟

涉及时间序列、kafka、并发等知识点

1. confluent-kafka 接发数据

import confluent_kafka
from confluent_kafka import Consumer, KafkaError, Producer
import uuid
import json
from multiprocessing import Process
import time

def read_kafka(server_add, topic, offset, group=None, size=8000000000):
    c = confluent_kafka.Consumer({
        'bootstrap.servers': server_add, 
        'group.id': group if group is not None else uuid.uuid1(), 
        'auto.offset.reset': offset
    })
    c.subscribe([topic])
    while size > 0:
        msg = c.poll(1.0)
        if msg is None:
            continue
        size -= 1
        if msg.error():
            print("Consumer error:", msg.error())
            continue
        yield msg.value()
    c.close()

def sent_kafka(keys, val):
    conf = {
        'bootstrap.servers': '21.33.12.84:9092', 
        # 'client.id': 
    }
    producer = Producer(conf)
    topic = '12345678'
    key = keys
    value = val
    # value = json.dumps(value).encode("utf8")       # 是否编码为二进制
    producer.produce(topic=topic, key=key, value=value)
    producer.flush()

def read_to_txt(save_name):
    fout = open(save_name, 'w')
    flush_time = time.time() + 1
    for msg in read_kafka(server_add, topic, offset):
        fout.write(msg.decode("utf8"))
        fout.write("\n")
        if time.time() >= flush_time:
            fout.flush()
            flush_time = time.time()
    fout.flush()
    fout.close()

if __name__ == "__main__":
    
    process_list = []
    offset = 'latest'
    topic_save = [('21.33.200.97:9092', 'topic', 'save_data/df.txt')]
    for addr, topic, f in topic_save:
        p = Process(target=read_to_txt, args=(addr, topic, offset, f))
        process_list.append(p)
    for p in process_list:
        p.start()
    for p  in process_list:
        p.json()

时间的转换

# 时间戳转换为标准时间
import time
start_time = 1678173258175    # ms
to_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time/1000))
print(to_time)
# 2023-03-07 15:14:18

# 标准时间转换为时间戳
end_time = "2023-03-07 15:14:18"
to_time = time.strptime(end_time, "%Y-%m-%d %H:%M:%S")
end = int(time.mktime(to_time) * 1000)
print(end)
# 1678173258000
# 生成时间
import numpy as np
import pandas as pd

# 1. 生成指定范围日期
date = pd.date_range('4/1/2023', '5/1/2023', freq='D')
# 2. 指定起始或者结束日期,单位是日
date = pd.date_range(start='4/1/2023', periods=5, freq='D')
date = pd.date_range(end='4/1/2023', periods=5, freq='D')

时间序列保存kafka中历史5分钟数据

import datetime 
import pandas as pd

def save_history_data(T):
    group = None
    conf = {
        'bootstrap.servers': '21.33.200.97:9092', 
        'group.id': group if group is not None else uuid.uuid1(), 
        'auto.offset.reset': 'latest'
    }
    consumer = Consumer(conf)
    topic = "xxx"
    consumer.subscribe([topic])
    history_interval = datetime.timedelta(minutes=T)
    last_timestamp = time.time()
    global history_data_trace
    while True:
        msg = consumer.poll(1.0)
        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                print('Reached end of partition')
            else:
                print('running...')
        else:
            key = msg.key()
            value = msg.value()
            timestamp = msg.timestamp()[1] / 1000.0
            history_data_trace[timestamp] = value
            expire_timestamp = time.time() - history_interval.total_seconds()
            for ts in list(history_data_trace.keys()):
                if ts < expire_timestamp:
                    del history_data_trace[ts]
            try:
                history_trace = [json.loads(log) for log in history_data_trace.values()]
                if len(history_trace) <= 0:
                    continue
                df_timestamp = int(history_trace[-1]['time'])
                if ((df_timestamp - last_timestamp) > 120000) & (last_timestamp != 0):
                    global trace 
                    trace = pd.json_normalize(history_trace)
                    ...
            except:
                pass

多进程共享全局变量

from multiprocessing import Manager
from multiprocessing import Process

def consumer_trace():
    pass
def consumer_pod():
    pass
def consumer_sys():
    pass

history_data_pod = Manager().dict()
history_sys = Manager().dict()

t1 = Process(target=consumer_trace, args=(history_data_pod, history_sys,))
t2 = Process(target=consumer_pod, args=(history_data_pod,))
t3 = Process(target=consumer_sys, args=(history_sys,))

T = [t1, t2, t3]
for t in T:
    t.start()
for t in T:
    t.json()

时间序列异常检测可视化

import numpy as np
import gc
import os
import matplotlib.pyplot as plt

def three_sigma(df_col):
    rule = (df_col.mean()-3*df_col.std()>=df_col)|(df_col.mean()+3*df_col.std()<=df_col)
    index = np.arange(df_col.shape[0])[rule]
    outrange = df_col.iloc[index]
    return outrange

def search_metric(metric, metric_name):
    metric = metric.sort_values(by=['time'], ascending=True)
    tags_list = metric['tags'].apply(lambda x: pd.Series(x).to_json()).tolist()
    tags = pd.json_normalize([eval(i) for i in tags_list])
    for tag in tags.columns:
        metric[tag] = tags[tag]
    metric.drop('tags', axis=1, inplace=True)
    del tags, tags_list
    gc.collect()
    metric['data'] = metric['data'].astype(float).astype(int)
    metric['time'] = metric['time'].astype(int)
    metric['time'] = metric['time'].apply(
        lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x/1000))
    )
    time_min = metric['time'].min().split(' ')[1]
    time_max = metric['time'].max().split(' ')[1]
    time_min_max = time_min + '~' + time_max
    metric.index = pd.to_datetime(metric['time'])
    plot_data = metric['data']
    index_ = three_sigma(plot_data)
    plt.figure(figsize=(12, 2))
    plt.plot(plot_data, label=metric_name)
    plt.scatter(index_.index, index_.values, color='red')
    plt.legend()
    if not os.path.exists('graph_save/' + time_min_max):
        os.mkdir('graph_save/' + time_min_max)
    plt.savefig('graph_save/' + time_min_max + '/' + metric_name + '.png')