ADA

38 阅读1分钟
import copy
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math
import time
import warnings

warnings.filterwarnings("ignore")


class MetricDetect(object):

    def __init__(self, change_scale=2, change_value=0.07, miss_rate=0.3, change_weight=0.1,
                 abn_threshold=0.9, score_threshold=0.9):
        self.change_scale = change_scale
        self.change_value = change_value
        self.change_weight = change_weight
        self.abn_threshold = abn_threshold
        self.score_threshold = score_threshold
        self.miss_rate_threshold = miss_rate

    def calCorr(self, array1, array2):
        data_matrix = pd.DataFrame({'array1': array1, 'array2': array2})
        if data_matrix['array1'].std() == 0 or data_matrix['array2'].std() == 0:
            pearson_coefficient = 0.0
        else:
            pearson_coefficient = abs(data_matrix.corr('pearson')['array1'][1])
        return pearson_coefficient

    def missCount(self, v):
        if len(v) >= 2:
            v = v.sort_values(by=['timestamp'])
            last_timestamp = v['timestamp'].iloc[-1]
            first_timestamp = v['timestamp'].iloc[0]
            data_len = last_timestamp - first_timestamp + 1
            miss_rate = (data_len - len(v)) / data_len
        else:
            miss_rate = 0
        return miss_rate

    def changeTime(self, df, change_scale):
        df = df.dropna(axis=0, how='any')
        df = df.sort_values(by=['timestamp'], ascending=True)
        last_timestamp = df['timestamp'].iloc[-1] if len(df) > 0 else 0
        df['diff'] = df['value'].diff().fillna(0)
        diff_median = df['diff'].median()
        diff_sigma = df['diff'].std()
        diff_upper = diff_median + change_scale * diff_sigma
        diff_lower = diff_median - change_scale * diff_sigma
        diff_df = df[(df['diff'] >= diff_upper) | (df['diff'] <= diff_lower)]
        if not diff_df.empty:
            if diff_df.iloc[0]['diff'] == 0:
                Tc = 0
            else:
                Tc = diff_df.iloc[0]['timestamp']
                if abs(int(last_timestamp) - int(Tc)) > 5:
                    Tc = 0
        else:
            Tc = 0
        return Tc

    def changeDegree(self, df, tc):
        if df.empty:
            abnormality = 0.0
            change = 0.0
        else:
            nor_df = df[(df['timestamp'] < tc)].reset_index(drop=True)
            ano_df = df[(df['timestamp'] >= tc)].reset_index(drop=True)
            if nor_df.empty or ano_df.empty:
                abnormality = 0.0
                change = 0.0
            else:
                Xi = np.array(nor_df['value'])
                Xj = np.array(ano_df['value'])
                median = np.median(Xi)
                tc_value = Xj.mean()
                change = tc_value - median
                if abs(tc_value - median) < self.change_value:      # 容错
                    abnormality = 0.0
                else:
                    array1 = np.append(np.zeros(len(Xi)), np.ones(len(Xj)))
                    array2 = np.append(np.zeros(len(Xi)), np.ones(1))
                    array2 = np.append(array2, np.zeros(len(Xj) - 1))
                    cor1 = self.calCorr(np.array(df['value']), array1)
                    cor2 = self.calCorr(np.array(df['value']), array2)
                    cor = max(cor1, cor2)
                    mad = abs(Xi - median)
                    mad_median = np.median(mad)
                    if mad_median != 0:
                        abn = (tc_value - mad_median) / mad_median
                    else:
                        abn = (tc_value - mad_median) / (mad_median + 0.1)
                    error_likelihood = 1 - 0.5 * math.erfc(abs(abn) / 1.4142135623730951)
                    abnormality = self.change_weight * error_likelihood + (1 - self.change_weight) * cor
        return abnormality, change

    def run(self, df):
        global tc
        if df.empty:
            abnormality = 0.0
            change = 0.0
        else:
            if 'change_scale' in df.columns:
                tc = self.changeTime(df, df['change_scale'].iloc[0])
            else:
                tc = self.changeTime(df, self.change_scale)
            if tc != 0:
                abnormality, change = self.changeDegree(df, tc)
            else:
                change = 0.0
                abnormality = 0
        return tc, change, abnormality

    def multi_run(self, df):
        if df.empty:
            result = pd.DataFrame()
        else:
            df_groups = df.groupby(['cmdb_id', 'kpi_name'])
            result = pd.DataFrame()
            CMDB = []
            KPI = []
            C = []
            TC = []
            Abn = []
            for k, v in df_groups:
                tc, change, abnormality = self.run(v)
                if abnormality >= self.abn_threshold:
                    CMDB.append(str(k[0]))
                    KPI.append(str(k[1]))
                    C.append(change)
                    TC.append(str(time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(int(tc))
                    )) if tc != 0 else 0)
                    Abn.append(abnormality)
            result['cmdb_id'] = CMDB
            result['kpi_name'] = KPI
            result['change'] = C
            result['tc'] = TC
            result['abnormality'] = Abn
        return result


if __name__ == '__main__':
    path = "/Users/zyl/Desktop/Python/PythonFile/Web_coding/plotly/metric.csv"

    df = pd.read_csv(path)
    df = df.sort_values(by=['timestamp'], ascending=True)
    df = df.dropna(how='any')
    alg = MetricDetect()
    last_detect_timestamp = int(df['timestamp'].values[0])
    anomaly_data = pd.DataFrame()
    detect_time = df['timestamp'].unique()
    output = []
    for dt in detect_time:
        dtimestamp = int(dt)
        if dtimestamp - last_detect_timestamp > 0:
            history_df = df[(df['timestamp'] >= dtimestamp-600) & (df['timestamp'] < dtimestamp)]

            tc, change, abnormality = alg.run(history_df)
            if abnormality >= 0.9:
                TC = str(time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(int(tc))
                    )) if tc != 0 else 0
                output.append([TC, change, abnormality])
            # result = alg.multi_run(history_df)
            # if not result.empty:
            #     print('output anomaly result is : ', "\n", result)
            #     anomaly_data = anomaly_data.append(result)

            last_detect_timestamp = dtimestamp

    print("异常信息: ---------", output)
    # 绘制异常检测图像
    if len(output) > 0:
    # if anomaly_data.shape[0]:
        df['time'] = df['timestamp'].apply(lambda x: time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(x)))
        df.index = pd.to_datetime(df['time'])
        plot_df = df['value']
        # anomaly_times = list(anomaly_data['tc'].values)
        anomaly_times = [c[0] for c in output]
        anomaly_value = [plot_df[t] for t in anomaly_times]
        plt.figure(figsize=(12, 3))
        plt.plot(plot_df, label='ooooo')
        plt.scatter(anomaly_times, anomaly_value, color='r', s=14)
        plt.legend()
        plt.show()
    else:
        print("未检测出异常 程序结束!")