使用pyspark计算一次计算模型多个指标ks

66 阅读2分钟

使用pyspark计算一次计算模型多个指标ks

  1. 前面以pandas和hive sql方式进行了一次性计算模型多个标签的ks指标的演示,本文将以pyspark的方式计算
  2. 有了前面的铺垫,本文会简单很多,直接上代码

计算原理

  1. 以pandas_udf调用sklearn计算

导入数据

  1. 前文使用sql计算一次计算模型多个指标ks已将数据导入到了t1.demo_data,本文使用此数据计算

计算单个ks的pandas udf

from sklearn import metrics
from pyspark.sql import SparkSession, functions as F
import pandas as pd

spark = SparkSession.builder.appName('calc_ks').enableHiveSupport().getOrCreate()


@F.pandas_udf('double', F.PandasUDFType.GROUPED_AGG)
def calc_ks(y_true, y_pred):
    y_true = y_true.reset_index(drop=True)
    idx = y_true[y_true.notna()].index
    # 去掉打分为空的
    y_pred = pd.Series(y_pred).reset_index(drop=True)
    idx2 = y_pred[y_pred.notna()].index
    # 取模型打分和标签都不为空的
    idx_uni = list(set(idx) & set(idx2))
    y_true = y_true[idx_uni]
    y_pred = y_pred[idx_uni]
    if len(y_true) == 0:
        return None
    tpr, fpr, _ = metrics.roc_curve(y_true, y_pred)
    ks = max(abs(tpr - fpr))
    return ks

注册udf和进行计算

spark.udf.register("calc_ks", calc_ks)


def get_data():
    sql = f"""
            select *
            from t1.demo_data

    """
    df = spark.sql(sql).persist()
    return df


def run():
    df = get_data()
    df_ks = (df
    .groupby('month')
    .agg(
        F.expr('calc_ks(y1,pred) as y1_ks'),
        F.expr('calc_ks(y2,pred) as y2_ks'),
        F.expr('calc_ks(y3,pred) as y3_ks'),
    )
    )
    print(df_ks.show())



完整代码

from sklearn import metrics
from pyspark.sql import SparkSession, functions as F
import pandas as pd

spark = SparkSession.builder.appName('calc_ks').enableHiveSupport().getOrCreate()


@F.pandas_udf('double', F.PandasUDFType.GROUPED_AGG)
def calc_ks(y_true, y_pred):
    y_true = y_true.reset_index(drop=True)
    idx = y_true[y_true.notna()].index
    # 去掉打分为空的
    y_pred = pd.Series(y_pred).reset_index(drop=True)
    idx2 = y_pred[y_pred.notna()].index
    # 取模型打分和标签都不为空的
    idx_uni = list(set(idx) & set(idx2))
    y_true = y_true[idx_uni]
    y_pred = y_pred[idx_uni]
    if len(y_true) == 0:
        return None
    tpr, fpr, _ = metrics.roc_curve(y_true, y_pred)
    ks = max(abs(tpr - fpr))
    return ks


spark.udf.register("calc_ks", calc_ks)


def get_data():
    sql = f"""
            select *
            from t1.demo_data

    """
    df = spark.sql(sql).persist()
    return df


def run():
    df = get_data()
    df_ks = (df
    .groupby('month')
    .agg(
        F.expr('calc_ks(y1,pred) as y1_ks'),
        F.expr('calc_ks(y2,pred) as y2_ks'),
        F.expr('calc_ks(y3,pred) as y3_ks'),
    )
    )
    print(df_ks.show())


if __name__ == '__main__':
    run()
# 输出如下:
# |month|              y1_ks|              y2_ks|              y3_ks|
# +-----+-------------------+-------------------+-------------------+
# |    1|  0.206674338319908|  0.270108695652174|0.10144927536231885|
# |    2|0.13867488443759624|0.08841807909604515|0.13571676501214852|
# |    3| 0.1394064872325742|0.12063492063492065|0.12549019607843137|
# |    4|0.10476190476190472|0.19269269269269262|0.14832309872461025|
# |    5|0.12083333333333333|0.09269128283212791| 0.0734295415959253|

总结

  1. 以上即是使用pyspark计算多个标签的ks方式,此模型指标计算系列到此完结