使用pyspark计算一次计算模型多个指标ks
- 前面以pandas和hive sql方式进行了一次性计算模型多个标签的ks指标的演示,本文将以pyspark的方式计算
- 有了前面的铺垫,本文会简单很多,直接上代码
计算原理
- 以pandas_udf调用sklearn计算
导入数据
- 前文使用sql计算一次计算模型多个指标ks已将数据导入到了t1.demo_data,本文使用此数据计算
计算单个ks的pandas udf
from sklearn import metrics
from pyspark.sql import SparkSession, functions as F
import pandas as pd
spark = SparkSession.builder.appName('calc_ks').enableHiveSupport().getOrCreate()
@F.pandas_udf('double', F.PandasUDFType.GROUPED_AGG)
def calc_ks(y_true, y_pred):
y_true = y_true.reset_index(drop=True)
idx = y_true[y_true.notna()].index
y_pred = pd.Series(y_pred).reset_index(drop=True)
idx2 = y_pred[y_pred.notna()].index
idx_uni = list(set(idx) & set(idx2))
y_true = y_true[idx_uni]
y_pred = y_pred[idx_uni]
if len(y_true) == 0:
return None
tpr, fpr, _ = metrics.roc_curve(y_true, y_pred)
ks = max(abs(tpr - fpr))
return ks
注册udf和进行计算
spark.udf.register("calc_ks", calc_ks)
def get_data():
sql = f"""
select *
from t1.demo_data
"""
df = spark.sql(sql).persist()
return df
def run():
df = get_data()
df_ks = (df
.groupby('month')
.agg(
F.expr('calc_ks(y1,pred) as y1_ks'),
F.expr('calc_ks(y2,pred) as y2_ks'),
F.expr('calc_ks(y3,pred) as y3_ks'),
)
)
print(df_ks.show())
完整代码
from sklearn import metrics
from pyspark.sql import SparkSession, functions as F
import pandas as pd
spark = SparkSession.builder.appName('calc_ks').enableHiveSupport().getOrCreate()
@F.pandas_udf('double', F.PandasUDFType.GROUPED_AGG)
def calc_ks(y_true, y_pred):
y_true = y_true.reset_index(drop=True)
idx = y_true[y_true.notna()].index
y_pred = pd.Series(y_pred).reset_index(drop=True)
idx2 = y_pred[y_pred.notna()].index
idx_uni = list(set(idx) & set(idx2))
y_true = y_true[idx_uni]
y_pred = y_pred[idx_uni]
if len(y_true) == 0:
return None
tpr, fpr, _ = metrics.roc_curve(y_true, y_pred)
ks = max(abs(tpr - fpr))
return ks
spark.udf.register("calc_ks", calc_ks)
def get_data():
sql = f"""
select *
from t1.demo_data
"""
df = spark.sql(sql).persist()
return df
def run():
df = get_data()
df_ks = (df
.groupby('month')
.agg(
F.expr('calc_ks(y1,pred) as y1_ks'),
F.expr('calc_ks(y2,pred) as y2_ks'),
F.expr('calc_ks(y3,pred) as y3_ks'),
)
)
print(df_ks.show())
if __name__ == '__main__':
run()
总结
- 以上即是使用pyspark计算多个标签的ks方式,此模型指标计算系列到此完结