[机器学习+pyspark]xgboost(实战)

330 阅读2分钟

以下是一个使用PySpark进行分布式XGBoost模型训练的完整Python代码示例:


from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1. 创建SparkSession
spark = SparkSession.builder \
    .appName("XGBoostSparkExample") \
    .master("yarn") \  # 或使用 "local[*]" 本地模式
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.jars.packages", "ml.dmlc:xgboost4j-spark_2.12:1.7.6") \  # 添加XGBoost依赖
    .getOrCreate()

# 2. 加载数据(示例CSV数据)
data = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("hdfs:///path/to/your/data.csv")

# 3. 数据预处理
# 假设最后一列是标签列,其他是特征列
feature_cols = data.columns[:-1]
label_col = data.columns[-1]

# 组合特征向量
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

# 数据转换
assembled_data = assembler.transform(data).select("features", label_col)

# 划分训练集和测试集
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=42)

# 4. 配置XGBoost参数
xgb_params = {
    "learning_rate": 0.1,
    "max_depth": 6,
    "objective": "binary:logistic",
    "num_workers": 4,  # 与Spark executor数量一致
    "eval_metric": "logloss",
    "num_round": 100
}

# 5. 创建XGBoost分类器
xgb_classifier = SparkXGBClassifier(
    features_col="features",
    label_col=label_col,
    **xgb_params
)

# 6. 训练模型
model = xgb_classifier.fit(train_data)

# 7. 预测测试集
predictions = model.transform(test_data)

# 8. 模型评估
evaluator = BinaryClassificationEvaluator(
    labelCol=label_col,
    rawPredictionCol="probabilities",  # XGBoost输出列名
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print(f"Test AUC = {auc:.4f}")

# 9. 保存模型
model.write().overwrite().save("hdfs:///path/to/xgboost_model")

# 10. 关闭Spark会话
spark.stop()

执行说明

  1. 依赖准备

# 需要安装的Python包
pip install xgboost pyspark==3.3.1  # 保持Spark与集群版本一致

# 提交作业时需要包含XGBoost的Spark JAR包
export PYSPARK_SUBMIT_ARGS="--jars xgboost4j-spark_2.12-1.7.6.jar pyspark-shell"
  1. Spark提交命令
spark-submit \
  --master yarn \
  --deploy-mode cluster \
  --num-executors 4 \
  --executor-memory 8G \
  --driver-memory 4G \
  --conf spark.executor.cores=2 \
  --conf spark.task.cpus=1 \
  --conf spark.sql.execution.arrow.pyspark.enabled=true \
  your_script.py

关键配置说明

  1. 参数优化建议

xgb_params = {
    "learning_rate": 0.05,          # 更小的学习率需要更多轮次
    "max_depth": 8,                # 更深的树增加模型复杂度
    "subsample": 0.8,              # 防止过拟合
    "colsample_bytree": 0.8,       # 特征采样
    "lambda": 1.0,                 # L2正则化
    "alpha": 0.0,                  # L1正则化
    "scale_pos_weight": 10         # 处理类别不平衡
}
  1. 分布式训练优化
# 在数据加载后添加重新分区
data = data.repartition(64)  # 推荐分区数 = executor数量 * 每个executor核心数 * 2

# 启用堆外内存(防止OOM)
.config("spark.memory.offHeap.enabled", "true") \
.config("spark.memory.offHeap.size", "2g") \

完整功能扩展版(含特征工程)

# 新增类别特征处理
categorical_cols = ["category_feature1", "category_feature2"]
indexers = [
    StringIndexer(inputCol=col, outputCol=f"{col}_indexed", handleInvalid="keep")
    for col in categorical_cols
]

# 更新特征列
feature_cols = [col for col in data.columns if col not in categorical_cols + [label_col]] 
feature_cols += [f"{col}_indexed" for col in categorical_cols]

# 构建完整Pipeline
pipeline = Pipeline(stages=indexers + [assembler, xgb_classifier])

# 训练完整流水线
pipeline_model = pipeline.fit(train_data)

# 保存完整Pipeline(含预处理)
pipeline_model.write().overwrite().save("hdfs:///path/to/full_pipeline")

常见问题处理

  1. 类型转换错误
# 强制转换数值类型
from pyspark.sql.functions import col
for c in feature_cols:
    data = data.withColumn(c, col(c).cast("float"))
  1. 处理缺失值
# 填充缺失值
data = data.fillna({
    "numeric_col": 0.0,
    "category_col": "missing"
})
  1. 提升训练速度

# 启用GPU加速
xgb_params.update({
    "tree_method": "gpu_hist",
    "device": "cuda"
})

# 配置Spark GPU资源
.config("spark.executor.resource.gpu.amount", "1") \
.config("spark.task.resource.gpu.amount", "0.1") \

如果需要回归任务,只需替换分类器为:

from xgboost.spark import SparkXGBRegressor

xgb_regressor = SparkXGBRegressor(
    features_col="features",
    label_col=label_col,
    **xgb_params
)