以下是一个使用PySpark进行分布式XGBoost模型训练的完整Python代码示例:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# 1. 创建SparkSession
spark = SparkSession.builder \
.appName("XGBoostSparkExample") \
.master("yarn") \ # 或使用 "local[*]" 本地模式
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.jars.packages", "ml.dmlc:xgboost4j-spark_2.12:1.7.6") \ # 添加XGBoost依赖
.getOrCreate()
# 2. 加载数据(示例CSV数据)
data = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load("hdfs:///path/to/your/data.csv")
# 3. 数据预处理
# 假设最后一列是标签列,其他是特征列
feature_cols = data.columns[:-1]
label_col = data.columns[-1]
# 组合特征向量
assembler = VectorAssembler(
inputCols=feature_cols,
outputCol="features"
)
# 数据转换
assembled_data = assembler.transform(data).select("features", label_col)
# 划分训练集和测试集
train_data, test_data = assembled_data.randomSplit([0.8, 0.2], seed=42)
# 4. 配置XGBoost参数
xgb_params = {
"learning_rate": 0.1,
"max_depth": 6,
"objective": "binary:logistic",
"num_workers": 4, # 与Spark executor数量一致
"eval_metric": "logloss",
"num_round": 100
}
# 5. 创建XGBoost分类器
xgb_classifier = SparkXGBClassifier(
features_col="features",
label_col=label_col,
**xgb_params
)
# 6. 训练模型
model = xgb_classifier.fit(train_data)
# 7. 预测测试集
predictions = model.transform(test_data)
# 8. 模型评估
evaluator = BinaryClassificationEvaluator(
labelCol=label_col,
rawPredictionCol="probabilities", # XGBoost输出列名
metricName="areaUnderROC"
)
auc = evaluator.evaluate(predictions)
print(f"Test AUC = {auc:.4f}")
# 9. 保存模型
model.write().overwrite().save("hdfs:///path/to/xgboost_model")
# 10. 关闭Spark会话
spark.stop()
执行说明
- 依赖准备:
# 需要安装的Python包
pip install xgboost pyspark==3.3.1 # 保持Spark与集群版本一致
# 提交作业时需要包含XGBoost的Spark JAR包
export PYSPARK_SUBMIT_ARGS="--jars xgboost4j-spark_2.12-1.7.6.jar pyspark-shell"
- Spark提交命令:
spark-submit \
--master yarn \
--deploy-mode cluster \
--num-executors 4 \
--executor-memory 8G \
--driver-memory 4G \
--conf spark.executor.cores=2 \
--conf spark.task.cpus=1 \
--conf spark.sql.execution.arrow.pyspark.enabled=true \
your_script.py
关键配置说明
- 参数优化建议:
xgb_params = {
"learning_rate": 0.05, # 更小的学习率需要更多轮次
"max_depth": 8, # 更深的树增加模型复杂度
"subsample": 0.8, # 防止过拟合
"colsample_bytree": 0.8, # 特征采样
"lambda": 1.0, # L2正则化
"alpha": 0.0, # L1正则化
"scale_pos_weight": 10 # 处理类别不平衡
}
- 分布式训练优化:
# 在数据加载后添加重新分区
data = data.repartition(64) # 推荐分区数 = executor数量 * 每个executor核心数 * 2
# 启用堆外内存(防止OOM)
.config("spark.memory.offHeap.enabled", "true") \
.config("spark.memory.offHeap.size", "2g") \
完整功能扩展版(含特征工程)
# 新增类别特征处理
categorical_cols = ["category_feature1", "category_feature2"]
indexers = [
StringIndexer(inputCol=col, outputCol=f"{col}_indexed", handleInvalid="keep")
for col in categorical_cols
]
# 更新特征列
feature_cols = [col for col in data.columns if col not in categorical_cols + [label_col]]
feature_cols += [f"{col}_indexed" for col in categorical_cols]
# 构建完整Pipeline
pipeline = Pipeline(stages=indexers + [assembler, xgb_classifier])
# 训练完整流水线
pipeline_model = pipeline.fit(train_data)
# 保存完整Pipeline(含预处理)
pipeline_model.write().overwrite().save("hdfs:///path/to/full_pipeline")
常见问题处理
- 类型转换错误:
# 强制转换数值类型
from pyspark.sql.functions import col
for c in feature_cols:
data = data.withColumn(c, col(c).cast("float"))
- 处理缺失值:
# 填充缺失值
data = data.fillna({
"numeric_col": 0.0,
"category_col": "missing"
})
- 提升训练速度:
# 启用GPU加速
xgb_params.update({
"tree_method": "gpu_hist",
"device": "cuda"
})
# 配置Spark GPU资源
.config("spark.executor.resource.gpu.amount", "1") \
.config("spark.task.resource.gpu.amount", "0.1") \
如果需要回归任务,只需替换分类器为:
from xgboost.spark import SparkXGBRegressor
xgb_regressor = SparkXGBRegressor(
features_col="features",
label_col=label_col,
**xgb_params
)