[大数据]python+公司spark集群(实战)

229 阅读4分钟

我将为您提供一个详细的步骤和Python代码示例,用于在本地连接公司的Spark集群进行机器学习训练。

1. 先决条件准备

1.1 确保本地环境已安装:

pip install pyspark pandas scikit-learn

1.2 获取集群配置信息(需要向集群管理员获取):

  • Spark Master URL (如:spark://master-node:7077yarn)
  • HDFS/存储系统的地址
  • 相关配置参数(executor内存、核心数等)

2. Python代码示例

2.1 基础连接示例

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

def create_spark_session(app_name="ML-Training", master_url="spark://your-master:7077"):
    """
    创建SparkSession连接集群
    """
    spark = SparkSession.builder \
        .appName(app_name) \
        .master(master_url) \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.instances", "10") \
        .config("spark.driver.host", "your-local-ip") \
        .config("spark.driver.port", "7777") \
        .config("spark.network.timeout", "600s") \
        .config("spark.sql.shuffle.partitions", "200") \
        .config("spark.yarn.access.hadoopFileSystems", "hdfs://your-namenode:9000") \
        .getOrCreate()
    
    return spark

# 使用示例
spark = create_spark_session(
    app_name="Customer-Churn-Prediction",
    master_url="spark://spark-master:7077"  # 替换为你的集群地址
)

print(f"Spark版本: {spark.version}")
print(f"应用名: {spark.sparkContext.appName}")
print(f"Master URL: {spark.sparkContext.master}")

2.2 从集群读取数据并训练模型

from pyspark.sql.functions import col, when
import pandas as pd

def train_model_on_cluster():
    """
    从集群读取数据并进行模型训练
    """
    # 1. 从集群读取数据
    # 从HDFS读取
    df = spark.read.parquet("hdfs://namenode:9000/data/mydata.parquet")
    
    # 或者从S3读取
    # df = spark.read.csv("s3://your-bucket/data.csv", header=True)
    
    # 或者从集群Hive表读取
    # df = spark.sql("SELECT * FROM my_database.my_table")
    
    print(f"数据行数: {df.count()}")
    print(f"数据列数: {len(df.columns)}")
    
    # 2. 数据预处理
    # 选择特征列
    feature_cols = ['feature1', 'feature2', 'feature3', 'feature4']
    target_col = 'label'
    
    # 移除缺失值
    df_clean = df.select(feature_cols + [target_col]).na.drop()
    
    # 3. 创建特征向量
    assembler = VectorAssembler(
        inputCols=feature_cols,
        outputCol="features"
    )
    
    # 4. 特征标准化
    scaler = StandardScaler(
        inputCol="features",
        outputCol="scaledFeatures",
        withStd=True,
        withMean=True
    )
    
    # 5. 分割训练集和测试集
    train_data, test_data = df_clean.randomSplit([0.8, 0.2], seed=42)
    
    # 6. 构建机器学习模型(以逻辑回归为例)
    from pyspark.ml.classification import LogisticRegression
    
    lr = LogisticRegression(
        featuresCol="scaledFeatures",
        labelCol=target_col,
        maxIter=100,
        regParam=0.01,
        elasticNetParam=0.5
    )
    
    # 7. 创建Pipeline
    pipeline = Pipeline(stages=[assembler, scaler, lr])
    
    # 8. 训练模型
    print("开始训练模型...")
    model = pipeline.fit(train_data)
    
    # 9. 预测和评估
    predictions = model.transform(test_data)
    
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    
    evaluator = BinaryClassificationEvaluator(
        labelCol=target_col,
        rawPredictionCol="rawPrediction",
        metricName="areaUnderROC"
    )
    
    auc = evaluator.evaluate(predictions)
    print(f"模型AUC: {auc:.4f}")
    
    # 10. 保存模型到集群
    model_path = "hdfs://namenode:9000/models/my_model_v1"
    model.save(model_path)
    print(f"模型已保存到: {model_path}")
    
    return model, auc

if __name__ == "__main__":
    try:
        spark = create_spark_session()
        model, auc = train_model_on_cluster()
        
        # 可选:在本地小规模数据上验证模型
        # 创建本地测试数据
        local_test_data = pd.DataFrame({
            'feature1': [1.2, 2.3, 3.4],
            'feature2': [4.5, 5.6, 6.7],
            'feature3': [7.8, 8.9, 9.0],
            'feature4': [10.1, 11.2, 12.3],
            'label': [0, 1, 0]
        })
        
        spark_test_data = spark.createDataFrame(local_test_data)
        predictions = model.transform(spark_test_data)
        predictions.select("features", "prediction", "probability").show(truncate=False)
        
    except Exception as e:
        print(f"执行出错: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        if 'spark' in locals():
            spark.stop()

2.3 高级配置示例(包含认证和优化)

def create_spark_session_with_auth():
    """
    包含认证和高级配置的SparkSession创建
    """
    spark = SparkSession.builder \
        .appName("Secure-ML-Training") \
        .master("yarn") \
        .config("spark.submit.deployMode", "client") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.instances", "20") \
        .config("spark.dynamicAllocation.enabled", "true") \
        .config("spark.dynamicAllocation.maxExecutors", "50") \
        .config("spark.dynamicAllocation.minExecutors", "5") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.hadoop.fs.defaultFS", "hdfs://namenode:9000") \
        # Kerberos认证(如果需要)
        .config("spark.yarn.keytab", "/path/to/keytab") \
        .config("spark.yarn.principal", "user@REALM") \
        # S3访问配置(如果使用S3)
        .config("spark.hadoop.fs.s3a.access.key", "your-access-key") \
        .config("spark.hadoop.fs.s3a.secret.key", "your-secret-key") \
        .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
        .getOrCreate()
    
    return spark

2.4 实用工具函数

def check_cluster_status(spark):
    """
    检查集群状态
    """
    # 查看Spark UI地址
    ui_url = spark.sparkContext.uiWebUrl
    print(f"Spark UI: {ui_url}")
    
    # 查看可用的executor
    executors = spark.sparkContext._jsc.sc().getExecutorMemoryStatus()
    print(f"可用Executor数量: {len(executors)}")
    
    # 查看配置
    conf = spark.sparkContext.getConf()
    print("当前配置:")
    for item in conf.getAll():
        print(f"  {item[0]}: {item[1]}")

def read_data_from_multiple_sources(spark):
    """
    从不同数据源读取数据
    """
    # 从HDFS读取CSV
    hdfs_data = spark.read \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .csv("hdfs://namenode:9000/data/*.csv")
    
    # 从Hive表读取
    hive_data = spark.sql("""
        SELECT * FROM production.customer_data 
        WHERE date >= '2024-01-01'
    """)
    
    # 合并数据
    combined_data = hdfs_data.unionByName(hive_data, allowMissingColumns=True)
    
    return combined_data

3. 配置和部署脚本

创建一个配置文件 spark_config.yaml

spark:
  master: "spark://your-spark-master:7077"
  app_name: "ML-Training-Pipeline"
  driver_memory: "4g"
  executor_memory: "8g"
  executor_cores: 4
  executor_instances: 10
  
hdfs:
  namenode: "namenode:9000"
  data_path: "/data/training/"
  model_path: "/models/"
  
model:
  train_test_split: 0.8
  validation_split: 0.2
  random_seed: 42

4. 本地开发和测试建议

  1. 先小规模测试
# 在提交到集群前,先在local模式下测试
test_spark = SparkSession.builder \
    .appName("Local-Test") \
    .master("local[4]") \
    .getOrCreate()
  1. 使用PyCharm/VSCode远程调试

    • 设置远程Python解释器
    • 配置SSH隧道访问集群
  2. 监控和日志

# 设置日志级别
spark.sparkContext.setLogLevel("INFO")

# 在代码中添加性能监控
import time
start_time = time.time()
# ... 你的代码 ...
print(f"执行时间: {time.time() - start_time:.2f}秒")

5. 常见问题解决

  1. 连接失败

    • 检查网络连通性
    • 确认防火墙设置
    • 验证认证信息
  2. 内存不足

    • 增加spark.executor.memory
    • 使用spark.memory.fraction调整内存分配
  3. 数据倾斜

    • 使用repartition()重分区
    • 添加salt key进行分组

注意事项:

  1. 确保本地Spark版本与集群版本兼容
  2. 大数据处理时注意分区策略
  3. 定期清理临时文件
  4. 使用checkpoint避免长血统链
  5. 合理配置shuffle分区数

这个方案应该能帮助您在本地连接Spark集群进行机器学习开发。根据您的具体集群配置,可能需要进行适当的调整。