基于pyspark的数据科学与商业实践视频课程

45 阅读6分钟

Python+Spark数据科学与商业实践全指南

本文将全面介绍如何使用Python和Spark进行数据科学分析和商业应用开发,涵盖数据准备、特征工程、机器学习建模、商业智能分析等核心环节,并提供可直接运行的代码示例。

一、环境搭建与数据准备

1.1 PySpark环境配置

# 使用pip安装PySpark
!pip install pyspark==3.3.1
!pip install findspark

# 初始化Spark会话
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Business Analytics") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

1.2 数据加载与探索

# 从CSV加载销售数据
df = spark.read.csv("sales_data.csv", 
                   header=True, 
                   inferSchema=True,
                   nullValue="NA")

# 数据概览
print(f"数据行数: {df.count()}")
print(f"数据列数: {len(df.columns)}")
df.printSchema()

# 描述性统计
df.describe().show()

# 缺失值检查
from pyspark.sql.functions import col, sum

df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

二、数据清洗与特征工程

2.1 数据清洗

from pyspark.sql.functions import when, mean

# 处理缺失值 - 数值列用均值填充
numeric_cols = [col_name for col_name, dtype in df.dtypes if dtype in ['int', 'double']]
mean_values = df.select([mean(col(c)).alias(c) for c in numeric_cols]).collect()[0]

df_clean = df
for col_name in numeric_cols:
    df_clean = df_clean.withColumn(col_name, 
                                  when(col(col_name).isNull(), mean_values[col_name])
                                  .otherwise(col(col_name)))

# 处理分类变量缺失值
df_clean = df_clean.fillna({
    "category": "Unknown",
    "region": "Other"
})

# 异常值处理 - IQR方法
from pyspark.sql.functions import percentile_approx

def remove_outliers(df, col_name):
    quantiles = df.select(
        percentile_approx(col_name, [0.25, 0.75], 1000000).alias("quantiles")
    ).collect()[0]["quantiles"]
    
    q1, q3 = quantiles
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    return df.filter((col(col_name) >= lower_bound) & (col(col_name) <= upper_bound))

df_clean = remove_outliers(df_clean, "sales_amount")

2.2 特征工程

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import month, year, dayofweek

# 日期特征提取
df_features = df_clean.withColumn("sale_month", month("sale_date")) \
                     .withColumn("sale_year", year("sale_date")) \
                     .withColumn("day_of_week", dayofweek("sale_date"))

# 分类变量编码
cat_cols = ["category", "region", "product_type"]
stages = []

for col_name in cat_cols:
    string_indexer = StringIndexer(inputCol=col_name, outputCol=col_name+"_index")
    encoder = OneHotEncoder(inputCol=col_name+"_index", outputCol=col_name+"_encoded")
    stages += [string_indexer, encoder]

# 数值特征标准化
from pyspark.ml.feature import StandardScaler

numeric_cols = ["sales_amount", "unit_price", "quantity"]
assembler_numeric = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_numeric_features")
stages += [assembler_numeric, scaler]

# 特征组合
feature_cols = [c+"_encoded" for c in cat_cols] + ["scaled_numeric_features"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
stages.append(assembler)

# 构建特征工程Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
model = pipeline.fit(df_features)
df_transformed = model.transform(df_features)

# 展示处理后的特征
df_transformed.select("features").show(5, truncate=False)

三、机器学习建模

3.1 销售预测模型

from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# 划分训练集和测试集
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)

# 构建随机森林模型
rf = RandomForestRegressor(featuresCol="features", 
                          labelCol="sales_amount",
                          numTrees=100,
                          maxDepth=10,
                          seed=42)

# 训练模型
rf_model = rf.fit(train_data)

# 预测
predictions = rf_model.transform(test_data)

# 评估模型
evaluator = RegressionEvaluator(labelCol="sales_amount", 
                               predictionCol="prediction",
                               metricName="rmse")

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# 特征重要性
feature_importance = rf_model.featureImportances
features = df_transformed.schema["features"].metadata["ml_attr"]["attrs"]["numeric"] + \
           df_transformed.schema["features"].metadata["ml_attr"]["attrs"]["binary"]
           
for feature, importance in zip(features, feature_importance):
    print(f"{feature['name']}: {importance}")

3.2 客户分群模型

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# 准备客户行为特征
customer_features = df_transformed.groupBy("customer_id") \
    .agg(
        mean("sales_amount").alias("avg_purchase"),
        sum("sales_amount").alias("total_spend"),
        countDistinct("product_id").alias("product_variety"),
        count("*").alias("purchase_frequency")
    )

# 特征工程
assembler = VectorAssembler(
    inputCols=["avg_purchase", "total_spend", "product_variety", "purchase_frequency"],
    outputCol="features"
)

scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[assembler, scaler])
model = pipeline.fit(customer_features)
customer_data = model.transform(customer_features)

# 寻找最佳K值
silhouette_scores = []
evaluator = ClusteringEvaluator(featuresCol='scaled_features', 
                               metricName='silhouette', 
                               distanceMeasure='squaredEuclidean')

for k in range(2, 11):
    kmeans = KMeans(featuresCol='scaled_features', k=k, seed=42)
    model = kmeans.fit(customer_data)
    predictions = model.transform(customer_data)
    score = evaluator.evaluate(predictions)
    silhouette_scores.append(score)
    print(f"Silhouette Score for k={k}: {score}")

# 选择最佳K值构建最终模型
best_k = silhouette_scores.index(max(silhouette_scores)) + 2
kmeans = KMeans(featuresCol='scaled_features', k=best_k, seed=42)
final_model = kmeans.fit(customer_data)
clustered_data = final_model.transform(customer_data)

# 分析聚类结果
clustered_data.groupBy("prediction") \
    .agg(
        mean("avg_purchase").alias("avg_purchase"),
        mean("total_spend").alias("total_spend"),
        mean("product_variety").alias("product_variety"),
        mean("purchase_frequency").alias("purchase_frequency"),
        count("*").alias("count")
    ) \
    .orderBy("prediction") \
    .show()

四、商业智能分析

4.1 销售趋势分析

import matplotlib.pyplot as plt
import pandas as pd

# 按月份分析销售趋势
monthly_sales = df_transformed.groupBy("sale_year", "sale_month") \
    .agg(sum("sales_amount").alias("total_sales")) \
    .orderBy("sale_year", "sale_month") \
    .toPandas()

# 可视化
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales["sale_month"].astype(str) + "-" + monthly_sales["sale_year"].astype(str), 
         monthly_sales["total_sales"])
plt.title("Monthly Sales Trend")
plt.xlabel("Month-Year")
plt.ylabel("Total Sales Amount")
plt.xticks(rotation=45)
plt.grid()
plt.show()

4.2 产品关联分析

from pyspark.ml.fpm import FPGrowth

# 准备交易数据(每个客户购买的产品集合)
transactions = df_transformed.groupBy("transaction_id") \
    .agg(collect_set("product_id").alias("items"))

# 频繁模式挖掘
fp_growth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.5)
model = fp_growth.fit(transactions)

# 显示频繁项集
model.freqItemsets.show(10)

# 显示关联规则
model.associationRules.show(10)

# 应用规则进行预测
model.transform(transactions).show(5)

五、性能优化技巧

5.1 数据分区优化

# 按日期重新分区数据以提高查询性能
df_optimized = df_transformed.repartition(12, "sale_month")

# 缓存常用数据集
df_optimized.cache()
df_optimized.count()  # 触发缓存

# 查看分区情况
print("分区数:", df_optimized.rdd.getNumPartitions())

5.2 Spark SQL优化

# 创建临时视图
df_transformed.createOrReplaceTempView("sales")

# 使用Spark SQL进行复杂分析
top_customers = spark.sql("""
    SELECT customer_id, SUM(sales_amount) as total_spend
    FROM sales
    GROUP BY customer_id
    ORDER BY total_spend DESC
    LIMIT 10
""")

top_customers.show()

# 使用广播连接优化小表连接
from pyspark.sql.functions import broadcast

product_info = spark.read.csv("product_info.csv", header=True)
df_joined = df_transformed.join(broadcast(product_info), 
                               "product_id", 
                               "left")

六、商业应用案例

6.1 动态定价策略

from pyspark.sql.window import Window
from pyspark.sql.functions import lag, avg

# 计算产品价格弹性
window_spec = Window.partitionBy("product_id").orderBy("sale_date")
price_elasticity = df_transformed.withColumn("prev_price", lag("unit_price", 1).over(window_spec)) \
                                .withColumn("prev_quantity", lag("quantity", 1).over(window_spec)) \
                                .na.drop() \
                                .withColumn("price_change", (col("unit_price") - col("prev_price")) / col("prev_price")) \
                                .withColumn("demand_change", (col("quantity") - col("prev_quantity")) / col("prev_quantity")) \
                                .withColumn("elasticity", col("demand_change") / col("price_change")) \
                                .groupBy("product_id") \
                                .agg(avg("elasticity").alias("avg_elasticity"))

# 根据弹性制定定价策略
pricing_strategy = price_elasticity.withColumn("pricing_strategy",
    when(col("avg_elasticity") < -1, "High Margin (Inelastic)")
    .when(col("avg_elasticity").between(-1, -0.5), "Moderate Margin")
    .otherwise("Low Margin (Elastic)")
)

pricing_strategy.show()

6.2 库存优化模型

# 计算产品需求预测和库存建议
inventory_analysis = df_transformed.groupBy("product_id", "sale_month") \
    .agg(
        sum("quantity").alias("monthly_demand"),
        avg("quantity").alias("avg_daily_demand"),
        stddev("quantity").alias("demand_stddev")
    ) \
    .withColumn("safety_stock", 
               col("demand_stddev") * 1.65 * sqrt(30)) \  # 95%服务水平,30天提前期
    .withColumn("reorder_point", 
               col("avg_daily_demand") * 30 + col("safety_stock")) \
    .orderBy("product_id", "sale_month")

inventory_analysis.show()

七、模型部署与API服务

7.1 模型保存与加载

# 保存销售预测模型
rf_model.write().overwrite().save("models/sales_prediction_model")

# 在另一个应用中加载模型
from pyspark.ml.regression import RandomForestRegressionModel
loaded_model = RandomForestRegressionModel.load("models/sales_prediction_model")

7.2 创建预测API服务

from flask import Flask, request, jsonify
import pandas as pd
from pyspark.sql import SparkSession

app = Flask(__name__)

# 初始化Spark
spark = SparkSession.builder.appName("PredictionAPI").getOrCreate()
model = RandomForestRegressionModel.load("models/sales_prediction_model")

@app.route('/predict', methods=['POST'])
def predict():
    # 获取输入数据
    input_data = request.json
    
    # 转换为Spark DataFrame
    input_df = spark.createDataFrame(pd.DataFrame([input_data]))
    
    # 应用相同的特征工程流程
    processed_df = pipeline_model.transform(input_df)
    
    # 进行预测
    predictions = model.transform(processed_df)
    
    # 返回结果
    result = predictions.select("prediction").collect()[0][0]
    return jsonify({"predicted_sales": result})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

八、最佳实践总结

  1. 数据质量优先:始终从数据探索和清洗开始,确保数据质量
  2. 增量建模:从简单模型开始,逐步增加复杂性
  3. 业务理解:模型指标应与业务KPI对齐
  4. 可解释性:优先选择可解释的模型,便于业务理解
  5. 监控迭代:建立模型性能监控机制,定期更新模型

通过本文的实践指南,您已经掌握了使用Python和Spark进行端到端数据科学和商业分析的全套技能。实际应用中,建议根据具体业务需求调整分析流程和模型选择,持续优化解决方案。