Python+Spark数据科学与商业实践全指南
本文将全面介绍如何使用Python和Spark进行数据科学分析和商业应用开发,涵盖数据准备、特征工程、机器学习建模、商业智能分析等核心环节,并提供可直接运行的代码示例。
一、环境搭建与数据准备
1.1 PySpark环境配置
# 使用pip安装PySpark
!pip install pyspark==3.3.1
!pip install findspark
# 初始化Spark会话
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Business Analytics") \
.config("spark.executor.memory", "4g") \
.config("spark.driver.memory", "2g") \
.getOrCreate()
1.2 数据加载与探索
# 从CSV加载销售数据
df = spark.read.csv("sales_data.csv",
header=True,
inferSchema=True,
nullValue="NA")
# 数据概览
print(f"数据行数: {df.count()}")
print(f"数据列数: {len(df.columns)}")
df.printSchema()
# 描述性统计
df.describe().show()
# 缺失值检查
from pyspark.sql.functions import col, sum
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()
二、数据清洗与特征工程
2.1 数据清洗
from pyspark.sql.functions import when, mean
# 处理缺失值 - 数值列用均值填充
numeric_cols = [col_name for col_name, dtype in df.dtypes if dtype in ['int', 'double']]
mean_values = df.select([mean(col(c)).alias(c) for c in numeric_cols]).collect()[0]
df_clean = df
for col_name in numeric_cols:
df_clean = df_clean.withColumn(col_name,
when(col(col_name).isNull(), mean_values[col_name])
.otherwise(col(col_name)))
# 处理分类变量缺失值
df_clean = df_clean.fillna({
"category": "Unknown",
"region": "Other"
})
# 异常值处理 - IQR方法
from pyspark.sql.functions import percentile_approx
def remove_outliers(df, col_name):
quantiles = df.select(
percentile_approx(col_name, [0.25, 0.75], 1000000).alias("quantiles")
).collect()[0]["quantiles"]
q1, q3 = quantiles
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
return df.filter((col(col_name) >= lower_bound) & (col(col_name) <= upper_bound))
df_clean = remove_outliers(df_clean, "sales_amount")
2.2 特征工程
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.sql.functions import month, year, dayofweek
# 日期特征提取
df_features = df_clean.withColumn("sale_month", month("sale_date")) \
.withColumn("sale_year", year("sale_date")) \
.withColumn("day_of_week", dayofweek("sale_date"))
# 分类变量编码
cat_cols = ["category", "region", "product_type"]
stages = []
for col_name in cat_cols:
string_indexer = StringIndexer(inputCol=col_name, outputCol=col_name+"_index")
encoder = OneHotEncoder(inputCol=col_name+"_index", outputCol=col_name+"_encoded")
stages += [string_indexer, encoder]
# 数值特征标准化
from pyspark.ml.feature import StandardScaler
numeric_cols = ["sales_amount", "unit_price", "quantity"]
assembler_numeric = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_numeric_features")
stages += [assembler_numeric, scaler]
# 特征组合
feature_cols = [c+"_encoded" for c in cat_cols] + ["scaled_numeric_features"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
stages.append(assembler)
# 构建特征工程Pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
model = pipeline.fit(df_features)
df_transformed = model.transform(df_features)
# 展示处理后的特征
df_transformed.select("features").show(5, truncate=False)
三、机器学习建模
3.1 销售预测模型
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
# 划分训练集和测试集
train_data, test_data = df_transformed.randomSplit([0.8, 0.2], seed=42)
# 构建随机森林模型
rf = RandomForestRegressor(featuresCol="features",
labelCol="sales_amount",
numTrees=100,
maxDepth=10,
seed=42)
# 训练模型
rf_model = rf.fit(train_data)
# 预测
predictions = rf_model.transform(test_data)
# 评估模型
evaluator = RegressionEvaluator(labelCol="sales_amount",
predictionCol="prediction",
metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")
# 特征重要性
feature_importance = rf_model.featureImportances
features = df_transformed.schema["features"].metadata["ml_attr"]["attrs"]["numeric"] + \
df_transformed.schema["features"].metadata["ml_attr"]["attrs"]["binary"]
for feature, importance in zip(features, feature_importance):
print(f"{feature['name']}: {importance}")
3.2 客户分群模型
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# 准备客户行为特征
customer_features = df_transformed.groupBy("customer_id") \
.agg(
mean("sales_amount").alias("avg_purchase"),
sum("sales_amount").alias("total_spend"),
countDistinct("product_id").alias("product_variety"),
count("*").alias("purchase_frequency")
)
# 特征工程
assembler = VectorAssembler(
inputCols=["avg_purchase", "total_spend", "product_variety", "purchase_frequency"],
outputCol="features"
)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[assembler, scaler])
model = pipeline.fit(customer_features)
customer_data = model.transform(customer_features)
# 寻找最佳K值
silhouette_scores = []
evaluator = ClusteringEvaluator(featuresCol='scaled_features',
metricName='silhouette',
distanceMeasure='squaredEuclidean')
for k in range(2, 11):
kmeans = KMeans(featuresCol='scaled_features', k=k, seed=42)
model = kmeans.fit(customer_data)
predictions = model.transform(customer_data)
score = evaluator.evaluate(predictions)
silhouette_scores.append(score)
print(f"Silhouette Score for k={k}: {score}")
# 选择最佳K值构建最终模型
best_k = silhouette_scores.index(max(silhouette_scores)) + 2
kmeans = KMeans(featuresCol='scaled_features', k=best_k, seed=42)
final_model = kmeans.fit(customer_data)
clustered_data = final_model.transform(customer_data)
# 分析聚类结果
clustered_data.groupBy("prediction") \
.agg(
mean("avg_purchase").alias("avg_purchase"),
mean("total_spend").alias("total_spend"),
mean("product_variety").alias("product_variety"),
mean("purchase_frequency").alias("purchase_frequency"),
count("*").alias("count")
) \
.orderBy("prediction") \
.show()
四、商业智能分析
4.1 销售趋势分析
import matplotlib.pyplot as plt
import pandas as pd
# 按月份分析销售趋势
monthly_sales = df_transformed.groupBy("sale_year", "sale_month") \
.agg(sum("sales_amount").alias("total_sales")) \
.orderBy("sale_year", "sale_month") \
.toPandas()
# 可视化
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales["sale_month"].astype(str) + "-" + monthly_sales["sale_year"].astype(str),
monthly_sales["total_sales"])
plt.title("Monthly Sales Trend")
plt.xlabel("Month-Year")
plt.ylabel("Total Sales Amount")
plt.xticks(rotation=45)
plt.grid()
plt.show()
4.2 产品关联分析
from pyspark.ml.fpm import FPGrowth
# 准备交易数据(每个客户购买的产品集合)
transactions = df_transformed.groupBy("transaction_id") \
.agg(collect_set("product_id").alias("items"))
# 频繁模式挖掘
fp_growth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.5)
model = fp_growth.fit(transactions)
# 显示频繁项集
model.freqItemsets.show(10)
# 显示关联规则
model.associationRules.show(10)
# 应用规则进行预测
model.transform(transactions).show(5)
五、性能优化技巧
5.1 数据分区优化
# 按日期重新分区数据以提高查询性能
df_optimized = df_transformed.repartition(12, "sale_month")
# 缓存常用数据集
df_optimized.cache()
df_optimized.count() # 触发缓存
# 查看分区情况
print("分区数:", df_optimized.rdd.getNumPartitions())
5.2 Spark SQL优化
# 创建临时视图
df_transformed.createOrReplaceTempView("sales")
# 使用Spark SQL进行复杂分析
top_customers = spark.sql("""
SELECT customer_id, SUM(sales_amount) as total_spend
FROM sales
GROUP BY customer_id
ORDER BY total_spend DESC
LIMIT 10
""")
top_customers.show()
# 使用广播连接优化小表连接
from pyspark.sql.functions import broadcast
product_info = spark.read.csv("product_info.csv", header=True)
df_joined = df_transformed.join(broadcast(product_info),
"product_id",
"left")
六、商业应用案例
6.1 动态定价策略
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, avg
# 计算产品价格弹性
window_spec = Window.partitionBy("product_id").orderBy("sale_date")
price_elasticity = df_transformed.withColumn("prev_price", lag("unit_price", 1).over(window_spec)) \
.withColumn("prev_quantity", lag("quantity", 1).over(window_spec)) \
.na.drop() \
.withColumn("price_change", (col("unit_price") - col("prev_price")) / col("prev_price")) \
.withColumn("demand_change", (col("quantity") - col("prev_quantity")) / col("prev_quantity")) \
.withColumn("elasticity", col("demand_change") / col("price_change")) \
.groupBy("product_id") \
.agg(avg("elasticity").alias("avg_elasticity"))
# 根据弹性制定定价策略
pricing_strategy = price_elasticity.withColumn("pricing_strategy",
when(col("avg_elasticity") < -1, "High Margin (Inelastic)")
.when(col("avg_elasticity").between(-1, -0.5), "Moderate Margin")
.otherwise("Low Margin (Elastic)")
)
pricing_strategy.show()
6.2 库存优化模型
# 计算产品需求预测和库存建议
inventory_analysis = df_transformed.groupBy("product_id", "sale_month") \
.agg(
sum("quantity").alias("monthly_demand"),
avg("quantity").alias("avg_daily_demand"),
stddev("quantity").alias("demand_stddev")
) \
.withColumn("safety_stock",
col("demand_stddev") * 1.65 * sqrt(30)) \ # 95%服务水平,30天提前期
.withColumn("reorder_point",
col("avg_daily_demand") * 30 + col("safety_stock")) \
.orderBy("product_id", "sale_month")
inventory_analysis.show()
七、模型部署与API服务
7.1 模型保存与加载
# 保存销售预测模型
rf_model.write().overwrite().save("models/sales_prediction_model")
# 在另一个应用中加载模型
from pyspark.ml.regression import RandomForestRegressionModel
loaded_model = RandomForestRegressionModel.load("models/sales_prediction_model")
7.2 创建预测API服务
from flask import Flask, request, jsonify
import pandas as pd
from pyspark.sql import SparkSession
app = Flask(__name__)
# 初始化Spark
spark = SparkSession.builder.appName("PredictionAPI").getOrCreate()
model = RandomForestRegressionModel.load("models/sales_prediction_model")
@app.route('/predict', methods=['POST'])
def predict():
# 获取输入数据
input_data = request.json
# 转换为Spark DataFrame
input_df = spark.createDataFrame(pd.DataFrame([input_data]))
# 应用相同的特征工程流程
processed_df = pipeline_model.transform(input_df)
# 进行预测
predictions = model.transform(processed_df)
# 返回结果
result = predictions.select("prediction").collect()[0][0]
return jsonify({"predicted_sales": result})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
八、最佳实践总结
- 数据质量优先:始终从数据探索和清洗开始,确保数据质量
- 增量建模:从简单模型开始,逐步增加复杂性
- 业务理解:模型指标应与业务KPI对齐
- 可解释性:优先选择可解释的模型,便于业务理解
- 监控迭代:建立模型性能监控机制,定期更新模型
通过本文的实践指南,您已经掌握了使用Python和Spark进行端到端数据科学和商业分析的全套技能。实际应用中,建议根据具体业务需求调整分析流程和模型选择,持续优化解决方案。