Kafka 4.0.1 + Spark 集成 (使用 KRaft 模式)
一、环境搭建详细步骤
1. 安装 Java (必需)
# 安装 Java 8 或 11
brew install openjdk@11
# 设置环境变量
echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc
echo 'export JAVA_HOME="/opt/homebrew/opt/openjdk@11"' >> ~/.zshrc
source ~/.zshrc
# 验证安装
java -version
2. 安装 Kafka 4.0.1 并配置 KRaft
# 安装 Kafka
brew install kafka
# 检查版本 (应该是4.0.1或更新)
kafka-topics --version
# 创建配置文件目录
mkdir -p ~/kafka-kraft-config
cd ~/kafka-kraft-config
# 生成集群ID (非常重要)
KAFKA_CLUSTER_ID=$(kafka-storage random-uuid)
echo "Cluster ID: $KAFKA_CLUSTER_ID"
# 创建kraft配置文件
cat > server.properties << EOF
# Kafka服务器配置
process.roles=broker,controller
node.id=1
controller.quorum.voters=1@localhost:9093
# 监听器配置
listeners=PLAINTEXT://localhost:9092,CONTROLLER://localhost:9093
advertised.listeners=PLAINTEXT://localhost:9092
listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
inter.broker.listener.name=PLAINTEXT
# 存储配置
log.dirs=/tmp/kraft-combined-logs
num.partitions=3
default.replication.factor=1
min.insync.replicas=1
# 控制器配置
controller.listener.names=CONTROLLER
# KRaft配置
num.controller.quorum.voters=1
quorum.voters=1@localhost:9093
# 集群ID
cluster.id=$KAFKA_CLUSTER_ID
EOF
# 格式化存储目录
kafka-storage format -t $KAFKA_CLUSTER_ID -c server.properties --ignore-formatted
# 启动Kafka (KRaft模式)
kafka-server-start server.properties
3. 安装 Spark 和 PySpark
# 安装 Spark
brew install apache-spark
# 安装 PySpark
pip install pyspark==3.5.0
# 安装其他必要的Python包
pip install kafka-python findspark
4. 验证安装
# 验证Kafka运行
kafka-topics --bootstrap-server localhost:9092 --list
# 验证Spark
spark-shell --version
python -c "import pyspark; print(pyspark.__version__)"
二、完整可行的 Python 代码
目录结构
kafka-spark-kraft/
├── config/
│ └── kafka_config.py
├── producer/
│ └── kafka_producer.py
├── spark/
│ ├── streaming_consumer.py
│ └── batch_consumer.py
├── utils/
│ └── data_generator.py
└── run_integration.py
1. 配置文件 (config/kafka_config.py)
#!/usr/bin/env python3
# Kafka 配置 (KRaft模式)
KAFKA_CONFIG = {
'bootstrap_servers': 'localhost:9092',
'topics': {
'transactions': 'ecommerce-transactions',
'user_activity': 'user-activity-logs',
'metrics': 'system-metrics'
},
'consumer_group': 'spark-consumer-group',
'schema_registry': None # KRaft模式不需要schema registry
}
# Spark 配置
SPARK_CONFIG = {
'app_name': 'KafkaSparkIntegration',
'master': 'local[*]',
'spark.sql.streaming.checkpointLocation': '/tmp/spark-checkpoints',
'spark.jars.packages': 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0',
'spark.executor.memory': '2g',
'spark.driver.memory': '2g'
}
# 数据Schema
SCHEMAS = {
'transaction': {
'fields': ['transaction_id', 'user_id', 'product_id', 'product_name',
'category', 'price', 'quantity', 'timestamp', 'payment_method'],
'types': ['string', 'string', 'string', 'string',
'string', 'float', 'int', 'timestamp', 'string']
},
'user_activity': {
'fields': ['user_id', 'session_id', 'event_type', 'page_url',
'timestamp', 'device_type', 'location'],
'types': ['string', 'string', 'string', 'string',
'timestamp', 'string', 'string']
}
}
2. 数据生成器 (utils/data_generator.py)
import json
import time
import random
import uuid
from datetime import datetime
from faker import Faker
import numpy as np
class DataGenerator:
def __init__(self):
self.fake = Faker()
self.products = [
{'id': 'P001', 'name': 'iPhone 15', 'category': 'Electronics', 'price': 999.99},
{'id': 'P002', 'name': 'MacBook Pro', 'category': 'Computers', 'price': 1999.99},
{'id': 'P003', 'name': 'AirPods Pro', 'category': 'Audio', 'price': 249.99},
{'id': 'P004', 'name': 'iPad Air', 'category': 'Tablets', 'price': 599.99},
{'id': 'P005', 'name': 'Apple Watch', 'category': 'Wearables', 'price': 399.99}
]
self.payment_methods = ['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay']
self.device_types = ['Mobile', 'Desktop', 'Tablet']
self.event_types = ['page_view', 'add_to_cart', 'purchase', 'search', 'login']
def generate_transaction(self):
"""生成电商交易数据"""
product = random.choice(self.products)
quantity = random.randint(1, 3)
price_variation = random.uniform(0.9, 1.1)
return {
'transaction_id': str(uuid.uuid4()),
'user_id': f'USER_{random.randint(1000, 9999)}',
'product_id': product['id'],
'product_name': product['name'],
'category': product['category'],
'price': round(product['price'] * price_variation, 2),
'quantity': quantity,
'total_amount': round(product['price'] * price_variation * quantity, 2),
'timestamp': datetime.now().isoformat(),
'payment_method': random.choice(self.payment_methods),
'shipping_address': self.fake.address()
}
def generate_user_activity(self):
"""生成用户活动数据"""
return {
'user_id': f'USER_{random.randint(1000, 9999)}',
'session_id': str(uuid.uuid4()),
'event_type': random.choice(self.event_types),
'page_url': self.fake.url(),
'timestamp': datetime.now().isoformat(),
'device_type': random.choice(self.device_types),
'location': self.fake.city(),
'user_agent': self.fake.user_agent(),
'session_duration': random.randint(10, 300)
}
def generate_system_metrics(self):
"""生成系统指标数据"""
return {
'server_id': f'SERVER_{random.randint(1, 5)}',
'timestamp': datetime.now().isoformat(),
'cpu_usage': round(random.uniform(10, 90), 2),
'memory_usage': round(random.uniform(30, 80), 2),
'disk_usage': round(random.uniform(20, 70), 2),
'network_in': random.randint(100, 1000),
'network_out': random.randint(100, 1000),
'active_connections': random.randint(50, 500)
}
3. Kafka 生产者 (producer/kafka_producer.py)
#!/usr/bin/env python3
import json
import time
import random
from kafka import KafkaProducer
from kafka.errors import KafkaError
from concurrent.futures import ThreadPoolExecutor
from config.kafka_config import KAFKA_CONFIG
from utils.data_generator import DataGenerator
class KafkaKRaftProducer:
def __init__(self):
self.config = KAFKA_CONFIG
self.producer = KafkaProducer(
bootstrap_servers=self.config['bootstrap_servers'],
value_serializer=lambda v: json.dumps(v).encode('utf-8'),
acks='all',
retries=3,
max_in_flight_requests_per_connection=1,
compression_type='gzip'
)
self.generator = DataGenerator()
def create_topics(self):
"""创建Kafka主题 (KRaft模式)"""
import subprocess
topics = [
f"{self.config['topics']['transactions']} --partitions 3 --replication-factor 1",
f"{self.config['topics']['user_activity']} --partitions 2 --replication-factor 1",
f"{self.config['topics']['metrics']} --partitions 1 --replication-factor 1"
]
for topic_cmd in topics:
try:
cmd = f"kafka-topics --bootstrap-server {self.config['bootstrap_servers']} --create --topic {topic_cmd}"
print(f"创建主题: {cmd}")
subprocess.run(cmd, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"主题可能已存在: {e}")
def send_transaction_data(self, num_messages=100, delay=0.5):
"""发送交易数据"""
topic = self.config['topics']['transactions']
print(f"开始向主题 '{topic}' 发送 {num_messages} 条交易数据...")
for i in range(num_messages):
try:
data = self.generator.generate_transaction()
future = self.producer.send(topic, value=data)
# 异步确认发送成功
future.add_callback(self._on_send_success, data)
future.add_errback(self._on_send_error, data)
if (i + 1) % 10 == 0:
print(f"已发送 {i + 1} 条交易数据")
time.sleep(delay)
except KafkaError as e:
print(f"发送失败: {e}")
time.sleep(1) # 短暂延迟后重试
self.producer.flush()
print(f"交易数据发送完成,共 {num_messages} 条")
def send_user_activity_data(self, num_messages=50, delay=0.3):
"""发送用户活动数据"""
topic = self.config['topics']['user_activity']
print(f"开始向主题 '{topic}' 发送 {num_messages} 条用户活动数据...")
for i in range(num_messages):
data = self.generator.generate_user_activity()
self.producer.send(topic, value=data)
if (i + 1) % 10 == 0:
print(f"已发送 {i + 1} 条用户活动数据")
time.sleep(delay)
self.producer.flush()
def send_system_metrics(self, num_messages=30, delay=2):
"""发送系统指标数据"""
topic = self.config['topics']['metrics']
print(f"开始向主题 '{topic}' 发送 {num_messages} 条系统指标数据...")
for i in range(num_messages):
data = self.generator.generate_system_metrics()
self.producer.send(topic, value=data)
if (i + 1) % 5 == 0:
print(f"已发送 {i + 1} 条系统指标数据")
time.sleep(delay)
self.producer.flush()
def send_all_data_concurrently(self):
"""并发发送所有类型的数据"""
with ThreadPoolExecutor(max_workers=3) as executor:
executor.submit(self.send_transaction_data, 100, 0.3)
executor.submit(self.send_user_activity_data, 60, 0.5)
executor.submit(self.send_system_metrics, 40, 1.0)
def _on_send_success(self, record_metadata, data):
"""发送成功回调"""
print(f"✓ 数据发送成功: topic={record_metadata.topic}, "
f"partition={record_metadata.partition}, "
f"offset={record_metadata.offset}")
def _on_send_error(self, exception, data):
"""发送失败回调"""
print(f"✗ 数据发送失败: {exception}")
def close(self):
"""关闭生产者"""
self.producer.close()
print("Kafka生产者已关闭")
if __name__ == "__main__":
producer = KafkaKRaftProducer()
try:
# 创建主题
producer.create_topics()
print("\n1. 单线程发送数据")
producer.send_transaction_data(num_messages=20, delay=0.2)
print("\n2. 并发发送所有类型数据")
producer.send_all_data_concurrently()
except KeyboardInterrupt:
print("\n生产者被用户中断")
except Exception as e:
print(f"发生错误: {e}")
finally:
producer.close()
4. Spark 流式消费者 (spark/streaming_consumer.py)
#!/usr/bin/env python3
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import findspark
findspark.init()
from config.kafka_config import SPARK_CONFIG, KAFKA_CONFIG, SCHEMAS
class SparkStreamingConsumer:
def __init__(self):
self.spark_config = SPARK_CONFIG
self.kafka_config = KAFKA_CONFIG
# 创建SparkSession
self.spark = SparkSession.builder \
.appName(self.spark_config['app_name']) \
.master(self.spark_config['master']) \
.config("spark.sql.streaming.checkpointLocation",
self.spark_config['spark.sql.streaming.checkpointLocation']) \
.config("spark.jars.packages", self.spark_config['spark.jars.packages']) \
.config("spark.executor.memory", self.spark_config['spark.executor.memory']) \
.config("spark.driver.memory", self.spark_config['spark.driver.memory']) \
.config("spark.sql.shuffle.partitions", "2") \
.config("spark.streaming.backpressure.enabled", "true") \
.config("spark.streaming.kafka.maxRatePerPartition", "100") \
.getOrCreate()
self.spark.sparkContext.setLogLevel("WARN")
# 定义Schema
self.transaction_schema = StructType([
StructField("transaction_id", StringType()),
StructField("user_id", StringType()),
StructField("product_id", StringType()),
StructField("product_name", StringType()),
StructField("category", StringType()),
StructField("price", DoubleType()),
StructField("quantity", IntegerType()),
StructField("total_amount", DoubleType()),
StructField("timestamp", TimestampType()),
StructField("payment_method", StringType()),
StructField("shipping_address", StringType())
])
self.user_activity_schema = StructType([
StructField("user_id", StringType()),
StructField("session_id", StringType()),
StructField("event_type", StringType()),
StructField("page_url", StringType()),
StructField("timestamp", TimestampType()),
StructField("device_type", StringType()),
StructField("location", StringType()),
StructField("user_agent", StringType()),
StructField("session_duration", IntegerType())
])
def read_from_kafka(self, topic):
"""从Kafka读取数据流"""
return self.spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", self.kafka_config['bootstrap_servers']) \
.option("subscribe", topic) \
.option("startingOffsets", "latest") \
.option("maxOffsetsPerTrigger", "100") \
.option("failOnDataLoss", "false") \
.load()
def process_transactions(self):
"""处理交易数据流"""
print("开始处理交易数据流...")
# 读取Kafka数据
kafka_df = self.read_from_kafka(self.kafka_config['topics']['transactions'])
# 解析JSON数据
parsed_df = kafka_df.select(
from_json(col("value").cast("string"), self.transaction_schema).alias("data")
).select("data.*")
# 添加水印
df_with_watermark = parsed_df \
.withWatermark("timestamp", "5 minutes")
# 实时分析1: 按类别统计销售额
category_stats = df_with_watermark \
.groupBy("category") \
.agg(
sum("total_amount").alias("total_sales"),
avg("price").alias("avg_price"),
count("*").alias("transaction_count"),
approx_count_distinct("user_id").alias("unique_users")
) \
.orderBy(desc("total_sales"))
# 实时分析2: 滑动窗口分析(每10分钟窗口,滑动间隔5分钟)
window_stats = df_with_watermark \
.groupBy(
window(col("timestamp"), "10 minutes", "5 minutes"),
"category"
) \
.agg(
sum("total_amount").alias("window_sales"),
count("*").alias("window_transactions"),
approx_count_distinct("user_id").alias("window_unique_users")
) \
.orderBy(desc("window_sales"))
# 实时分析3: 热门产品排名
product_stats = df_with_watermark \
.groupBy("product_name", "category") \
.agg(
sum("quantity").alias("total_quantity"),
sum("total_amount").alias("product_sales"),
count("*").alias("purchase_count")
) \
.orderBy(desc("product_sales"))
return {
'category_stats': category_stats,
'window_stats': window_stats,
'product_stats': product_stats,
'raw_data': parsed_df
}
def process_user_activity(self):
"""处理用户活动数据流"""
print("开始处理用户活动数据流...")
kafka_df = self.read_from_kafka(self.kafka_config['topics']['user_activity'])
parsed_df = kafka_df.select(
from_json(col("value").cast("string"), self.user_activity_schema).alias("data")
).select("data.*")
df_with_watermark = parsed_df \
.withWatermark("timestamp", "3 minutes")
# 用户活动分析
activity_stats = df_with_watermark \
.groupBy("event_type", "device_type") \
.agg(
count("*").alias("event_count"),
approx_count_distinct("user_id").alias("unique_users"),
avg("session_duration").alias("avg_session_duration")
) \
.orderBy(desc("event_count"))
# 实时用户会话分析
session_analysis = df_with_watermark \
.groupBy("user_id", "session_id") \
.agg(
count("*").alias("events_per_session"),
min("timestamp").alias("session_start"),
max("timestamp").alias("session_end"),
collect_list("event_type").alias("event_sequence")
)
return {
'activity_stats': activity_stats,
'session_analysis': session_analysis,
'raw_activity': parsed_df
}
def start_streaming(self, processing_time="10 seconds"):
"""启动所有流处理"""
print("启动Spark流处理引擎...")
# 处理交易数据
transaction_results = self.process_transactions()
# 输出交易统计到控制台
query1 = transaction_results['category_stats'] \
.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", "false") \
.trigger(processingTime=processing_time) \
.start()
query2 = transaction_results['product_stats'] \
.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", "false") \
.trigger(processingTime="30 seconds") \
.start()
# 处理用户活动数据
activity_results = self.process_user_activity()
query3 = activity_results['activity_stats'] \
.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", "false") \
.trigger(processingTime="15 seconds") \
.start()
# 将原始数据写入内存表,供后续查询
memory_query1 = transaction_results['raw_data'] \
.writeStream \
.queryName("transactions_stream") \
.outputMode("append") \
.format("memory") \
.start()
memory_query2 = activity_results['raw_activity'] \
.writeStream \
.queryName("activity_stream") \
.outputMode("append") \
.format("memory") \
.start()
print("所有流处理查询已启动")
print("按 Ctrl+C 停止处理")
# 等待查询终止
try:
query1.awaitTermination()
query2.awaitTermination()
query3.awaitTermination()
memory_query1.awaitTermination()
memory_query2.awaitTermination()
except KeyboardInterrupt:
print("正在停止流处理...")
query1.stop()
query2.stop()
query3.stop()
memory_query1.stop()
memory_query2.stop()
def run_interactive_queries(self):
"""运行交互式查询"""
print("\n=== 交互式查询 ===")
# 等待一些数据积累
import time
time.sleep(10)
# 查询内存表中的数据
try:
# 查询最新交易
recent_transactions = self.spark.sql("""
SELECT
user_id,
product_name,
total_amount,
payment_method,
timestamp
FROM transactions_stream
ORDER BY timestamp DESC
LIMIT 10
""")
print("最近的10笔交易:")
recent_transactions.show(truncate=False)
# 查询用户活动统计
user_activity_summary = self.spark.sql("""
SELECT
event_type,
COUNT(*) as count,
COUNT(DISTINCT user_id) as unique_users
FROM activity_stream
GROUP BY event_type
ORDER BY count DESC
""")
print("\n用户活动统计:")
user_activity_summary.show(truncate=False)
except Exception as e:
print(f"查询失败: {e}")
def stop(self):
"""停止SparkSession"""
self.spark.stop()
print("SparkSession已停止")
if __name__ == "__main__":
consumer = SparkStreamingConsumer()
try:
# 在新线程中启动流处理
import threading
def start_stream():
consumer.start_streaming()
stream_thread = threading.Thread(target=start_stream, daemon=True)
stream_thread.start()
# 运行交互式查询
consumer.run_interactive_queries()
# 保持主线程运行
stream_thread.join()
except KeyboardInterrupt:
print("\n程序被用户中断")
except Exception as e:
print(f"发生错误: {e}")
finally:
consumer.stop()
5. Spark 批处理消费者 (spark/batch_consumer.py)
#!/usr/bin/env python3
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from config.kafka_config import SPARK_CONFIG, KAFKA_CONFIG
class SparkBatchConsumer:
def __init__(self):
self.spark_config = SPARK_CONFIG
self.kafka_config = KAFKA_CONFIG
self.spark = SparkSession.builder \
.appName("KafkaBatchProcessor") \
.master(self.spark_config['master']) \
.config("spark.jars.packages", self.spark_config['spark.jars.packages']) \
.config("spark.executor.memory", self.spark_config['spark.executor.memory']) \
.config("spark.driver.memory", self.spark_config['spark.driver.memory']) \
.getOrCreate()
self.spark.sparkContext.setLogLevel("WARN")
def read_batch_from_kafka(self, topic, starting_offset="earliest", ending_offset="latest"):
"""从Kafka批量读取数据"""
print(f"从Kafka主题 '{topic}' 读取批量数据...")
return self.spark \
.read \
.format("kafka") \
.option("kafka.bootstrap.servers", self.kafka_config['bootstrap_servers']) \
.option("subscribe", topic) \
.option("startingOffsets", starting_offset) \
.option("endingOffsets", ending_offset) \
.load()
def process_transaction_batch(self):
"""批量处理交易数据"""
print("=== 批量处理交易数据 ===")
# 读取数据
kafka_df = self.read_batch_from_kafka(
self.kafka_config['topics']['transactions']
)
# 解析数据
parsed_df = kafka_df.select(
from_json(col("value").cast("string"),
StructType([
StructField("transaction_id", StringType()),
StructField("user_id", StringType()),
StructField("product_name", StringType()),
StructField("category", StringType()),
StructField("price", DoubleType()),
StructField("quantity", IntegerType()),
StructField("total_amount", DoubleType()),
StructField("timestamp", StringType()),
StructField("payment_method", StringType())
])).alias("data")
).select("data.*")
# 创建临时视图
parsed_df.createOrReplaceTempView("transactions")
# 执行复杂分析
results = self.spark.sql("""
WITH category_summary AS (
SELECT
category,
COUNT(*) as transaction_count,
SUM(total_amount) as total_sales,
AVG(price) as avg_price,
COUNT(DISTINCT user_id) as unique_customers,
PERCENTILE_APPROX(total_amount, 0.5) as median_transaction
FROM transactions
GROUP BY category
),
hourly_trends AS (
SELECT
DATE_FORMAT(FROM_UNIXTIME(UNIX_TIMESTAMP(timestamp)), 'yyyy-MM-dd HH:00') as hour,
category,
COUNT(*) as transactions_per_hour,
SUM(total_amount) as sales_per_hour
FROM transactions
GROUP BY DATE_FORMAT(FROM_UNIXTIME(UNIX_TIMESTAMP(timestamp)), 'yyyy-MM-dd HH:00'), category
),
customer_segments AS (
SELECT
user_id,
COUNT(*) as total_transactions,
SUM(total_amount) as lifetime_value,
AVG(total_amount) as avg_transaction_value,
MAX(timestamp) as last_purchase
FROM transactions
GROUP BY user_id
)
SELECT
'CATEGORY_SUMMARY' as report_type,
category,
transaction_count,
ROUND(total_sales, 2) as total_sales,
ROUND(avg_price, 2) as avg_price,
unique_customers
FROM category_summary
UNION ALL
SELECT
'HOURLY_TRENDS' as report_type,
CONCAT(hour, ' - ', category) as category,
transactions_per_hour as transaction_count,
ROUND(sales_per_hour, 2) as total_sales,
0 as avg_price,
0 as unique_customers
FROM hourly_trends
ORDER BY report_type, total_sales DESC
""")
print("批量分析结果:")
results.show(50, truncate=False)
# 保存结果到文件
results.write \
.mode("overwrite") \
.option("header", "true") \
.csv("/tmp/kafka_batch_results")
print("结果已保存到 /tmp/kafka_batch_results")
return results
def process_user_activity_batch(self):
"""批量处理用户活动数据"""
print("\n=== 批量处理用户活动数据 ===")
kafka_df = self.read_batch_from_kafka(
self.kafka_config['topics']['user_activity']
)
parsed_df = kafka_df.select(
from_json(col("value").cast("string"),
StructType([
StructField("user_id", StringType()),
StructField("event_type", StringType()),
StructField("device_type", StringType()),
StructField("timestamp", StringType()),
StructField("session_duration", IntegerType())
])).alias("data")
).select("data.*")
parsed_df.createOrReplaceTempView("user_activity")
analysis = self.spark.sql("""
SELECT
event_type,
device_type,
COUNT(*) as event_count,
COUNT(DISTINCT user_id) as unique_users,
AVG(session_duration) as avg_session_duration,
MIN(timestamp) as first_event,
MAX(timestamp) as last_event
FROM user_activity
GROUP BY event_type, device_type
ORDER BY event_count DESC
""")
print("用户活动分析:")
analysis.show(truncate=False)
return analysis
def run_advanced_analytics(self):
"""运行高级分析"""
print("\n=== 运行高级分析 ===")
# 读取所有数据
transactions_df = self.read_batch_from_kafka(
self.kafka_config['topics']['transactions']
)
activity_df = self.read_batch_from_kafka(
self.kafka_config['topics']['user_activity']
)
# 解析数据
transactions_parsed = transactions_df.select(
from_json(col("value").cast("string"),
StructType([
StructField("user_id", StringType()),
StructField("product_name", StringType()),
StructField("total_amount", DoubleType()),
StructField("timestamp", StringType())
])).alias("data")
).select("data.*")
activity_parsed = activity_df.select(
from_json(col("value").cast("string"),
StructType([
StructField("user_id", StringType()),
StructField("event_type", StringType()),
StructField("timestamp", StringType())
])).alias("data")
).select("data.*")
# 创建视图
transactions_parsed.createOrReplaceTempView("transactions")
activity_parsed.createOrReplaceTempView("activity")
# 用户行为与购买关联分析
correlation_analysis = self.spark.sql("""
WITH user_events AS (
SELECT
user_id,
SUM(CASE WHEN event_type = 'add_to_cart' THEN 1 ELSE 0 END) as cart_adds,
SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as purchases,
SUM(CASE WHEN event_type = 'page_view' THEN 1 ELSE 0 END) as page_views
FROM activity
GROUP BY user_id
),
user_transactions AS (
SELECT
user_id,
COUNT(*) as transaction_count,
SUM(total_amount) as total_spent,
AVG(total_amount) as avg_transaction_value
FROM transactions
GROUP BY user_id
)
SELECT
COALESCE(ue.user_id, ut.user_id) as user_id,
ue.cart_adds,
ue.purchases,
ue.page_views,
ut.transaction_count,
ROUND(ut.total_spent, 2) as total_spent,
ROUND(ut.avg_transaction_value, 2) as avg_transaction_value,
ROUND(ut.total_spent / NULLIF(ue.page_views, 0), 2) as revenue_per_view
FROM user_events ue
FULL OUTER JOIN user_transactions ut ON ue.user_id = ut.user_id
ORDER BY total_spent DESC
""")
print("用户行为与购买关联分析:")
correlation_analysis.show(20, truncate=False)
# 保存结果
correlation_analysis.write \
.mode("overwrite") \
.option("header", "true") \
.parquet("/tmp/user_behavior_analysis")
print("高级分析结果已保存到 /tmp/user_behavior_analysis")
def stop(self):
"""停止Spark"""
self.spark.stop()
if __name__ == "__main__":
processor = SparkBatchConsumer()
try:
# 批量处理交易数据
processor.process_transaction_batch()
# 批量处理用户活动数据
processor.process_user_activity_batch()
# 运行高级分析
processor.run_advanced_analytics()
except Exception as e:
print(f"批处理错误: {e}")
finally:
processor.stop()
6. 集成运行脚本 (run_integration.py)
#!/usr/bin/env python3
import subprocess
import sys
import time
import threading
import os
from pathlib import Path
class KafkaSparkIntegration:
def __init__(self):
self.base_dir = Path(__file__).parent
self.kafka_process = None
def check_java(self):
"""检查Java安装"""
try:
result = subprocess.run(['java', '-version'],
capture_output=True, text=True)
if result.returncode == 0:
print("✓ Java已安装")
return True
except:
print("✗ Java未安装,请先安装Java 11")
return False
def start_kafka_kraft(self):
"""启动Kafka KRaft模式"""
print("启动Kafka (KRaft模式)...")
# 生成集群ID
cluster_id_cmd = "kafka-storage random-uuid"
result = subprocess.run(cluster_id_cmd, shell=True,
capture_output=True, text=True)
cluster_id = result.stdout.strip()
if not cluster_id:
print("无法生成集群ID")
return False
print(f"集群ID: {cluster_id}")
# 创建配置文件
config_content = f"""
process.roles=broker,controller
node.id=1
controller.quorum.voters=1@localhost:9093
listeners=PLAINTEXT://localhost:9092,CONTROLLER://localhost:9093
advertised.listeners=PLAINTEXT://localhost:9092
listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
inter.broker.listener.name=PLAINTEXT
log.dirs=/tmp/kraft-combined-logs
num.partitions=3
default.replication.factor=1
min.insync.replicas=1
controller.listener.names=CONTROLLER
num.controller.quorum.voters=1
quorum.voters=1@localhost:9093
cluster.id={cluster_id}
"""
config_file = self.base_dir / "kraft-server.properties"
with open(config_file, 'w') as f:
f.write(config_content)
# 格式化存储
format_cmd = f"kafka-storage format -t {cluster_id} -c {config_file}"
subprocess.run(format_cmd, shell=True, check=True)
# 启动Kafka
self.kafka_process = subprocess.Popen(
["kafka-server-start", str(config_file)],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
# 等待Kafka启动
time.sleep(10)
# 检查Kafka是否运行
try:
subprocess.run([
"kafka-topics",
"--bootstrap-server", "localhost:9092",
"--list"
], timeout=10, capture_output=True)
print("✓ Kafka启动成功")
return True
except:
print("✗ Kafka启动失败")
return False
def create_topics(self):
"""创建Kafka主题"""
topics = [
("ecommerce-transactions", 3),
("user-activity-logs", 2),
("system-metrics", 1)
]
for topic, partitions in topics:
try:
cmd = [
"kafka-topics",
"--bootstrap-server", "localhost:9092",
"--create",
"--topic", topic,
"--partitions", str(partitions),
"--replication-factor", "1",
"--if-not-exists"
]
subprocess.run(cmd, check=True)
print(f"✓ 主题 '{topic}' 创建成功")
except subprocess.CalledProcessError:
print(f"主题 '{topic}' 可能已存在")
except Exception as e:
print(f"创建主题错误: {e}")
def run_producer(self):
"""运行生产者"""
print("\n运行Kafka生产者...")
producer_script = self.base_dir / "producer" / "kafka_producer.py"
subprocess.run([sys.executable, str(producer_script)])
def run_streaming_consumer(self):
"""运行流式消费者"""
print("\n运行Spark流式消费者...")
consumer_script = self.base_dir / "spark" / "streaming_consumer.py"
subprocess.run([sys.executable, str(consumer_script)])
def run_batch_consumer(self):
"""运行批处理消费者"""
print("\n运行Spark批处理消费者...")
consumer_script = self.base_dir / "spark" / "batch_consumer.py"
subprocess.run([sys.executable, str(consumer_script)])
def monitor_kafka(self):
"""监控Kafka状态"""
print("\n=== Kafka状态监控 ===")
try:
# 列出所有主题
print("\n1. 所有主题:")
subprocess.run([
"kafka-topics",
"--bootstrap-server", "localhost:9092",
"--list"
])
# 查看主题详情
print("\n2. 主题详情:")
topics = ["ecommerce-transactions", "user-activity-logs"]
for topic in topics:
subprocess.run([
"kafka-topics",
"--bootstrap-server", "localhost:9092",
"--describe",
"--topic", topic
])
# 查看消息
print("\n3. 查看最新消息:")
subprocess.run([
"kafka-console-consumer",
"--bootstrap-server", "localhost:9092",
"--topic", "ecommerce-transactions",
"--from-beginning",
"--max-messages", "5"
])
except Exception as e:
print(f"监控错误: {e}")
def cleanup(self):
"""清理资源"""
print("\n清理资源...")
if self.kafka_process:
self.kafka_process.terminate()
self.kafka_process.wait()
print("Kafka已停止")
# 清理临时文件
import shutil
temp_dirs = ["/tmp/kraft-combined-logs", "/tmp/spark-checkpoints"]
for dir_path in temp_dirs:
if os.path.exists(dir_path):
try:
shutil.rmtree(dir_path)
print(f"清理目录: {dir_path}")
except:
pass
def run_demo(self):
"""运行完整演示"""
print("=== Kafka 4.0.1 + Spark 集成演示 (KRaft模式) ===")
try:
# 检查Java
if not self.check_java():
return
# 启动Kafka
if not self.start_kafka_kraft():
return
# 创建主题
self.create_topics()
# 监控状态
self.monitor_kafka()
# 用户选择运行模式
print("\n请选择运行模式:")
print("1. 仅运行生产者")
print("2. 仅运行Spark流式消费者")
print("3. 仅运行Spark批处理消费者")
print("4. 完整演示 (先生产后消费)")
print("5. 并发演示 (同时运行生产和消费)")
choice = input("\n请输入选择 (1-5): ").strip()
if choice == '1':
self.run_producer()
elif choice == '2':
self.run_streaming_consumer()
elif choice == '3':
self.run_batch_consumer()
elif choice == '4':
print("\n=== 步骤1: 运行生产者 ===")
self.run_producer()
time.sleep(2)
print("\n=== 步骤2: 运行批处理消费者 ===")
self.run_batch_consumer()
elif choice == '5':
print("\n并发运行生产和消费...")
print("请打开两个终端分别运行:")
print("终端1: python producer/kafka_producer.py")
print("终端2: python spark/streaming_consumer.py")
else:
print("无效选择")
except KeyboardInterrupt:
print("\n演示被用户中断")
except Exception as e:
print(f"演示错误: {e}")
finally:
self.cleanup()
def quick_test(self):
"""快速测试"""
print("=== 快速测试 ===")
# 启动Kafka
self.start_kafka_kraft()
time.sleep(5)
# 创建主题
self.create_topics()
# 简单生产消费测试
print("\n简单生产消费测试...")
# 生产一些测试消息
test_producer = subprocess.Popen([
"kafka-console-producer",
"--bootstrap-server", "localhost:9092",
"--topic", "test-topic"
], stdin=subprocess.PIPE, text=True)
messages = ["message1", "message2", "message3"]
for msg in messages:
test_producer.stdin.write(msg + "\n")
test_producer.stdin.flush()
test_producer.terminate()
# 消费消息
print("\n消费消息:")
subprocess.run([
"kafka-console-consumer",
"--bootstrap-server", "localhost:9092",
"--topic", "test-topic",
"--from-beginning",
"--timeout-ms", "3000"
])
self.cleanup()
if __name__ == "__main__":
integration = KafkaSparkIntegration()
if len(sys.argv) > 1 and sys.argv[1] == "--test":
integration.quick_test()
else:
integration.run_demo()
三、运行说明
1. 完整运行步骤
# 1. 安装依赖
pip install kafka-python pyspark findspark faker
# 2. 确保Kafka已安装
brew services stop kafka # 停止可能运行的旧版本
brew install kafka
# 3. 运行集成演示
python run_integration.py
# 4. 或者单独运行组件
# 启动Kafka (KRaft模式)
python run_integration.py --test
# 运行生产者
python producer/kafka_producer.py
# 运行Spark流式消费者
python spark/streaming_consumer.py
# 运行Spark批处理消费者
python spark/batch_consumer.py
2. 验证 KRaft 模式
# 检查Kafka是否使用KRaft模式
ps aux | grep kafka | grep -v grep
# 查看Kafka日志
tail -f /tmp/kraft-combined-logs/*.log
# 验证Kafka服务
kafka-topics --bootstrap-server localhost:9092 --list
3. 故障排除
问题1: Kafka启动失败
# 清理旧数据
rm -rf /tmp/kraft-combined-logs
rm -rf /tmp/kafka-logs
# 重新启动
python run_integration.py --test
问题2: Spark连接失败
# 检查Kafka是否运行
nc -z localhost 9092
# 检查Spark版本兼容性
pip install pyspark==3.5.0
问题3: 内存不足
# 增加Spark内存
export SPARK_DRIVER_MEMORY=2g
export SPARK_EXECUTOR_MEMORY=2g
这个完整方案包含了:
- Kafka 4.0.1 使用 KRaft 模式(无需 ZooKeeper)
- 多主题数据生产(交易、用户活动、系统指标)
- Spark Structured Streaming 实时处理
- Spark 批处理分析
- 高级数据分析示例
- 完整的错误处理和资源管理
所有代码都经过模块化设计,可以直接在您的 Mac 上运行。