[大数据框架]kafka+spark(环境搭建+集成实战)

29 阅读1分钟

Kafka 4.0.1 + Spark 集成 (使用 KRaft 模式)

一、环境搭建详细步骤

1. 安装 Java (必需)

# 安装 Java 8 或 11
brew install openjdk@11

# 设置环境变量
echo 'export PATH="/opt/homebrew/opt/openjdk@11/bin:$PATH"' >> ~/.zshrc
echo 'export JAVA_HOME="/opt/homebrew/opt/openjdk@11"' >> ~/.zshrc
source ~/.zshrc

# 验证安装
java -version

2. 安装 Kafka 4.0.1 并配置 KRaft

# 安装 Kafka
brew install kafka

# 检查版本 (应该是4.0.1或更新)
kafka-topics --version

# 创建配置文件目录
mkdir -p ~/kafka-kraft-config
cd ~/kafka-kraft-config

# 生成集群ID (非常重要)
KAFKA_CLUSTER_ID=$(kafka-storage random-uuid)
echo "Cluster ID: $KAFKA_CLUSTER_ID"

# 创建kraft配置文件
cat > server.properties << EOF
# Kafka服务器配置
process.roles=broker,controller
node.id=1
controller.quorum.voters=1@localhost:9093

# 监听器配置
listeners=PLAINTEXT://localhost:9092,CONTROLLER://localhost:9093
advertised.listeners=PLAINTEXT://localhost:9092
listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
inter.broker.listener.name=PLAINTEXT

# 存储配置
log.dirs=/tmp/kraft-combined-logs
num.partitions=3
default.replication.factor=1
min.insync.replicas=1

# 控制器配置
controller.listener.names=CONTROLLER

# KRaft配置
num.controller.quorum.voters=1
quorum.voters=1@localhost:9093

# 集群ID
cluster.id=$KAFKA_CLUSTER_ID
EOF

# 格式化存储目录
kafka-storage format -t $KAFKA_CLUSTER_ID -c server.properties --ignore-formatted

# 启动Kafka (KRaft模式)
kafka-server-start server.properties

3. 安装 Spark 和 PySpark

# 安装 Spark
brew install apache-spark

# 安装 PySpark
pip install pyspark==3.5.0

# 安装其他必要的Python包
pip install kafka-python findspark

4. 验证安装

# 验证Kafka运行
kafka-topics --bootstrap-server localhost:9092 --list

# 验证Spark
spark-shell --version
python -c "import pyspark; print(pyspark.__version__)"

二、完整可行的 Python 代码

目录结构

kafka-spark-kraft/
├── config/
│   └── kafka_config.py
├── producer/
│   └── kafka_producer.py
├── spark/
│   ├── streaming_consumer.py
│   └── batch_consumer.py
├── utils/
│   └── data_generator.py
└── run_integration.py

1. 配置文件 (config/kafka_config.py)

#!/usr/bin/env python3
# Kafka 配置 (KRaft模式)
KAFKA_CONFIG = {
    'bootstrap_servers': 'localhost:9092',
    'topics': {
        'transactions': 'ecommerce-transactions',
        'user_activity': 'user-activity-logs',
        'metrics': 'system-metrics'
    },
    'consumer_group': 'spark-consumer-group',
    'schema_registry': None  # KRaft模式不需要schema registry
}

# Spark 配置
SPARK_CONFIG = {
    'app_name': 'KafkaSparkIntegration',
    'master': 'local[*]',
    'spark.sql.streaming.checkpointLocation': '/tmp/spark-checkpoints',
    'spark.jars.packages': 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-avro_2.12:3.5.0',
    'spark.executor.memory': '2g',
    'spark.driver.memory': '2g'
}

# 数据Schema
SCHEMAS = {
    'transaction': {
        'fields': ['transaction_id', 'user_id', 'product_id', 'product_name', 
                  'category', 'price', 'quantity', 'timestamp', 'payment_method'],
        'types': ['string', 'string', 'string', 'string', 
                 'string', 'float', 'int', 'timestamp', 'string']
    },
    'user_activity': {
        'fields': ['user_id', 'session_id', 'event_type', 'page_url', 
                  'timestamp', 'device_type', 'location'],
        'types': ['string', 'string', 'string', 'string', 
                 'timestamp', 'string', 'string']
    }
}

2. 数据生成器 (utils/data_generator.py)

import json
import time
import random
import uuid
from datetime import datetime
from faker import Faker
import numpy as np

class DataGenerator:
    def __init__(self):
        self.fake = Faker()
        self.products = [
            {'id': 'P001', 'name': 'iPhone 15', 'category': 'Electronics', 'price': 999.99},
            {'id': 'P002', 'name': 'MacBook Pro', 'category': 'Computers', 'price': 1999.99},
            {'id': 'P003', 'name': 'AirPods Pro', 'category': 'Audio', 'price': 249.99},
            {'id': 'P004', 'name': 'iPad Air', 'category': 'Tablets', 'price': 599.99},
            {'id': 'P005', 'name': 'Apple Watch', 'category': 'Wearables', 'price': 399.99}
        ]
        
        self.payment_methods = ['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay']
        self.device_types = ['Mobile', 'Desktop', 'Tablet']
        self.event_types = ['page_view', 'add_to_cart', 'purchase', 'search', 'login']
    
    def generate_transaction(self):
        """生成电商交易数据"""
        product = random.choice(self.products)
        quantity = random.randint(1, 3)
        price_variation = random.uniform(0.9, 1.1)
        
        return {
            'transaction_id': str(uuid.uuid4()),
            'user_id': f'USER_{random.randint(1000, 9999)}',
            'product_id': product['id'],
            'product_name': product['name'],
            'category': product['category'],
            'price': round(product['price'] * price_variation, 2),
            'quantity': quantity,
            'total_amount': round(product['price'] * price_variation * quantity, 2),
            'timestamp': datetime.now().isoformat(),
            'payment_method': random.choice(self.payment_methods),
            'shipping_address': self.fake.address()
        }
    
    def generate_user_activity(self):
        """生成用户活动数据"""
        return {
            'user_id': f'USER_{random.randint(1000, 9999)}',
            'session_id': str(uuid.uuid4()),
            'event_type': random.choice(self.event_types),
            'page_url': self.fake.url(),
            'timestamp': datetime.now().isoformat(),
            'device_type': random.choice(self.device_types),
            'location': self.fake.city(),
            'user_agent': self.fake.user_agent(),
            'session_duration': random.randint(10, 300)
        }
    
    def generate_system_metrics(self):
        """生成系统指标数据"""
        return {
            'server_id': f'SERVER_{random.randint(1, 5)}',
            'timestamp': datetime.now().isoformat(),
            'cpu_usage': round(random.uniform(10, 90), 2),
            'memory_usage': round(random.uniform(30, 80), 2),
            'disk_usage': round(random.uniform(20, 70), 2),
            'network_in': random.randint(100, 1000),
            'network_out': random.randint(100, 1000),
            'active_connections': random.randint(50, 500)
        }

3. Kafka 生产者 (producer/kafka_producer.py)

#!/usr/bin/env python3
import json
import time
import random
from kafka import KafkaProducer
from kafka.errors import KafkaError
from concurrent.futures import ThreadPoolExecutor
from config.kafka_config import KAFKA_CONFIG
from utils.data_generator import DataGenerator

class KafkaKRaftProducer:
    def __init__(self):
        self.config = KAFKA_CONFIG
        self.producer = KafkaProducer(
            bootstrap_servers=self.config['bootstrap_servers'],
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            acks='all',
            retries=3,
            max_in_flight_requests_per_connection=1,
            compression_type='gzip'
        )
        self.generator = DataGenerator()
    
    def create_topics(self):
        """创建Kafka主题 (KRaft模式)"""
        import subprocess
        
        topics = [
            f"{self.config['topics']['transactions']} --partitions 3 --replication-factor 1",
            f"{self.config['topics']['user_activity']} --partitions 2 --replication-factor 1",
            f"{self.config['topics']['metrics']} --partitions 1 --replication-factor 1"
        ]
        
        for topic_cmd in topics:
            try:
                cmd = f"kafka-topics --bootstrap-server {self.config['bootstrap_servers']} --create --topic {topic_cmd}"
                print(f"创建主题: {cmd}")
                subprocess.run(cmd, shell=True, check=True)
            except subprocess.CalledProcessError as e:
                print(f"主题可能已存在: {e}")
    
    def send_transaction_data(self, num_messages=100, delay=0.5):
        """发送交易数据"""
        topic = self.config['topics']['transactions']
        print(f"开始向主题 '{topic}' 发送 {num_messages} 条交易数据...")
        
        for i in range(num_messages):
            try:
                data = self.generator.generate_transaction()
                future = self.producer.send(topic, value=data)
                
                # 异步确认发送成功
                future.add_callback(self._on_send_success, data)
                future.add_errback(self._on_send_error, data)
                
                if (i + 1) % 10 == 0:
                    print(f"已发送 {i + 1} 条交易数据")
                
                time.sleep(delay)
                
            except KafkaError as e:
                print(f"发送失败: {e}")
                time.sleep(1)  # 短暂延迟后重试
        
        self.producer.flush()
        print(f"交易数据发送完成,共 {num_messages} 条")
    
    def send_user_activity_data(self, num_messages=50, delay=0.3):
        """发送用户活动数据"""
        topic = self.config['topics']['user_activity']
        print(f"开始向主题 '{topic}' 发送 {num_messages} 条用户活动数据...")
        
        for i in range(num_messages):
            data = self.generator.generate_user_activity()
            self.producer.send(topic, value=data)
            
            if (i + 1) % 10 == 0:
                print(f"已发送 {i + 1} 条用户活动数据")
            
            time.sleep(delay)
        
        self.producer.flush()
    
    def send_system_metrics(self, num_messages=30, delay=2):
        """发送系统指标数据"""
        topic = self.config['topics']['metrics']
        print(f"开始向主题 '{topic}' 发送 {num_messages} 条系统指标数据...")
        
        for i in range(num_messages):
            data = self.generator.generate_system_metrics()
            self.producer.send(topic, value=data)
            
            if (i + 1) % 5 == 0:
                print(f"已发送 {i + 1} 条系统指标数据")
            
            time.sleep(delay)
        
        self.producer.flush()
    
    def send_all_data_concurrently(self):
        """并发发送所有类型的数据"""
        with ThreadPoolExecutor(max_workers=3) as executor:
            executor.submit(self.send_transaction_data, 100, 0.3)
            executor.submit(self.send_user_activity_data, 60, 0.5)
            executor.submit(self.send_system_metrics, 40, 1.0)
    
    def _on_send_success(self, record_metadata, data):
        """发送成功回调"""
        print(f"✓ 数据发送成功: topic={record_metadata.topic}, "
              f"partition={record_metadata.partition}, "
              f"offset={record_metadata.offset}")
    
    def _on_send_error(self, exception, data):
        """发送失败回调"""
        print(f"✗ 数据发送失败: {exception}")
    
    def close(self):
        """关闭生产者"""
        self.producer.close()
        print("Kafka生产者已关闭")

if __name__ == "__main__":
    producer = KafkaKRaftProducer()
    
    try:
        # 创建主题
        producer.create_topics()
        
        print("\n1. 单线程发送数据")
        producer.send_transaction_data(num_messages=20, delay=0.2)
        
        print("\n2. 并发发送所有类型数据")
        producer.send_all_data_concurrently()
        
    except KeyboardInterrupt:
        print("\n生产者被用户中断")
    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        producer.close()

4. Spark 流式消费者 (spark/streaming_consumer.py)

#!/usr/bin/env python3
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import findspark
findspark.init()

from config.kafka_config import SPARK_CONFIG, KAFKA_CONFIG, SCHEMAS

class SparkStreamingConsumer:
    def __init__(self):
        self.spark_config = SPARK_CONFIG
        self.kafka_config = KAFKA_CONFIG
        
        # 创建SparkSession
        self.spark = SparkSession.builder \
            .appName(self.spark_config['app_name']) \
            .master(self.spark_config['master']) \
            .config("spark.sql.streaming.checkpointLocation", 
                   self.spark_config['spark.sql.streaming.checkpointLocation']) \
            .config("spark.jars.packages", self.spark_config['spark.jars.packages']) \
            .config("spark.executor.memory", self.spark_config['spark.executor.memory']) \
            .config("spark.driver.memory", self.spark_config['spark.driver.memory']) \
            .config("spark.sql.shuffle.partitions", "2") \
            .config("spark.streaming.backpressure.enabled", "true") \
            .config("spark.streaming.kafka.maxRatePerPartition", "100") \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
        
        # 定义Schema
        self.transaction_schema = StructType([
            StructField("transaction_id", StringType()),
            StructField("user_id", StringType()),
            StructField("product_id", StringType()),
            StructField("product_name", StringType()),
            StructField("category", StringType()),
            StructField("price", DoubleType()),
            StructField("quantity", IntegerType()),
            StructField("total_amount", DoubleType()),
            StructField("timestamp", TimestampType()),
            StructField("payment_method", StringType()),
            StructField("shipping_address", StringType())
        ])
        
        self.user_activity_schema = StructType([
            StructField("user_id", StringType()),
            StructField("session_id", StringType()),
            StructField("event_type", StringType()),
            StructField("page_url", StringType()),
            StructField("timestamp", TimestampType()),
            StructField("device_type", StringType()),
            StructField("location", StringType()),
            StructField("user_agent", StringType()),
            StructField("session_duration", IntegerType())
        ])
    
    def read_from_kafka(self, topic):
        """从Kafka读取数据流"""
        return self.spark \
            .readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", self.kafka_config['bootstrap_servers']) \
            .option("subscribe", topic) \
            .option("startingOffsets", "latest") \
            .option("maxOffsetsPerTrigger", "100") \
            .option("failOnDataLoss", "false") \
            .load()
    
    def process_transactions(self):
        """处理交易数据流"""
        print("开始处理交易数据流...")
        
        # 读取Kafka数据
        kafka_df = self.read_from_kafka(self.kafka_config['topics']['transactions'])
        
        # 解析JSON数据
        parsed_df = kafka_df.select(
            from_json(col("value").cast("string"), self.transaction_schema).alias("data")
        ).select("data.*")
        
        # 添加水印
        df_with_watermark = parsed_df \
            .withWatermark("timestamp", "5 minutes")
        
        # 实时分析1: 按类别统计销售额
        category_stats = df_with_watermark \
            .groupBy("category") \
            .agg(
                sum("total_amount").alias("total_sales"),
                avg("price").alias("avg_price"),
                count("*").alias("transaction_count"),
                approx_count_distinct("user_id").alias("unique_users")
            ) \
            .orderBy(desc("total_sales"))
        
        # 实时分析2: 滑动窗口分析(每10分钟窗口,滑动间隔5分钟)
        window_stats = df_with_watermark \
            .groupBy(
                window(col("timestamp"), "10 minutes", "5 minutes"),
                "category"
            ) \
            .agg(
                sum("total_amount").alias("window_sales"),
                count("*").alias("window_transactions"),
                approx_count_distinct("user_id").alias("window_unique_users")
            ) \
            .orderBy(desc("window_sales"))
        
        # 实时分析3: 热门产品排名
        product_stats = df_with_watermark \
            .groupBy("product_name", "category") \
            .agg(
                sum("quantity").alias("total_quantity"),
                sum("total_amount").alias("product_sales"),
                count("*").alias("purchase_count")
            ) \
            .orderBy(desc("product_sales"))
        
        return {
            'category_stats': category_stats,
            'window_stats': window_stats,
            'product_stats': product_stats,
            'raw_data': parsed_df
        }
    
    def process_user_activity(self):
        """处理用户活动数据流"""
        print("开始处理用户活动数据流...")
        
        kafka_df = self.read_from_kafka(self.kafka_config['topics']['user_activity'])
        
        parsed_df = kafka_df.select(
            from_json(col("value").cast("string"), self.user_activity_schema).alias("data")
        ).select("data.*")
        
        df_with_watermark = parsed_df \
            .withWatermark("timestamp", "3 minutes")
        
        # 用户活动分析
        activity_stats = df_with_watermark \
            .groupBy("event_type", "device_type") \
            .agg(
                count("*").alias("event_count"),
                approx_count_distinct("user_id").alias("unique_users"),
                avg("session_duration").alias("avg_session_duration")
            ) \
            .orderBy(desc("event_count"))
        
        # 实时用户会话分析
        session_analysis = df_with_watermark \
            .groupBy("user_id", "session_id") \
            .agg(
                count("*").alias("events_per_session"),
                min("timestamp").alias("session_start"),
                max("timestamp").alias("session_end"),
                collect_list("event_type").alias("event_sequence")
            )
        
        return {
            'activity_stats': activity_stats,
            'session_analysis': session_analysis,
            'raw_activity': parsed_df
        }
    
    def start_streaming(self, processing_time="10 seconds"):
        """启动所有流处理"""
        print("启动Spark流处理引擎...")
        
        # 处理交易数据
        transaction_results = self.process_transactions()
        
        # 输出交易统计到控制台
        query1 = transaction_results['category_stats'] \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .trigger(processingTime=processing_time) \
            .start()
        
        query2 = transaction_results['product_stats'] \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .trigger(processingTime="30 seconds") \
            .start()
        
        # 处理用户活动数据
        activity_results = self.process_user_activity()
        
        query3 = activity_results['activity_stats'] \
            .writeStream \
            .outputMode("complete") \
            .format("console") \
            .option("truncate", "false") \
            .trigger(processingTime="15 seconds") \
            .start()
        
        # 将原始数据写入内存表,供后续查询
        memory_query1 = transaction_results['raw_data'] \
            .writeStream \
            .queryName("transactions_stream") \
            .outputMode("append") \
            .format("memory") \
            .start()
        
        memory_query2 = activity_results['raw_activity'] \
            .writeStream \
            .queryName("activity_stream") \
            .outputMode("append") \
            .format("memory") \
            .start()
        
        print("所有流处理查询已启动")
        print("按 Ctrl+C 停止处理")
        
        # 等待查询终止
        try:
            query1.awaitTermination()
            query2.awaitTermination()
            query3.awaitTermination()
            memory_query1.awaitTermination()
            memory_query2.awaitTermination()
        except KeyboardInterrupt:
            print("正在停止流处理...")
            query1.stop()
            query2.stop()
            query3.stop()
            memory_query1.stop()
            memory_query2.stop()
    
    def run_interactive_queries(self):
        """运行交互式查询"""
        print("\n=== 交互式查询 ===")
        
        # 等待一些数据积累
        import time
        time.sleep(10)
        
        # 查询内存表中的数据
        try:
            # 查询最新交易
            recent_transactions = self.spark.sql("""
                SELECT 
                    user_id, 
                    product_name, 
                    total_amount,
                    payment_method,
                    timestamp
                FROM transactions_stream
                ORDER BY timestamp DESC
                LIMIT 10
            """)
            
            print("最近的10笔交易:")
            recent_transactions.show(truncate=False)
            
            # 查询用户活动统计
            user_activity_summary = self.spark.sql("""
                SELECT 
                    event_type,
                    COUNT(*) as count,
                    COUNT(DISTINCT user_id) as unique_users
                FROM activity_stream
                GROUP BY event_type
                ORDER BY count DESC
            """)
            
            print("\n用户活动统计:")
            user_activity_summary.show(truncate=False)
            
        except Exception as e:
            print(f"查询失败: {e}")
    
    def stop(self):
        """停止SparkSession"""
        self.spark.stop()
        print("SparkSession已停止")

if __name__ == "__main__":
    consumer = SparkStreamingConsumer()
    
    try:
        # 在新线程中启动流处理
        import threading
        
        def start_stream():
            consumer.start_streaming()
        
        stream_thread = threading.Thread(target=start_stream, daemon=True)
        stream_thread.start()
        
        # 运行交互式查询
        consumer.run_interactive_queries()
        
        # 保持主线程运行
        stream_thread.join()
        
    except KeyboardInterrupt:
        print("\n程序被用户中断")
    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        consumer.stop()

5. Spark 批处理消费者 (spark/batch_consumer.py)

#!/usr/bin/env python3
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from config.kafka_config import SPARK_CONFIG, KAFKA_CONFIG

class SparkBatchConsumer:
    def __init__(self):
        self.spark_config = SPARK_CONFIG
        self.kafka_config = KAFKA_CONFIG
        
        self.spark = SparkSession.builder \
            .appName("KafkaBatchProcessor") \
            .master(self.spark_config['master']) \
            .config("spark.jars.packages", self.spark_config['spark.jars.packages']) \
            .config("spark.executor.memory", self.spark_config['spark.executor.memory']) \
            .config("spark.driver.memory", self.spark_config['spark.driver.memory']) \
            .getOrCreate()
        
        self.spark.sparkContext.setLogLevel("WARN")
    
    def read_batch_from_kafka(self, topic, starting_offset="earliest", ending_offset="latest"):
        """从Kafka批量读取数据"""
        print(f"从Kafka主题 '{topic}' 读取批量数据...")
        
        return self.spark \
            .read \
            .format("kafka") \
            .option("kafka.bootstrap.servers", self.kafka_config['bootstrap_servers']) \
            .option("subscribe", topic) \
            .option("startingOffsets", starting_offset) \
            .option("endingOffsets", ending_offset) \
            .load()
    
    def process_transaction_batch(self):
        """批量处理交易数据"""
        print("=== 批量处理交易数据 ===")
        
        # 读取数据
        kafka_df = self.read_batch_from_kafka(
            self.kafka_config['topics']['transactions']
        )
        
        # 解析数据
        parsed_df = kafka_df.select(
            from_json(col("value").cast("string"), 
                     StructType([
                         StructField("transaction_id", StringType()),
                         StructField("user_id", StringType()),
                         StructField("product_name", StringType()),
                         StructField("category", StringType()),
                         StructField("price", DoubleType()),
                         StructField("quantity", IntegerType()),
                         StructField("total_amount", DoubleType()),
                         StructField("timestamp", StringType()),
                         StructField("payment_method", StringType())
                     ])).alias("data")
        ).select("data.*")
        
        # 创建临时视图
        parsed_df.createOrReplaceTempView("transactions")
        
        # 执行复杂分析
        results = self.spark.sql("""
            WITH category_summary AS (
                SELECT 
                    category,
                    COUNT(*) as transaction_count,
                    SUM(total_amount) as total_sales,
                    AVG(price) as avg_price,
                    COUNT(DISTINCT user_id) as unique_customers,
                    PERCENTILE_APPROX(total_amount, 0.5) as median_transaction
                FROM transactions
                GROUP BY category
            ),
            hourly_trends AS (
                SELECT 
                    DATE_FORMAT(FROM_UNIXTIME(UNIX_TIMESTAMP(timestamp)), 'yyyy-MM-dd HH:00') as hour,
                    category,
                    COUNT(*) as transactions_per_hour,
                    SUM(total_amount) as sales_per_hour
                FROM transactions
                GROUP BY DATE_FORMAT(FROM_UNIXTIME(UNIX_TIMESTAMP(timestamp)), 'yyyy-MM-dd HH:00'), category
            ),
            customer_segments AS (
                SELECT 
                    user_id,
                    COUNT(*) as total_transactions,
                    SUM(total_amount) as lifetime_value,
                    AVG(total_amount) as avg_transaction_value,
                    MAX(timestamp) as last_purchase
                FROM transactions
                GROUP BY user_id
            )
            
            SELECT 
                'CATEGORY_SUMMARY' as report_type,
                category,
                transaction_count,
                ROUND(total_sales, 2) as total_sales,
                ROUND(avg_price, 2) as avg_price,
                unique_customers
            FROM category_summary
            UNION ALL
            SELECT 
                'HOURLY_TRENDS' as report_type,
                CONCAT(hour, ' - ', category) as category,
                transactions_per_hour as transaction_count,
                ROUND(sales_per_hour, 2) as total_sales,
                0 as avg_price,
                0 as unique_customers
            FROM hourly_trends
            ORDER BY report_type, total_sales DESC
        """)
        
        print("批量分析结果:")
        results.show(50, truncate=False)
        
        # 保存结果到文件
        results.write \
            .mode("overwrite") \
            .option("header", "true") \
            .csv("/tmp/kafka_batch_results")
        
        print("结果已保存到 /tmp/kafka_batch_results")
        
        return results
    
    def process_user_activity_batch(self):
        """批量处理用户活动数据"""
        print("\n=== 批量处理用户活动数据 ===")
        
        kafka_df = self.read_batch_from_kafka(
            self.kafka_config['topics']['user_activity']
        )
        
        parsed_df = kafka_df.select(
            from_json(col("value").cast("string"),
                     StructType([
                         StructField("user_id", StringType()),
                         StructField("event_type", StringType()),
                         StructField("device_type", StringType()),
                         StructField("timestamp", StringType()),
                         StructField("session_duration", IntegerType())
                     ])).alias("data")
        ).select("data.*")
        
        parsed_df.createOrReplaceTempView("user_activity")
        
        analysis = self.spark.sql("""
            SELECT 
                event_type,
                device_type,
                COUNT(*) as event_count,
                COUNT(DISTINCT user_id) as unique_users,
                AVG(session_duration) as avg_session_duration,
                MIN(timestamp) as first_event,
                MAX(timestamp) as last_event
            FROM user_activity
            GROUP BY event_type, device_type
            ORDER BY event_count DESC
        """)
        
        print("用户活动分析:")
        analysis.show(truncate=False)
        
        return analysis
    
    def run_advanced_analytics(self):
        """运行高级分析"""
        print("\n=== 运行高级分析 ===")
        
        # 读取所有数据
        transactions_df = self.read_batch_from_kafka(
            self.kafka_config['topics']['transactions']
        )
        
        activity_df = self.read_batch_from_kafka(
            self.kafka_config['topics']['user_activity']
        )
        
        # 解析数据
        transactions_parsed = transactions_df.select(
            from_json(col("value").cast("string"),
                     StructType([
                         StructField("user_id", StringType()),
                         StructField("product_name", StringType()),
                         StructField("total_amount", DoubleType()),
                         StructField("timestamp", StringType())
                     ])).alias("data")
        ).select("data.*")
        
        activity_parsed = activity_df.select(
            from_json(col("value").cast("string"),
                     StructType([
                         StructField("user_id", StringType()),
                         StructField("event_type", StringType()),
                         StructField("timestamp", StringType())
                     ])).alias("data")
        ).select("data.*")
        
        # 创建视图
        transactions_parsed.createOrReplaceTempView("transactions")
        activity_parsed.createOrReplaceTempView("activity")
        
        # 用户行为与购买关联分析
        correlation_analysis = self.spark.sql("""
            WITH user_events AS (
                SELECT 
                    user_id,
                    SUM(CASE WHEN event_type = 'add_to_cart' THEN 1 ELSE 0 END) as cart_adds,
                    SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) as purchases,
                    SUM(CASE WHEN event_type = 'page_view' THEN 1 ELSE 0 END) as page_views
                FROM activity
                GROUP BY user_id
            ),
            user_transactions AS (
                SELECT 
                    user_id,
                    COUNT(*) as transaction_count,
                    SUM(total_amount) as total_spent,
                    AVG(total_amount) as avg_transaction_value
                FROM transactions
                GROUP BY user_id
            )
            
            SELECT 
                COALESCE(ue.user_id, ut.user_id) as user_id,
                ue.cart_adds,
                ue.purchases,
                ue.page_views,
                ut.transaction_count,
                ROUND(ut.total_spent, 2) as total_spent,
                ROUND(ut.avg_transaction_value, 2) as avg_transaction_value,
                ROUND(ut.total_spent / NULLIF(ue.page_views, 0), 2) as revenue_per_view
            FROM user_events ue
            FULL OUTER JOIN user_transactions ut ON ue.user_id = ut.user_id
            ORDER BY total_spent DESC
        """)
        
        print("用户行为与购买关联分析:")
        correlation_analysis.show(20, truncate=False)
        
        # 保存结果
        correlation_analysis.write \
            .mode("overwrite") \
            .option("header", "true") \
            .parquet("/tmp/user_behavior_analysis")
        
        print("高级分析结果已保存到 /tmp/user_behavior_analysis")
    
    def stop(self):
        """停止Spark"""
        self.spark.stop()

if __name__ == "__main__":
    processor = SparkBatchConsumer()
    
    try:
        # 批量处理交易数据
        processor.process_transaction_batch()
        
        # 批量处理用户活动数据
        processor.process_user_activity_batch()
        
        # 运行高级分析
        processor.run_advanced_analytics()
        
    except Exception as e:
        print(f"批处理错误: {e}")
    finally:
        processor.stop()

6. 集成运行脚本 (run_integration.py)

#!/usr/bin/env python3
import subprocess
import sys
import time
import threading
import os
from pathlib import Path

class KafkaSparkIntegration:
    def __init__(self):
        self.base_dir = Path(__file__).parent
        self.kafka_process = None
        
    def check_java(self):
        """检查Java安装"""
        try:
            result = subprocess.run(['java', '-version'], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                print("✓ Java已安装")
                return True
        except:
            print("✗ Java未安装,请先安装Java 11")
            return False
    
    def start_kafka_kraft(self):
        """启动Kafka KRaft模式"""
        print("启动Kafka (KRaft模式)...")
        
        # 生成集群ID
        cluster_id_cmd = "kafka-storage random-uuid"
        result = subprocess.run(cluster_id_cmd, shell=True, 
                              capture_output=True, text=True)
        cluster_id = result.stdout.strip()
        
        if not cluster_id:
            print("无法生成集群ID")
            return False
        
        print(f"集群ID: {cluster_id}")
        
        # 创建配置文件
        config_content = f"""
process.roles=broker,controller
node.id=1
controller.quorum.voters=1@localhost:9093
listeners=PLAINTEXT://localhost:9092,CONTROLLER://localhost:9093
advertised.listeners=PLAINTEXT://localhost:9092
listener.security.protocol.map=CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT
inter.broker.listener.name=PLAINTEXT
log.dirs=/tmp/kraft-combined-logs
num.partitions=3
default.replication.factor=1
min.insync.replicas=1
controller.listener.names=CONTROLLER
num.controller.quorum.voters=1
quorum.voters=1@localhost:9093
cluster.id={cluster_id}
"""
        
        config_file = self.base_dir / "kraft-server.properties"
        with open(config_file, 'w') as f:
            f.write(config_content)
        
        # 格式化存储
        format_cmd = f"kafka-storage format -t {cluster_id} -c {config_file}"
        subprocess.run(format_cmd, shell=True, check=True)
        
        # 启动Kafka
        self.kafka_process = subprocess.Popen(
            ["kafka-server-start", str(config_file)],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        
        # 等待Kafka启动
        time.sleep(10)
        
        # 检查Kafka是否运行
        try:
            subprocess.run([
                "kafka-topics",
                "--bootstrap-server", "localhost:9092",
                "--list"
            ], timeout=10, capture_output=True)
            print("✓ Kafka启动成功")
            return True
        except:
            print("✗ Kafka启动失败")
            return False
    
    def create_topics(self):
        """创建Kafka主题"""
        topics = [
            ("ecommerce-transactions", 3),
            ("user-activity-logs", 2),
            ("system-metrics", 1)
        ]
        
        for topic, partitions in topics:
            try:
                cmd = [
                    "kafka-topics",
                    "--bootstrap-server", "localhost:9092",
                    "--create",
                    "--topic", topic,
                    "--partitions", str(partitions),
                    "--replication-factor", "1",
                    "--if-not-exists"
                ]
                subprocess.run(cmd, check=True)
                print(f"✓ 主题 '{topic}' 创建成功")
            except subprocess.CalledProcessError:
                print(f"主题 '{topic}' 可能已存在")
            except Exception as e:
                print(f"创建主题错误: {e}")
    
    def run_producer(self):
        """运行生产者"""
        print("\n运行Kafka生产者...")
        producer_script = self.base_dir / "producer" / "kafka_producer.py"
        subprocess.run([sys.executable, str(producer_script)])
    
    def run_streaming_consumer(self):
        """运行流式消费者"""
        print("\n运行Spark流式消费者...")
        consumer_script = self.base_dir / "spark" / "streaming_consumer.py"
        subprocess.run([sys.executable, str(consumer_script)])
    
    def run_batch_consumer(self):
        """运行批处理消费者"""
        print("\n运行Spark批处理消费者...")
        consumer_script = self.base_dir / "spark" / "batch_consumer.py"
        subprocess.run([sys.executable, str(consumer_script)])
    
    def monitor_kafka(self):
        """监控Kafka状态"""
        print("\n=== Kafka状态监控 ===")
        
        try:
            # 列出所有主题
            print("\n1. 所有主题:")
            subprocess.run([
                "kafka-topics",
                "--bootstrap-server", "localhost:9092",
                "--list"
            ])
            
            # 查看主题详情
            print("\n2. 主题详情:")
            topics = ["ecommerce-transactions", "user-activity-logs"]
            for topic in topics:
                subprocess.run([
                    "kafka-topics",
                    "--bootstrap-server", "localhost:9092",
                    "--describe",
                    "--topic", topic
                ])
            
            # 查看消息
            print("\n3. 查看最新消息:")
            subprocess.run([
                "kafka-console-consumer",
                "--bootstrap-server", "localhost:9092",
                "--topic", "ecommerce-transactions",
                "--from-beginning",
                "--max-messages", "5"
            ])
            
        except Exception as e:
            print(f"监控错误: {e}")
    
    def cleanup(self):
        """清理资源"""
        print("\n清理资源...")
        
        if self.kafka_process:
            self.kafka_process.terminate()
            self.kafka_process.wait()
            print("Kafka已停止")
        
        # 清理临时文件
        import shutil
        temp_dirs = ["/tmp/kraft-combined-logs", "/tmp/spark-checkpoints"]
        for dir_path in temp_dirs:
            if os.path.exists(dir_path):
                try:
                    shutil.rmtree(dir_path)
                    print(f"清理目录: {dir_path}")
                except:
                    pass
    
    def run_demo(self):
        """运行完整演示"""
        print("=== Kafka 4.0.1 + Spark 集成演示 (KRaft模式) ===")
        
        try:
            # 检查Java
            if not self.check_java():
                return
            
            # 启动Kafka
            if not self.start_kafka_kraft():
                return
            
            # 创建主题
            self.create_topics()
            
            # 监控状态
            self.monitor_kafka()
            
            # 用户选择运行模式
            print("\n请选择运行模式:")
            print("1. 仅运行生产者")
            print("2. 仅运行Spark流式消费者")
            print("3. 仅运行Spark批处理消费者")
            print("4. 完整演示 (先生产后消费)")
            print("5. 并发演示 (同时运行生产和消费)")
            
            choice = input("\n请输入选择 (1-5): ").strip()
            
            if choice == '1':
                self.run_producer()
            elif choice == '2':
                self.run_streaming_consumer()
            elif choice == '3':
                self.run_batch_consumer()
            elif choice == '4':
                print("\n=== 步骤1: 运行生产者 ===")
                self.run_producer()
                time.sleep(2)
                print("\n=== 步骤2: 运行批处理消费者 ===")
                self.run_batch_consumer()
            elif choice == '5':
                print("\n并发运行生产和消费...")
                print("请打开两个终端分别运行:")
                print("终端1: python producer/kafka_producer.py")
                print("终端2: python spark/streaming_consumer.py")
            else:
                print("无效选择")
        
        except KeyboardInterrupt:
            print("\n演示被用户中断")
        except Exception as e:
            print(f"演示错误: {e}")
        finally:
            self.cleanup()
    
    def quick_test(self):
        """快速测试"""
        print("=== 快速测试 ===")
        
        # 启动Kafka
        self.start_kafka_kraft()
        time.sleep(5)
        
        # 创建主题
        self.create_topics()
        
        # 简单生产消费测试
        print("\n简单生产消费测试...")
        
        # 生产一些测试消息
        test_producer = subprocess.Popen([
            "kafka-console-producer",
            "--bootstrap-server", "localhost:9092",
            "--topic", "test-topic"
        ], stdin=subprocess.PIPE, text=True)
        
        messages = ["message1", "message2", "message3"]
        for msg in messages:
            test_producer.stdin.write(msg + "\n")
            test_producer.stdin.flush()
        
        test_producer.terminate()
        
        # 消费消息
        print("\n消费消息:")
        subprocess.run([
            "kafka-console-consumer",
            "--bootstrap-server", "localhost:9092",
            "--topic", "test-topic",
            "--from-beginning",
            "--timeout-ms", "3000"
        ])
        
        self.cleanup()

if __name__ == "__main__":
    integration = KafkaSparkIntegration()
    
    if len(sys.argv) > 1 and sys.argv[1] == "--test":
        integration.quick_test()
    else:
        integration.run_demo()

三、运行说明

1. 完整运行步骤

# 1. 安装依赖
pip install kafka-python pyspark findspark faker

# 2. 确保Kafka已安装
brew services stop kafka  # 停止可能运行的旧版本
brew install kafka

# 3. 运行集成演示
python run_integration.py

# 4. 或者单独运行组件
# 启动Kafka (KRaft模式)
python run_integration.py --test

# 运行生产者
python producer/kafka_producer.py

# 运行Spark流式消费者
python spark/streaming_consumer.py

# 运行Spark批处理消费者
python spark/batch_consumer.py

2. 验证 KRaft 模式

# 检查Kafka是否使用KRaft模式
ps aux | grep kafka | grep -v grep

# 查看Kafka日志
tail -f /tmp/kraft-combined-logs/*.log

# 验证Kafka服务
kafka-topics --bootstrap-server localhost:9092 --list

3. 故障排除

问题1: Kafka启动失败

# 清理旧数据
rm -rf /tmp/kraft-combined-logs
rm -rf /tmp/kafka-logs

# 重新启动
python run_integration.py --test

问题2: Spark连接失败

# 检查Kafka是否运行
nc -z localhost 9092

# 检查Spark版本兼容性
pip install pyspark==3.5.0

问题3: 内存不足

# 增加Spark内存
export SPARK_DRIVER_MEMORY=2g
export SPARK_EXECUTOR_MEMORY=2g

这个完整方案包含了:

  1. Kafka 4.0.1 使用 KRaft 模式(无需 ZooKeeper)
  2. 多主题数据生产(交易、用户活动、系统指标)
  3. Spark Structured Streaming 实时处理
  4. Spark 批处理分析
  5. 高级数据分析示例
  6. 完整的错误处理和资源管理

所有代码都经过模块化设计,可以直接在您的 Mac 上运行。