[hadoop]hadoop伪分布式集群搭建+集成spark

397 阅读3分钟

我可以帮你在一台Mac上搭建Hadoop集群并集成Spark。以下是详细步骤:

1. 环境准备

检查当前环境

# 确认已安装的软件
java -version
which scala
which spark-submit

# 如果还没安装,先安装必要的软件
brew update
brew install java scala

2. 安装Hadoop

下载Hadoop

# 下载Hadoop(选择与Spark兼容的版本,推荐3.3.x)
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
tar -xzf hadoop-3.3.6.tar.gz
sudo mv hadoop-3.3.6 /usr/local/hadoop

配置环境变量

# 编辑 ~/.zshrc 或 ~/.bash_profile
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME

# 使配置生效
source ~/.zshrc

3. 配置Hadoop伪分布式集群

修改Hadoop配置文件

1. hadoop-env.sh

cd $HADOOP_HOME/etc/hadoop
echo "export JAVA_HOME=$(/usr/libexec/java_home)" >> hadoop-env.sh

2. core-site.xml

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://localhost:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/Users/$(whoami)/hadoop_data/tmp</value>
    </property>
</configuration>

3. hdfs-site.xml

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/Users/$(whoami)/hadoop_data/namenode</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/Users/$(whoami)/hadoop_data/datanode</value>
    </property>
</configuration>

4. mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

5. yarn-site.xml

<configuration>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>

4. 启动Hadoop集群

格式化HDFS

hdfs namenode -format

启动HDFS和YARN

# 启动HDFS
start-dfs.sh

# 启动YARN
start-yarn.sh

# 检查进程
jps
# 应该看到:NameNode, DataNode, ResourceManager, NodeManager, SecondaryNameNode

验证HDFS

# 创建HDFS目录
hdfs dfs -mkdir -p /user/$(whoami)
hdfs dfs -ls /

# 访问Web UI
# NameNode: http://localhost:9870
# ResourceManager: http://localhost:8088

5. 配置Spark连接HDFS

修改Spark配置

# 编辑Spark配置,添加Hadoop配置
cd $SPARK_HOME/conf
cp spark-env.sh.template spark-env.sh

在spark-env.sh中添加:

export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH

测试Spark与HDFS集成

创建测试程序 test_hdfs.py

from pyspark.sql import SparkSession
import os

# 初始化Spark会话
spark = SparkSession.builder \
    .appName("HDFS Integration Test") \
    .getOrCreate()

# 1. 将本地文件写入HDFS
local_file = "test_data.txt"
hdfs_path = "/user/$(whoami)/test_data.txt"

# 创建测试数据
data = ["Hello Hadoop", "Hello Spark", "HDFS and Spark Integration"]
with open(local_file, "w") as f:
    f.write("\n".join(data))

# 将文件上传到HDFS
os.system(f"hdfs dfs -put {local_file} {hdfs_path}")

# 2. 使用Spark读取HDFS文件
print("Reading from HDFS...")
hdfs_data = spark.read.text(hdfs_path)
hdfs_data.show()

# 3. 进行WordCount处理
print("\nWord Count:")
words = hdfs_data.rdd.flatMap(lambda x: x.value.split(" "))
word_count = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
for word, count in word_count.collect():
    print(f"{word}: {count}")

# 4. 将结果写回HDFS
output_path = "/user/$(whoami)/wordcount_output"
word_count.saveAsTextFile(output_path)
print(f"\nOutput saved to HDFS: {output_path}")

# 5. 读取并显示结果
print("\nReading results from HDFS:")
results = spark.read.text(f"{output_path}/part-*")
results.show()

spark.stop()

6. 完整工作流程示例

创建数据管道脚本 data_pipeline.py

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time

class HDFSSparkPipeline:
    def __init__(self):
        self.spark = SparkSession.builder \
            .appName("HDFS-Spark Pipeline") \
            .config("spark.sql.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse") \
            .getOrCreate()
        
    def create_sample_data(self):
        """创建示例数据集"""
        print("Creating sample data...")
        data = [
            ("2024-01-01", "user1", "login", 1),
            ("2024-01-01", "user2", "purchase", 150),
            ("2024-01-02", "user1", "purchase", 200),
            ("2024-01-02", "user3", "login", 1),
            ("2024-01-02", "user2", "view", 1),
        ]
        
        df = self.spark.createDataFrame(data, ["date", "user_id", "event_type", "value"])
        return df
    
    def write_to_hdfs(self, df, path):
        """写入数据到HDFS"""
        print(f"Writing data to HDFS: {path}")
        df.write.mode("overwrite").parquet(f"hdfs://localhost:9000{path}")
        return path
    
    def read_from_hdfs(self, path):
        """从HDFS读取数据"""
        print(f"Reading data from HDFS: {path}")
        return self.spark.read.parquet(f"hdfs://localhost:9000{path}")
    
    def process_data(self, df):
        """数据处理和分析"""
        print("Processing data...")
        
        # 按日期统计
        daily_stats = df.groupBy("date").agg(
            count("*").alias("total_events"),
            sum("value").alias("total_value"),
            countDistinct("user_id").alias("unique_users")
        )
        
        # 按用户统计
        user_stats = df.groupBy("user_id").agg(
            count("*").alias("event_count"),
            sum("value").alias("total_spent"),
            collect_list("event_type").alias("events")
        )
        
        return daily_stats, user_stats
    
    def run_pipeline(self):
        """运行完整的数据管道"""
        print("="*50)
        print("Starting HDFS-Spark Pipeline")
        print("="*50)
        
        # 步骤1: 创建数据
        df = self.create_sample_data()
        print("\nStep 1: Sample Data")
        df.show()
        
        # 步骤2: 写入HDFS
        raw_data_path = "/user/$(whoami)/raw_data"
        self.write_to_hdfs(df, raw_data_path)
        
        # 步骤3: 从HDFS读取
        df_from_hdfs = self.read_from_hdfs(raw_data_path)
        print("\nStep 3: Data read from HDFS")
        df_from_hdfs.show()
        
        # 步骤4: 数据处理
        daily_stats, user_stats = self.process_data(df_from_hdfs)
        
        # 步骤5: 保存处理结果到HDFS
        print("\nStep 5: Saving processed results to HDFS")
        daily_path = "/user/$(whoami)/daily_stats"
        user_path = "/user/$(whoami)/user_stats"
        
        self.write_to_hdfs(daily_stats, daily_path)
        self.write_to_hdfs(user_stats, user_path)
        
        # 步骤6: 验证结果
        print("\nStep 6: Verification - Daily Statistics")
        self.read_from_hdfs(daily_path).show()
        
        print("\nStep 7: Verification - User Statistics")
        self.read_from_hdfs(user_path).show()
        
        # 步骤7: HDFS操作验证
        print("\nStep 8: HDFS Operations Verification")
        os.system("hdfs dfs -ls /user/$(whoami)/")
        
        print("\n" + "="*50)
        print("Pipeline completed successfully!")
        print("="*50)
        
        self.spark.stop()

if __name__ == "__main__":
    import os
    pipeline = HDFSSparkPipeline()
    pipeline.run_pipeline()

7. 运行和监控

启动所有服务

# 启动HDFS
start-dfs.sh

# 启动YARN
start-yarn.sh

# 运行Spark应用
spark-submit --master yarn \
    --deploy-mode client \
    test_hdfs.py

监控界面

8. 常见问题解决

端口冲突

# 如果端口被占用,修改端口
# 修改 $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<property>
    <name>dfs.namenode.http-address</name>
    <value>localhost:9870</value>
</property>

权限问题

# 设置HDFS目录权限
hdfs dfs -chmod -R 755 /user
hdfs dfs -chown -R $(whoami) /user/$(whoami)

内存配置

# 如果内存不足,修改配置
# 修改 $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export HADOOP_HEAPSIZE_MAX=512m
export HADOOP_HEAPSIZE_MIN=256m

这个配置将让你在单台Mac上体验完整的Hadoop HDFS + Spark工作流程。你可以:

  1. 将数据存储到HDFS
  2. 使用Spark处理HDFS中的数据
  3. 将结果写回HDFS
  4. 通过Web UI监控整个流程

如果需要进一步扩展(如使用Hive、HBase等),我可以提供更多集成方案。