我可以帮你在一台Mac上搭建Hadoop集群并集成Spark。以下是详细步骤:
1. 环境准备
检查当前环境
# 确认已安装的软件
java -version
which scala
which spark-submit
# 如果还没安装,先安装必要的软件
brew update
brew install java scala
2. 安装Hadoop
下载Hadoop
# 下载Hadoop(选择与Spark兼容的版本,推荐3.3.x)
wget https://downloads.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz
tar -xzf hadoop-3.3.6.tar.gz
sudo mv hadoop-3.3.6 /usr/local/hadoop
配置环境变量
# 编辑 ~/.zshrc 或 ~/.bash_profile
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
# 使配置生效
source ~/.zshrc
3. 配置Hadoop伪分布式集群
修改Hadoop配置文件
1. hadoop-env.sh
cd $HADOOP_HOME/etc/hadoop
echo "export JAVA_HOME=$(/usr/libexec/java_home)" >> hadoop-env.sh
2. core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/Users/$(whoami)/hadoop_data/tmp</value>
</property>
</configuration>
3. hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/Users/$(whoami)/hadoop_data/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/Users/$(whoami)/hadoop_data/datanode</value>
</property>
</configuration>
4. mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
5. yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
4. 启动Hadoop集群
格式化HDFS
hdfs namenode -format
启动HDFS和YARN
# 启动HDFS
start-dfs.sh
# 启动YARN
start-yarn.sh
# 检查进程
jps
# 应该看到:NameNode, DataNode, ResourceManager, NodeManager, SecondaryNameNode
验证HDFS
# 创建HDFS目录
hdfs dfs -mkdir -p /user/$(whoami)
hdfs dfs -ls /
# 访问Web UI
# NameNode: http://localhost:9870
# ResourceManager: http://localhost:8088
5. 配置Spark连接HDFS
修改Spark配置
# 编辑Spark配置,添加Hadoop配置
cd $SPARK_HOME/conf
cp spark-env.sh.template spark-env.sh
在spark-env.sh中添加:
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native:$LD_LIBRARY_PATH
测试Spark与HDFS集成
创建测试程序 test_hdfs.py
from pyspark.sql import SparkSession
import os
# 初始化Spark会话
spark = SparkSession.builder \
.appName("HDFS Integration Test") \
.getOrCreate()
# 1. 将本地文件写入HDFS
local_file = "test_data.txt"
hdfs_path = "/user/$(whoami)/test_data.txt"
# 创建测试数据
data = ["Hello Hadoop", "Hello Spark", "HDFS and Spark Integration"]
with open(local_file, "w") as f:
f.write("\n".join(data))
# 将文件上传到HDFS
os.system(f"hdfs dfs -put {local_file} {hdfs_path}")
# 2. 使用Spark读取HDFS文件
print("Reading from HDFS...")
hdfs_data = spark.read.text(hdfs_path)
hdfs_data.show()
# 3. 进行WordCount处理
print("\nWord Count:")
words = hdfs_data.rdd.flatMap(lambda x: x.value.split(" "))
word_count = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
for word, count in word_count.collect():
print(f"{word}: {count}")
# 4. 将结果写回HDFS
output_path = "/user/$(whoami)/wordcount_output"
word_count.saveAsTextFile(output_path)
print(f"\nOutput saved to HDFS: {output_path}")
# 5. 读取并显示结果
print("\nReading results from HDFS:")
results = spark.read.text(f"{output_path}/part-*")
results.show()
spark.stop()
6. 完整工作流程示例
创建数据管道脚本 data_pipeline.py
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import time
class HDFSSparkPipeline:
def __init__(self):
self.spark = SparkSession.builder \
.appName("HDFS-Spark Pipeline") \
.config("spark.sql.warehouse.dir", "hdfs://localhost:9000/user/hive/warehouse") \
.getOrCreate()
def create_sample_data(self):
"""创建示例数据集"""
print("Creating sample data...")
data = [
("2024-01-01", "user1", "login", 1),
("2024-01-01", "user2", "purchase", 150),
("2024-01-02", "user1", "purchase", 200),
("2024-01-02", "user3", "login", 1),
("2024-01-02", "user2", "view", 1),
]
df = self.spark.createDataFrame(data, ["date", "user_id", "event_type", "value"])
return df
def write_to_hdfs(self, df, path):
"""写入数据到HDFS"""
print(f"Writing data to HDFS: {path}")
df.write.mode("overwrite").parquet(f"hdfs://localhost:9000{path}")
return path
def read_from_hdfs(self, path):
"""从HDFS读取数据"""
print(f"Reading data from HDFS: {path}")
return self.spark.read.parquet(f"hdfs://localhost:9000{path}")
def process_data(self, df):
"""数据处理和分析"""
print("Processing data...")
# 按日期统计
daily_stats = df.groupBy("date").agg(
count("*").alias("total_events"),
sum("value").alias("total_value"),
countDistinct("user_id").alias("unique_users")
)
# 按用户统计
user_stats = df.groupBy("user_id").agg(
count("*").alias("event_count"),
sum("value").alias("total_spent"),
collect_list("event_type").alias("events")
)
return daily_stats, user_stats
def run_pipeline(self):
"""运行完整的数据管道"""
print("="*50)
print("Starting HDFS-Spark Pipeline")
print("="*50)
# 步骤1: 创建数据
df = self.create_sample_data()
print("\nStep 1: Sample Data")
df.show()
# 步骤2: 写入HDFS
raw_data_path = "/user/$(whoami)/raw_data"
self.write_to_hdfs(df, raw_data_path)
# 步骤3: 从HDFS读取
df_from_hdfs = self.read_from_hdfs(raw_data_path)
print("\nStep 3: Data read from HDFS")
df_from_hdfs.show()
# 步骤4: 数据处理
daily_stats, user_stats = self.process_data(df_from_hdfs)
# 步骤5: 保存处理结果到HDFS
print("\nStep 5: Saving processed results to HDFS")
daily_path = "/user/$(whoami)/daily_stats"
user_path = "/user/$(whoami)/user_stats"
self.write_to_hdfs(daily_stats, daily_path)
self.write_to_hdfs(user_stats, user_path)
# 步骤6: 验证结果
print("\nStep 6: Verification - Daily Statistics")
self.read_from_hdfs(daily_path).show()
print("\nStep 7: Verification - User Statistics")
self.read_from_hdfs(user_path).show()
# 步骤7: HDFS操作验证
print("\nStep 8: HDFS Operations Verification")
os.system("hdfs dfs -ls /user/$(whoami)/")
print("\n" + "="*50)
print("Pipeline completed successfully!")
print("="*50)
self.spark.stop()
if __name__ == "__main__":
import os
pipeline = HDFSSparkPipeline()
pipeline.run_pipeline()
7. 运行和监控
启动所有服务
# 启动HDFS
start-dfs.sh
# 启动YARN
start-yarn.sh
# 运行Spark应用
spark-submit --master yarn \
--deploy-mode client \
test_hdfs.py
监控界面
- HDFS NameNode: http://localhost:9870
- YARN ResourceManager: http://localhost:8088
- Spark History Server: http://localhost:18080 (如果已启动)
8. 常见问题解决
端口冲突
# 如果端口被占用,修改端口
# 修改 $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<property>
<name>dfs.namenode.http-address</name>
<value>localhost:9870</value>
</property>
权限问题
# 设置HDFS目录权限
hdfs dfs -chmod -R 755 /user
hdfs dfs -chown -R $(whoami) /user/$(whoami)
内存配置
# 如果内存不足,修改配置
# 修改 $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export HADOOP_HEAPSIZE_MAX=512m
export HADOOP_HEAPSIZE_MIN=256m
这个配置将让你在单台Mac上体验完整的Hadoop HDFS + Spark工作流程。你可以:
- 将数据存储到HDFS
- 使用Spark处理HDFS中的数据
- 将结果写回HDFS
- 通过Web UI监控整个流程
如果需要进一步扩展(如使用Hive、HBase等),我可以提供更多集成方案。