大数据集群搭建
集群配置
Ubuntu18.04
Hadoop3.3.0
Zookeeper3.6.2
Hive 3.1.2
Spark 3.4.0
Kafka 3.2.3
MySQL
环境搭建
| master | slave01 | slave02 | |
|---|---|---|---|
| IP | 192.168.179.101 | 192.168.179.102 | 192.168.179.103 |
| HDFS | NameNode DataNode | SecondaryNameNode DataNode | DataNode |
| YARN | NodeManager ResourceManager | NodeManager | NodeManager |
| Zookeeper | QuorumPeerMain | QuorumPeerMain | QuorumPeerMain |
| Kafka | master | worker | worker |
| Hive | 主节点 | ||
| Master | 主节点 |
1 Ubuntu18.04
① 修改网络配置
cd /etc/netplan
sudo gedit 01-network-manager-all.yaml
# Let NetworkManager manage all devices on this system
network:
version: 2
renderer: NetworkManager
ethernets:
ens160: #配置的网卡名称
addresses: [192.168.179.101/21] #设置IP掩码
gateway4: 192.168.179.2 #设置网关
nameservers:
addresses: [8.8.8.8,114.114.114.114] #设置dns
sudo netplan apply
②更换镜像文件+安装必要软件+关闭防火墙
sudo gedit /etc/apt/sources.list
deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
sudo apt-get update
apt-get install open-vm-tools (vm17工具只能调整大小)
apt-get install open-vm-tools-desktop
sudo apt-get install vim
sudo apt-get install openssh-server -y
sudo apt install net-tools
sudo ufw disable
③主机映射 + 修改主机名
sudo gedit /etc/hosts
master 192.168.179.101
master 192.168.179.102
master 192.168.179.103
sudo gedit /etc/hostname
master(slave01 slave02)什么机器配什么名字
④配置免密
ssh-keygen -t rsa -P ""
cd ~/.ssh
cat id_rsa.pub >> authorized_keys
cd ~/.ssh
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@master
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@slave01
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@slave02
2 Hadoop
(一)JDK安装 + 解压Hadoop
#解压jdk
sudo tar -zxvf jdk-8u162-linux-x64.tar.gz -C /usr/local/jvm
cd /usr/local/jvm
sudo mv ./jdk1.8.0_162/ ./java
gedit ~/.bashrc
#解压hadoop
sudo tar -zxvf hadoop-3.3.0.tar.gz -C /usr/local
cd /usr/local/
sudo mv ./hadoop-3.3.0/ ./hadoop
sudo chown -R zzp ./hadoop
#环境变量
export JAVA_HOME=/usr/local/jvm/java
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
export HADOOP_HOME=/usr/local/hadoop
exportPATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
(二)修改配置文件
路径:cd /usr/local/hadoop/etc/hadoop
1.hadoop-env.sh yarn-env.sh mapred-env.sh
export JAVA_HOME=/usr/local/jvm/java
2.workers
master
slave01
slave02
3.mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 指定jobhistory地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>master:19888</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>3072</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>3072</value>
</property>
<!-- 指定hadoop环境变量 -->
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.admin.map.child.java.opts</name>
<value>-Xmx3072m</value>
</property>
<property>
<name>mapreduce.admin.reduce.child.java.opts</name>
<value>-Xmx3072m</value>
</property>
<property>
<name>mapreduce.job.jvm.numtasks</name>
<value>-1</value>
<description>How many tasks to run per jvm. If set to -1, there is
no limit.
</description>
</property>
4.yarn-site.xml
<!-- 指定ResourceManager的地址 -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<!-- 指定MR走shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 是否对容器实施物理内存限制 -->
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<!-- 是否对容器实施虚拟内存限制 -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 设置 yarn 历史服务器地址 -->
<property>
<name>yarn.log.server.url</name>
<value>http://master:19888/jobhistory/logs</value>
</property>
<!-- 开启日志聚集-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>192.168.179.101:8088</value>
</property>
<!-- 聚集日志保留的时间7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<!-- 是否检查虚拟内存,默认为true -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>2048</value>
<description>default value is 1024</description>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>10240</value>
<description>default value is 8192</description>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
5.hdfs-site.xml
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<!-- nn web端访问地址-->
<property>
<name>dfs.namenode.http-address</name>
<value>master:9870</value>
</property>
<!-- snn web端访问地址-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>slave01:9868</value>
</property>
<!-- 设置hdfs的文件权限 -->
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
6.core-site.xml
<!-- 指定 NameNode 的地址 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:8020</value>
</property>
<!-- 指定 hadoop 数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>1</value>
</property>
<!-- 配置该hadoop用户允许通过代理访问的主机节点-->
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<!-- 配置该hadoop用户允许代理的用户所属组-->
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
<!-- 配置该hadoop用户允许代理的用户, *代表所有-->
<property>
<name>hadoop.proxyuser.hadoop.users</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.zzp.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.zzp.groups</name>
<value>*</value>
</property>
(三)分发集群
cd /usr/local
tar -zcf ~/hadoop.master.tar.gz ./hadoop
cd ~
scp ./hadoop.master.tar.gz slave01:/home/1
scp ./hadoop.master.tar.gz slave02:/home/1
sudo rm -r /usr/local/hadoop
sudo tar -zxf ~/hadoop.master.tar.gz -C /usr/local
sudo chown -R 1 /usr/local/hadoop
(四)节点初始化
hdfs namenode -format
3 Hive
(一)解压
sudo tar -zxvf ./apache-hive-3.1.2-bin.tar.gz -C /usr/local
cd /usr/local/
sudo mv apache-hive-3.1.2-bin hive
sudo chown -R 1 hive
(二)环境变量
vim ~/.bashrc
export HIVE_HOME=/usr/local/hive
export PATH=$PATH:$HIVE_HOME/bin
source ~/.bashrc
(三)修改配置文件
cd ./hive/conf/
mv hive-env.sh.template hive-env.sh
mv hive-default.xml.template hive-site.xml
hive-env.sh
export JAVA_HOME=/usr/local/jvm/java
export HIVE_HOME=/usr/local/hive
export HADOOP_HOME=/usr/local/hadoop
hive-site.xml
<!-- jdbc 连接的 URL -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://master:3306/metastore?useSSL=false</value>
</property>
<!-- jdbc 连接的 Driver-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<!-- jdbc 连接的 username-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<!-- jdbc 连接的 password -->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<!-- Hive 元数据存储版本的验证 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!--元数据存储授权-->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
<!-- Hive 默认在 HDFS 的工作目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
hive_conf.xml
#添加
<property>
<name>hive.auto.convert.join</name>
<value>false</value>//true修改为false
<description>Enables the optimization about converting common join into mapjoin</description>
</property>
(四)添加JDBC
cp /mysql-connector-java-5.1.37.jar $HIVE_HOME/lib
(五)与mysql连接
#新建 Hive 元数据库
mysql> create database metastore;
mysql> quit;
#初始化 Hive 元数据库
[zzp@master]# schematool -initSchema -dbType mysql - verbose
4 Kafka
(一)解压
sudo tar -zxf kafka_2.13-3.2.3.tgz -C /usr/local
cd /usr/local
sudo mv kafka_2.13-3.2.3/ ./kafka
sudo chown -R 1 ./kafka
(二)环境变量
vim ~/.bashrc
export KAFKA_HOME=/usr/local/kafka
export PATH=$PATH:$KAFKA_HOME/bin
source ~/.bashrc
(三)修改配置文件
cd ./kafka
mkdir ./logs
cd config/
vim server.properties
server.properties
#broker 的全局唯一编号,不能重复
broker.id=0
#删除 topic 功能使能
delete.topic.enable=true
#处理网络请求的线程数量
num.network.threads=3
#用来处理磁盘 IO 的现成数量
num.io.threads=8
#发送套接字的缓冲区大小
socket.send.buffer.bytes=102400
#接收套接字的缓冲区大小
socket.receive.buffer.bytes=102400
#请求套接字的缓冲区大小
socket.request.max.bytes=104857600
#kafka 运行日志存放的路径
log.dirs=/usr/local/kafka/logs
#topic 在当前 broker 上的分区个数
num.partitions=1
#用来恢复和清理 data 下数据的线程数量
num.recovery.threads.per.data.dir=1
#segment 文件保留的最长时间,超时将被删除
log.retention.hours=168
#配置连接 Zookeeper 集群地址
zookeeper.connect=master:2181,slave01:2181,slave02:2181
kafka-server-stop.sh
cd /bin
vim kafka-server-stop.sh
将 PIDS=$(ps ax | grep -i ‘kafka.Kafka’ | grep java | grep -v grep | awk ‘{print $1}’)
修改为:PIDS=$(jps -lm | grep -i 'kafka.Kafka' | awk '{print $1}')
(四)分发集群
cd /usr/local
tar -zcf ~/kafka.master.tar.gz ./kafka
cd ~
scp ./kafka.master.tar.gz slave01:/home/zzp
scp ./kafka.master.tar.gz slave02:/home/zzp
sudo rm -r /usr/local/kafka
sudo tar -zxf ~/kafka.master.tar.gz -C /usr/local
sudo chown -R 1 /usr/local/kafka
修改文件
分别在 slave01 和 slave02 上修改配置文件/usr/local/kafka/config/server.properties 中的
broker.id=1、broker.id=2
注:broker.id 不得重复
5 Zookeeper
(一)解压
sudo tar -zxvf apache-zookeeper-3.6.2-bin.tar.gz -C /usr/local
cd /usr/local/
sudo mv ./apache-zookeeper-3.6.2-bin ./zookeeper
sudo chown -R zzp ./zookeeper
(二)修改环境变量
vim ~/.bashrc
export ZOOKEEPER_HOME=/usr/local/zookeeper
export PATH=$PATH:$ZOOKEEPER_HOME/bin
source ~/.bashrc
(三)修改配置文件
路径:usr/local/zookeeper
mkdir zkData
cd zkData/
echo 1 > myid
cd ..
cd ./conf/
cp zoo_sample.cfg zoo.cfg
vim zoo.cfg
# 修改以下内容:
dataDir=/usr/local/zookeeper/zkData
# 添加以下内容(不能有空格!)
server.1=master:2888:3888
server.2=slave01:2888:3888
server.3=slave02:2888:3888
(四)分发集群
cd /usr/local
tar -zcf ~/zookeeper.master.tar.gz ./zookeeper
cd ~
scp ./zookeeper.master.tar.gz slave01:/home/1
scp ./zookeeper.master.tar.gz slave02:/home/1
sudo rm -r /usr/local/zookeeper
sudo tar -zxf ~/zookeeper.master.tar.gz -C /usr/local
sudo chown -R 1 /usr/local/zookeeper
#slave01改成2 slave02改成3
vim /usr/local/zookeeper/zkdata/myid
cd /usr/local/zookeeper/bin/
zkServer.sh start(修改后zk.sh start)
zkServer.sh status
zkServer.sh stop
6 Spark
(一)解压
sudo tar -zxvf spark-3.4.0-bin-hadoop3.tgz -C /usr/local
cd /usr/local/
sudo mv ./spark-3.4.0-bin-hadoop3 ./spark
sudo chown -R 1 ./spark
(二)环境变量
sudo tar -zxvf spark-3.4.0-bin-hadoop3.tgz -C /usr/local
cd /usr/local/
sudo mv ./spark-3.4.0-bin-hadoop3 ./spark
sudo chown -R 1 ./spark
(三)修改配置文件
works
cd /usr/local/spark
cd ./conf
cp ./workers.template ./workers
vim workers
master
slave01
slave02
spark-env.sh
cp ./spark-env.sh.template ./spark-env.sh
vim spark-env.sh
export JAVA_HOME=/usr/local/jvm/java
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
export SPARK_MASTER_HOST=master
export SPARK_MASTER_PORT=7077
spark-defaults.conf
cp spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf
spark.master spark://master:7077
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 1g
spark.executor.memory 1g
spark-config.sh
cd /usr/local/spark/sbin
sudo vim spark-config.sh
export JAVA_HOME=/usr/local/jvm/java
(四)分发集群
cd /usr/local
tar -zcf ~/spark.master.tar.gz ./spark
cd ~
scp ./spark.master.tar.gz slave01:/home/zzp
scp ./spark.master.tar.gz slave02:/home/zzp
sudo rm -r /usr/local/spark
sudo tar -zxf ~/spark.master.tar.gz -C /usr/local
sudo chown -R zzp /usr/local/spark
启动命令
cd /usr/local/spark/
sbin/start-all #与Hadoop冲突因此需要在路径下启动或者改名
7 MySQL
sudo apt install MySQL-server
mysql start
(一)修改密码
sudo gedit /etc/mysql/mysql.conf.d/mysqld.cnf
#添加并保存
skip-grant-tables
#(修改密码时使用不用则#注释)
#重启mysql
service mysql restart
mysql -u root -p
#数据库界面
use mysql;
select user, plugin from user;
update user set authentication_string=password("密码"),plugin='mysql_native_password' where user='root';
flush privileges;
#如果上步启动出错则继续添加#修改
update user set authentication_string=password("你的密码") where user="root";
use mysql;
(二)IP登录
#mysql 库下的 user 表中的 root 用户允许任意 ip 连接
update mysql.user set host='%' where user='root';
flush privileges;
8 Sqoop
(一)解压
sudo tar -zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz -C /usr/local/
cd /usr/local/
sudo mv ./sqoop-1.4.7.bin__hadoop-2.6.0 ./sqoop
sudo chown -R 1 ./sqoop
(二)环境变量
vim ~/.bashrc
#sqoop
export SQOOP_HOME=/usr/local/sqoop
export PATH=$PATH:$SQOOP_HOME/bin
source ~/.bashrc
(三)配置文件
cd /usr/local/sqoop/conf
mv sqoop-env-template.sh sqoop-env.sh
gedit sqoop-env.sh
export HADOOP_COMMON_HOME=/usr/local/hadoop
export HADOOP_MAPRED_HOME=/usr/local/hadoop
export HIVE_HOME=/usr/local/hive
export HCAT_HOME=$HIVE_HOME/hcatalog
(四)JDBC
cp software/mysql-connector-java-
5.1.37.jar $SQOOP_HOME/lib
(五)查看guava
hadoop
guava-27.0-jre.jar
spark
guava-14.0.1.jar
hive
guava-19.0.jar
选择最高版本替换最低版本
这就是大数据集群搭建的过程,中途可能有些解释不清楚的地方,如果有中途安装有问题,勿喷谢谢。