大数据集群搭建(详细)

181 阅读4分钟

大数据集群搭建

集群配置

Ubuntu18.04

Hadoop3.3.0

Zookeeper3.6.2

Hive 3.1.2

Spark 3.4.0

Kafka 3.2.3

MySQL

环境搭建

masterslave01slave02
IP192.168.179.101192.168.179.102192.168.179.103
HDFSNameNode DataNodeSecondaryNameNode DataNodeDataNode
YARNNodeManager ResourceManagerNodeManagerNodeManager
ZookeeperQuorumPeerMainQuorumPeerMainQuorumPeerMain
Kafkamasterworkerworker
Hive主节点
Master主节点

1 Ubuntu18.04

① 修改网络配置

cd /etc/netplan
sudo gedit 01-network-manager-all.yaml

# Let NetworkManager manage all devices on this system
network:
  version: 2
  renderer: NetworkManager
  ethernets:
     ens160:         #配置的网卡名称
       addresses: [192.168.179.101/21]   #设置IP掩码
       gateway4: 192.168.179.2      #设置网关
       nameservers:
         addresses: [8.8.8.8,114.114.114.114]  #设置dns

sudo netplan apply

②更换镜像文件+安装必要软件+关闭防火墙

sudo gedit /etc/apt/sources.list

deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse    


sudo apt-get update
apt-get install open-vm-tools (vm17工具只能调整大小)
apt-get install open-vm-tools-desktop
sudo apt-get install vim
sudo apt-get install openssh-server -y
sudo apt install net-tools

sudo ufw disable

③主机映射 + 修改主机名

sudo gedit /etc/hosts
master 192.168.179.101
master 192.168.179.102
master 192.168.179.103

sudo gedit /etc/hostname
master(slave01 slave02)什么机器配什么名字

④配置免密

ssh-keygen -t rsa -P ""  
cd ~/.ssh    
cat id_rsa.pub >> authorized_keys

cd ~/.ssh 
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@master
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@slave01
ssh-copy-id -i ~/.ssh/id_rsa.pub 1@slave02

2 Hadoop

(一)JDK安装 + 解压Hadoop

#解压jdk
sudo tar -zxvf jdk-8u162-linux-x64.tar.gz -C /usr/local/jvm
cd /usr/local/jvm
sudo mv ./jdk1.8.0_162/ ./java
gedit ~/.bashrc

#解压hadoop
sudo tar -zxvf hadoop-3.3.0.tar.gz -C /usr/local
cd /usr/local/
sudo mv ./hadoop-3.3.0/ ./hadoop           
sudo chown -R zzp ./hadoop 

#环境变量
export JAVA_HOME=/usr/local/jvm/java
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
export HADOOP_HOME=/usr/local/hadoop
exportPATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH

(二)修改配置文件

路径:cd /usr/local/hadoop/etc/hadoop

1.hadoop-env.sh yarn-env.sh mapred-env.sh

export JAVA_HOME=/usr/local/jvm/java

2.workers

master
slave01
slave02

3.mapred-site.xml

<property>
	<name>mapreduce.framework.name</name>
	<value>yarn</value>
</property>
<!-- 指定jobhistory地址 -->
 <property>
    	<name>mapreduce.jobhistory.address</name>
    	<value>master:10020</value>
 </property>
 <property>
    	<name>mapreduce.jobhistory.webapp.address</name>
    	<value>master:19888</value>
</property>
<property>
   <name>mapreduce.map.memory.mb</name>
   <value>3072</value>
</property>
<property>
   <name>mapreduce.reduce.memory.mb</name>
   <value>3072</value>
</property>

<!-- 指定hadoop环境变量 -->
<property>
  	<name>yarn.app.mapreduce.am.env</name>
  	<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
 	 <name>mapreduce.map.env</name>
  	<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
  	<name>mapreduce.reduce.env</name>
  	<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
   <name>mapreduce.admin.map.child.java.opts</name>
   <value>-Xmx3072m</value>
</property>
<property>
    <name>mapreduce.admin.reduce.child.java.opts</name>
    <value>-Xmx3072m</value>
</property>

<property>
  <name>mapreduce.job.jvm.numtasks</name>
  <value>-1</value>
  <description>How many tasks to run per jvm. If set to -1, there is
  no limit. 
  </description>
</property>

4.yarn-site.xml

<!-- 指定ResourceManager的地址 -->
<property>
	   <name>yarn.resourcemanager.hostname</name>
	   <value>master</value>
</property>	
<!-- 指定MR走shuffle -->
<property>
	   <name>yarn.nodemanager.aux-services</name>
	   <value>mapreduce_shuffle</value>
</property>
<!-- 是否对容器实施物理内存限制 -->
<property>
	   <name>yarn.nodemanager.pmem-check-enabled</name>
	   <value>false</value>
</property>
<!-- 是否对容器实施虚拟内存限制 -->
<property>
	   <name>yarn.nodemanager.vmem-check-enabled</name>
	   <value>false</value>
</property>

<!-- 设置 yarn 历史服务器地址 -->
<property>
	   <name>yarn.log.server.url</name>
	   <value>http://master:19888/jobhistory/logs</value>
</property>
<!-- 开启日志聚集-->
<property>
	   <name>yarn.log-aggregation-enable</name>
	   <value>true</value>
</property>
<property>
	   <name>yarn.resourcemanager.webapp.address</name>
	   <value>192.168.179.101:8088</value>
</property>
<!-- 聚集日志保留的时间7天 -->
<property>
	   <name>yarn.log-aggregation.retain-seconds</name>
	   <value>604800</value>
</property>
 <property>
           <name>yarn.nodemanager.pmem-check-enabled</name>
           <value>false</value>
</property>
    
    <!-- 是否检查虚拟内存,默认为true -->
<property>
   <name>yarn.scheduler.minimum-allocation-mb</name>
   <value>2048</value>
   <description>default value is 1024</description>
</property>
<property>
   <name>yarn.scheduler.maximum-allocation-mb</name>
   <value>10240</value>
   <description>default value is 8192</description>
</property>
<property>
  <name>yarn.nodemanager.resource.memory-mb</name>
  <value>4096</value>
</property>

5.hdfs-site.xml

<property>
        <name>dfs.replication</name>
        <value>2</value>
 </property>

 <!-- nn web端访问地址-->
 <property>
        <name>dfs.namenode.http-address</name>
        <value>master:9870</value>
 </property>
 <!-- snn web端访问地址-->
 <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>slave01:9868</value>
</property>
<!-- 设置hdfs的文件权限 -->
<property>
	<name>dfs.permissions</name>
	<value>false</value>
</property>

6.core-site.xml

<!-- 指定 NameNode 的地址 -->
 <property>
  	<name>fs.defaultFS</name>
  	<value>hdfs://master:8020</value>
 </property>
 <!-- 指定 hadoop 数据的存储目录 -->
 <property>
  	<name>hadoop.tmp.dir</name>
  	<value>/usr/local/hadoop/tmp</value>
 </property>
 <property>
	<name>hadoop.http.staticuser.user</name>
	<value>1</value>
</property>

<!-- 配置该hadoop用户允许通过代理访问的主机节点-->
<property>
	<name>hadoop.proxyuser.hadoop.hosts</name>
	<value>*</value>
</property>
<!-- 配置该hadoop用户允许代理的用户所属组-->
<property>
	<name>hadoop.proxyuser.hadoop.groups</name>
	 <value>*</value>
</property>
	<!-- 配置该hadoop用户允许代理的用户, *代表所有-->
<property>
	<name>hadoop.proxyuser.hadoop.users</name>
	 <value>*</value>
</property>
<property>     
	<name>hadoop.proxyuser.zzp.hosts</name>     
	<value>*</value>
 </property> 
<property>     
	<name>hadoop.proxyuser.zzp.groups</name>    
    <value>*</value> 
</property>

(三)分发集群

cd /usr/local
tar -zcf ~/hadoop.master.tar.gz ./hadoop 
cd ~

scp ./hadoop.master.tar.gz slave01:/home/1
scp ./hadoop.master.tar.gz slave02:/home/1

sudo rm -r /usr/local/hadoop 
sudo tar -zxf ~/hadoop.master.tar.gz -C /usr/local 
sudo chown -R 1 /usr/local/hadoop 

(四)节点初始化

hdfs namenode -format

3 Hive

(一)解压

sudo tar -zxvf ./apache-hive-3.1.2-bin.tar.gz -C /usr/local 
cd /usr/local/
sudo mv apache-hive-3.1.2-bin hive     
sudo chown -R 1 hive  

(二)环境变量

vim ~/.bashrc

export HIVE_HOME=/usr/local/hive
export PATH=$PATH:$HIVE_HOME/bin

source ~/.bashrc

(三)修改配置文件

cd ./hive/conf/
mv hive-env.sh.template  hive-env.sh
mv hive-default.xml.template  hive-site.xml

hive-env.sh

export JAVA_HOME=/usr/local/jvm/java
export HIVE_HOME=/usr/local/hive
export HADOOP_HOME=/usr/local/hadoop

hive-site.xml

<!-- jdbc 连接的 URL --> 
<property> 
<name>javax.jdo.option.ConnectionURL</name> 
<value>jdbc:mysql://master:3306/metastore?useSSL=false</value> 
</property> 
<!-- jdbc 连接的 Driver--> 
<property> 
<name>javax.jdo.option.ConnectionDriverName</name> 
<value>com.mysql.jdbc.Driver</value> 
</property> 
<!-- jdbc 连接的 username--> 
<property> 
<name>javax.jdo.option.ConnectionUserName</name> 
<value>root</value> 
</property> 
<!-- jdbc 连接的 password --> 
<property> 
<name>javax.jdo.option.ConnectionPassword</name> 
<value>123456</value> 
</property> 
<!-- Hive 元数据存储版本的验证 --> 
<property> 
<name>hive.metastore.schema.verification</name> 
<value>false</value> 
</property> 
<!--元数据存储授权--> 
<property> 
<name>hive.metastore.event.db.notification.api.auth</name> 
<value>false</value> 
</property> 
<!-- Hive 默认在 HDFS 的工作目录 --> 
<property> 
<name>hive.metastore.warehouse.dir</name> 
<value>/user/hive/warehouse</value> 
</property> 

hive_conf.xml

#添加
<property>

<name>hive.auto.convert.join</name>

<value>false</value>//true修改为false

<description>Enables the optimization about converting common join into mapjoin</description>

</property>

(四)添加JDBC

cp /mysql-connector-java-5.1.37.jar $HIVE_HOME/lib

(五)与mysql连接


#新建 Hive 元数据库 
mysql> create database metastore; 
mysql> quit; 

#初始化 Hive 元数据库 
[zzp@master]# schematool -initSchema -dbType mysql - verbose 

4 Kafka

(一)解压

sudo tar -zxf kafka_2.13-3.2.3.tgz -C /usr/local
cd /usr/local
sudo mv kafka_2.13-3.2.3/ ./kafka
sudo chown -R 1 ./kafka

(二)环境变量

vim ~/.bashrc

export KAFKA_HOME=/usr/local/kafka
export PATH=$PATH:$KAFKA_HOME/bin

source ~/.bashrc

(三)修改配置文件

cd ./kafka
mkdir ./logs
cd config/
vim server.properties

server.properties

#broker 的全局唯一编号,不能重复 
broker.id=0
#删除 topic 功能使能 
delete.topic.enable=true
#处理网络请求的线程数量 
num.network.threads=3
#用来处理磁盘 IO 的现成数量 
num.io.threads=8
#发送套接字的缓冲区大小 
socket.send.buffer.bytes=102400
#接收套接字的缓冲区大小 
socket.receive.buffer.bytes=102400
#请求套接字的缓冲区大小
socket.request.max.bytes=104857600 
#kafka 运行日志存放的路径
log.dirs=/usr/local/kafka/logs
#topic 在当前 broker 上的分区个数 
num.partitions=1
#用来恢复和清理 data 下数据的线程数量 
num.recovery.threads.per.data.dir=1
#segment 文件保留的最长时间,超时将被删除 
log.retention.hours=168
#配置连接 Zookeeper 集群地址 
zookeeper.connect=master:2181,slave01:2181,slave02:2181

kafka-server-stop.sh

cd /bin
vim kafka-server-stop.sh

将 PIDS=$(ps ax | grep -i ‘kafka.Kafka’ | grep java | grep -v grep | awk ‘{print $1}’)
修改为:PIDS=$(jps -lm | grep -i 'kafka.Kafka' | awk '{print $1}')

(四)分发集群

cd /usr/local
tar -zcf ~/kafka.master.tar.gz ./kafka
cd ~
scp ./kafka.master.tar.gz slave01:/home/zzp
scp ./kafka.master.tar.gz slave02:/home/zzp

sudo rm -r /usr/local/kafka
sudo tar -zxf ~/kafka.master.tar.gz -C /usr/local 
sudo chown -R 1 /usr/local/kafka 

修改文件

分别在 slave01 和 slave02 上修改配置文件/usr/local/kafka/config/server.properties 中的 

broker.id=1、broker.id=2 

注:broker.id 不得重复 

5 Zookeeper

(一)解压

sudo tar -zxvf apache-zookeeper-3.6.2-bin.tar.gz -C /usr/local
cd /usr/local/
sudo mv ./apache-zookeeper-3.6.2-bin  ./zookeeper       
sudo chown -R zzp ./zookeeper 

(二)修改环境变量

vim ~/.bashrc

export ZOOKEEPER_HOME=/usr/local/zookeeper
export PATH=$PATH:$ZOOKEEPER_HOME/bin

source ~/.bashrc

(三)修改配置文件

路径:usr/local/zookeeper

mkdir zkData
cd zkData/
echo 1 > myid

cd ..
cd ./conf/
cp zoo_sample.cfg zoo.cfg
vim zoo.cfg

# 修改以下内容:
dataDir=/usr/local/zookeeper/zkData
# 添加以下内容(不能有空格!)
server.1=master:2888:3888
server.2=slave01:2888:3888
server.3=slave02:2888:3888

(四)分发集群

cd /usr/local
tar -zcf ~/zookeeper.master.tar.gz ./zookeeper
cd ~
scp ./zookeeper.master.tar.gz slave01:/home/1
scp ./zookeeper.master.tar.gz slave02:/home/1

sudo rm -r /usr/local/zookeeper
sudo tar -zxf ~/zookeeper.master.tar.gz -C /usr/local 
sudo chown -R 1 /usr/local/zookeeper 
#slave01改成2 slave02改成3
vim /usr/local/zookeeper/zkdata/myid

cd /usr/local/zookeeper/bin/
zkServer.sh start(修改后zk.sh start)
zkServer.sh status
zkServer.sh stop

6 Spark

(一)解压

sudo tar -zxvf spark-3.4.0-bin-hadoop3.tgz -C /usr/local
cd /usr/local/
sudo mv ./spark-3.4.0-bin-hadoop3 ./spark          
sudo chown -R 1 ./spark 

(二)环境变量

sudo tar -zxvf spark-3.4.0-bin-hadoop3.tgz -C /usr/local
cd /usr/local/
sudo mv ./spark-3.4.0-bin-hadoop3 ./spark          
sudo chown -R 1 ./spark 

(三)修改配置文件

works

cd /usr/local/spark
cd ./conf
cp ./workers.template ./workers

vim workers

master
slave01
slave02

spark-env.sh

cp ./spark-env.sh.template ./spark-env.sh

vim spark-env.sh
export JAVA_HOME=/usr/local/jvm/java
export HADOOP_HOME=/usr/local/hadoop
export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
export SPARK_DIST_CLASSPATH=$(/usr/local/hadoop/bin/hadoop classpath)
export SPARK_MASTER_HOST=master
export SPARK_MASTER_PORT=7077

spark-defaults.conf

cp spark-defaults.conf.template spark-defaults.conf

vim spark-defaults.conf
spark.master                     spark://master:7077
spark.serializer                 org.apache.spark.serializer.KryoSerializer
spark.driver.memory              1g
spark.executor.memory            1g

spark-config.sh

cd /usr/local/spark/sbin
sudo vim spark-config.sh

export JAVA_HOME=/usr/local/jvm/java

(四)分发集群

cd /usr/local
tar -zcf ~/spark.master.tar.gz ./spark
cd ~
scp ./spark.master.tar.gz slave01:/home/zzp
scp ./spark.master.tar.gz slave02:/home/zzp

sudo rm -r /usr/local/spark
sudo tar -zxf ~/spark.master.tar.gz -C /usr/local 
sudo chown -R zzp /usr/local/spark 

启动命令

cd /usr/local/spark/
sbin/start-all  #与Hadoop冲突因此需要在路径下启动或者改名

7 MySQL

sudo apt install MySQL-server

mysql start

(一)修改密码

sudo gedit /etc/mysql/mysql.conf.d/mysqld.cnf

#添加并保存
skip-grant-tables
#(修改密码时使用不用则#注释)

#重启mysql
service mysql restart

mysql -u root -p

#数据库界面
use mysql;
select user, plugin from user;
update user set authentication_string=password("密码"),plugin='mysql_native_password' where user='root';
flush privileges;

#如果上步启动出错则继续添加#修改
update user set authentication_string=password("你的密码") where user="root";
use mysql;

(二)IP登录

#mysql 库下的 user 表中的 root 用户允许任意 ip 连接 
update mysql.user set host='%' where user='root'; 
flush privileges; 

8 Sqoop

(一)解压

sudo tar -zxvf sqoop-1.4.7.bin__hadoop-2.6.0.tar.gz -C /usr/local/
cd /usr/local/
sudo mv ./sqoop-1.4.7.bin__hadoop-2.6.0 ./sqoop         
sudo chown -R 1 ./sqoop

(二)环境变量

vim ~/.bashrc

#sqoop
export SQOOP_HOME=/usr/local/sqoop
export PATH=$PATH:$SQOOP_HOME/bin

source ~/.bashrc

(三)配置文件

cd /usr/local/sqoop/conf
mv sqoop-env-template.sh sqoop-env.sh

gedit sqoop-env.sh

export HADOOP_COMMON_HOME=/usr/local/hadoop
export HADOOP_MAPRED_HOME=/usr/local/hadoop
export HIVE_HOME=/usr/local/hive
export HCAT_HOME=$HIVE_HOME/hcatalog

(四)JDBC

cp software/mysql-connector-java- 
5.1.37.jar $SQOOP_HOME/lib

(五)查看guava

hadoop
guava-27.0-jre.jar

spark
guava-14.0.1.jar 

hive
guava-19.0.jar

选择最高版本替换最低版本

这就是大数据集群搭建的过程,中途可能有些解释不清楚的地方,如果有中途安装有问题,勿喷谢谢。