分布式系统部署配置

3 阅读4分钟
NODE1NODE2NODE3NODE4
Zookeeper
ZKFC-1ZKFC-2
HDFS-NameNodeNameNodeNameNode
HDFS-DataNodeDataNode-1DataNode-2DataNode-3
HDFS-JournalNodeJNN-1JNN-2JNN-3
YARN-ResourceManageRM-1RM-2
SparkMaster、Worker、HistoryServerWorkerWorker
Mysql
Hive
安装包

远控编程

Pycharm远程连接Spark(超详细图文教程)_pycharm远程连接spark集群-CSDN博客

基础部件

1. ssh连接服务

sudo yum install openssh-server
sudo systemctl start sshd
sudo systemctl enable sshd

注:默认源无法下载,[[换源#CentOS]]

2. 时钟同步

crontab -e
0 1 * * * /user/sbin/ntpdate aliyun.com

3.配置主机名

sudo hostnamectl set-hostname 新主机名

检查 hostname

4. 关闭防火墙

sudo systemctl disable firewalld

5. 配置hosts

# /etc/hosts
ip地址 用户名

6. JDK 安装

sudo rpm -ivh jdk-8u221-linux-x64.rpm
# 配置环境变量 ~/.bash_profile
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
export PATH=$PATH:$JAVA_HOME/bin
# 启动环境
source .bash_profile

7. 免密登录

ssh-keygen -t rsa  # 生成密钥 位置~/.ssh
cat .ssh/id_rsa.pub >> .ssh/authorized_keys  # 设置公钥
scp .ssh/authorized_keys bigdata@node2:/root/.ssh/  # 公钥相互传递,每个公钥保存所有相连主机的密钥
chmod 600 .ssh/authorized_keys  # 修改权限

Zookeeper

1. 解压安装包并改名

tar -zxvf apache-zookeeper 3.5.7-bin.tar.gz -C ~/bigdata
#  在文件所在路径下
mv apache-zookeeper-3.5.7-bin zookeeper-3.5.7

2. 配置 ZooKeeper 环境变量

export ZOOKEEPER_HOME=~/bigData/zookeeper-3.5.7 
export PATH=$PATH:$JAVA_HOME/bin:$ZOOKEEPER_HOME/
# 更新环境
source .bash_profile

3. 配置zoo.cfg

# zookeeper-3.5.7/conf/zoo.cfg
tickTime=2000
dataDir=/home/bigdata/bigData/zookeeper-3.5.7/data
clientPort=2181
initLimit=5
syncLimit=2
server.2=node2:2881:3881
server.3=node3:2881:3881 
server.4=node4:2881:3881

4. 创建 myid

mkdir ~/bigData/zookeeper-3.5.7/data
cd ~/bigData/zookeeper-3.5.7/data
echo 2 > myid

5. 启动

zkServer.sh start  # 启动
zkServer.sh status #查看状态
zkServer.sh stop   #停止

==注:zoo.cfg 和 myid 每台Zookeeper 都需要配置,myid 要和zoo.cfg相同==

Hadoop

HDFS

1.解压Hadoop文件
tar -zxvf hadoop_3.1.3.tar.gz -C ~/bigData/
2. 配置环境变量
# ~/.bash_profile
export HADOOP_HOME=/home/bigdata/bigData/hadoop-3.1.3
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HAD OOP_HOME/sbin

# 启动环境
source .bash_profile
3. 配置hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
4. 配置workers
node2
node3
node4
5. 配置 core-site.xml
<configuration>
    <!-- 用来指定 HDFS 的老大,namenode 的地址 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://node1:9820</value>
    </property>
  
    <!-- 指定 Hadoop 数据的存储目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/bigdata/bigData/data/hadoop/full</value>
    </property>
</configuration>
6.配置hdfs-site.xml
<configuration>
    <!-- 指定 NameNode web 端访问地址 -->
    <property>
        <name>dfs.namenode.http-address</name>
        <value>node1:9870</value>
    </property>

    <!-- 指定 Secondary NameNode web 端访问地址 -->
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>node2:9868</value>
    </property>

    <!-- 指定每个 block 块的副本数,默认为 3 -->
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
</configuration>
7.启动
hdfs namenode -format
start-dfs.sh

HDFS HA

注:ssh不提示fingerprint信息

# /etc/ssh/ssh_config
StrictHostKeyChecking no
1. core-site.xml
<configuration>
    <!-- 指定 HDFS 的默认文件系统 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://mycluster</value>
    </property>

    <!-- 数据的存放目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/bigdata/bigData/data/hadoop/ha</value>
    </property>

    <!-- 指定每个 ZooKeeper 服务器的位置和客户端端口号 -->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>node2:2181,node3:2181,node4:2181</value>
    </property>

    <!-- 解决 HDFS web 页面上删除、创建文件权限不足的问题 -->
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
    </property>
</configuration>
2. hdfs-site.xml
<configuration>
    <!-- JournalNode 数据存储目录 -->
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>${hadoop.tmp.dir}/dfs/journalnode/</value>
    </property>

    <!-- 集群名称 -->
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
  
    <!-- 集群中 NameNode 节点有哪些 -->
    <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>

    <!-- NameNode 的 RPC 通信地址 -->
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>node1:9820</value>
    </property>
    
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>node2:9820</value>
    </property>

    <!-- NameNode 的 HTTP 通信地址 -->
    <property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>node1:9870</value>
    </property>

    <property>
        <name>dfs.namenode.http-address.mycluster.nn2</name>
        <value>node2:9870</value>
    </property>

    <!-- 指定 NameNode 元数据在 JournalNode 上的存放位置 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
    <value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
    </property>

    <!-- 访问代理类:client 用于确定哪个 NameNode 为 Active -->
    <property>
        <name>dfs.client.failover.proxy.provider.mycluster</name><value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
  
    <!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>

    <!-- 使用隔离机制时需要 SSH 秘钥登录 -->
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/bigdata/.ssh/id_dsa</value>
    </property>
  
    <!-- 启用 NameNode 故障自动转移 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
</configuration>
3. 启动
# node2 node3 node4
zkServer.sh start

# node1 node2 node3
hdfs --daemon start journalnode

# node1
hdfs namenode -format
hdfs --daemon start namenode

# node2
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode

# node1
hdfs zkfc -formatZK
start-dfs.sh

==注:Active NameNode不能自动进行切换==

yum install -y psmisc
启动脚本
  1. allJps.sh
#!/bin/bash
# ~/bin/allJps.sh
# 查看集群所有节点的 Java 进程
echo "-----------node1 jps--------------"
jps
for node in node2 node3 node4
do
    echo "-----------$node jps--------------"
    ssh $node "source ~/.bash_profile; jps"
done
  1. starthdfs.sh
#!/bin/bash
# 启动zk集群
for node in node2 node3 node4
do
    ssh $node "source ~/.bash_profile; zkServer.sh start"
done
# 休眠1s
sleep 1
# 启动hdfs集群
start-dfs.sh
allJps.sh
  1. stophdfs.sh
#!/bin/bash
source ~/.bash_profile  # 确保 Hadoop/ZooKeeper 命令可用
echo "正在关闭 HDFS 集群..."
stop-dfs.sh
sleep 1  # 等待 HDFS 完全关闭
echo "正在关闭 ZooKeeper 集群..."
for node in node2 node3 node4
do
    echo "关闭 $node 上的 ZooKeeper..."
    ssh $node "source ~/.bash_profile; zkServer.sh stop"
done
echo "检查各节点进程状态..."
~/bin/allJps.sh

Yarn HA

1. 配置mapreduce-site.xml
<property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
</property>
2. 配置yarn-site.xml
<configuration>
    <!-- 基础配置 -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>

    <!-- HA 核心配置 -->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
        <value>true</value>
    </property>

    <property>

        <name>yarn.resourcemanager.cluster-id</name>

        <value>cluster-yarn1</value>

    </property>

    <property>

        <name>yarn.resourcemanager.ha.rm-ids</name>

        <value>rm1,rm2</value>

    </property>

    <property>

        <name>yarn.resourcemanager.zk-address</name>

        <value>node2:2181,node3:2181,node4:2181</value>

    </property>

  

    <!-- RM1 配置 -->

    <property>

        <name>yarn.resourcemanager.hostname.rm1</name>

        <value>node3</value>

    </property>

    <property>

        <name>yarn.resourcemanager.address.rm1</name>

        <value>node3:8032</value>

    </property>

    <property>

        <name>yarn.resourcemanager.scheduler.address.rm1</name>

        <value>node3:8030</value>

    </property>

    <property>

        <name>yarn.resourcemanager.resource-tracker.address.rm1</name>

        <value>node3:8031</value>

    </property>

    <property>

        <name>yarn.resourcemanager.admin.address.rm1</name>

        <value>node3:8033</value>

    </property>

    <property>

        <name>yarn.resourcemanager.webapp.address.rm1</name>

        <value>node3:8088</value>

    </property>

    <property>

        <name>yarn.resourcemanager.ha.id.rm1</name>

        <value>rm1</value>

    </property>

  

    <!-- RM2 配置 -->

    <property>

        <name>yarn.resourcemanager.hostname.rm2</name>

        <value>node4</value>

    </property>

    <property>

        <name>yarn.resourcemanager.address.rm2</name>

        <value>node4:8032</value>

    </property>

    <property>

        <name>yarn.resourcemanager.scheduler.address.rm2</name>

        <value>node4:8030</value>

    </property>

    <property>

        <name>yarn.resourcemanager.resource-tracker.address.rm2</name>

        <value>node4:8031</value>

    </property>

    <property>

        <name>yarn.resourcemanager.admin.address.rm2</name>

        <value>node4:8033</value>

    </property>

    <property>

        <name>yarn.resourcemanager.webapp.address.rm2</name>

        <value>node4:8088</value>

    </property>

    <property>

        <name>yarn.resourcemanager.ha.id.rm2</name>

        <value>rm2</value>

    </property>

  

    <!-- 状态存储 -->

    <property>

        <name>yarn.resourcemanager.recovery.enabled</name>

        <value>true</value>

    </property>

    <property>

        <name>yarn.resourcemanager.store.class</name>

        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>

    </property>

  

    <!-- 资源限制 -->

    <property>

        <name>yarn.scheduler.minimum-allocation-mb</name>

        <value>512</value>

    </property>

    <property>

        <name>yarn.scheduler.maximum-allocation-mb</name>

        <value>1024</value>

    </property>

    <property>

        <name>yarn.nodemanager.resource.memory-mb</name>

        <value>1024</value>

    </property>

    <property>

        <name>yarn.nodemanager.pmem-check-enabled</name>

        <value>false</value>

    </property>

    <property>

        <name>yarn.nodemanager.vmem-check-enabled</name>

        <value>false</value>

    </property>

</configuration>
启动脚本
  1. startha.sh
#!/bin/bash
# 启动zk集群
for node in node2 node3 node4
do
    ssh $node "source ~/.bash_profile; zkServer.sh start"
done
# 休眠1s
sleep 3
# 启动hdfs集群
start-dfs.sh
# 启动yarn
ssh node3 "source ~/.bash_profile; start-yarn.sh"
# 查看四个节点上的java进程
~/bin/allJps.sh
  1. stopha.sh
#!/bin/bash
# 关闭yarn
ssh node3 "source ~/.bash_profile; stop-yarn.sh"
# 关闭hdfs
stop-dfs.sh
# 关闭zk集群
for node in node2 node3 node4
do
    ssh $node "source ~/.bash_profile; zkServer.sh stop"
done
# 查看java进程
allJps.sh

Spark

1.安装conda

sh ./Anaconda3-Linux-x86_64.sh  # 安装

Welcome to Anaconda3 2021.05
In order to continue the installation process, please review the license
agreement.
Please, press ENTER to continue
>>>    # 回车
===================================
End User License Agreement - Anaconda Individual Edition
===================================  
Copyright 2015-2021, Anaconda, Inc.
  
All rights reserved under the 3-clause BSD License:
This End User License Agreement (the "Agreement") is a legal agreement between you and Anaconda, Inc. ("Anaconda") and governs your use of Anaconda
 Individual Edition (which was formerly known as Anaconda Distribution).
--------------more------------------   # 8次 空格

Do you accept the license terms? [yes|no]
[no] >>> yes   # 确认

Anaconda3 will now be installed into this location:
/home/bigdata/anaconda3  
  - Press ENTER to confirm the location
  - Press CTRL-C to abort the installation
  - Or specify a different location below 

[/home/bigdata/anaconda3] >>> /home/bigdata/bigData/anaconda3   # anaconda3的安装路径

PREFIX=/home/bigdata/bigData/anaconda3
Unpacking payload ...

Do you wish the installer to initialize Anaconda3
by running conda init? [yes|no]
[no] >>> yes   # 初始化

2.换源

conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/ 
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/ 
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/ 
conda config --set show_channel_urls yes
# 检查
conda config --show channels

3.配置 pySpark环境

conda create -n pyspark_env python=3.10
conda activate pyspark_env

conda install pyspark
#  hive
pip install pyhive jieba -i https://pypi.tuna.tsinghua.edu.cn/simple

4.解压spark

# 解压文件
tar -zxvf spark-3.2.1-bin-hadoop3.2.tgz -C ~/bigData/
# 改名
mv spark-3.2.1-bin-hadoop3.2/ spark-3.2.1

5. 环境配置

- JAVA_HOME: 告知Spark JDK在哪里(已配过)
- HADOOP_HOME: 告知Spark Hadoop在哪里(已配过)
- HADOOP_CONF_DIR: 告知Spark Hadoop的配置文件在哪里 
- SPARK_HOME: 表示Spark安装路径在哪里 
- PYSPARK_PYTHON: 表示Spark想运行Python程序

export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop 
export SPARK_HOME=/home/bigdata/bigData/spark-3.2.1 
export PYSPARK_PYTHON=/home/bigdata/bigData/anaconda3/envs/pyspark_env/bin/python 
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin

source ~/.bash_profile

6. local模式

# 启动 pyspark 单机模式
(pyspark_env) [bigdata@node1 ~]$ pyspark
Python 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2025-04-03 20:23:08,774 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.1
      /_/

Using Python version 3.10.16 (main, Dec 11 2024 16:24:50)
Spark context Web UI available at http://node1:4040
Spark context available as 'sc' (master = local[*], app id = local-1743682990974).
SparkSession available as 'spark'.
>>>

7. Standalone模式

1.配置 环境

cd $SPARK_HOME/conf

# 1.添加节点
vim workers
node1
node2
node3

# 2.配置spark环境
mv spark env.sh.template spark-env.sh
vim spark-env.sh
# 设置 JAVA 安装目录
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
# HADOOP 软件配置文件目录,读取 HDFS 上文件和运行 YARN 集群
export HADOOP_CONF_DIR=/home/bigdata/bigData/hadoop-3.1.3/etc/hadoop
export YARN_CONF_DIR=/home/bigdata/bigData/hadoop-3.1.3/etc/hadoop
# master 运行在哪个机器上
export SPARK_MASTER_HOST=node1
# master 的通讯端口
export SPARK_MASTER_PORT=7077
# master 的 webui 端口
export SPARK_MASTER_WEBUI_PORT=8080
# worker cpu 可用核数
export SPARK_WORKER_CORES=1
# worker 可用内存
export SPARK_WORKER_MEMORY=1g
# worker 的工作通讯地址
export SPARK_WORKER_PORT=7078
# worker 的 webui 地址
export SPARK_WORKER_WEBUI_PORT=8081
# 将 spark 程序运行的历史日志存到 hdfs 的 /sparklogs 文件夹中
export SPARK_HISTORY_OPTS="
-Dspark.history.fs.logDirectory=hdfs://mycluster:8020/sparklogs/
-Dspark.history.fs.cleaner.enabled=true"

hdfs dfs -mkdir /sparklogs
hdfs dfs -chmod 777 /sparklogs

mv spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf
# 追加
# 开启spark的日志记录功能
spark.eventLog.enabled true
# 设置spark日志记录的路径
spark.eventLog.dir
hdfs://mycluster:8020/sparklogs/
# 设置spark日志是否启动压缩
spark.eventLog.compress true

mv log4j.properties.template log4j.properties
vim log4j.properties
# 修改时间格式(可以忽略)
log4j.appender.console.layout.ConversionPa ttern=%d{MM/dd HH:mm:ss} %p %c{1}: %m%n

2.启动

start-history-server.sh

cd ~/bigData/spark-3.2.1/sbin
./start-all.sh

3.运行任务

pyspark --master spark://node1:777   # --master 启动standalone集群,否则local

8.standaloneHA模式

  1. spark-env.sh
# 追加
# Spark 高可用性配置
export SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER \
-Dspark.deploy.zookeeper.url=node2:2181,node3:2181,node4:2181 \
-Dspark.deploy.zookeeper.dir=/spark-ha"
  1. 启动
cd ~/bigData/spark-3.2.1/sbin
./start-all.sh
start-history-server.sh

# 测试 在 node2/3
cd ~/bigData/spark-3.2.1/sbin
start-master.sh
jps #  多出master
stop-master.sh

9.yarn平台

1.修改yarn-site.xml

# 删除
 <property>
    <name>yarn.scheduler.minimum-allocation-mb</name>
    <value>512</value>
  </property>
  <property>
    <name>yarn.scheduler.maximum-allocation-mb</name>
    <value>1024</value>
  </property>
  <!-- yarn容器允许管理的物理内存大小(NN) -->
  <property>
    <name>yarn.nodemanager.resource.memory-mb</name>
    <value>1024</value>
  </property>
# 确保添加
<property>
    <name>yarn.nodemanager.pmem-check.enabled</name>
    <value>false</value>
</property>
<!-- 是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是 true -->
<property>
    <name>yarn.nodemanager.vmem-check.enabled</name>
    <value>false</value>
</property>

2.启动

stopha.sh
startha.sh

3.测试

spark-submit --master yarn ~/bigData/spark-3.2.1/examples/src/main/python/pi.py 20

注:启动 后有大量日志,不是error

Hive

1.mysql

安装libaio
yum install -y net-tools libaio
解压mysql
tar -xvf mysql-5.7.33-1.el7.x86_64.rpm-bundle.tar -C ./mysql
安装mysql
sudo rpm -ivh mysql-community-*

# 简洁安装
sudo rpm -ivh mysql-community-common-5.7.33-1.el7.x86_64.rpm \
              mysql-community-client-5.7.33-1.el7.x86_64.rpm \
              mysql-community-server-5.7.33-1.el7.x86_64.rpm \
              mysql-community-libs-5.7.33-1.el7.x86_64.rpm \
              mysql-community-devel-5.7.33-1.el7.x86_64.rpm
初始化
 sudo mysqld --initialize --user=mysql
配置数据库编码方式
sudo vim /etc/my.cnf
[client]
default-character-set=utf8
[mysql]
default-character-set=utf8
[mysqld]
character_set_server=utf8
启动服务器
sudo systemctl start mysqld

# 开机自启动
sudo systemctl enable mysqld
设置密码
# 查找临时密码
sudo grep 'temporary password' /var/log/mysqld.log

mysql -u root -p
>>密码

ALTER USER 'root'@'localhost' IDENTIFIED BY 'root';   # root 可自行替换

2. hive直接连接数据库

解压文件
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C ~/bigData/
添加环境变量
vim ~/.bash_profile
# 追加
export HIVE_HOME=/home/bigdata/bigData/hive-3.1.2
export PATH=$PATH:$HIVE_HOME/bin

source ~/.bash_profile
解决Jar包冲突
mv $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.jar $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.bak_up2021
将MySQL的JDBC驱动拷贝到Hive的lib目录下
cp mysql-connector-java-5.1.48.jar $HIVE_HOME/lib
配置Metastore到MySql
vim $HIVE_HOME/conf/hive-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
 <!-- jdbc连接的URL设置 -->
 <property>
  <name>javax.jdo.option.ConnectionURL</name>
  <value>jdbc:mysql://node1:3306/hive?useSSL=false</value>
 </property>
 <!-- jdbc连接的Driver类设置-->
 <property>
  <name>javax.jdo.option.ConnectionDriverName</name>
  <value>com.mysql.jdbc.Driver</value>
 </property>
 <!-- 指定jdbc连接的username-->
 <property>
  <name>javax.jdo.option.ConnectionUserName</name>
  <value>root</value>
 </property>
 <!-- 指定jdbc连接mysql的password -->
 <property>
  <name>javax.jdo.option.ConnectionPassword</name>
  <value>root</value>
 </property>
 <!-- Hive默认在HDFS的工作目录 -->
 <property>
  <name>hive.metastore.warehouse.dir</name>
  <value>/home/bigdata/bigData/data/hive/warehouse</value>
 </property>
 <!-- Hive元数据存储的验证 -->
 <property>
  <name>hive.metastore.schema.verification</name>
  <value>false</value>
 </property>
 <!-- 元数据存储授权  -->
 <property>
  <name>hive.metastore.event.db.notification.api.auth</name>
  <value>false</value>
 </property>
</configuration>

在node1上mysql数据库中创建数据库实例hive
sudo mysql -uroot -p
Enter password:
Welcome to the MySQL monitor.  Commands end with ; or \g.
Your MySQL connection id is 6
Server version: 5.7.33 MySQL Community Server (GPL)

Copyright (c) 2000, 2021, Oracle and/or its affiliates.

Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.

Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.

mysql> create database hive;
Query OK, 1 row affected (0.01 sec)

mysql> use hive;
Database changed

mysql> show tables;
Empty set (0.01 sec)

mysql>quit;
Bye

在node2上初始化hive的元数据到mysql数据库的hive实例下
 schematool -initSchema -dbType mysql -verbose

ps:1. 在node1的mysql数据库中查看hive实例下表,会发现多了74张表。

可能的错误: 1.Guava 版本问题

Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V at org.apache.hadoop.conf.Configuration.set(Configuration.java:1357) at org.apache.hadoop.conf.Configuration.set(Configuration.java:1338) at org.apache.hadoop.mapred.JobConf.setJar(JobConf.java:518) at org.apache.hadoop.mapred.JobConf.setJarByClass(JobConf.java:536) at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:430) at org.apache.hadoop.hive.conf.HiveConf.initialize(HiveConf.java:5141) at org.apache.hadoop.hive.conf.HiveConf.<init>(HiveConf.java:5104) at org.apache.hive.beeline.HiveSchemaTool.<init>(HiveSchemaTool.java:96) at org.apache.hive.beeline.HiveSchemaTool.main(HiveSchemaTool.java:1473) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.util.RunJar.run(RunJar.java:318) at org.apache.hadoop.util.RunJar.main(RunJar.java:232)

解决方案:

# 下载 Guava 21.0 或更高版本:
    wget https://repo1.maven.org/maven2/com/google/guava/guava/21.0/guava-21.0.jar
    
# 替换 `$HIVE_HOME/lib` 中的旧版本 Guava:
    mv guava-21.0.jar $HIVE_HOME/lib/

# 清理旧版本的 Guava
	cp $HIVE_HOME/lib/guava-19.0.jar $HIVE_HOME/lib/guava-19.0.jar.bak
	rm $HIVE_HOME/lib/guava-19.0.jar

2.Mysql 权限问题:

Metastore connection URL:        jdbc:mysql://node1:3306/hive?useSSL=false
Metastore Connection Driver :    com.mysql.jdbc.Driver
Metastore connection User:       root
org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
Underlying cause: java.sql.SQLException : null,  message from server: "Host 'node2' is not allowed to connect to this MySQL server"
SQL Error code: 1130
org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
        at org.apache.hadoop.hive.metastore.tools.HiveSchemaHelper.getConnectionToMetastore(HiveSchemaHelper.java:94)
        at org.apache.hive.beeline.HiveSchemaTool.getConnectionToMetastore(HiveSchemaTool.java:169)
        at org.apache.hive.beeline.HiveSchemaTool.testConnectionToMetastore(HiveSchemaTool.java:475)
        at org.apache.hive.beeline.HiveSchemaTool.doInit(HiveSchemaTool.java:581)
        at org.apache.hive.beeline.HiveSchemaTool.doInit(HiveSchemaTool.java:567)
        at org.apache.hive.beeline.HiveSchemaTool.main(HiveSchemaTool.java:1517)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:318)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:232)
Caused by: java.sql.SQLException: null,  message from server: "Host 'node2' is not allowed to connect to this MySQL server"
        at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:959)
        at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:898)
        at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:887)
        at com.mysql.jdbc.MysqlIO.doHandshake(MysqlIO.java:1038)
        at com.mysql.jdbc.ConnectionImpl.coreConnect(ConnectionImpl.java:2254)
        at com.mysql.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:2285)
        at com.mysql.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:2084)
        at com.mysql.jdbc.ConnectionImpl.<init>(ConnectionImpl.java:795)
        at com.mysql.jdbc.JDBC4Connection.<init>(JDBC4Connection.java:44)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
        at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
        at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
        at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
        at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
        at com.mysql.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:400)
        at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:327)
        at java.sql.DriverManager.getConnection(DriverManager.java:664)
        at java.sql.DriverManager.getConnection(DriverManager.java:247)
        at org.apache.hadoop.hive.metastore.tools.HiveSchemaHelper.getConnectionToMetastore(HiveSchemaHelper.java:88)
        ... 11 more
*** schemaTool failed ***

解决方案:

GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'root' WITH GRANT OPTION;
# mysql>5.7
ALTER USER 'root'@'%' IDENTIFIED BY 'root';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;

# 刷新
FLUSH PRIVILEGES;

# 如果想一劳永逸  所有用户都授权
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'your_password' WITH GRANT OPTION;
启动
hive
# 具体hive操作
hive> show databases;
OK
default
Time taken: 0.923 seconds, Fetched: 1 row(s)

hive> show tables;
OK
Time taken: 0.081 seconds

hive> create table tb_test(id int);
OK

Time taken: 1.001 seconds

hive> show tables;
OK
tb_test

Time taken: 0.087 seconds, Fetched: 1 row(s)

hive> create table test(id int,age int);
OK
Time taken: 0.123 seconds

hive> show tables;
OK
tb_test
test

3. hive远程连接数据库