NODE1 | NODE2 | NODE3 | NODE4 | |
---|---|---|---|---|
Zookeeper | ✅ | ✅ | ✅ | |
ZKFC-1 | ZKFC-2 | |||
HDFS-NameNode | NameNode | NameNode | ||
HDFS-DataNode | DataNode-1 | DataNode-2 | DataNode-3 | |
HDFS-JournalNode | JNN-1 | JNN-2 | JNN-3 | |
YARN-ResourceManage | RM-1 | RM-2 | ||
Spark | Master、Worker、HistoryServer | Worker | Worker | |
Mysql | ✅ | |||
Hive | ✅ | |||
安装包 |
远控编程
Pycharm远程连接Spark(超详细图文教程)_pycharm远程连接spark集群-CSDN博客
基础部件
1. ssh
连接服务
sudo yum install openssh-server
sudo systemctl start sshd
sudo systemctl enable sshd
注:默认源无法下载,[[换源#CentOS]]
2. 时钟同步
crontab -e
0 1 * * * /user/sbin/ntpdate aliyun.com
3.配置主机名
sudo hostnamectl set-hostname 新主机名
检查 hostname
4. 关闭防火墙
sudo systemctl disable firewalld
5. 配置hosts
# /etc/hosts
ip地址 用户名
6. JDK 安装
sudo rpm -ivh jdk-8u221-linux-x64.rpm
# 配置环境变量 ~/.bash_profile
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
export PATH=$PATH:$JAVA_HOME/bin
# 启动环境
source .bash_profile
7. 免密登录
ssh-keygen -t rsa # 生成密钥 位置~/.ssh
cat .ssh/id_rsa.pub >> .ssh/authorized_keys # 设置公钥
scp .ssh/authorized_keys bigdata@node2:/root/.ssh/ # 公钥相互传递,每个公钥保存所有相连主机的密钥
chmod 600 .ssh/authorized_keys # 修改权限
Zookeeper
1. 解压安装包并改名
tar -zxvf apache-zookeeper 3.5.7-bin.tar.gz -C ~/bigdata
# 在文件所在路径下
mv apache-zookeeper-3.5.7-bin zookeeper-3.5.7
2. 配置 ZooKeeper 环境变量
export ZOOKEEPER_HOME=~/bigData/zookeeper-3.5.7
export PATH=$PATH:$JAVA_HOME/bin:$ZOOKEEPER_HOME/
# 更新环境
source .bash_profile
3. 配置zoo.cfg
# zookeeper-3.5.7/conf/zoo.cfg
tickTime=2000
dataDir=/home/bigdata/bigData/zookeeper-3.5.7/data
clientPort=2181
initLimit=5
syncLimit=2
server.2=node2:2881:3881
server.3=node3:2881:3881
server.4=node4:2881:3881
4. 创建 myid
mkdir ~/bigData/zookeeper-3.5.7/data
cd ~/bigData/zookeeper-3.5.7/data
echo 2 > myid
5. 启动
zkServer.sh start # 启动
zkServer.sh status #查看状态
zkServer.sh stop #停止
==注:zoo.cfg 和 myid 每台Zookeeper 都需要配置,myid 要和zoo.cfg相同==
Hadoop
HDFS
1.解压Hadoop
文件
tar -zxvf hadoop_3.1.3.tar.gz -C ~/bigData/
2. 配置环境变量
# ~/.bash_profile
export HADOOP_HOME=/home/bigdata/bigData/hadoop-3.1.3
export PATH=$PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HAD OOP_HOME/sbin
# 启动环境
source .bash_profile
3. 配置hadoop-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
4. 配置workers
node2
node3
node4
5. 配置 core-site.xml
<configuration>
<!-- 用来指定 HDFS 的老大,namenode 的地址 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://node1:9820</value>
</property>
<!-- 指定 Hadoop 数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/bigdata/bigData/data/hadoop/full</value>
</property>
</configuration>
6.配置hdfs-site.xml
<configuration>
<!-- 指定 NameNode web 端访问地址 -->
<property>
<name>dfs.namenode.http-address</name>
<value>node1:9870</value>
</property>
<!-- 指定 Secondary NameNode web 端访问地址 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node2:9868</value>
</property>
<!-- 指定每个 block 块的副本数,默认为 3 -->
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
7.启动
hdfs namenode -format
start-dfs.sh
HDFS HA
注:ssh不提示fingerprint信息
# /etc/ssh/ssh_config
StrictHostKeyChecking no
1. core-site.xml
<configuration>
<!-- 指定 HDFS 的默认文件系统 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 数据的存放目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/home/bigdata/bigData/data/hadoop/ha</value>
</property>
<!-- 指定每个 ZooKeeper 服务器的位置和客户端端口号 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>node2:2181,node3:2181,node4:2181</value>
</property>
<!-- 解决 HDFS web 页面上删除、创建文件权限不足的问题 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
</configuration>
2. hdfs-site.xml
<configuration>
<!-- JournalNode 数据存储目录 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>${hadoop.tmp.dir}/dfs/journalnode/</value>
</property>
<!-- 集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!-- 集群中 NameNode 节点有哪些 -->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!-- NameNode 的 RPC 通信地址 -->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>node1:9820</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>node2:9820</value>
</property>
<!-- NameNode 的 HTTP 通信地址 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>node1:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>node2:9870</value>
</property>
<!-- 指定 NameNode 元数据在 JournalNode 上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://node1:8485;node2:8485;node3:8485/mycluster</value>
</property>
<!-- 访问代理类:client 用于确定哪个 NameNode 为 Active -->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name><value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 配置隔离机制,即同一时刻只能有一台服务器对外响应 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制时需要 SSH 秘钥登录 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/bigdata/.ssh/id_dsa</value>
</property>
<!-- 启用 NameNode 故障自动转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
3. 启动
# node2 node3 node4
zkServer.sh start
# node1 node2 node3
hdfs --daemon start journalnode
# node1
hdfs namenode -format
hdfs --daemon start namenode
# node2
hdfs namenode -bootstrapStandby
hdfs --daemon start namenode
# node1
hdfs zkfc -formatZK
start-dfs.sh
==注:Active NameNode不能自动进行切换==
yum install -y psmisc
启动脚本
allJps.sh
#!/bin/bash
# ~/bin/allJps.sh
# 查看集群所有节点的 Java 进程
echo "-----------node1 jps--------------"
jps
for node in node2 node3 node4
do
echo "-----------$node jps--------------"
ssh $node "source ~/.bash_profile; jps"
done
starthdfs.sh
#!/bin/bash
# 启动zk集群
for node in node2 node3 node4
do
ssh $node "source ~/.bash_profile; zkServer.sh start"
done
# 休眠1s
sleep 1
# 启动hdfs集群
start-dfs.sh
allJps.sh
stophdfs.sh
#!/bin/bash
source ~/.bash_profile # 确保 Hadoop/ZooKeeper 命令可用
echo "正在关闭 HDFS 集群..."
stop-dfs.sh
sleep 1 # 等待 HDFS 完全关闭
echo "正在关闭 ZooKeeper 集群..."
for node in node2 node3 node4
do
echo "关闭 $node 上的 ZooKeeper..."
ssh $node "source ~/.bash_profile; zkServer.sh stop"
done
echo "检查各节点进程状态..."
~/bin/allJps.sh
Yarn HA
1. 配置mapreduce-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
2. 配置yarn-site.xml
<configuration>
<!-- 基础配置 -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<!-- HA 核心配置 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster-yarn1</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node2:2181,node3:2181,node4:2181</value>
</property>
<!-- RM1 配置 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>node3</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>node3:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>node3:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>node3:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>node3:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>node3:8088</value>
</property>
<property>
<name>yarn.resourcemanager.ha.id.rm1</name>
<value>rm1</value>
</property>
<!-- RM2 配置 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>node4</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>node4:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>node4:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>node4:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>node4:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>node4:8088</value>
</property>
<property>
<name>yarn.resourcemanager.ha.id.rm2</name>
<value>rm2</value>
</property>
<!-- 状态存储 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 资源限制 -->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
</configuration>
启动脚本
startha.sh
#!/bin/bash
# 启动zk集群
for node in node2 node3 node4
do
ssh $node "source ~/.bash_profile; zkServer.sh start"
done
# 休眠1s
sleep 3
# 启动hdfs集群
start-dfs.sh
# 启动yarn
ssh node3 "source ~/.bash_profile; start-yarn.sh"
# 查看四个节点上的java进程
~/bin/allJps.sh
stopha.sh
#!/bin/bash
# 关闭yarn
ssh node3 "source ~/.bash_profile; stop-yarn.sh"
# 关闭hdfs
stop-dfs.sh
# 关闭zk集群
for node in node2 node3 node4
do
ssh $node "source ~/.bash_profile; zkServer.sh stop"
done
# 查看java进程
allJps.sh
Spark
1.安装conda
sh ./Anaconda3-Linux-x86_64.sh # 安装
Welcome to Anaconda3 2021.05
In order to continue the installation process, please review the license
agreement.
Please, press ENTER to continue
>>> # 回车
===================================
End User License Agreement - Anaconda Individual Edition
===================================
Copyright 2015-2021, Anaconda, Inc.
All rights reserved under the 3-clause BSD License:
This End User License Agreement (the "Agreement") is a legal agreement between you and Anaconda, Inc. ("Anaconda") and governs your use of Anaconda
Individual Edition (which was formerly known as Anaconda Distribution).
--------------more------------------ # 8次 空格
Do you accept the license terms? [yes|no]
[no] >>> yes # 确认
Anaconda3 will now be installed into this location:
/home/bigdata/anaconda3
- Press ENTER to confirm the location
- Press CTRL-C to abort the installation
- Or specify a different location below
[/home/bigdata/anaconda3] >>> /home/bigdata/bigData/anaconda3 # anaconda3的安装路径
PREFIX=/home/bigdata/bigData/anaconda3
Unpacking payload ...
Do you wish the installer to initialize Anaconda3
by running conda init? [yes|no]
[no] >>> yes # 初始化
2.换源
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
conda config --set show_channel_urls yes
# 检查
conda config --show channels
3.配置 pySpark环境
conda create -n pyspark_env python=3.10
conda activate pyspark_env
conda install pyspark
# hive
pip install pyhive jieba -i https://pypi.tuna.tsinghua.edu.cn/simple
4.解压spark
# 解压文件
tar -zxvf spark-3.2.1-bin-hadoop3.2.tgz -C ~/bigData/
# 改名
mv spark-3.2.1-bin-hadoop3.2/ spark-3.2.1
5. 环境配置
- JAVA_HOME: 告知Spark JDK在哪里(已配过)
- HADOOP_HOME: 告知Spark Hadoop在哪里(已配过)
- HADOOP_CONF_DIR: 告知Spark Hadoop的配置文件在哪里
- SPARK_HOME: 表示Spark安装路径在哪里
- PYSPARK_PYTHON: 表示Spark想运行Python程序
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_HOME=/home/bigdata/bigData/spark-3.2.1
export PYSPARK_PYTHON=/home/bigdata/bigData/anaconda3/envs/pyspark_env/bin/python
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
source ~/.bash_profile
6. local模式
# 启动 pyspark 单机模式
(pyspark_env) [bigdata@node1 ~]$ pyspark
Python 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2025-04-03 20:23:08,774 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 3.2.1
/_/
Using Python version 3.10.16 (main, Dec 11 2024 16:24:50)
Spark context Web UI available at http://node1:4040
Spark context available as 'sc' (master = local[*], app id = local-1743682990974).
SparkSession available as 'spark'.
>>>
7. Standalone模式
1.配置 环境
cd $SPARK_HOME/conf
# 1.添加节点
vim workers
node1
node2
node3
# 2.配置spark环境
mv spark env.sh.template spark-env.sh
vim spark-env.sh
# 设置 JAVA 安装目录
export JAVA_HOME=/usr/java/jdk1.8.0_221-amd64
# HADOOP 软件配置文件目录,读取 HDFS 上文件和运行 YARN 集群
export HADOOP_CONF_DIR=/home/bigdata/bigData/hadoop-3.1.3/etc/hadoop
export YARN_CONF_DIR=/home/bigdata/bigData/hadoop-3.1.3/etc/hadoop
# master 运行在哪个机器上
export SPARK_MASTER_HOST=node1
# master 的通讯端口
export SPARK_MASTER_PORT=7077
# master 的 webui 端口
export SPARK_MASTER_WEBUI_PORT=8080
# worker cpu 可用核数
export SPARK_WORKER_CORES=1
# worker 可用内存
export SPARK_WORKER_MEMORY=1g
# worker 的工作通讯地址
export SPARK_WORKER_PORT=7078
# worker 的 webui 地址
export SPARK_WORKER_WEBUI_PORT=8081
# 将 spark 程序运行的历史日志存到 hdfs 的 /sparklogs 文件夹中
export SPARK_HISTORY_OPTS="
-Dspark.history.fs.logDirectory=hdfs://mycluster:8020/sparklogs/
-Dspark.history.fs.cleaner.enabled=true"
hdfs dfs -mkdir /sparklogs
hdfs dfs -chmod 777 /sparklogs
mv spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf
# 追加
# 开启spark的日志记录功能
spark.eventLog.enabled true
# 设置spark日志记录的路径
spark.eventLog.dir
hdfs://mycluster:8020/sparklogs/
# 设置spark日志是否启动压缩
spark.eventLog.compress true
mv log4j.properties.template log4j.properties
vim log4j.properties
# 修改时间格式(可以忽略)
log4j.appender.console.layout.ConversionPa ttern=%d{MM/dd HH:mm:ss} %p %c{1}: %m%n
2.启动
start-history-server.sh
cd ~/bigData/spark-3.2.1/sbin
./start-all.sh
3.运行任务
pyspark --master spark://node1:777 # --master 启动standalone集群,否则local
8.standaloneHA模式
spark-env.sh
# 追加
# Spark 高可用性配置
export SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER \
-Dspark.deploy.zookeeper.url=node2:2181,node3:2181,node4:2181 \
-Dspark.deploy.zookeeper.dir=/spark-ha"
- 启动
cd ~/bigData/spark-3.2.1/sbin
./start-all.sh
start-history-server.sh
# 测试 在 node2/3
cd ~/bigData/spark-3.2.1/sbin
start-master.sh
jps # 多出master
stop-master.sh
9.yarn平台
1.修改yarn-site.xml
# 删除
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>1024</value>
</property>
<!-- yarn容器允许管理的物理内存大小(NN) -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
# 确保添加
<property>
<name>yarn.nodemanager.pmem-check.enabled</name>
<value>false</value>
</property>
<!-- 是否启动一个线程检查每个任务正使用的物理内存量,如果任务超出分配值,则直接将其杀掉,默认是 true -->
<property>
<name>yarn.nodemanager.vmem-check.enabled</name>
<value>false</value>
</property>
2.启动
stopha.sh
startha.sh
3.测试
spark-submit --master yarn ~/bigData/spark-3.2.1/examples/src/main/python/pi.py 20
注:启动 后有大量日志,不是error
Hive
1.mysql
安装libaio
yum install -y net-tools libaio
解压mysql
tar -xvf mysql-5.7.33-1.el7.x86_64.rpm-bundle.tar -C ./mysql
安装mysql
sudo rpm -ivh mysql-community-*
# 简洁安装
sudo rpm -ivh mysql-community-common-5.7.33-1.el7.x86_64.rpm \
mysql-community-client-5.7.33-1.el7.x86_64.rpm \
mysql-community-server-5.7.33-1.el7.x86_64.rpm \
mysql-community-libs-5.7.33-1.el7.x86_64.rpm \
mysql-community-devel-5.7.33-1.el7.x86_64.rpm
初始化
sudo mysqld --initialize --user=mysql
配置数据库编码方式
sudo vim /etc/my.cnf
[client]
default-character-set=utf8
[mysql]
default-character-set=utf8
[mysqld]
character_set_server=utf8
启动服务器
sudo systemctl start mysqld
# 开机自启动
sudo systemctl enable mysqld
设置密码
# 查找临时密码
sudo grep 'temporary password' /var/log/mysqld.log
mysql -u root -p
>>密码
ALTER USER 'root'@'localhost' IDENTIFIED BY 'root'; # root 可自行替换
2. hive直接连接数据库
解压文件
tar -zxvf apache-hive-3.1.2-bin.tar.gz -C ~/bigData/
添加环境变量
vim ~/.bash_profile
# 追加
export HIVE_HOME=/home/bigdata/bigData/hive-3.1.2
export PATH=$PATH:$HIVE_HOME/bin
source ~/.bash_profile
解决Jar包冲突
mv $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.jar $HIVE_HOME/lib/log4j-slf4j-impl-2.10.0.bak_up2021
将MySQL的JDBC驱动拷贝到Hive的lib目录下
cp mysql-connector-java-5.1.48.jar $HIVE_HOME/lib
配置Metastore到MySql
vim $HIVE_HOME/conf/hive-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- jdbc连接的URL设置 -->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://node1:3306/hive?useSSL=false</value>
</property>
<!-- jdbc连接的Driver类设置-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<!-- 指定jdbc连接的username-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<!-- 指定jdbc连接mysql的password -->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>
<!-- Hive默认在HDFS的工作目录 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/home/bigdata/bigData/data/hive/warehouse</value>
</property>
<!-- Hive元数据存储的验证 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<!-- 元数据存储授权 -->
<property>
<name>hive.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
</configuration>
在node1上mysql数据库中创建数据库实例hive
sudo mysql -uroot -p
Enter password:
Welcome to the MySQL monitor. Commands end with ; or \g.
Your MySQL connection id is 6
Server version: 5.7.33 MySQL Community Server (GPL)
Copyright (c) 2000, 2021, Oracle and/or its affiliates.
Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.
Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
mysql> create database hive;
Query OK, 1 row affected (0.01 sec)
mysql> use hive;
Database changed
mysql> show tables;
Empty set (0.01 sec)
mysql>quit;
Bye
在node2上初始化hive的元数据到mysql数据库的hive实例下
schematool -initSchema -dbType mysql -verbose
ps:1. 在node1的mysql数据库中查看hive实例下表,会发现多了74张表。
可能的错误: 1.Guava 版本问题
Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V at org.apache.hadoop.conf.Configuration.set(Configuration.java:1357) at org.apache.hadoop.conf.Configuration.set(Configuration.java:1338) at org.apache.hadoop.mapred.JobConf.setJar(JobConf.java:518) at org.apache.hadoop.mapred.JobConf.setJarByClass(JobConf.java:536) at org.apache.hadoop.mapred.JobConf.<init>(JobConf.java:430) at org.apache.hadoop.hive.conf.HiveConf.initialize(HiveConf.java:5141) at org.apache.hadoop.hive.conf.HiveConf.<init>(HiveConf.java:5104) at org.apache.hive.beeline.HiveSchemaTool.<init>(HiveSchemaTool.java:96) at org.apache.hive.beeline.HiveSchemaTool.main(HiveSchemaTool.java:1473) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.util.RunJar.run(RunJar.java:318) at org.apache.hadoop.util.RunJar.main(RunJar.java:232)
解决方案:
# 下载 Guava 21.0 或更高版本:
wget https://repo1.maven.org/maven2/com/google/guava/guava/21.0/guava-21.0.jar
# 替换 `$HIVE_HOME/lib` 中的旧版本 Guava:
mv guava-21.0.jar $HIVE_HOME/lib/
# 清理旧版本的 Guava
cp $HIVE_HOME/lib/guava-19.0.jar $HIVE_HOME/lib/guava-19.0.jar.bak
rm $HIVE_HOME/lib/guava-19.0.jar
2.Mysql 权限问题:
Metastore connection URL: jdbc:mysql://node1:3306/hive?useSSL=false
Metastore Connection Driver : com.mysql.jdbc.Driver
Metastore connection User: root
org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
Underlying cause: java.sql.SQLException : null, message from server: "Host 'node2' is not allowed to connect to this MySQL server"
SQL Error code: 1130
org.apache.hadoop.hive.metastore.HiveMetaException: Failed to get schema version.
at org.apache.hadoop.hive.metastore.tools.HiveSchemaHelper.getConnectionToMetastore(HiveSchemaHelper.java:94)
at org.apache.hive.beeline.HiveSchemaTool.getConnectionToMetastore(HiveSchemaTool.java:169)
at org.apache.hive.beeline.HiveSchemaTool.testConnectionToMetastore(HiveSchemaTool.java:475)
at org.apache.hive.beeline.HiveSchemaTool.doInit(HiveSchemaTool.java:581)
at org.apache.hive.beeline.HiveSchemaTool.doInit(HiveSchemaTool.java:567)
at org.apache.hive.beeline.HiveSchemaTool.main(HiveSchemaTool.java:1517)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:318)
at org.apache.hadoop.util.RunJar.main(RunJar.java:232)
Caused by: java.sql.SQLException: null, message from server: "Host 'node2' is not allowed to connect to this MySQL server"
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:959)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:898)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:887)
at com.mysql.jdbc.MysqlIO.doHandshake(MysqlIO.java:1038)
at com.mysql.jdbc.ConnectionImpl.coreConnect(ConnectionImpl.java:2254)
at com.mysql.jdbc.ConnectionImpl.connectOneTryOnly(ConnectionImpl.java:2285)
at com.mysql.jdbc.ConnectionImpl.createNewIO(ConnectionImpl.java:2084)
at com.mysql.jdbc.ConnectionImpl.<init>(ConnectionImpl.java:795)
at com.mysql.jdbc.JDBC4Connection.<init>(JDBC4Connection.java:44)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at com.mysql.jdbc.Util.handleNewInstance(Util.java:404)
at com.mysql.jdbc.ConnectionImpl.getInstance(ConnectionImpl.java:400)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:327)
at java.sql.DriverManager.getConnection(DriverManager.java:664)
at java.sql.DriverManager.getConnection(DriverManager.java:247)
at org.apache.hadoop.hive.metastore.tools.HiveSchemaHelper.getConnectionToMetastore(HiveSchemaHelper.java:88)
... 11 more
*** schemaTool failed ***
解决方案:
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'root' WITH GRANT OPTION;
# mysql>5.7
ALTER USER 'root'@'%' IDENTIFIED BY 'root';
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;
# 刷新
FLUSH PRIVILEGES;
# 如果想一劳永逸 所有用户都授权
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' IDENTIFIED BY 'your_password' WITH GRANT OPTION;
启动
hive
# 具体hive操作
hive> show databases;
OK
default
Time taken: 0.923 seconds, Fetched: 1 row(s)
hive> show tables;
OK
Time taken: 0.081 seconds
hive> create table tb_test(id int);
OK
Time taken: 1.001 seconds
hive> show tables;
OK
tb_test
Time taken: 0.087 seconds, Fetched: 1 row(s)
hive> create table test(id int,age int);
OK
Time taken: 0.123 seconds
hive> show tables;
OK
tb_test
test