hadoop
- 修改 hostname
# xx.xx.xx.11
hostnamectl set-hostname master
# xx.xx.xx.12
hostnamectl set-hostname slave1
# xx.xx.xx.43
hostnamectl set-hostname slave2
- IP 和主机名的映射
# xx.xx.xx.11 xx.xx.xx.12 xx.xx.xx.43
vim /etc/hosts
映射关系
xx.xx.xx.11 master
xx.xx.xx.12 slave1
xx.xx.xx.43 slave2
- 时间设置 集群 时间要一致
# xx.xx.xx.11 xx.xx.xx.12 xx.xx.xx.43
# 查看
date
# 设置时间
date -s 09:38:00
- SSH 免密登录(Hadoop 集群)
# xx.xx.xx.11 xx.xx.xx.12 xx.xx.xx.43
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
chmod 0600 ~/.ssh/authorized_keys
# xx.xx.xx.11 xx.xx.xx.12 xx.xx.xx.43
ssh-copy-id -i ~/.ssh/id_rsa.pub master
ssh-copy-id -i ~/.ssh/id_rsa.pub slave1
ssh-copy-id -i ~/.ssh/id_rsa.pub slave2
-
java 环境配置 已安装,跳过
-
Hadoop 环境配置
下载
上传到 master 机器
# xx.xx.xx.11 解压缩
tar -xvf hadoop-3.2.3.tar.gz
#
vim /etc/profile
#hadoop
export HADOOP_HOME=/opt/hadoop3.2.3
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
source /etc/profile
- 创建数据存储目录
sudo mkdir -p /usr/local/data/hadoop/name
sudo mkdir -p /usr/local/data/hadoop/secondary
sudo mkdir -p /usr/local/data/hadoop/data
sudo mkdir -p /usr/local/data/hadoop/tmp
- 配置 hadoop-env.sh
export JAVA_HOME=/root/jdk1.8.0_251
export HDFS_NAMENODE_USER="root"
export HDFS_DATANODE_USER="root"
export HDFS_SECONDARYNAMENODE_USER="root"
export YARN_RESOURCEMANAGER_USER="root"
export YARN_NODEMANAGER_USER="root"
- 配置 core-site.xml 文件
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:9820</value>
<description>hdfs内部通讯访问地址</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/data/hadoop/tmp</value>
<description>其它临时目录的父目录,会被其它临时目录用到</description>
</property>
<!-- hive 连接时,不设置会报错 -->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
- 配置 hdfs-site.xml 文件
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
<description>副本数,HDFS存储时的备份数量</description>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/usr/local/data/hadoop/name</value>
<description>namenode临时文件所存放的目录</description>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/usr/local/data/hadoop/data</value>
<description>datanode临时文件所存放的目录</description>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<!-- <property>
<name>dfs.namenode.http-address</name>
<value>master:9870</value>
<description>hdfs web 地址</description>
</property> -->
</configuration>
- 配置 yarn-site.xml 文件:
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
<description>nomenodeManager获取数据的方式是shuffle</description>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
<description>指定Yarn的老大(ResourceManager)的地址</description>
</property>
<!-- <property>
<name>yarn.resourcemanager.webapp.address</name>
<value>192.168.192.164:8088</value>
<description>配置 yarn 外部可访问,(外网IP:端口)</description>
</property> -->
<!-- -->
<!-- <property>
<name>yarn.application.classpath</name>
<value>$hadoop classpath</value>
<description>命令行执行hadoop classpath得到的值</description>
</property> -->
<!-- <property>
<name>yarn.nodemanager.env-whitelist</name>
<value> JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ</value>
<description>容器可能会覆盖的环境变量,而不是使用NodeManager的默认值</description>
</property> -->
<!-- <property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
<description>关闭内存检测,虚拟机需要,不配会报错</description>
</property> -->
</configuration>
- 配置 mapred-site.xml 文件
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
<description>告诉hadoop以后MR(Map/Reduce)运行在YARN上</description>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
</configuration>
- 配置从节点主机名
vim workers # hadoop3.0以后slaves更名为workers
master
slave1
slave2
- 配置分发
cd /opt
scp -r hadoop-3.2.3 root@slave1:/opt/hadoop-3.2.3
scp -r hadoop-3.2.3 root@slave2:/opt/hadoop-3.2.3
- 修改 从节点 配置文件
略
- 启动
只需要在 master 执行即可
hdfs namenode -format
# 一次性启动
start-all.sh
# 逐个组件启动
start-dfs.sh
start-yarn.sh
- 验证
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.2.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>3.2.3</version>
</dependency>
package net.riking;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
public class HadoopTest {
public static Configuration conf = new Configuration();
public static String url = "hdfs://master:9820";
public static String user = "root";
public static void main(String[] args) {
try {
mkdir();
create();
cat();
} catch (Exception e) {
e.printStackTrace();
}
}
public static FileSystem getFS() {
//读取配置文件
Configuration conf = new Configuration();
conf.set("dfs.replication", "1");
conf.set("dfs.client.use.datanode.hostname", "true");
System.setProperty("HADOOP_USER_NAME", "root");
FileSystem fs = null;
try {
URI uri = new URI(url);
fs = FileSystem.get(uri, conf);
} catch (Exception e) {
e.printStackTrace();
}
return fs;
}
public static void mkdir() throws URISyntaxException, IOException, InterruptedException {
// 1、创建配置文件
//// 要确认是org.apache.hadoop.conf.Configuration;
// Configuration conf = new Configuration();
//// 2\获取文件系统
// FileSystem fs = FileSystem.get(new URI(url), conf, "root");
FileSystem fs = getFS();
// 3、调用ApI操作
fs.mkdirs(new Path("/user/java"));
// 4、关闭流
fs.close();
}
// 创建文件并写入数据
public static void create() throws URISyntaxException, IOException, InterruptedException {
FileSystem fs = getFS();
// 3、调用ApI操作
// 创建文件并写入数据
FSDataOutputStream in = fs.create(new Path("/user/java/c.txt"));
in.write("Hello,HDFS \n".getBytes());
in.flush();
// 4、关闭流
fs.close();
}
// 读取文件并在控制台展示
public static void cat() throws URISyntaxException, IOException, InterruptedException {
// 3、调用ApI操作
// 创建文件并写入数据
FileSystem fs = getFS();
// FileStatus[] fileStatuses = fs.listStatus(new Path("/"));
// Arrays.stream(fileStatuses).forEach(e->{
// System.err.println(e.getPath().getName());
// });
FSDataInputStream out = fs.open(new Path("/user/java/c.txt"));
IOUtils.copyBytes(out, System.out, 1024);
// 4、关闭流
fs.close();
}
}
hive
- 下载
tar -xvf apache-hive-3.1.2-bin.tar.gz
- 配置环境变量
export HIVE_HOME=/opt/apache-hive-3.1.2-bin
export HIVE_CONF_DIR=${HIVE_HOME}/conf
export PATH=$PATH:$HIVE_HOME/bin
- 配置文件
cp hive-env.sh.template hive-env.sh
- 配置 hive-site.xml
cp hive-default.xml.template hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/root/hive/warehouse</value>
<description>location of default database for the warehouse</description>
</property>
<!--MySQL地址metastore是元数据库的名称,需要在mysql中创建相同名字的数据库-->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://xx.xx.xx.50:3306/hive3?createDatabaseIfNotExist=true</value>
</property>
<!--MySQL驱动-->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value>
<!-- <value>com.mysql.jdbc.Driver</value> -->
</property>
<!--MySQL用户名-->
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<!--MySQL密码-->
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>hive.server2.thrift.client.user</name>
<value>root</value>
</property>
<property>
<name>hive.server2.thrift.client.password</name>
<value>123456</value>
</property>
</configuration>
hadoop fs -mkdir -p /root/hive/warehouse
- copy jar 包
cd /opt
cp mysql-connector-java-8.0.18.jar apache-hive-3.1.2-bin/lib/mysql-connector-java-8.0.18.jar
- 启动
schematool -initSchema -dbType mysql
- 启动报错
切换 guava-27.0-jre.jar 版本
cd /opt/hadoop-3.2.3/share/hadoop/common/lib
cp guava-27.0-jre.jar /opt/apache-hive-3.1.2-bin/lib/guava-27.0-jre.jar
cd /opt/apache-hive-3.1.2-bin/lib
rm -rf guava-19.0.jar
# 再次启动
schematool -initSchema -dbType mysql
# hive 进入
hive
# 后台启动
nohup hive --service metastore >/usr/hive/metastore.log 2>&1 &
nohup hive --service hiveserver2 >/usr/hive/hiveserver2.log 2>&1 &
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>3.1.2</version>
</dependency>
public class HiveConfig {
private final String driverName = "org.apache.hive.jdbc.HiveDriver";
private final String url = "jdbc:hive2://xx.xx.xx.11:10000/db_hivetest";
private final String user = "root";
private final String password = "123456";
public List<Map<String, Object>> queryData() throws Exception {
// Register driver and create driver instance
Class.forName(driverName);
// get connection
Connection con = DriverManager.getConnection(url, user, password);
// create statement
Statement stmt = con.createStatement();
// execute statement
ResultSet resultSet = stmt.executeQuery("SELECT * FROM student where id>1 limit 100");
List<Map<String, Object>> list = new ArrayList<>();
while (resultSet.next()) {
Map<String, Object> entity = new HashMap<>();
entity.put("id", resultSet.getLong("id"));
entity.put("name", resultSet.getString("name"));
// entity.put("age", resultSet.getInt("age"));
// entity.put("ptd", resultSet.getString("pt_d"));
list.add(entity);
System.out.println(resultSet.getLong(1) + "\t" + resultSet.getString(2) + "\t" );
}
System.out.println("Table student queried.");
resultSet.close();
stmt.close();
con.close();
return list;
}
public static void main(String[] args) throws Exception {
HiveConfig hiveConfig = new HiveConfig();
hiveConfig.queryData();
}
}
spark
- 下载
tar -xvzf spark-3.1.3-bin-hadoop3.2.tgz
- 环境变量
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_HOME=/opt/spark-3.1.3-bin-hadoop3.2
export PATH=$PATH:$SPARK_HOME/bin
- 修改配置
cd /opt/spark-3.1.3-bin-hadoop3.2/conf
cp spark-env.sh.template spark-env.sh
vim spark-env.sh
export SPARK_MASTER_HOST=master
cp workers.template workers
vim workers
master
slave1
slave2
cp spark-defaults.conf.template spark-defaults.conf
vim spark-defaults.conf
spark.eventLog.enabled true
spark.eventLog.dir hdfs://master:9820/spark-logs
spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider
spark.history.fs.logDirectory hdfs://master:9820/spark-logs
spark.history.fs.update.interval 10s
spark.history.ui.port 18080
创建 hadoop spark-eventLog 目录
hadoop fs -mkdir /spark-logs
hadoop fs -chmod 777 /spark-logs
vim spark-env.sh
# 添加下列内容
export SPARK_HISTORY_OPTS="-Dspark.history.ui.port=18080 -Dspark.history.retainedApplications=2 -Dspark.history.fs.logDirectory=hdfs://master:9820/spark-logs"
- 配置分发
cd /opt
scp -r spark-3.1.3-bin-hadoop3.2 root@slave1:/opt/spark-3.1.3-bin-hadoop3.2
scp -r spark-3.1.3-bin-hadoop3.2 root@slave2:/opt/spark-3.1.3-bin-hadoop3.2
- 子节点环境变量配置
同上
- 测试
run-example SparkPi 10
显示结果
Pi is roughly 3.1413871413871415
- 启动
cd /opt/spark-3.1.3-bin-hadoop3.2/sbin
# standalone 代码中SparkSession设置master (spark://ip:7077)
start-all.sh
# history-server http://xx.xx.xx.11:18080
start-history-server.sh
- 整合 hive
cd /opt/apache-hive-3.1.2-bin/conf
vim hive-site.xml
<!-- 该属性为空表示嵌入模式或本地模式,否则为远程模式 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://master:9083</value>
</property>
# 后台启动
nohup hive --service metastore >/usr/hive/metastore.log 2>&1 &
nohup hive --service hiveserver2 >/usr/hive/hiveserver2.log 2>&1 &
cd /opt/apache-hive-3.1.2-bin/conf
cp hive-site.xml /opt/spark-3.1.3-bin-hadoop3.2/conf/
cp /opt/apache-hive-3.1.2-bin/lib/mysql-connector-java-8.0.18.jar /opt/spark-3.1.3-bin-hadoop3.2/jars/
测试
spark-sql
show databases;
能看到 hive 的 数据库
- spark on yarn 和 standalone
无须启动 spark 集群,即无需执行 start-all.sh,代码中 SparkSession 不能指定 master
异同点:
yarn-client
yarn-cluster
standalone
- spark-submit
# standalone
spark-submit --master spark://master:7077 --class net.xxx.SparkSubmitTest /opt/hive3-1.0.jar aaa db_hivetest.student
# yarn client
spark-submit --master yarn --deploy-mode client --class net.xxx.SparkSubmitTest /opt/hive3-1.0.jar aaa db_hivetest.student
# yarn cluster
spark-submit --master yarn --deploy-mode cluster --class net.xxx.SparkSubmitTest /opt/hive3-1.0.jar aaa db_hivetest.student