基础环境搭建 hostnamectl set-hostname master vi /etc/hosts tzselect vi /etc/profile echo “TZ=‘Asia/Shanghai’; export TZ” >> /etc/profile && source /etc/profile vi /etc/ntp.conf server 127.127.1.0 fudge 127.127.1.0 stratum 10 service ntpd start systemctl status ntpd crontab -e */30 10-17 * * * /usr/sbin/ntpdate master ssh-keygen -t rsa ssh-copy-id master ssh localhost ssh-copy-id slave1 ssh-copy-id slave2 mkdir java tar -zxvf jdk1.8.0_221.tar.gz -C /usr/java vi /etc/profile export JAVA_HOME=/usr/java/jdk1.8.0_171 export PATH=P A T H : PATH:PATH:JAVA_HOME/bin source /etc/profile java -version
mkdir zookeeper tar xvf zookeeper-3.4.10.tar.gz -C /usr/zookeeper vi /etc/profile export ZOOKEEPER_HOME=/usr/zookeeper/zookeeper-3.4.10 export PATH=P A T H : PATH:PATH:ZOOKEEPER_HOME/bin cd /usr/zookeeper/zookeeper-3.4.10/conf mv zoo_sample.cfg zoo.cfg vi zoo.cfg dataDir=/usr/zookeeper/zookeeper-3.4.10/zkdata dataLogDir=/usr/zookeeper/zookeeper-3.4.10/zkdatalog server.1=master:2888:3888 server.2=slave1:2888:3888 server.3=slave2:2888:3888 cd /usr/zookeeper/zookeeper-3.4.10 && mkdir zkdata zkdatalog cd /usr/zookeeper/zookeeper-3.4.10/zkdata && touch myid echo 1 >> myid bin/zkServer.sh start bin/zkServer.sh status
mkdir -p /usr/hadoop tar -zxvf hadoop-2.7.3.tar.gz -C /usr/hadoop vi /etc/profile export HADOOP_HOME=/usr/hadoop/hadoop-2.7.3 export PATH=P A T H : PATH:PATH:HADOOP_HOME/bin export PATH=P A T H : PATH:PATH:HADOOP_HOME/sbin source /etc/profile cd /usr/hadoop/hadoop-2.7.3/etc/hadoop && vi hadoop-env.sh export JAVA_HOME=/usr/java/jdk1.8.0_171
---------vi core-site.xml----------------
fs.default.name hdfs://master:9000 fs.default.name hdfs://master:9000 hadoop.tmp.dir /usr/hadoop/hadoop-2.7.3/hdfs/tmp io.file.buffer.size 131072 fs.checkpoint.period 60 fs.checkpoint.size 67108864---------hdfs-site.xml----------------
dfs.replication 2 dfs.namenode.name.dir file:/usr/hadoop/hadoop-2.7.3/hdfs/name dfs.datanode.data.dir file:/usr/hadoop/hadoop-2.7.3/hdfs/datavi yarn-env.sh export JAVA_HOME=/usr/java/jdk1.8.0_171
-----------vi yarn-site.xml--------------
yarn.resourcemanager.address master:18040 yarn.resourcemanager.scheduler.address master:18030 yarn.resourcemanager.webapp.address master:18088 yarn.resourcemanager.resource-tracker.address master:18025 yarn.resourcemanager.admin.address master:18141 yarn.nodemanager.aux-services mapreduce_shuffle yarn.nodemanager.auxservices.mapreduce.shuffle.class org.apache.hadoop.mapred.ShuffleHandler-----------vi mapred-site.xml------------
mapreduce.framework.name yarncd /usr/hadoop/hadoop-2.7.3/etc/hadoop/ echo master > master && echo slave1 > slaves && echo slave2 >> slaves
hadoop namenode -format start-all.sh start
systemctl start mysqld systemctl status mysqld grep “temporary password” /var/log/mysqld.log
mysql> set global validate_password_policy=0; mysql> set global validate_password_length=4; mysql> alter user'root'@'localhost'identified by '123456'; mysql> grant all privileges on . to 'root'@'%' identified by '123456' with grant option; mysql> flush privileges;
mkdir -p /usr/hive cd /usr/package tar -zxvf apache-hive-2.1.1-bin.tar.gz -C /usr/hive vi /etc/profile #hive export HIVE_HOME=/usr/hive/apache-hive-2.1.1-bin export PATH=$PATH:PATH:PATH:HIVE_HOME/bin
cd /usr/hive/apache-hive-2.1.1-bin/conf && mv hive-env.sh.template hive-env.sh cd $HIVE_HOME/conf && vim hive-env.sh
#配置Hadoop 安装路径 export HADOOP_HOME=/usr/hadoop/hadoop-2.7.3 #配置Hive 配置文件存放路径 export HIVE_CONF_DIR=/usr/hive/apache-hive-2.1.1-bin/conf #配置Hive 运行资源库路径 export HIVE_AUX_JARS_PATH=/usr/hive/apache-hive-2.1.1-bin/lib
cp mysql-connector-java-5.1.47-bin.jar /usr/hive/apache-hive-2.1.1-bin/lib
cd /usr/hive/apache-hive-2.1.1-bin/lib cp jline-2.12.jar /usr/hadoop/hadoop-2.7.3/share/hadoop/yarn/lib
----------hive-site.xml------------ cd /usr/hive/apache-hive-2.1.1-bin/conf && vim hive-site.xml
hive.metastore.warehouse.dir /user/hive_remote/warehouse javax.jdo.option.ConnectionURL jdbc:mysql://slave2:3306/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8&useSSL=false JDBC connect string for a JDBC metastore javax.jdo.option.ConnectionDriverName com.mysql.jdbc.Driver javax.jdo.option.ConnectionUserName root javax.jdo.option.ConnectionPassword 123456 hive.metastore.schema.verification false datanucleus.schema.autoCreateALL true--------hive-site.xml------------ master上执行, cd /usr/hive/apache-hive-2.1.1-bin/conf && vim hive-site.xml
hive.metastore.warehouse.dir /user/hive_remote/warehouse hive.metastore.local false hive.metastore.uris thrift://slave1:9083cd /usr/hadoop/hadoop-2.7.3 start-dfs.sh start-yarn.sh slave1上执行,初始化数据库 schematool -dbType mysql -initSchema cd /usr/hive/apache-hive-2.1.1-bin bin/hive --service metastore master上执行 cd /usr/hive/apache-hive-2.1.1-bin bin/hive create database student;
=============动态添加/删除节点========== hostnamectl set-hostname slave3 systemctl stop firewalld systemctl disable firewalld vi /etc/hosts 172.18.39.85 master 172.18.39.89 slave1 172.18.39.86 slave2 172.18.39.87 slave3 tzselect 5,9,1,1 source /etc/profile echo “TZ=‘Asia/Shanghai’; export TZ” >> /etc/profile && source /etc/profile crontab -e */10 * * * * /usr/sbin/ntpdate master
[root@master ~]# ssh-copy-id slave3
mkdir -p /usr/java cd /usr/package tar -zxvf jdk1.8.0_221.tar.gz -C /usr/java vi /etc/profile #JAVA_HOME export JAVA_HOME=/usr/java/jdk1.8.0_171 export PATH=P A T H : PATH:PATH:JAVA_HOME/bin
mkdir -p /usr/hadoop cd /usr/package tar -zxvf hadoop-2.7.3.tar.gz -C /usr/hadoop vi /etc/profile #HADOOP_HOME export HADOOP_HOME=/usr/hadoop/hadoop-2.7.3 export PATH=P A T H : PATH:PATH:HADOOP_HOME/bin export PATH=P A T H : PATH:PATH:HADOOP_HOME/sbin
cd /usr/hadoop/hadoop-2.7.3/etc/hadoop && vi hadoop-env.sh export JAVA_HOME=/usr/java/jdk1.8.0_171
ssh-keygen -t rsa ssh-copy-id slave3 ssh-copy-id master
cd /usr/hadoop/hadoop-2.7.3/etc/hadoop/ echo master >> master && echo slave1 >> slaves && echo slave2 >> slaves && echo slave3 >> slaves 其他节点(master、slave1、slave2) cd /usr/hadoop/hadoop-2.7.3/etc/hadoop/ echo slave3 >> slaves
hadoop-env.sh JAVA_HOME
core-site.xml 同master
------hdfs-site.xml(在master上)---------
cd /usr/hadoop/hadoop-2.7.3/etc/hadoop vi hdfs-site.xml
dfs.hosts /usr/hadoop/hadoop-2.7.3/etc/hadoop/datanode-allow.list dfs.hosts.exclude usr/hadoop/hadoop-2.7.3/etc/hadoop/excludes修改datanode-allow.list文件,添加如下内容 master slave1 slave2 slave3
slave3 hadoop-daemon.sh start datanode yarn-daemon.sh start nodemanager master hdfs dfsadmin -refreshNodes hdfs dfsadmin -report
删除节点 1.临时删除节点: hadoop-daemon.sh stop datanode hadoop-daemon.sh stop tasktracker 2、永久删除 cd /usr/hadoop/hadoop-2.7.3/etc/hadoop vi excludes slave3 在master上 hdfs dfsadmin -refreshNodes hdfs dfsadmin -report
在slave3节点上关闭DN和NM hadoop-daemon.sh stop datanode yarn-daemon.sh stop nodemanager
从slaves、dn-include.conf和hosts文件中将退役节点slave3剔除(在master,slave1,slave2上操作) cd /usr/hadoop/hadoop-2.7.3/etc/hadoop vi slaves 最后更新集群配置 数据块重新分布:sbin/start-balancer.sh(数据均衡) 从NameNode 的excludes 文件及slaves 文件、hosts 文件中去掉已经移除的主机名
====scala==== mkdir -p /usr/scala/ tar -zxvf scala-2.11.12.tgz -C /usr/scala/ vi /etc/profile 添加 #set scala export SCALA_HOME=/usr/scala/scala-2.11.12 export PATH=PATH source /etc/profile scala -version scp -r /usr/scala root@slave1:/usr/ scp -r /usr/scala root@slave2:/usr/ mkdir -p /usr/spark/ tar -zxvf spark-2.4.0-bin-hadoop2.7.tgz -C /usr/spark/ cd /usr/spark/spark-2.4.0-bin-hadoop2.7/conf cp spark-env.sh.template spark-env.sh vi spark-env.sh 添加 export SPARK_MASTER_IP=master export SCALA_HOME=/usr/scala/scala-2.11.12 export SPARK_WORKER_MEMORY=8g export JAVA_HOME=/usr/java/jdk1.8.0_171 export HADOOP_HOME=/usr/hadoop/hadoop-2.7.3 export HADOOP_CONF_DIR=/usr/hadoop/hadoop-2.7.3/etc/Hadoop
cp slaves.template slaves vi slaves 修改最后一行 slave1 slave2 vi /etc/profile 添加 export SPARK_HOME=/usr/spark/spark-2.4.0-bin-hadoop2.7 export PATH=PATH 然后 source /etc/profile scp -r /usr/spark root@slave1:/usr/ scp -r /usr/spark root@slave2:/usr/ 开启Hadoop /usr/hadoop/hadoop-2.7.3/sbin/start-all.sh 开启spark集群 /usr/spark/spark-2.4.0-bin-hadoop2.7/sbin/start-all.sh jps 三个节点分别出现以下标志即成功 浏览器输入 master节点IP:8080 出现以下界面即成功
spark-shell 可输入命令测试 println("Hello world") 10.输入pyspark测试python环境spark交互模式 pyspark 输入quit()可退出
数据分析 yum install lrzsz -y 文件上传:rz 文件下载:sz
hadoop fs -mkdir -p /college hadoop fs -put /root/college/loan.csv /college/ hadoop fs -ls /college/
建库: create database hive; use hive; // LoanStatus:贷款状态 BorrowerRate:贷款率 ProsperScore:信用得分 Occupation:职业 EmploymentStatus:就业状态 IsBorrowerHomeowner:是否有房 CreditScoreRangeLower:信用评分下限 CreditScoreRangeUppe:信用得分上限 IncomeRange:收入范围
建表:
create table loan ( LoanStatus string, BorrowerRate decimal(10,5), ProsperScore int, Occupation string, EmploymentStatus string , IsBorrowerHomeowner string, CreditScoreRangeLower int , CreditScoreRangeUpper int , IncomeRange string) row format delimited fields terminated by ',';
本地数据导入 load data local inpath ‘/root/college/loan.csv’ into table loan;
统计表数据,结果写入本地/root/college000/01/中。 insert overwrite local directory ‘/root/college000/’ row format delimited fields terminated by ‘\t’ select count() from loan; 以信用得分ProsperScore为变量,对借款进行计数统计(降序),结果写入本地/root/college001/中。 insert overwrite local directory ‘/root/college001/’ row format delimited fields terminated by ‘\t’ select count() from(select ProsperScore from loan where ProsperScore >1 order by ProsperScore desc)a; 给出借款较多的行业top5,结果写入本地/root/college002/中。 select Occupation, count(1) AS avg from loan GROUP BY Occupation ORDER BY avg desc limit 5; 分析贷款状态为违约(Defaulted)的贷款人就业信息,对不同就业状态进行计数统计,将结果top3写入/root/college003/ select EmploymentStatus,count(EmploymentStatus) as num from loan where LoanStatus=‘Defaulted’ group by EmploymentStatus order by num desc limit 3; 对数据中收入范围进行分组统计(降序),查看贷款人收入情况,结果写入/root/college004/ select IncomeRange ,count(IncomeRange ) as num from loan group by IncomeRange order by num desc; 对信用得分进行中间数求值作为信用得分,计算各职业中的信用得分最高分。结果top5写入/root/college005/(对信用得分上限及下限进行中间数求值作为信用得分,计算各职业中的信用得分最高分。要求按照信用的得分降序,职业升序,将结果top5写入/root/college005/) select Occupation,max(num) from (select Occupation, (CreditScoreRangeLower+CreditScoreRangeUppe)/2 as num from loan group by Occupation,num order by Occupation ,num desc)a group by Occupation limit 5; 6.请根据 Apriori 关联规则算法的原理找出与违约最多的(借款状态, 后项) 之间的关联度最强的职业(前项) ,支持度写到本地/root/college006/中(保留五位小数)。 select Occupation, round(sum(1)/113937 , 5) as support from loan where LoanStatus=‘Defaulted’ group by Occupation;