大数据开发初识HDFS（第五篇）一、HDFS 1.1、HDFS分布式文件系统的设计思想用户请求查看数据时候请求主节点，

一、HDFS

1.1、HDFS分布式文件系统的设计思想

用户请求查看数据时候请求主节点，主节点上面会维护所有数据的存储信息，主节点会把对应数据所在的节点信息返回给用户，然后用户根据数据所在节点信息去读取数据，这样压力就不会全部在主节点上面。这个就是HDFS分布式文件系统的设计思想

1.2、HDFS （Hadoop Distributed File System）

它是一种允许文件通过网络在多台主机上分享的文件系统，可以让多台机器上的多个用户分享文件和存储空间。

其实分布式文件管理系统有很多，HDFS只是其中一种实现而已。还有 GFS(谷歌的)、TFS(淘宝的)、S3(亚马逊的)。
不同的分布式文件系统的特点是不一样的，HDFS是一种适合大文件存储的分布式文件系统，不适合小文件存储，什么叫小文件，例如，几KB，几M的文件都可以认为是小文件

1.3、HDFS的Shell介绍

操作格式：bin/hdfs dfs -xxx scheme://authority/path

使用hadoop bin目录的hdfs命令，后面指定dfs，表示是操作分布式文件系统的，这些属于固定格式。如果在PATH中配置了hadoop的bin目录，那么这里可以直接使用hdfs就可以了。这里的xxx是一个占位符，具体我们想对hdfs做什么操作，就可以在这里指定对应的命令了。

HDFS的schema是hdfs，authority是集群中namenode所在节点的ip和对应的端口号，把ip换成主机名也是一样的，path是我们要操作的文件路径信息

二、HDFS的常见的shell命令

其实hdfs后面支持很多的参数，但是有很多是很少用的。直接在命令行中输入hdfs dfs，可以查看dfs后面可以跟的所有参数。

注意：这里面的[]表示是可选项，<>表示是必填项

-ls：查询指定路径信息

hdfs dfs -ls hdfs://bigdata01:9000/

其实后面hdfs的url这个一串内容默认可以省略的，因为hdfs在执行的时候会根据HADOOP_HOME自动识别配置文件中的fs.defaultFS属性

hdfs dfs -ls /
-put：从本地上传文件

hdfs dfs -put README.txt /
-cat：查看HDFS文件内容

hdfs dfs -cat /README.txt
-get：下载文件到本地

hdfs dfs -get /README.txt

注意这边下载的文件名不能与现有文件名重复。

hdfs dfs -get /README.txt README.txt.bak

下载的时候重命名
-mkdir [-p]：创建文件夹

hdfs dfs -mkdir /test

如果要递归创建多级目录，还需要再指定-p参数

hdfs dfs -mkdir -p /abc/xyz

想要递归显示所有目录的信息，可以在ls后面添加-R参数

hdfs dfs -ls -R /
-rm [-r]：删除文件/文件夹

如果想要删除hdfs中的目录或者文件，可以使用rm

hdfs dfs -rm /README.txt

删除目录，注意，删除目录需要指定-r参数

hdfs dfs -rm -r /test

递归删除

hdfs dfs -rm -r /abc

三、HDFS案例实操

需求：统计HDFS中文件的个数和每个文件的大小

hdfs dfs -ls / |grep /| wc -l

四、Java代码操作HDFS

4.1、添加POM依赖

<properties>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <slf4j.version>1.7.25</slf4j.version>
    <log4j.version>1.2.16</log4j.version>
</properties>

<dependencies>
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-api</artifactId>
        <version>${slf4j.version}</version>
    </dependency>
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
        <version>${slf4j.version}</version>
        <exclusions>
            <exclusion>
                <artifactId>log4j</artifactId>
                <groupId>log4j</groupId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>log4j</groupId>
        <artifactId>log4j</artifactId>
        <version>${log4j.version}</version>
    </dependency>
</dependencies>

<dependency>
  <groupId>org.apache.hadoop</groupId>
  <artifactId>hadoop-client</artifactId>
  <version>3.3.4</version>
  <exclusions>
    <exclusion>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
    </exclusion>
    <exclusion>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-log4j12</artifactId>
    </exclusion>
    <exclusion>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
    </exclusion>
    <exclusion>
      <artifactId>slf4j-reload4j</artifactId>
      <groupId>org.slf4j</groupId>
    </exclusion>
  </exclusions>
</dependency>

新建log4j.properties

log4j.rootLogger=info,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} [%-5p] [%t] %m %n

4.2、Java代码

package com.strivelearn.hadoop.hdfs;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

/**
 * The type Hdfs main.
 *
 * @author strivelearn
 * @version HdfsMain.java, 2022年09月03日
 */
public class HdfsMain {

    /**
     * The constant fileSystem.
     */
    private static FileSystem fileSystem;

    static {
        //创建一个配置对象
        Configuration conf = new Configuration();
        //指定HDFS的地址
        conf.set("fs.defaultFS", "hdfs://192.168.234.100:9000");
        //获取操作HDFS的对象
        try {
            fileSystem = FileSystem.get(conf);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * The entry point of application.
     *
     * @param args the input arguments
     * @throws IOException the io exception
     */
    public static void main(String[] args) throws IOException {
        //createFile("/Users/strivelearn/Desktop/分布式集群.drawio.png", "/分布式集群.png");
        //getFile("/分布式集群.png", "/Users/strivelearn/Desktop/1.png");
        deleteFile("/分布式集群.png");
    }

    /**
     * Create file.
     *
     * @param fromPath   the from path
     * @param targetPath the target path
     * @throws IOException the io exception
     */
    private static void createFile(String fromPath, String targetPath) throws IOException {
        //获取HDFS文件系统的输出流
        FSDataOutputStream fos = fileSystem.create(new Path(targetPath));
        //获取本地文件的输入流
        FileInputStream fis = new FileInputStream(fromPath);
        //上传文件：通过工具类把输入流拷贝到输出流里面，实现本地文件上传到HDFS
        IOUtils.copyBytes(fis, fos, 1024, true);
    }

    /**
     * Gets file.
     *
     * @param fromPath the from path
     * @param downPath the down path
     * @throws IOException the io exception
     */
    private static void getFile(String fromPath, String downPath) throws IOException {
        //获取HDFS文件系统的输入流
        FSDataInputStream fis = fileSystem.open(new Path(fromPath));
        //获取本地文件的输出流
        FileOutputStream fos = new FileOutputStream(downPath);
        //下载文件+
        IOUtils.copyBytes(fis, fos, 1024, true);
    }

    /**
     * Delete file.
     *
     * @param deletePath the delete path
     */
    private static void deleteFile(String deletePath) throws IOException {
        //删除文件，目录也可以删除
        //如果要递归删除目录，则第二个参数需要设置为true
        //如果是删除文件或者空目录，第二个参数会被忽略
        boolean flag = fileSystem.delete(new Path(deletePath), true);
        if (flag) {
            System.out.println("删除成功！");
        } else {
            System.out.println("删除失败！");
        }
    }
}

执行报错如下

Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.security.AccessControlException): Permission denied: user=strivelearn, access=WRITE, inode="/":root:supergroup:drwxr-xr-x
  at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPermissionChecker.java:506)
  at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermission(FSPermissionChecker.java:346)
  at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermission(FSPermissionChecker.java:242)
  at org.apache.hadoop.hdfs.server.namenode.FSDirectory.checkPermission(FSDirectory.java:1943)
  at org.apache.hadoop.hdfs.server.namenode.FSDirectory.checkPermission(FSDirectory.java:1927)
  at org.apache.hadoop.hdfs.server.namenode.FSDirectory.checkAncestorAccess(FSDirectory.java:1886)
  at org.apache.hadoop.hdfs.server.namenode.FSDirWriteFileOp.resolvePathForStartFile(FSDirWriteFileOp.java:323)
  at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2656)
  at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFile(FSNamesystem.java:2596)
  at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:799)

解决办法：

去掉hdfs的用户权限校验机制，通过在hdfs-site.xml配置dsf.permissions.enabled=false

/root/software/hadoop-3.3.4/etc/hadoop

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>1</value>
    </property>
    <property>
        <name>dfs.permissions.enabled</name>
        <value>false</value>
    </property>
</configuration>

停止服务器

sbin/stop-all.sh
重启服务器

sbin/start-all.sh
查看上传文件

hdfs dfs -ls /