09HDFS 操作HDFS 操作 Shell操作基本语法基于shell来操作HDFS时，可以使用$HADOOP_HO

HDFS 操作

Shell操作

基本语法

基于shell来操作HDFS时，可以使用 $HADOOP_HOME/bin/hadoop fs 具体命令或者使用$ HADOOP_HOME/bin/hdfs dfs 具体命令，其中fs也可以使用dfs命令代替，dfs是fs的实现类

hdfs dfs
Usage: hadoop fs [generic options]
        [-appendToFile <localsrc> ... <dst>]
        [-cat [-ignoreCrc] <src> ...]
        [-checksum <src> ...]
        [-chgrp [-R] GROUP PATH...]
        [-chmod [-R] <MODE[,MODE]... | OCTALMODE> PATH...]
        [-chown [-R] [OWNER][:[GROUP]] PATH...]
        [-copyFromLocal [-f] [-p] [-l] [-d] [-t <thread count>] [-q <thread pool queue size>] <localsrc> ... <dst>]
...

常用命令实操

# node01
su - bigdata
# 1.启动Hadoop集群
start-dfs.sh
# 单独手动启动namenode
hdfs --daemon start namenode
# 2.-help查看命令
hdfs dfs -help
# 3.-ls主要显示目录信息，查看HDFS中某个目录下的文件信息
hdfs dfs -ls /
# 4.-mkdir在HDFS中创建目录，还可跟上-p来创建多级目录
hdfs dfs -mkdir /hello
hdfs dfs -mkdir -p /hello/a/b/c
# 5.-moveFromLocal将文件从本地剪切到HDFS目录中
echo "hello hdfs" > data.txt
hdfs dfs -moveFromLocal ./data.txt /hello/

hdfs dfs -ls /hello
Found 2 items
drwxr-xr-x   - bigdata supergroup          0 2024-01-18 09:15 /hello/a
-rw-r--r--   3 bigdata supergroup         11 2024-01-18 09:15 /hello/data.txt
# 6.-cat显示HDFS文件内容
hdfs dfs -cat /hello/data.txt
hello hdfs
# 7.-appendToFile追加一个文件到已经存在的文件末尾
echo "你好 hadoop" > data2.txt
hdfs dfs -appendToFile ./data2.txt /hello/data.txt

hdfs dfs -cat /hello/data.txt
hello hdfs
你好 hadoop
# 8.-chmod修改文件权限，与文件系统中的用法一样
hdfs dfs -ls /hello/data.txt
-rw-r--r--   3 bigdata supergroup         25 2024-01-18 09:16 /hello/data.txt

hdfs dfs -chmod 777 /hello/data.txt

hdfs dfs -ls /hello/data.txt
-rwxrwxrwx   3 bigdata supergroup         25 2024-01-18 09:16 /hello/data.txt
# 9.-copyFromLocal从本地文件系统中拷贝文件到HDFS路径中
hdfs dfs -copyFromLocal data2.txt /hello/

hdfs dfs -cat /hello/data2.txt
你好 hadoop
# 10.-copyToLocal从HDFS拷贝文件或者目录到本地
hdfs dfs -copyToLocal /hello/ ./

ls ./hello
a  data2.txt  data.txt
# 11.-cp从HDFS的一个路径拷贝到HDFS的另一个路径
hdfs dfs -mkdir -p /hello2
hdfs dfs -cp /hello/data.txt /hello2/

hdfs dfs -cat /hello2/data.txt
hello hdfs
你好 hadoop
# 12.-mv在HDFS目录中移动文件，将文件移动到某个HDFS目录中
hdfs dfs -mkdir -p /hello3
hdfs dfs -mv /hello2/data.txt /hello3/
hdfs dfs -ls /hello2/

hdfs dfs -cat /hello3/data.txt
hello hdfs
你好 hadoop
# 13.-get等同于-copyToLocal，从HDFS中下载文件或目录到本地
hdfs dfs -get /hello3 ./

ls ./hello3
data.txt
# 14.-getmerge合并下载多个文件，比如HDFS的目录 /hello4下有多个文件:a.txt,b.txt,c.txt...，可以通过此命令，将数据合并下载到本地
hdfs dfs -ls /hello
Found 3 items
drwxr-xr-x   - bigdata supergroup          0 2024-01-18 09:15 /hello/a
-rwxrwxrwx   3 bigdata supergroup         25 2024-01-18 09:16 /hello/data.txt
-rw-r--r--   3 bigdata supergroup         14 2024-01-18 09:19 /hello/data2.txt

hdfs dfs -getmerge /hello/* ./merge.txt

cat merge.txt
hello hdfs
你好 hadoop
你好 hadoop
# 15.-put等同于-copyFromLocal，将本地文件复制上传到HDFS中
hdfs dfs -put ./merge.txt /hello/
# 16.-tail显示一个文件最后1kb数据到控制台
hdfs dfs -tail /hello/merge.txt
hello hdfs
你好 hadoop
你好 hadoop
# 17.-rm删除文件或文件夹。可以加上 -r来递归删除目录下的所有数据
hdfs dfs -rm /hello/merge.txt
hdfs dfs -rm -r /hello/
# 18.-rmdir删除空目录，目录必须是空目录才可以
hdfs dfs -rmdir /hello2/
# 19.-du统计文件或文件夹的大小。第一列表示文件大小。第二列表示该文件在集群上的总存储大小和你的副本数相关，副本数默认是3 ，所以第二列是第一列的三倍（第二列内容=文件大小*副本数），第三列显示文件路径
hdfs dfs -mkdir -p /hello3/aa/bb
hdfs dfs -cp /hello3/data.txt /hello3/aa/bb/

hdfs dfs -du /hello3
25  75  /hello3/aa
25  75  /hello3/data.txt
# 20.-setrep设置HDFS中文件的副本数量
hdfs dfs -setrep 10 /hello3/data.txt

# 上传文件，并设置块大小为1M，这个在前面就测试过，该文件被分割成两个块，下面Java API操作时需要
for i in `seq 100000`;do echo "hello hadoop $i" >> mydata.txt;done
hdfs dfs -D dfs.blocksize=1048576 -put  mydata.txt /hello3/

默认3个副本

调整副本数为10

注意：这里设置的副本数只是记录在NameNode的元数据中，是否真的会有这么多副本，还得看DataNode的数量。因为目前只有3台DataNode，最多也就3个副本，只有节点数增加到10台时，副本数才能达到10

Java API 操作

创建空的maven项目，并将hadoop相关配置拷贝到maven项目的resources目录。这里只测试hdfs，只需要拷贝core-site.xml和hdfs-site.xml这两个文件即可。还需要在执行java代码的主机上配置hosts，配置hdfs访问域名到ip的映射

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>org.example</groupId>
    <artifactId>hadoop-test</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.2.4</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.13</version>
        </dependency>
    </dependencies>
    <build>
        <finalName>${project.name}</finalName>
        <resources>
            <resource>
                <directory>src/main/resources</directory>
                <filtering>true</filtering>
            </resource>
        </resources>
        <plugins>
            <!--代码编译指定版本插件-->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <target>${maven.compiler.target}</target>
                    <source>${maven.compiler.source}</source>
                    <encoding>UTF-8</encoding>
                    <skip>true</skip>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

public class TestHDFS {
    public Configuration conf = null;
    public FileSystem fs = null;
    @Before
    public void conn() throws Exception {
        // 会自动加载 resources 目录中hadoop相关配置文件
        conf = new Configuration(true);
        // 需要配置环境变量 HADOOP_USER_NAME=bigdata
        // fs = FileSystem.get(conf);
        // 直接指定用户
        fs = FileSystem.get(URI.create("hdfs://mycluster/"), conf, "bigdata");
    }
    @Test
    public void mkdir() throws Exception {
        Path dir = new Path("/mytest01");
        if (fs.exists(dir)) {
            fs.delete(dir, true);
        }
        fs.mkdirs(dir);
    }
    @Test
    public void upload() throws Exception {
        String content = "hello\njava\nbigdata";
        BufferedInputStream input =
                new BufferedInputStream(new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8)));
        Path outFile = new Path("/mytest01/out.txt");
        FSDataOutputStream output = fs.create(outFile);
        IOUtils.copyBytes(input, output, conf, true);
    }
    @Test
    public void get() throws Exception {
        Path file = new Path("/mytest01/out.txt");
        FileStatus fileStatus = fs.getFileStatus(file);
        int len = (int) fileStatus.getLen();
        byte[] outByte = new byte[len];
        FSDataInputStream input = fs.open(file);
        input.readFully(0, outByte);
        Assert.assertArrayEquals(outByte, "hello\njava\nbigdata".getBytes(StandardCharsets.UTF_8));
    }
    @Test
    public void blocks() throws Exception {
        Path file = new Path("/hello3/mydata.txt");
        FileStatus fss = fs.getFileStatus(file);
        BlockLocation[] blks = fs.getFileBlockLocations(fss, 0, fss.getLen());
        for (BlockLocation b : blks) {
            System.out.println(b);
        }
        // 输出
        // 0,1048576,node03,node02,node04
        // 1048576,840319,node04,node03,node02

        // 读取第二块的前5个字节
        BlockLocation blk = blks[1];
        long offset = blk.getOffset();
        // 其实用户和程序读取的是文件这个级别，并不知道有块的概念，但是可以通过设置偏移量跳到某个块读取
        FSDataInputStream in = fs.open(file);
        in.seek(offset);
        byte[] outByte = new byte[5];
        in.read(outByte);
        // 5773\n
        System.out.println(new String(outByte, StandardCharsets.UTF_8));
        IOUtils.closeStream(in);

        // 通过获取文件块的信息，可以知道块在哪些节点，这样就可以让计算向数据移动
        // 计算向数据移动，期望的是分治并减少数据传输的网络流量
        // hdfs客户端读取块中的数据时，具备距离的概念，会将具有块副本的节点按照距离排序，优先从距离最近的DN读取数据，如果本地DN有数据就读取本地
    }
}