Flink-Examples-WordCount学习 | 青训营笔记

989 阅读2分钟

这是我参与「第四届青训营 」笔记创作活动的第1天

WordCount

官网下载源码,然后找到 .\flink-1.15.1\flink-examples\flink-examples-streaming\src\main\java\org\apache\flink\streaming\examples\wordcount

  • util
    • WordCountData.java
    • CLI.java
  • WordCount.java

WordCountData.java 里面是哈姆雷特的一段著名台词,作为 Default Data 使用。 CLI.java 是判断命令行输入条件执行相应操作。 所以重点学习 WordCount.java

导入需要的包

  • 声明包,包名和文件路径一样
  • 导入的包有
    • 事件时间,水印策略
    • FlatMap算子
    • 序列化,简单字符串编码器
    • 元组
    • 数据流
    • 流执行环境
    • 滚动策略
    • 其他
package org.apache.flink.streaming.examples.wordcount;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.examples.wordcount.util.CLI;
import org.apache.flink.streaming.examples.wordcount.util.WordCountData;
import org.apache.flink.util.Collector;

import java.time.Duration;

程序结构

  • class WordCount
    • main 函数
    • class Tokenizer
      • flatMap 函数
public class WordCount {

    public static void main(String[] args) throws Exception {
        // ...
    }
    
    public static final class Tokenizer
            implements FlatMapFunction<String, Tuple2<String, Integer>> {
            
        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
            //...
        }
        
    }
}
  • throws Exception 怎么放在main后面
  • final 关键字的含义
  • implements 什么意思
  • @Override 作用?

main函数

处理命令行输入的参数

final CLI params = CLI.fromArgs(args);

创建执行环境,这是构建 Flink 应用程序的主要入口点。

final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

设置执行模式

env.setRuntimeMode(params.getExecutionMode());
env.getConfig().setGlobalJobParameters(params);

选择默认数据

DataStream<String> text;

if (params.getInputs().isPresent()) {

    FileSource.FileSourceBuilder<String> builder =
            FileSource.forRecordStreamFormat(
                    new TextLineInputFormat(), params.getInputs().get());
    params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
    text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
    
} else {
    text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
}

分词计数

DataStream<Tuple2<String, Integer>> counts =
        text.flatMap(new Tokenizer())
                .name("tokenizer")
                .keyBy(value -> value.f0)
                .sum(1)
                .name("counter");

输出结果

if (params.getOutput().isPresent()) {
    counts.sinkTo(
                    FileSink.<Tuple2<String, Integer>>forRowFormat(
                                    params.getOutput().get(), new SimpleStringEncoder<>())
                            .withRollingPolicy(
                                    DefaultRollingPolicy.builder()
                                            .withMaxPartSize(MemorySize.ofMebiBytes(1))
                                            .withRolloverInterval(Duration.ofSeconds(10))
                                            .build())
                            .build())
            .name("file-sink");
} else {
    counts.print().name("print-sink");
}

作业提交并处理

env.execute("WordCount");

Tokenizer类

@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {

    String[] tokens = value.toLowerCase().split("\\W+");

    for (String token : tokens) {
        if (token.length() > 0) {
            out.collect(new Tuple2<>(token, 1));
        }
    }
}

配置文件信息

<!-- WordCount -->
<execution>
    <id>WordCount</id>
    <phase>package</phase>
    <goals>
        <goal>jar</goal>
    </goals>
    <configuration>
        <classifier>WordCount</classifier>

        <archive>
            <manifestEntries>
                <program-class>org.apache.flink.streaming.examples.wordcount.WordCount</program-class>
            </manifestEntries>
        </archive>

        <includes>
                <include>org/apache/flink/streaming/examples/wordcount/WordCount.class</include>
                <include>org/apache/flink/streaming/examples/wordcount/WordCount$*.class</include>
                <include>org/apache/flink/streaming/examples/wordcount/util/WordCountData.class</include>
                <include>org/apache/flink/streaming/examples/wordcount/util/CLI.class</include>
                <include>META-INF/LICENSE</include>
                <include>META-INF/NOTICE</include>
        </includes>
    </configuration>
</execution>

WindowWordCount

导入需要的包

package org.apache.flink.streaming.examples.windowing;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.examples.wordcount.WordCount;
import org.apache.flink.streaming.examples.wordcount.util.CLI;
import org.apache.flink.streaming.examples.wordcount.util.WordCountData;

import java.time.Duration;

官方注释

Implements a windowed version of the streaming "WordCount" program.

The input is a plain text file with lines separated by newline characters.

Usage:

WordCount --input <path> --output <path> --window <n> --slide <n>

If no parameters are provided, the program is run with default data from {@link WordCountData}.

This example shows how to:

  • write a simple Flink Streaming program,
  • use tuple data types,
  • use basic windowing abstractions.

程序结构

public class WindowWordCount {

    public static void main(String[] args) throws Exception {
        // ...
    }

}

处理命令行参数、创建并设置执行环境

final CLI params = CLI.fromArgs(args);

// Create the execution environment. This is the main entrypoint
// to building a Flink application.
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(params.getExecutionMode());
env.getConfig().setGlobalJobParameters(params);

选择默认文本

DataStream<String> text;
if (params.getInputs().isPresent()) {
    FileSource.FileSourceBuilder<String> builder =
            FileSource.forRecordStreamFormat(
                    new TextLineInputFormat(), params.getInputs().get());
    params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
    text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
} else {
    text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
}

设置窗口大小

int windowSize = params.getInt("window").orElse(250);
int slideSize = params.getInt("slide").orElse(150);

统计词频

DataStream<Tuple2<String, Integer>> counts =
    text.flatMap(new WordCount.Tokenizer())
            .name("tokenizer")
            .keyBy(value -> value.f0)
            .countWindow(windowSize, slideSize)
            .sum(1)
            .name("counter");

sink

if (params.getOutput().isPresent()) {
    counts.sinkTo(
                    FileSink.<Tuple2<String, Integer>>forRowFormat(
                                    params.getOutput().get(), new SimpleStringEncoder<>())
                            .withRollingPolicy(
                                    DefaultRollingPolicy.builder()
                                            .withMaxPartSize(MemorySize.ofMebiBytes(1))
                                            .withRolloverInterval(Duration.ofSeconds(10))
                                            .build())
                            .build())
            .name("file-sink");
} else {
    counts.print().name("print-sink");
}

执行环境

// Apache Flink applications are composed lazily. Calling execute
// submits the Job and begins processing.
env.execute("WindowWordCount");

SocketWindowWordCount

导入需要的包

package org.apache.flink.streaming.examples.socket;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

使用说明

Implements a streaming windowed version of the "WordCount" program.

This program connects to a server socket and reads strings from the socket. The easiest way to try this out is to open a text server (at port 12345) using the netcat tool via

nc -l 12345 on Linux or nc -l -p 12345 on Windows

and run this example with the hostname and the port as arguments.

程序结构

  • class SocketWindowWordCount
    • main
    • class WordWithCount

main函数

主机名和端口号

final String hostname;
final int port;
try {
    final ParameterTool params = ParameterTool.fromArgs(args);
    hostname = params.has("hostname") ? params.get("hostname") : "localhost";
    port = params.getInt("port");
} catch (Exception e) {
    System.err.println(
            "No port specified. Please run 'SocketWindowWordCount "
                    + "--hostname <hostname> --port <port>', where hostname (localhost by default) "
                    + "and port is the address of the text server");
    System.err.println(
            "To start a simple text server, run 'netcat -l <port>' and "
                    + "type the input text into the command line");
    return;
}

创建执行环境、获得数据

final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

DataStream<String> text = env.socketTextStream(hostname, port, "\n");

统计数据

DataStream<WordWithCount> windowCounts =
        text.flatMap(
                        (FlatMapFunction<String, WordWithCount>)
                                (value, out) -> {
                                    for (String word : value.split("\\s")) {
                                        out.collect(new WordWithCount(word, 1L));
                                    }
                                },
                        Types.POJO(WordWithCount.class))
                .keyBy(value -> value.word)
                .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
                .reduce((a, b) -> new WordWithCount(a.word, a.count + b.count))
                .returns(WordWithCount.class);

windowCounts.print().setParallelism(1);

输出结果、关闭环境

windowCounts.print().setParallelism(1);

env.execute("Socket Window WordCount");

WordWithCount类

public static class WordWithCount {

    public String word;
    public long count;

    @SuppressWarnings("unused")
    public WordWithCount() {}

    public WordWithCount(String word, long count) {
        this.word = word;
        this.count = count;
    }

    @Override
    public String toString() {
        return word + " : " + count;
    }
}

运行SocketWindowWordCount

  1. 打开一个终端
    $ nc -l 9099
    to be or not to be
    that is a question
    
  2. 再打开一个终端
    $ cd /usr/local/flink
    $ ./bin/flink run examples/streaming/SocketWindowWordCount.jar --hostname hadoop --port 9099
    
  3. 可以在第一个终端继续输入单词和句子,按 Ctrl + C 结束,SocketWindowWordCount 程序也终止运行。
  4. 在log文件夹查看结果
    $ cd /usr/local/flink/log
    $ cat *.out
    

配置文件信息

<!-- SocketWindowWordCount -->
<execution>
    <id>SocketWindowWordCount</id>
    <phase>package</phase>
    <goals>
        <goal>jar</goal>
    </goals>
    <configuration>
        <classifier>SocketWindowWordCount</classifier>

        <archive>
            <manifestEntries>
                <program-class>org.apache.flink.streaming.examples.socket.SocketWindowWordCount</program-class>
            </manifestEntries>
        </archive>

        <includes>
            <include>org/apache/flink/streaming/examples/socket/SocketWindowWordCount.class</include>
            <include>org/apache/flink/streaming/examples/socket/SocketWindowWordCount$*.class</include>
            <include>META-INF/LICENSE</include>
            <include>META-INF/NOTICE</include>
        </includes>
    </configuration>
</execution>