这是我参与「第四届青训营 」笔记创作活动的第1天
WordCount
官网下载源码,然后找到
.\flink-1.15.1\flink-examples\flink-examples-streaming\src\main\java\org\apache\flink\streaming\examples\wordcount
- util
WordCountData.javaCLI.java
WordCount.java
WordCountData.java 里面是哈姆雷特的一段著名台词,作为 Default Data 使用。
CLI.java 是判断命令行输入条件执行相应操作。
所以重点学习 WordCount.java。
导入需要的包
- 声明包,包名和文件路径一样
- 导入的包有
- 事件时间,水印策略
- FlatMap算子
- 序列化,简单字符串编码器
- 元组
- 数据流
- 流执行环境
- 滚动策略
- 其他
package org.apache.flink.streaming.examples.wordcount;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.examples.wordcount.util.CLI;
import org.apache.flink.streaming.examples.wordcount.util.WordCountData;
import org.apache.flink.util.Collector;
import java.time.Duration;
程序结构
class WordCountmain函数class TokenizerflatMap函数
public class WordCount {
public static void main(String[] args) throws Exception {
// ...
}
public static final class Tokenizer
implements FlatMapFunction<String, Tuple2<String, Integer>> {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
//...
}
}
}
- throws Exception 怎么放在main后面
- final 关键字的含义
- implements 什么意思
- @Override 作用?
main函数
处理命令行输入的参数
final CLI params = CLI.fromArgs(args);
创建执行环境,这是构建 Flink 应用程序的主要入口点。
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
设置执行模式
env.setRuntimeMode(params.getExecutionMode());
env.getConfig().setGlobalJobParameters(params);
选择默认数据
DataStream<String> text;
if (params.getInputs().isPresent()) {
FileSource.FileSourceBuilder<String> builder =
FileSource.forRecordStreamFormat(
new TextLineInputFormat(), params.getInputs().get());
params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
} else {
text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
}
分词计数
DataStream<Tuple2<String, Integer>> counts =
text.flatMap(new Tokenizer())
.name("tokenizer")
.keyBy(value -> value.f0)
.sum(1)
.name("counter");
输出结果
if (params.getOutput().isPresent()) {
counts.sinkTo(
FileSink.<Tuple2<String, Integer>>forRowFormat(
params.getOutput().get(), new SimpleStringEncoder<>())
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withMaxPartSize(MemorySize.ofMebiBytes(1))
.withRolloverInterval(Duration.ofSeconds(10))
.build())
.build())
.name("file-sink");
} else {
counts.print().name("print-sink");
}
作业提交并处理
env.execute("WordCount");
Tokenizer类
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
String[] tokens = value.toLowerCase().split("\\W+");
for (String token : tokens) {
if (token.length() > 0) {
out.collect(new Tuple2<>(token, 1));
}
}
}
配置文件信息
<!-- WordCount -->
<execution>
<id>WordCount</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>WordCount</classifier>
<archive>
<manifestEntries>
<program-class>org.apache.flink.streaming.examples.wordcount.WordCount</program-class>
</manifestEntries>
</archive>
<includes>
<include>org/apache/flink/streaming/examples/wordcount/WordCount.class</include>
<include>org/apache/flink/streaming/examples/wordcount/WordCount$*.class</include>
<include>org/apache/flink/streaming/examples/wordcount/util/WordCountData.class</include>
<include>org/apache/flink/streaming/examples/wordcount/util/CLI.class</include>
<include>META-INF/LICENSE</include>
<include>META-INF/NOTICE</include>
</includes>
</configuration>
</execution>
WindowWordCount
导入需要的包
package org.apache.flink.streaming.examples.windowing;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.MemorySize;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.examples.wordcount.WordCount;
import org.apache.flink.streaming.examples.wordcount.util.CLI;
import org.apache.flink.streaming.examples.wordcount.util.WordCountData;
import java.time.Duration;
官方注释
Implements a windowed version of the streaming "WordCount" program.
The input is a plain text file with lines separated by newline characters.
Usage:
WordCount --input <path> --output <path> --window <n> --slide <n>
If no parameters are provided, the program is run with default data from {@link WordCountData}.
This example shows how to:
- write a simple Flink Streaming program,
- use tuple data types,
- use basic windowing abstractions.
程序结构
public class WindowWordCount {
public static void main(String[] args) throws Exception {
// ...
}
}
处理命令行参数、创建并设置执行环境
final CLI params = CLI.fromArgs(args);
// Create the execution environment. This is the main entrypoint
// to building a Flink application.
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(params.getExecutionMode());
env.getConfig().setGlobalJobParameters(params);
选择默认文本
DataStream<String> text;
if (params.getInputs().isPresent()) {
FileSource.FileSourceBuilder<String> builder =
FileSource.forRecordStreamFormat(
new TextLineInputFormat(), params.getInputs().get());
params.getDiscoveryInterval().ifPresent(builder::monitorContinuously);
text = env.fromSource(builder.build(), WatermarkStrategy.noWatermarks(), "file-input");
} else {
text = env.fromElements(WordCountData.WORDS).name("in-memory-input");
}
设置窗口大小
int windowSize = params.getInt("window").orElse(250);
int slideSize = params.getInt("slide").orElse(150);
统计词频
DataStream<Tuple2<String, Integer>> counts =
text.flatMap(new WordCount.Tokenizer())
.name("tokenizer")
.keyBy(value -> value.f0)
.countWindow(windowSize, slideSize)
.sum(1)
.name("counter");
sink
if (params.getOutput().isPresent()) {
counts.sinkTo(
FileSink.<Tuple2<String, Integer>>forRowFormat(
params.getOutput().get(), new SimpleStringEncoder<>())
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withMaxPartSize(MemorySize.ofMebiBytes(1))
.withRolloverInterval(Duration.ofSeconds(10))
.build())
.build())
.name("file-sink");
} else {
counts.print().name("print-sink");
}
执行环境
// Apache Flink applications are composed lazily. Calling execute
// submits the Job and begins processing.
env.execute("WindowWordCount");
SocketWindowWordCount
导入需要的包
package org.apache.flink.streaming.examples.socket;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
使用说明
Implements a streaming windowed version of the "WordCount" program.
This program connects to a server socket and reads strings from the socket. The easiest way to try this out is to open a text server (at port 12345) using the netcat tool via
nc -l 12345 on Linux or nc -l -p 12345 on Windows
and run this example with the hostname and the port as arguments.
程序结构
- class SocketWindowWordCount
- main
- class WordWithCount
main函数
主机名和端口号
final String hostname;
final int port;
try {
final ParameterTool params = ParameterTool.fromArgs(args);
hostname = params.has("hostname") ? params.get("hostname") : "localhost";
port = params.getInt("port");
} catch (Exception e) {
System.err.println(
"No port specified. Please run 'SocketWindowWordCount "
+ "--hostname <hostname> --port <port>', where hostname (localhost by default) "
+ "and port is the address of the text server");
System.err.println(
"To start a simple text server, run 'netcat -l <port>' and "
+ "type the input text into the command line");
return;
}
创建执行环境、获得数据
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<String> text = env.socketTextStream(hostname, port, "\n");
统计数据
DataStream<WordWithCount> windowCounts =
text.flatMap(
(FlatMapFunction<String, WordWithCount>)
(value, out) -> {
for (String word : value.split("\\s")) {
out.collect(new WordWithCount(word, 1L));
}
},
Types.POJO(WordWithCount.class))
.keyBy(value -> value.word)
.window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
.reduce((a, b) -> new WordWithCount(a.word, a.count + b.count))
.returns(WordWithCount.class);
windowCounts.print().setParallelism(1);
输出结果、关闭环境
windowCounts.print().setParallelism(1);
env.execute("Socket Window WordCount");
WordWithCount类
public static class WordWithCount {
public String word;
public long count;
@SuppressWarnings("unused")
public WordWithCount() {}
public WordWithCount(String word, long count) {
this.word = word;
this.count = count;
}
@Override
public String toString() {
return word + " : " + count;
}
}
运行SocketWindowWordCount
- 打开一个终端
$ nc -l 9099 to be or not to be that is a question - 再打开一个终端
$ cd /usr/local/flink $ ./bin/flink run examples/streaming/SocketWindowWordCount.jar --hostname hadoop --port 9099 - 可以在第一个终端继续输入单词和句子,按
Ctrl + C结束,SocketWindowWordCount程序也终止运行。 - 在log文件夹查看结果
$ cd /usr/local/flink/log $ cat *.out
配置文件信息
<!-- SocketWindowWordCount -->
<execution>
<id>SocketWindowWordCount</id>
<phase>package</phase>
<goals>
<goal>jar</goal>
</goals>
<configuration>
<classifier>SocketWindowWordCount</classifier>
<archive>
<manifestEntries>
<program-class>org.apache.flink.streaming.examples.socket.SocketWindowWordCount</program-class>
</manifestEntries>
</archive>
<includes>
<include>org/apache/flink/streaming/examples/socket/SocketWindowWordCount.class</include>
<include>org/apache/flink/streaming/examples/socket/SocketWindowWordCount$*.class</include>
<include>META-INF/LICENSE</include>
<include>META-INF/NOTICE</include>
</includes>
</configuration>
</execution>