flink 01--DataStream API 入门

426 阅读2分钟

基础示例

步骤&示例

创建 Flink 程序核心步骤如下:

  1. 创建StreamExecutionEnvironment环境
  2. 定义各种变换操作,这些操作都是 lazy 模式的,即只是定义,不会真正的执行
  3. 执行StreamExecutionEnvironment,启动整个流程

代码示例

Example.java 代码:

package com.demo;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Example {
    public static void main(String[] args) throws Exception {
    
        // 创建环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        
        // 直接加载元素,一般用于测试
        DataStream<Person> personDataStream = env.fromElements(
                new Person(1001, "foo1", 19),
                new Person(1002, "foo2", 17),
                new Person(1003, "foo3", 24)
        );
        
        // 这里定义流的操作,一般用 lambda 或者函数签名
        personDataStream
                .filter(person -> person.getAge() > 18)
                .map(Person::getId)
                .print();
        
        // 执行
        env.execute();
    }
}

Person.java 代码:

package com.demo;

public class Person {
    private long id;
    private String name;
    private int age;

    Person() {}

    Person(long id, String name, int age) {
        this.id = id;
        this.name = name;
        this.age = age;
    }

    @Override
    public String toString() {
        return String.format("Person(id: %d, name: %s, age: %d)",
                this.id, this.name, this.age);
    }

    public long getId() {
        return id;
    }

    public void setId(long id) {
        this.id = id;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }
}

maven 文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>myflink</artifactId>
    <packaging>jar</packaging>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>11</maven.compiler.source>
        <maven.compiler.target>11</maven.compiler.target>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-java -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.14.3</version>
            <scope>provided</scope>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-java -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>1.14.3</version>
            <scope>provided</scope>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.0.2</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <mainClass>com.demo.Example</mainClass> <!-- 此处为主入口-->
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

这样,我们就能打印出超过18岁Person 的 id 信息了

进阶练习

自定义数据源,在原来的代码上,新增PersonGenerator.java,代码内容如下:

package com.demo;

import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.Random;

// 重载 SourceFunction 
public class PersonGenerator implements SourceFunction<Person> {
    private int id;
    private volatile boolean isRunning;

    PersonGenerator() {
        id = 1000;
        isRunning = true;
    }
    
    // 重载用于产生数据流
    @Override
    public void run(SourceContext<Person> sourceContext) throws Exception {
        Random rand = new Random();
        while (isRunning) {
            String name = "foo" + (id - 1000);
            int age = rand.nextInt(101);
            // 填充数据
            sourceContext.collect(new Person(id, name, age));
            id += 1;
            Thread.sleep(1000);  // sleep 1s 防止刷新过快
        }
    }
    
    // 必须重载该函数,用于停止数据流
    @Override
    public void cancel() {
        isRunning = false;
    }
}

原来的Example.java 更改为如下代码:

package com.demo;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.PrintSinkFunction;

public class Example {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // 添加数据源
        DataStream<Person> personDataStream = env.addSource(new PersonGenerator());

        personDataStream
                .filter(person -> person.getAge() > 50)
                .map(Person::toString)
                .addSink(new PrintSinkFunction<>());  // 这里新增Sink 方式,直接输出到日志

        env.execute("Person Age Filter");
    }
}

这样,日志就会输出年龄大于50的Person了:

Person(id: 1001, name: foo1, age: 61)
Person(id: 1004, name: foo4, age: 75)
Person(id: 1005, name: foo5, age: 96)
Person(id: 1006, name: foo6, age: 60)
Person(id: 1007, name: foo7, age: 80)
Person(id: 1008, name: foo8, age: 85)
Person(id: 1015, name: foo15, age: 97)
Person(id: 1017, name: foo17, age: 95)
Person(id: 1018, name: foo18, age: 61)

我们可以在http://localhost:8081/#/overview 中观测任务运行情况:

image.png

也可以直接取消:

image.png