SparkStreaming[java]案例-用户广告点击

99 阅读10分钟

概述

本案例是基于用户广告点击数据的统计,一共有3个小的需求。

  • 实现实时的动态黑名单机制:将每天对某个广告点击超过k次的用户拉黑,k为实际需求点击数量
  • 实时统计每天各地区各城市各广告的点击总流量,并将其存入MySQL
  • 统计最近一个小时的广告点击量

数据

数据主要包括:时间戳、地区、城市、用户ID、广告ID 数据示例如下:

2024-03-15 14:55:28.408073,东北,沈阳,3,3
2024-03-15 14:55:28.408175,华中,长沙,5,3
2024-03-15 14:55:28.408212,华中,长沙,4,4
2024-03-15 14:55:28.408273,华东,济南,4,6
2024-03-15 14:55:28.408334,华北,北京,6,5
2024-03-15 14:55:28.408393,华南,深圳,6,3
2024-03-15 14:55:28.408450,西北,银川,4,5
2024-03-15 14:55:28.408510,华东,青岛,1,3
2024-03-15 14:55:28.408573,华南,广州,2,6
2024-03-15 14:55:28.408674,东北,沈阳,1,5

数据生成模块

数据生成采用了之前flink的测试数据的python脚本,在原来基础上新增了一些功能,数据生成核心还是数据类以及数据生成器类:

@dataclass
class UserClickInfo(object):
    timestamp: str
    area: str
    city_name: str
    user_id: int
    ad_id: int
    
    def to_str(self, data_type='text', patter="|") -> str:
        if data_type.lower() == "text":
            attr_list = [str(item) for item in list(self.__dict__.values())]
            return f"{patter}".join(attr_list)
        elif data_type.lower() == "json":
            return dumps(self.__dict__, ensure_ascii=False)
        else:
            return None


class UserClickGenerator(DataGenerator):
    PARAMS = ["data_type"]
    def __init__(self) -> None:
        cityinfo = """1	北京	华北
2	上海	华东
3	深圳	华南
4	广州	华南
5	武汉	华中
6	南京	华东
7	天津	华北
8	成都	西南
9	哈尔滨	东北
10	大连	东北
11	沈阳	东北
12	西安	西北
13	长沙	华中
14	重庆	西南
15	济南	华东
16	石家庄	华北
17	银川	西北
18	杭州	华东
19	保定	华北
20	福州	华南
21	贵阳	西南
22	青岛	华东
23	苏州	华东
24	郑州	华北
25	无锡	华东
26	厦门	华南"""
        self.city_data = []
        for line in cityinfo.splitlines():
            values = line.strip().split("\t")
            self.city_data.append((values[0], values[1], values[2]))
        self.city_nums = len(self.city_data)

    @staticmethod
    def get_current_time() -> str:
        return str(datetime.now())

    def necessary_params(self) -> List[str]:
        """
        获取当前生成器的数据生成方法的参数列表
        """
        return self.PARAMS

    def param_check(self, param_name: str) -> Union[bool, None]:
        return self.PARAMS.__contains__(param_name)

    @staticmethod
    def get_randint(num1: int, num2: int) -> int:
        return randint(num1, num2)

    @staticmethod
    def get_id() -> int:
        return randint(1, 6)

    def get_city(self, index: int) -> str:
        city_info = self.city_data[index]
        return city_info[1]

    def get_area(self, index: int) -> str:
        city_info = self.city_data[index]
        return city_info[2]

    def make_data(self, data_type: str="text", *args, **kwargs) -> Any:
        local_params = {"data_type": data_type, "id": None, "ad_id": None, "patter": ","}
        if kwargs:
            for key, value in kwargs.items():
                if self.param_check(key):
                    local_params[key] = value
        if data_type is None:
            local_params['data_type'] = "text"
        if local_params.get("id", None) is None:
            local_params['id'] = self.get_id()
        if local_params.get("ad_id", None) is None:
            local_params['ad_id'] = self.get_id()
        city_index = randint(0, self.city_nums-1)
        
        user_click = UserClickInfo(
            timestamp=self.get_current_time(),
            area=self.get_area(city_index),
            city_name=self.get_city(city_index),
            user_id=local_params.get("id"),
            ad_id=local_params.get("ad_id")
        )
        return user_click.to_str(data_type=local_params.get("data_type"), patter=local_params.get("patter"))

    def get_data_class_name(self):
        return "UserClickInfo"

需求实现

工具类准备

时间戳解析工具类

package src.main.blackUserList;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class TimestampUtils {
    public static String parseTimestamp(String timestamp) throws ParseException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
        Date date = simpleDateFormat.parse(timestamp);
        SimpleDateFormat simpleDateFormat1 = new SimpleDateFormat("yyyy-MM-dd");
        return simpleDateFormat1.format(date);
    }
    public static String parseTimestampHour(String timestamp) throws ParseException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS");
        Date date = simpleDateFormat.parse(timestamp);
        SimpleDateFormat simpleDateFormat1 = new SimpleDateFormat("HH:mm");
        return simpleDateFormat1.format(date);
    }
}

kafka工具类

package src.main.blackUserList;

import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class kafkaUtils {
    private Map<String, Object> kafkaConfig = new HashMap<String, Object>();

    public kafkaUtils() {
        this.setKafkaConfig();
    }

    private void setKafkaConfig(){
        kafkaConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.141.177:9092");
        kafkaConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "SPARKTEXT01");
        kafkaConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
        kafkaConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_DOC, "latest");
        kafkaConfig.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true);
    }

    public JavaInputDStream<ConsumerRecord<String, String>> createKafkaDS(String topic, JavaStreamingContext streamingContext){
        List<String> topicList = Arrays.asList(topic);  // topic列表

        JavaInputDStream<ConsumerRecord<String, String>> kafkaStream = KafkaUtils.createDirectStream(
                streamingContext,  // spark streaming context实例
                LocationStrategies.PreferConsistent(),
                ConsumerStrategies.<String, String>Subscribe(
                        topicList,  // 需要传入collection对象
                        this.kafkaConfig  // 需要传入map对象
                )
        );
        return kafkaStream;
    }

}

MySQL连接及查询更新工具类

package src.main.blackUserList;

import javax.sql.DataSource;
import com.alibaba.druid.pool.DruidDataSourceFactory;


import java.sql.*;
import java.util.*;

/**
 * @projectName: sparkDemo
 * @package: src.main.blackUserList
 * @className: MysqlUtil
 * @author: NelsonWu
 * @description: TODO
 * @date: 2024/3/13 15:26
 * @version: 1.0
 */
public class MysqlUtil {
    public DataSource dataSource = init();

    public MysqlUtil() throws Exception {
    }

    public DataSource init() throws Exception {
        Properties properties = new Properties();
        properties.setProperty("driverClassName", "com.mysql.cj.jdbc.Driver");
        properties.setProperty("url", "jdbc:mysql://192.168.141.177:3306/sparktest");
        properties.setProperty("username", "root");
        properties.setProperty("password", "mysql");
        properties.setProperty("maxActive", "50");
        DataSource datasource = DruidDataSourceFactory.createDataSource(properties);
        return datasource;
    }

    public Connection getConnection() throws SQLException {
        return this.dataSource.getConnection();
    }

    /*
    执行SQL语句,单条数据插入
     */
    public Integer executUpdate(Connection connection, String sql, ArrayList<Object> params){
        Integer result = 0;
        PreparedStatement preparedStatement = null;

        try {
            connection.setAutoCommit(false);
            preparedStatement = connection.prepareStatement(sql);
            if (params!=null && params.size() >0 ){
                for (int i = 0; i < params.size(); i++) {
                    preparedStatement.setObject(i+1, params.get(i));
                }
                result = preparedStatement.executeUpdate();
                connection.commit();

                preparedStatement.close();
            }
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }

        return result;
    }

    public boolean isExist(Connection connection, String sql, ArrayList<Object> params){
        boolean flag = false;
        PreparedStatement preparedStatement = null;
        try {
            preparedStatement = connection.prepareStatement(sql);
            for (int i = 0; i < params.size(); i++) {
                preparedStatement.setObject(i+1, params.get(i));
            }
            flag = preparedStatement.executeQuery().next();
            preparedStatement.close();
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
        return flag;
    }
//
    public ArrayList<HashMap<String, Object>> queryData(Connection connection, String sql, List<Object> params, List<Object> outputParams){
        PreparedStatement preparedStatement = null;

        ArrayList<HashMap<String, Object>> resultData = new ArrayList<HashMap<String, Object>>();
        try {
            preparedStatement = connection.prepareStatement(sql);
            for (int i = 0; i < params.size(); i++) {
                preparedStatement.setObject(i+1, params.get(i));
            }
            ResultSet resultSet = preparedStatement.executeQuery();

            while (resultSet.next()){
                HashMap<String, Object> currentData = new HashMap<>();
                for (Object key : outputParams) {
                    currentData.put(key.toString(), resultSet.getObject(key.toString()));
                }
                resultData.add(currentData);
            }

            resultSet.close();
            preparedStatement.close();
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
        return resultData;
    }

    public static void main(String[] args) throws Exception {
        MysqlUtil mysqlUtil = new MysqlUtil();
        Connection connection = mysqlUtil.getConnection();
        String sql = "select dt, userid, adid, count from user_ad_count order by count desc limit 5";
        ArrayList<Object> arrayList = new ArrayList<>();
        arrayList.add("dt");
        arrayList.add("userid");
        arrayList.add("adid");
        arrayList.add("count");

        ArrayList<HashMap<String, Object>> outputData = mysqlUtil.queryData(connection, sql, new ArrayList<Object>(), arrayList);
        for (HashMap<String, Object> outputDatum : outputData) {
            for (Map.Entry<String, Object> stringObjectEntry : outputDatum.entrySet()) {
                String key = stringObjectEntry.getKey();
                Object value = stringObjectEntry.getValue();
                System.out.println(key + "->" + value.toString());
            }
        }

        connection.close();
    }

}

需求一:实时的动态黑名单机制

消费Kafka中用户的点击日志数据,按照日期、用户及广告进行聚合统计,判断该用户是否点击某广告的次数是否达到阈值(测试中设定为30次),达到阈值则将该用户拉入黑名单(MySQL的black_list表),并将该用户当天对该广告的点击次数汇总到MySQL的user_ad_count表中。 MySQL数据表如下:

CREATE TABLE `black_list` (
  `userid` varchar(10) NOT NULL,
  PRIMARY KEY (`userid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

CREATE TABLE `user_ad_count` (
  `dt` varchar(50) NOT NULL,
  `userid` varchar(10) NOT NULL,
  `adid` varchar(10) NOT NULL,
  `count` bigint DEFAULT NULL,
  PRIMARY KEY (`dt`,`userid`,`adid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;

Java代码:

package src.main.blackUserList;


import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import scala.Tuple3;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Arrays;

/**
 * @description: 从Kafka数据源中读取数据并进行处理,判断当天用户点击某广告大于等于30次则加入黑名单中
 */
public class sparkBlackListHandler2 {
    public static String sql = "select userid from black_list";
    public static String sql2 = "insert into black_list (userid) values(?) on duplicate key update userid=?";
    public static String sql4 = "select dt, userid, adid, count from user_ad_count where dt=? and userid=? and adid=?";
    public static String sql5 = "update user_ad_count set count=? where dt=? and userid=? and adid=?";

    public static String sql6 = "insert into user_ad_count (dt, userid, adid, count) values(?,?,?,?)";

    public static ArrayList<Object> makeList(Object... args){
        ArrayList<Object> params = new ArrayList<>();
        params.addAll(Arrays.asList(args));
        return params;
    }

    public static void main(String[] args) throws InterruptedException {
        SparkConf sparkAppConf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamingDemo");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkAppConf, Durations.seconds(30));

        kafkaUtils kafkaUtils = new kafkaUtils();
        JavaInputDStream<ConsumerRecord<String, String>> kafkaDS = kafkaUtils.createKafkaDS("test", javaStreamingContext);

        JavaDStream<AdsLog> adsLogJavaDStream = kafkaDS.map(new Function<ConsumerRecord<String, String>, AdsLog>() {
            @Override
            public AdsLog call(ConsumerRecord<String, String> record) throws Exception {
                String value = record.value();
                String[] values = value.split(",");
                AdsLog adsLog = new AdsLog(
                        values[0],
                        values[1],
                        values[2],
                        values[3],
                        values[4]
                );
                return adsLog;
            }
        });

        JavaDStream<Tuple2<Tuple3<String, String, String>, Integer>> dStream = adsLogJavaDStream.transform(new Function<JavaRDD<AdsLog>, JavaRDD<Tuple2<Tuple3<String, String, String>, Integer>>>() {
            @Override
            public JavaRDD<Tuple2<Tuple3<String, String, String>, Integer>> call(JavaRDD<AdsLog> adsLogJavaRDD) throws Exception {

                // 通过JDBC周期性获取黑名单数据
                ArrayList<String> blackList = new ArrayList<>();
                MysqlUtil mysqlUtil = new MysqlUtil();
                Connection connection = mysqlUtil.getConnection();
                PreparedStatement blackper = connection.prepareStatement(sparkBlackListHandler2.sql);
                ResultSet resultSet = blackper.executeQuery();
                while (resultSet.next()) {
                    blackList.add(resultSet.getString(1));
                }
                resultSet.close();
                blackper.close();
                connection.close();

                // 判断点击用户是否在黑名单中
                JavaRDD<AdsLog> filterRDD = adsLogJavaRDD.filter(new Function<AdsLog, Boolean>() {
                    @Override
                    public Boolean call(AdsLog adsLog) throws Exception {

                        return !blackList.contains(adsLog.userid);
                    }
                });

                JavaPairRDD<Tuple3<String, String, String>, Integer> reduceRDD = filterRDD.mapToPair(new PairFunction<AdsLog, Tuple3<String, String, String>, Integer>() {
                    @Override
                    public Tuple2<Tuple3<String, String, String>, Integer> call(AdsLog adsLog) throws Exception {
                        String dt = TimestampUtils.parseTimestamp(adsLog.getTimestamp());
                        String userid = adsLog.getUserid();
                        String adis = adsLog.getAdis();

                        return new Tuple2<>(new Tuple3<>(dt, userid, adis), 1);
                    }
                }).reduceByKey((x, y) -> x + y);

                JavaRDD<Tuple2<Tuple3<String, String, String>, Integer>> resultRDD = reduceRDD.map(new Function<Tuple2<Tuple3<String, String, String>, Integer>, Tuple2<Tuple3<String, String, String>, Integer>>() {
                    @Override
                    public Tuple2<Tuple3<String, String, String>, Integer> call(Tuple2<Tuple3<String, String, String>, Integer> value) throws Exception {
                        return value;
                    }
                });

                return resultRDD;
            }
        });

        // 判断聚合后的数据是否符合要求,将符合要求的数据进行处理
        dStream.foreachRDD(
                rdd -> {
                    rdd.foreach(new VoidFunction<Tuple2<Tuple3<String, String, String>, Integer>>() {
                        @Override
                        public void call(Tuple2<Tuple3<String, String, String>, Integer> value) throws Exception {
                            Tuple3<String, String ,String> clickInfo = value._1();
                            Integer count = value._2();
                            String dt = clickInfo._1();
                            String userid = clickInfo._2();
                            String adid = clickInfo._3();
                            MysqlUtil mysqlUtil = new MysqlUtil();
                            Connection connection = mysqlUtil.getConnection();

                            Integer totalCnt = 0 + count;  // 最终计数值

                            // 1. 判断count是否超过阈值
                            if (count >= 30){
                                // 1.1 添加到黑名单中
                                ArrayList<Object> params = makeList(userid, userid);
                                Integer i = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql2, params);

                                // 1.2 将当前用户数据更新user_ad_count表
                                PreparedStatement preparedStatement = connection.prepareStatement(sparkBlackListHandler2.sql4);
                                preparedStatement.setObject(1, dt);
                                preparedStatement.setObject(2, userid);
                                preparedStatement.setObject(3, adid);
                                ResultSet resultSet = preparedStatement.executeQuery();
                                if (resultSet.next()){
                                    // 1.2.1 如果在user_ad_count中则更新count数据
                                    totalCnt += resultSet.getInt("count");
                                    ArrayList<Object> arrayList = makeList(totalCnt, dt, userid, adid);
                                    Integer i1 = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql5, arrayList);

                                }
                                else{
                                    // 1.2.2 不在则插入到user_ad_count表中
                                    ArrayList<Object> arrayList = makeList(dt, userid, adid, totalCnt);
                                    Integer i1 = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql6, arrayList);

                                }
                            }
                            else {
                                // 2. count低于阈值
                                // 2.1 查看user_ad_count表中是否存在数据
                                PreparedStatement preparedStatement = connection.prepareStatement(sparkBlackListHandler2.sql4);
                                preparedStatement.setObject(1, dt);
                                preparedStatement.setObject(2, userid);
                                preparedStatement.setObject(3, adid);

                                ResultSet resultSet = preparedStatement.executeQuery();
                                // 2.2 存在数据则将数据与当前count进行累加
                                if (resultSet.next()){
                                    int cnt = resultSet.getInt("count");
                                    totalCnt += cnt;
                                    // 2.2.1 将当前的count更新到user_ad_count表中
                                    ArrayList<Object> params3 = makeList(totalCnt, dt, userid, adid);
                                    Integer i = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql5, params3);

                                }
                                else {
                                    // 2.3 不存在则将当前数据插入到user_ad_count表中
                                    ArrayList<Object> params1 = makeList(dt, userid, adid, totalCnt);
                                    Integer i = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql6, params1);
                                }
                                resultSet.close();
                                preparedStatement.close();

                                // 2.4 判断最终的值是否大于阈值,大于则添加到黑名单
                                if (totalCnt >= 30){
                                    // 加入黑名单
                                    ArrayList<Object> params2 = makeList(userid, userid);
                                    Integer i1 = mysqlUtil.executUpdate(connection, sparkBlackListHandler2.sql2, params2);
                                }

                            }

                            connection.close();

                        }
                    });
                }
        );

        javaStreamingContext.start();

        javaStreamingContext.awaitTermination();

    }
}

image.png

需求二:统计每天各地区各城市各广告的点击总流量

需要将统计后的结果写入MySQL中:

CREATE TABLE `area_city_count` (
  `dt` varchar(50) NOT NULL,
  `area` varchar(50) NOT NULL,
  `city` varchar(50) NOT NULL,
  `adid` varchar(50) NOT NULL,
  `count` bigint DEFAULT NULL,
  PRIMARY KEY (`dt`,`area`,`city`,`adid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
package src.main.blackUserList;


import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import scala.Tuple4;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.HashMap;

/**
 * @description: 从Kafka数据源中读取数据并进行处理,统计当天各地区各城市各广告的点击量
 */
public class sparkBlackListHandler3 {
    public static String sql = "select userid from black_list";

    public static void main(String[] args) throws InterruptedException {
        SparkConf sparkAppConf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamingDemo");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkAppConf, Durations.seconds(30));

        kafkaUtils kafkaUtils = new kafkaUtils();
        JavaInputDStream<ConsumerRecord<String, String>> kafkaDS = kafkaUtils.createKafkaDS("test", javaStreamingContext);

        JavaDStream<AdsLog> adsLogJavaDStream = kafkaDS.map(new Function<ConsumerRecord<String, String>, AdsLog>() {
            @Override
            public AdsLog call(ConsumerRecord<String, String> record) throws Exception {
                String value = record.value();
                String[] values = value.split(",");
                AdsLog adsLog = new AdsLog(
                        values[0],
                        values[1],
                        values[2],
                        values[3],
                        values[4]
                );
                return adsLog;
            }
        });

        JavaDStream<Tuple2<Tuple4<String, String, String, String>, Integer>> dStream = adsLogJavaDStream.transform(new Function<JavaRDD<AdsLog>, JavaRDD<Tuple2<Tuple4<String, String, String, String>, Integer>>>() {
            @Override
            public JavaRDD<Tuple2<Tuple4<String, String, String, String>, Integer>> call(JavaRDD<AdsLog> adsLogJavaRDD) throws Exception {

                // 通过JDBC周期性获取黑名单数据
                ArrayList<String> blackList = new ArrayList<>();
                MysqlUtil mysqlUtil = new MysqlUtil();
                Connection connection = mysqlUtil.getConnection();
                PreparedStatement blackper = connection.prepareStatement(sparkBlackListHandler2.sql);
                ResultSet resultSet = blackper.executeQuery();
                while (resultSet.next()) {
                    blackList.add(resultSet.getString(1));
                }
                resultSet.close();
                blackper.close();
                connection.close();

                // 判断点击用户是否在黑名单中
                JavaRDD<AdsLog> filterRDD = adsLogJavaRDD.filter(new Function<AdsLog, Boolean>() {
                    @Override
                    public Boolean call(AdsLog adsLog) throws Exception {

                        return !blackList.contains(adsLog.userid);
                    }
                });

                JavaPairRDD<Tuple4<String, String, String, String>, Integer> reduceRDD = filterRDD.mapToPair(new PairFunction<AdsLog, Tuple4<String, String, String, String>, Integer>() {
                    @Override
                    public Tuple2<Tuple4<String, String, String, String>, Integer> call(AdsLog adsLog) throws Exception {
                        String dt = TimestampUtils.parseTimestamp(adsLog.getTimestamp());
                        // String userid = adsLog.getUserid();
                        String adis = adsLog.getAdis();
                        String area = adsLog.getArea();
                        String city = adsLog.getCity();

                        return new Tuple2<>(new Tuple4<>(dt, area, city, adis), 1);
                    }
                }).reduceByKey((x, y) -> x + y);

                JavaRDD<Tuple2<Tuple4<String, String, String, String>, Integer>> resultRDD = reduceRDD.map(new Function<Tuple2<Tuple4<String, String, String, String>, Integer>, Tuple2<Tuple4<String, String, String, String>, Integer>>() {
                    @Override
                    public Tuple2<Tuple4<String, String, String, String>, Integer> call(Tuple2<Tuple4<String, String, String, String>, Integer> value) throws Exception {
                        return value;
                    }
                });

                return resultRDD;
            }
        });

        // 判断聚合后的数据是否符合要求,将符合要求的数据进行处理
        dStream.foreachRDD(
                rdd -> {
                    rdd.foreach(new VoidFunction<Tuple2<Tuple4<String, String, String, String>, Integer>>() {
                        @Override
                        public void call(Tuple2<Tuple4<String, String, String, String>, Integer> tuple4IntegerTuple2) throws Exception {
                            Tuple4<String, String, String, String> adlog = tuple4IntegerTuple2._1();
                            String dt = adlog._1();
                            String area = adlog._2();
                            String city = adlog._3();
                            String adid = adlog._4();

                            Integer totalCnt = 0 + tuple4IntegerTuple2._2();
                            MysqlUtil mysqlUtil = new MysqlUtil();
                            Connection connection = mysqlUtil.getConnection();

                            // 1. 该数据在表中是否存在
                            String sql = "select dt, area, city, adid, count from area_city_count where dt=? and area=? and city=? and adid=?";
                            ArrayList<Object> queryParams = Utils.makeList("dt", "area", "city", "adid");
                            ArrayList<HashMap<String, Object>> hashMaps = mysqlUtil.queryData(connection, sql, queryParams, queryParams);

                            if (hashMaps.isEmpty()){
                                // 1.1 不存在,添加到表中
                                String insert_sql = "insert into area_city_count(dt, area, city, adid, count) values(?, ?, ?, ?, ?) on duplicate key update dt=?, area=?, city=?, adid=?";
                                ArrayList<Object> arrayList = Utils.makeList(dt, area, city, adid, totalCnt, dt, area, city, adid);
                                Integer i = mysqlUtil.executUpdate(connection, insert_sql, arrayList);
                            }
                            else{
                                // 1.2 存在,聚合两份数据的点击量,并更新到表中
                                int size = hashMaps.size();
                                assert size == 1;
                                HashMap<String, Object> value = hashMaps.get(0);
                                Object cnt = value.get("count");
                                totalCnt += Integer.valueOf(cnt.toString());

                                String update_sql = "update area_city_count set count=? where dt=? and area=? and city=? and adid=?";
                                ArrayList<Object> update_params = Utils.makeList(totalCnt, dt, area, city, adid);
                                Integer i = mysqlUtil.executUpdate(connection, update_sql, update_params);

                            }

                            connection.close();
                        }
                    });
                }
        );

        javaStreamingContext.start();

        javaStreamingContext.awaitTermination();

    }
}

image.png

需求三:统计最近一个小时的广告点击量

package src.main.blackUserList;


import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;


/**
 * @description: 从Kafka数据源中读取数据并进行处理,统计最近一个小时(测试2分钟)广告的点击量
 */
public class sparkBlackListHandler4 {
    public static String sql = "select userid from black_list";

    public static void main(String[] args) throws InterruptedException {
        SparkConf sparkAppConf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamingDemo");
        JavaStreamingContext javaStreamingContext = new JavaStreamingContext(sparkAppConf, Durations.seconds(30));

        kafkaUtils kafkaUtils = new kafkaUtils();
        JavaInputDStream<ConsumerRecord<String, String>> kafkaDS = kafkaUtils.createKafkaDS("test", javaStreamingContext);

        JavaDStream<AdsLog> adsLogJavaDStream = kafkaDS.map(new Function<ConsumerRecord<String, String>, AdsLog>() {
            @Override
            public AdsLog call(ConsumerRecord<String, String> record) throws Exception {
                String value = record.value();
                String[] values = value.split(",");
                AdsLog adsLog = new AdsLog(
                        values[0],
                        values[1],
                        values[2],
                        values[3],
                        values[4]
                );
                return adsLog;
            }
        });

        JavaPairDStream<Tuple2<String, String>, Integer> resultDS = adsLogJavaDStream.mapToPair(new PairFunction<AdsLog, Tuple2<String, String>, Integer>() {
            @Override
            public Tuple2<Tuple2<String, String>, Integer> call(AdsLog adsLog) throws Exception {
                String timestamp = adsLog.getTimestamp();
                String dt = TimestampUtils.parseTimestampHour(timestamp);
                String adis = adsLog.getAdis();
                return new Tuple2<>(new Tuple2<>(adis, dt), 1);
            }
        }).reduceByKeyAndWindow((x, y) -> x + y, Durations.minutes(2), Durations.minutes(1));

        resultDS.print();

        javaStreamingContext.start();

        javaStreamingContext.awaitTermination();

    }
}

执行输出:

-------------------------------------------
Time: 1710485130000 ms
-------------------------------------------
((3,02:01),2)
((5,02:12),1)
((6,02:07),1)
((6,02:13),1)
((6,14:46),2)
((1,14:46),1)
((6,02:08),2)
((2,02:06),1)
((3,02:13),1)
((3,02:12),1)
...

-------------------------------------------
Time: 1710485190000 ms
-------------------------------------------
((3,02:01),2)
((4,14:47),2)
((5,02:12),1)
((6,02:07),1)
((6,02:13),1)
((6,14:46),3)
((1,14:46),1)
((6,02:08),2)
((5,14:48),1)
((2,02:06),1)
...