MapReduce案例之多Map阶段求共同好友

162 阅读4分钟

持续创作,加速成长!这是我参与「掘金日新计划 · 6 月更文挑战」的第15天,点击查看活动详情

 该案例中我们要实现的是从数据文件中的好友关系中得到两两之间的共同好友,具体实现大致需要通过下面的两个阶段。

第一阶段:

 第二阶段:

  • 数据准备

将数据上传至HDFS:

[root@hadoop01 test_data]# cat test_mapreduce_friend.txt
A#C,D,E
B#D,E,F
C#A,F
D#A,B
E#A,B
F#B,C

[root@hadoop01 test_data]# hdfs dfs -mkdir /test_multiple_map_input
[root@hadoop01 test_data]# hdfs dfs -put test_mapreduce_friend.txt /test_multiple_map_input

 新建project:

  • 引入pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>wyh.test</groupId>
    <artifactId>mapreduce_multiple_map</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <packaging>jar</packaging>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>2.7.5</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <minimizeJar>true</minimizeJar>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
  • 自定义第一阶段的Mapper
package stage1;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * K1    V1
 * 0     A#C,D,E
 * K2    V2
 * C     A
 * D     A
 * E     A
 */
public class Stage1Mapper extends Mapper<LongWritable, Text, Text, Text> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {

        //将V1按照#进行拆分,左边截取到的就是V2
        String[] split = value.toString().split("#");
        String v2Value = split[0];
        //将#右边的部分按照逗号进行拆分,遍历得到A的每一个好友(K2),每次遍历时直接将K2,V2写入context
        String[] s = split[1].split(",");
        for (String friend : s) {
            context.write(new Text(friend), new Text(v2Value));
        }


    }
}
  • 自定义第一阶段的Reducer
package stage1;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * 经过Shuffle之后
 * K2'    V2'
 * C      <A,F>
 * K3     V3
 * A+F    C
 */
public class Stage1Reducer extends Reducer<Text, Text, Text, Text> {
    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        StringBuffer buffer = new StringBuffer();
        //遍历V2'集合中的每个元素,将同一个K2'对应的集合中的元素进行拼接,我们这里使用"+"来拼接元素
        for (Text element : values) {
            buffer.append(element.toString()).append("+");
        }
        //将K3,V3写入context
        context.write(new Text(buffer.toString()), key);

    }
}
  • 自定义第一阶段的主类
package stage1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Stage1JobMain extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {
        Job job = Job.getInstance();//这里当时忘了写参数
        job.setJarByClass(Stage1JobMain.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("hdfs://192.168.126.132:8020/test_multiple_map_input"));
        job.setMapperClass(Stage1Mapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(Stage1Reducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.126.132:8020/test_multiple_map_output"));
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int run = ToolRunner.run(configuration, new Stage1JobMain(), args);
        System.exit(run);
    }
}
  • 打包第一阶段的jar并上传至服务器

  • 运行jar 
[root@hadoop01 test_jar]# hadoop jar mapreduce_multiple_map-1.0-SNAPSHOT.jar stage1.Stage1JobMain
  • 查看第一阶段的输出结果

[root@hadoop01 test_jar]# hdfs dfs -cat /test_multiple_map_output/part-r-00000

  •  自定义第二阶段的Mapper
package stage2;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Arrays;

/**
 * K1   V1
 * 0    C+E+D+  A    (V1中的前后两部分是\t连接的)
 * K2   V2
 * C+E  A
 * C+D  A
 * D+E  A
 */
public class Stage2Mapper extends Mapper<LongWritable, Text, Text, Text> {
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        //对V1按照\t进行拆分
        String[] split = value.toString().split("\t");
        //拆分后的右半部分就是V2
        String v2Value = split[1];
        //对V1拆分后的左半部分再按照"+"来拆分
        String[] split1 = split[0].split("\+");//加号拆分时需要转义
        //对拆分后的好友数组进行排序,避免出现C+E与E+C在后面的处理中会被当作是两种情况,其实他们都是表示C与E的共同好友
        Arrays.sort(split1);//C+E+D+拆分排序后变为C D E存在于数组中
        /**
         * 对数组进行嵌套循环使之可以两两拼接
         * C D     (i)
         *   D E   (j=i+1)
         *   即C与D,E依次遍历拼接,D与E拼接
         */
        for(int i = 0; i < split1.length - 1; i++){
            for(int j = i + 1; j < split1.length; j++){
                //拼接
                String k2 = split1[i] + "+" + split1[j];
                context.write(new Text(k2), new Text(v2Value));
            }
        }
    }
}
  • 自定义第二阶段的Reducer
package stage2;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * K2'   V2'
 * A+B   <D,E>
 * K3    V3
 * A+B   D&E
 * K3直接取K2'的值即可
 * V3可以对V2'中的集合进行遍历拼接
 */
public class Stage2Reducer extends Reducer<Text, Text, Text, Text> {
    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        StringBuffer buffer = new StringBuffer();
        for (Text element : values) {
            buffer.append(element.toString()).append("&");
        }
        //将拼接好的V3转为String类型
        String s = buffer.toString();
        //由于上面append的时候末尾会多出来一个拼接符&,所以这里需要截取一下
        String substring = s.substring(0, s.length() - 1);
        //将K3,V3写入context
        context.write(key, new Text(substring));

    }
}
  • 自定义第二阶段的主类
package stage2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class Stage2JobMain extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {
        Job job = Job.getInstance(super.getConf(), "test_multiple_map_job2");
        job.setJarByClass(Stage2JobMain.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("hdfs://192.168.126.132:8020/test_multiple_map_output"));//注意这里的输入路径是第一阶段的输出路径,因为要读取第一阶段产生的文件
        job.setMapperClass(Stage2Mapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(Stage2Reducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.126.132:8020/test_multiple_map_step2_output"));
        boolean b = job.waitForCompletion(true);
        return b?0:1;
    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int run = ToolRunner.run(configuration, new Stage2JobMain(), args);
        System.exit(run);
    }
}
  • 重新打包并上传至服务器
[root@hadoop01 test_jar]# rm -f mapreduce_multiple_map-1.0-SNAPSHOT.jar
//上传新打的包...
//注意这一次运行jar时指定的主类是我们创建的第二阶段的主类全路径
[root@hadoop01 test_jar]# hadoop jar mapreduce_multiple_map-1.0-SNAPSHOT.jar stage2.Stage2JobMain
  • 查看运行结果

[root@hadoop01 test_jar]# hdfs dfs -cat /test_multiple_map_step2_output/part-r-00000

 可以看到这个结果与我们在最开始分析的流程图中的结果是一致的。

这样就简单地实现了MapReduce中多Map的操作。