本文已参与「新人创作礼」活动,一起开启掘金创作之路。
这次要实现的是对数码、出行、餐饮三项支出中的出行支出(第4列)进行排序(降序)。
- 数据准备
将数据上传至HDFS:
[root@hadoop01 test_data]# hdfs dfs -mkdir /test_shopping_sort_input
[root@hadoop01 test_data]# hdfs dfs -put test_shopping.txt /test_shopping_sort_input
新建project:
- 引入pom依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>wyh.test</groupId>
<artifactId>mapreduce_shopping</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.7.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>RELEASE</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- 创建自定义Bean封装三项支出并实现序列化与排序
package wyh.test.shoppingsort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
//因为既要实现序列化又要实现比较,所以这里要实现的是WritableComparable接口,而不是Writable接口
public class ShoppingSortBean implements WritableComparable<ShoppingSortBean> {
private Integer digital;//数码支出
private Integer commute;//出行支出
private Integer catering;//餐饮支出
public Integer getDigital() {
return digital;
}
public void setDigital(Integer digital) {
this.digital = digital;
}
public Integer getcommute() {
return commute;
}
public void setcommute(Integer commute) {
this.commute = commute;
}
public Integer getCatering() {
return catering;
}
public void setCatering(Integer catering) {
this.catering = catering;
}
@Override
public String toString() {
return digital +
"\t" + commute +
"\t" + catering;
}
//实现Writable接口中的序列化方法
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(digital);
dataOutput.writeInt(commute);
dataOutput.writeInt(catering);
}
//实现Writable接口中的反序列化方法
@Override
public void readFields(DataInput dataInput) throws IOException {
this.digital = dataInput.readInt();
this.commute = dataInput.readInt();
this.catering = dataInput.readInt();
}
//定义排序规则
@Override
public int compareTo(ShoppingSortBean shoppingSortBean) {
//使用Integer类中提供的compareTo()进行比较,由于我们需要的是降序,所以这里乘以(-1)使升序变为降序
int compareResult = (this.getcommute().compareTo(shoppingSortBean.getcommute())) * (-1);
return compareResult;
}
}
- 自定义Mapper
package wyh.test.shoppingsort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* K1 行偏移量
* V1 行文本数据
* K2 ShoppingSortBean(因为要对支出总和中的出行支出进行排序,所以要保证把出行支出项的值放在K2中,这里可以只把出行支出放在K2中,也可以把封装了出行支出的ShoppingSortBean放在K2中)
* V2 用户标识(手机号)
*/
public class ShoppingSortMapper extends Mapper<LongWritable, Text, ShoppingSortBean, Text> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, ShoppingSortBean, Text>.Context context) throws IOException, InterruptedException {
//拆分行文本数据,截取出每条记录中的三项支出并封装为ShoppingSortBean,即可得到K2
ShoppingSortBean shoppingSortBean = new ShoppingSortBean();
String[] splitString = value.toString().split(",");
shoppingSortBean.setDigital(Integer.parseInt(splitString[2]));
shoppingSortBean.setcommute(Integer.parseInt(splitString[3]));
shoppingSortBean.setCatering(Integer.parseInt(splitString[4]));
//从行文本数据中截取出手机号即可得到V2
String userPhone = splitString[0];
//将K2,V2写入context对象
context.write(shoppingSortBean, new Text(userPhone));
}
}
- 自定义Reducer
package wyh.test.shoppingsort;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* K2 含有出行支出的ShoppingSortBean
* V2 用户标识(手机号)
* K3 用户标识(手机号)(在输出结果时,我们还是将手机号作为第一列进行输出,所以在Reduce阶段,我们将K2/V2的值互换分别赋给V3/K3)
* V3 经过排序后的ShoppingnSortBean
*/
public class ShoppingSortReducer extends Reducer<ShoppingSortBean, Text, Text, ShoppingSortBean> {
@Override
protected void reduce(ShoppingSortBean key, Iterable<Text> values, Reducer<ShoppingSortBean, Text, Text, ShoppingSortBean>.Context context) throws IOException, InterruptedException {
//遍历V2集合,即可拿到每条数据中的手机号,即可得到K3,此时每条数据中K2 Bean是不相同的,所以V2中的集合应该就只有一个手机号
for (Text userPhone : values) {
//拿到每条数据的手机号,我们直接将K3,V3写入context对象中
context.write(userPhone, key);
}
}
}
- 自定义主类
package wyh.test.shoppingsort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ShoppingSortJobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Job job = Job.getInstance(super.getConf(), "test_shopping_sort_job");
job.setJarByClass(ShoppingSortJobMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path("hdfs://192.168.126.132:8020/test_shopping_sort_input"));
job.setMapperClass(ShoppingSortMapper.class);
job.setMapOutputKeyClass(ShoppingSortBean.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ShoppingSortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ShoppingSortBean.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path("hdfs://192.168.126.132:8020/test_shopping_sort_output"));
boolean status = job.waitForCompletion(true);
return status?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
//启动job任务
int runStatus = ToolRunner.run(configuration, new ShoppingSortJobMain(), args);
System.exit(runStatus);
}
}
- 打包
- 将jar上传至服务器并运行jar
[root@hadoop01 test_jar]# hadoop jar mapreduce_shopping-1.0-SNAPSHOT.jar wyh.test.shoppingsort.ShoppingSortJobMain
- 查看输出结果
[root@hadoop01 test_jar]# hdfs dfs -cat /test_shopping_sort_output/part-r-00000
18828838888 852 603 1549
18828838888 527 543 1704
18828838888 1487 499 1039
15525535555 2890 437 1495
16626636666 754 417 1586
15525535555 1088 372 1726
15525535555 340 319 1653
16626636666 1264 308 1677
16626636666 530 259 2174