本文已参与周末学习计划,点击链接查看详情:juejin.cn/post/696572…
Hadoop的MapReduce框架解决流量统计案例:
需要统计的文本文件格式如下:
1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15 1116 954 200
第一列为时间戳,第二列为手机号,倒数第二列为下行流量,倒数第三列为上行流量。现在需要以手机作为key,统计每个手机下使用的总上行流量、总下行流量和总流量数据。
项目结构如下:
-flowCount
-FlowBean.java
-FlowCountMapper.java
-FlowCountReducer.java
-JobSubmitter.java
FlowBean.java:
构造具有上行流量、下行流量和总流量属性对象
FlowCountMapper.java:
负责处理文本文件的每一行,把每行转化手机号为key,value为FlowBean对象
FlowCountReducer.java:
负责统计每个手机号的流量使用情况
JobSubmitter.java:
负责提交任务
详细代码如下: FlowBean.java:
package flowCount;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable{
private int upFlow;
private int dFlow;
private int amountFlow;
private String phone;
public FlowBean() {}
public FlowBean(String phone, int upFlow, int dFlow) {
this.phone = phone;
this.upFlow = upFlow;
this.dFlow = dFlow;
this.amountFlow = upFlow + dFlow;
}
public int getUpFlow() {
return upFlow;
}
public void setUpFlow(int upFlow) {
this.upFlow = upFlow;
}
public int getdFlow() {
return dFlow;
}
public void setdFlow(int dFlow) {
this.dFlow = dFlow;
}
public int getAmountFlow() {
return amountFlow;
}
public void setAmountFlow(int amountFlow) {
this.amountFlow = amountFlow;
}
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(upFlow);
out.writeUTF(phone);
out.writeInt(dFlow);
out.writeInt(amountFlow);
}
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.upFlow = in.readInt();
this.phone = in.readUTF();
this.dFlow = in.readInt();
this.amountFlow = in.readInt();
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
@Override
public String toString() {
return this.upFlow + "," + this.dFlow + "," + this.amountFlow;
}
}
FlowCountMapper.java
package flowCount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String phone = fields[1];
int upFlow = Integer.parseInt(fields[fields.length - 3]);
int dFlow = Integer.parseInt(fields[fields.length - 2]);
context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
}
}
FlowCountReducer.java
package flowCount;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
int upSum = 0;
int dSum = 0;
for(FlowBean value:values){
upSum += value.getUpFlow();
dSum += value.getdFlow();
}
context.write(key, new FlowBean(key.toString(), upSum, dSum));
}
}
JobSubmitter.java
package flowCount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("/mrdata/flow/input"));
FileOutputFormat.setOutputPath(job, new Path("/mrdata/flow/output"));
job.waitForCompletion(true);
}
}
运行步骤
- 提交待统计文本文件到HDFS中
hadoop fs -put 本地文件路径 HDFS绝对路径
2.打包成flowCount.jar
3.执行命令
hadoop jar flowCount.jar flowCount.JobSubmitter