本文已参与周末学习计划,点击链接查看详情:juejin.cn/post/696572…
Hadoop的MapReduce框架解决统计单词出现次数案例:
项目结构如下:
-wordcount
-WordcountMapper.java
-WordcountReducer.java
-JobSubmitter.java
Mapper.java:
负责处理文本文件的每一行,把每行拆分成若干个独立的单词
Reducer.java:
负责统计每个单词的次数(接收以单词为key的一组数据)
JobSubmitter.java:
负责提交任务
详细代码如下:
WordcountMapper.java
package wordcount;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(" ");
for (String word : words) {
context.write(new Text(word), new IntWritable(1));
}
}
}
WordcountReducer.java
package wordcount;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int count = 0;
Iterator<IntWritable> iterator= values.iterator();
while(iterator.hasNext()) {
IntWritable value = iterator.next();
count += value.get();
}
context.write(new Text(key), new IntWritable(count));
}
}
JobSubmitter.java
package wordcount;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "student");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path output = new Path("/user/joe/wordcount/output2");
FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"), conf, "student");
if (fs.exists(output)) {
fs.delete(output, true);
}
FileInputFormat.setInputPaths(job, new Path("/user/joe/wordcount/input"));
FileOutputFormat.setOutputPath(job, output);
job.setNumReduceTasks(2);
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : -1);
}
}
运行步骤
- 提交待统计文本文件到HDFS中
hadoop fs -put 本地文件路径 HDFS绝对路径
2.打包成wordcount.jar
3.执行命令
hadoop jar wordcount.jar wordcount.JobSubmitter