Hadoop经典案例WordCount｜周末学习本文已参与周末学习计划，点击链接查看详情：https://juejin.

本文已参与周末学习计划，点击链接查看详情：juejin.cn/post/696572…

Hadoop的MapReduce框架解决统计单词出现次数案例：
项目结构如下：

-wordcount
 -WordcountMapper.java
 -WordcountReducer.java
 -JobSubmitter.java

Mapper.java：
负责处理文本文件的每一行，把每行拆分成若干个独立的单词
Reducer.java：
负责统计每个单词的次数（接收以单词为key的一组数据）
JobSubmitter.java：
负责提交任务

详细代码如下：
WordcountMapper.java

package wordcount;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = value.toString();
		String[] words = line.split(" ");
		for (String word : words) {
			context.write(new Text(word), new IntWritable(1));
		}

	}
}

WordcountReducer.java

package wordcount;
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		int count = 0;
		Iterator<IntWritable> iterator= values.iterator();
		while(iterator.hasNext()) {
			IntWritable value = iterator.next();
			count += value.get();
		}
		
		context.write(new Text(key), new IntWritable(count));
	}
}

JobSubmitter.java

package wordcount;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitter {

	public static void main(String[] args) throws Exception {
	System.setProperty("HADOOP_USER_NAME", "student");

		Configuration conf = new Configuration();	
		Job job = Job.getInstance(conf);

		job.setJarByClass(JobSubmitter.class);
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);


		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		Path output = new Path("/user/joe/wordcount/output2");
		FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"), conf, "student");
		if (fs.exists(output)) {
			fs.delete(output, true);
		}

		FileInputFormat.setInputPaths(job, new Path("/user/joe/wordcount/input"));
		FileOutputFormat.setOutputPath(job, output);

		job.setNumReduceTasks(2);

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : -1);

	}
}

运行步骤

提交待统计文本文件到HDFS中

hadoop fs -put 本地文件路径 HDFS绝对路径

2.打包成wordcount.jar
3.执行命令

hadoop jar wordcount.jar wordcount.JobSubmitter