Hadoop经典案例WordCount|周末学习

274 阅读1分钟

本文已参与周末学习计划,点击链接查看详情:juejin.cn/post/696572…

Hadoop的MapReduce框架解决统计单词出现次数案例:
项目结构如下:

-wordcount
 -WordcountMapper.java
 -WordcountReducer.java
 -JobSubmitter.java

Mapper.java:
负责处理文本文件的每一行,把每行拆分成若干个独立的单词
Reducer.java:
负责统计每个单词的次数(接收以单词为key的一组数据)
JobSubmitter.java:
负责提交任务

详细代码如下:
WordcountMapper.java

package wordcount;
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = value.toString();
		String[] words = line.split(" ");
		for (String word : words) {
			context.write(new Text(word), new IntWritable(1));
		}

	}
}

WordcountReducer.java

package wordcount;
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		int count = 0;
		Iterator<IntWritable> iterator= values.iterator();
		while(iterator.hasNext()) {
			IntWritable value = iterator.next();
			count += value.get();
		}
		
		context.write(new Text(key), new IntWritable(count));
	}
}

JobSubmitter.java

package wordcount;

import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmitter {

	public static void main(String[] args) throws Exception {
	System.setProperty("HADOOP_USER_NAME", "student");

		Configuration conf = new Configuration();	
		Job job = Job.getInstance(conf);

		job.setJarByClass(JobSubmitter.class);
		job.setMapperClass(WordcountMapper.class);
		job.setReducerClass(WordcountReducer.class);


		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		Path output = new Path("/user/joe/wordcount/output2");
		FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"), conf, "student");
		if (fs.exists(output)) {
			fs.delete(output, true);
		}

		FileInputFormat.setInputPaths(job, new Path("/user/joe/wordcount/input"));
		FileOutputFormat.setOutputPath(job, output);

		job.setNumReduceTasks(2);

		boolean res = job.waitForCompletion(true);

		System.exit(res ? 0 : -1);

	}
}

运行步骤

  1. 提交待统计文本文件到HDFS中
hadoop fs -put 本地文件路径 HDFS绝对路径 

2.打包成wordcount.jar
3.执行命令

hadoop jar wordcount.jar wordcount.JobSubmitter