
1.job = map+reduce
2.Map的输出是reduce的输入
3.所有的输入和输出都是<Key,Values>,一共4对。
4.K2=K3 V3是一个集合,该集合的每个元素就是V2。
5.所有的数据类型都必须是Hadoop自己的数据类型。
int--->IntWritable
long--->LongWritable
string--->Text
null--->NUllWritable
Submitter运行类:
package com.etc;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.File;
import java.io.IOException;
public class JobSubmitter {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
//map输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//reduce输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
File file = new File("F:\\wordcountfengze");
if (file.exists()){
FileUtils.deleteDirectory(file);
}
FileInputFormat.setInputPaths(job,new Path("F:\\wordcountwangcc"));
FileOutputFormat.setOutputPath(job,new Path("F:\\wordcountfengze"));
job.setNumReduceTasks(1);
boolean tt = job.waitForCompletion(true);
System.out.println(tt);
}
}
Mapper类1:
package com.etc;
import org.apache.commons.collections.ArrayStack;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
/**
* KEYIN :是map task读取到的数据的key的类型,是一行的起始偏移量Long
* VALUEIN:是map task读取到的数据的value的类型,是一行的内容String
* KEYOUT:是用户的自定义map方法要返回的结果kv数据的key的类型,在wordcount逻辑中,我们需要返回的是单词String
* VALUEOUT:是用户的自定义map方法要返回的结果kv数据的value的类型,在wordcount逻辑中,我们需要返回的是整数Integer
* 但是,在mapreduce中,map产生的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中的原生序列化机制产生的数据量比较冗余,就会导致数据在mapreduce运行过程中传输效率低下
* 所以,hadoop专门设计了自己的序列化机制,那么,mapreduce中传输的数据类型就必须实现hadoop自己的序列化接口
* hadoop为jdk中的常用基本类型Long String Integer Float等数据类型封住了自己的实现了hadoop序列化接口的类型:LongWritable,Text,IntWritable,FloatWritable
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切单词
String line = value.toString();
String[] words = line.split("\t");//分裂空格
List<String> listStri = new ArrayList<String>(Arrays.asList(words));//String数组转化ArrayList
//遍历数组查找空格删除空格
for (int i = 0; i <listStri.size() ; i++) {
if (listStri.get(i).equals(" ")){
listStri.remove(i);
}
}
for (String word : listStri) {
context.write(new Text( word), new IntWritable(1));
}
}
}
Mapper类2:
package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
for (String s : split) {
//重点values个数只能为1
context.write(new Text(s),new IntWritable(1));
}
}
}
Reduce类:
package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
// k3 v3 k4 v4
public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count = count + value.get();
}
context.write(new Text(key),new IntWritable(count));
}
}
动态计数器:
Context对象的getCounter方法有两个String类型的输入参数,分别代表组名称和计数器名称。
Public Counter getCounter(String groupName,String counterName)