mapreduce的编程模型,计数器

245 阅读2分钟

1.job = map+reduce

2.Map的输出是reduce的输入

3.所有的输入和输出都是<Key,Values>,一共4对。

4.K2=K3 V3是一个集合,该集合的每个元素就是V2。

5.所有的数据类型都必须是Hadoop自己的数据类型。

int--->IntWritable
long--->LongWritable
string--->Text
null--->NUllWritable

Submitter运行类:

package com.etc;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.File;
import java.io.IOException;

public class JobSubmitter {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(JobSubmitter.class);


        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReduce.class);
        //map输出
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //reduce输出
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        File file = new File("F:\\wordcountfengze");
        if (file.exists()){
            FileUtils.deleteDirectory(file);
        }

        FileInputFormat.setInputPaths(job,new Path("F:\\wordcountwangcc"));
        FileOutputFormat.setOutputPath(job,new Path("F:\\wordcountfengze"));

        job.setNumReduceTasks(1);
        boolean tt = job.waitForCompletion(true);
        System.out.println(tt);


    }
}

Mapper类1:

package com.etc;

import org.apache.commons.collections.ArrayStack;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * KEYIN :是map task读取到的数据的key的类型,是一行的起始偏移量Long
 * VALUEIN:是map task读取到的数据的value的类型,是一行的内容String
 * KEYOUT:是用户的自定义map方法要返回的结果kv数据的key的类型,在wordcount逻辑中,我们需要返回的是单词String
 * VALUEOUT:是用户的自定义map方法要返回的结果kv数据的value的类型,在wordcount逻辑中,我们需要返回的是整数Integer
 * 但是,在mapreduce中,map产生的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中的原生序列化机制产生的数据量比较冗余,就会导致数据在mapreduce运行过程中传输效率低下
 * 所以,hadoop专门设计了自己的序列化机制,那么,mapreduce中传输的数据类型就必须实现hadoop自己的序列化接口
 * hadoop为jdk中的常用基本类型Long String Integer Float等数据类型封住了自己的实现了hadoop序列化接口的类型:LongWritable,Text,IntWritable,FloatWritable
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //切单词

        String line = value.toString();
        String[] words = line.split("\t");//分裂空格
        List<String> listStri = new ArrayList<String>(Arrays.asList(words));//String数组转化ArrayList
        //遍历数组查找空格删除空格
        for (int i = 0; i <listStri.size() ; i++) {
            if (listStri.get(i).equals(" ")){
                listStri.remove(i);
            }
        }
        for (String word : listStri) {
            context.write(new Text( word), new IntWritable(1));
        }
    }
}

Mapper类2:

package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
       String line = value.toString();
        String[] split = line.split(",");
        for (String s : split) {
            //重点values个数只能为1
            context.write(new Text(s),new IntWritable(1));
        }
    }
}

Reduce类:

package com.etc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//                                            k3   v3           k4  v4
public class WordCountReduce  extends Reducer<Text,IntWritable,Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable value : values) {
            count = count + value.get();
        }
        context.write(new Text(key),new IntWritable(count));
    }
}

动态计数器:

Context对象的getCounter方法有两个String类型的输入参数,分别代表组名称和计数器名称。

Public Counter getCounter(String groupName,String counterName)