python实现MapReduce

613 阅读1分钟

python实现mapreduce:

mapper.py

#!/usr/bin/env python
import sys
for line in sys.stdin:
    words = line.strip().split(' ')
    for word in words:
        print ' '.join([word,'1'])

reducer.py

#coding:utf8
import sys
from itertools import groupby
from operator import itemgetter
def get_mapper_output():
    """ When a generator function is called,it returns a generator
    """
    for line in sys.stdin:
        yield line.rstrip().split()
data = get_mapper_output()#data's type is a generator
for word,group in groupby(data,itemgetter(0)):#group:itertools._grouper object
    counts = sum((int(count) for current_word,count in group))#sum对可迭代对象求和,里面跟了一个generator,generator外侧的圆括号可以省略
    print "%s %s" %(word,str(counts))

#!/bin/bash /usr/local/src/hadoop-2.7.4/bin/hadoop jar /usr/local/src/hadoop-2.7.4/share/hadoop/tools/lib/hadoop-streaming-.jar
-files /home/hadoop/mapper.py,/home/hadoop/reducer.py
-mapper "python /home/hadoop/mapper.py"
-reducer "python /home/hadoop/reducer.py"
-input /user/root/input/

-output /user/root/output

测试: #!/bin/bash cat wordcount.txt |python mapper.py|sort -k1,1|python reducer.py