python实现mapreduce:
mapper.py
#!/usr/bin/env python
import sys
for line in sys.stdin:
words = line.strip().split(' ')
for word in words:
print ' '.join([word,'1'])
reducer.py
#coding:utf8
import sys
from itertools import groupby
from operator import itemgetter
def get_mapper_output():
""" When a generator function is called,it returns a generator
"""
for line in sys.stdin:
yield line.rstrip().split()
data = get_mapper_output()#data's type is a generator
for word,group in groupby(data,itemgetter(0)):#group:itertools._grouper object
counts = sum((int(count) for current_word,count in group))#sum对可迭代对象求和,里面跟了一个generator,generator外侧的圆括号可以省略
print "%s %s" %(word,str(counts))
#!/bin/bash
/usr/local/src/hadoop-2.7.4/bin/hadoop jar /usr/local/src/hadoop-2.7.4/share/hadoop/tools/lib/hadoop-streaming-.jar
-files /home/hadoop/mapper.py,/home/hadoop/reducer.py
-mapper "python /home/hadoop/mapper.py"
-reducer "python /home/hadoop/reducer.py"
-input /user/root/input/
-output /user/root/output
测试: #!/bin/bash cat wordcount.txt |python mapper.py|sort -k1,1|python reducer.py