python 跑 MapReduce

65 阅读1分钟
  • python3 task1.py -r hadoop ./user_record.txt --output-dir=./task1_result/
from mrjob.job import MRJob  
from mrjob.step import MRStep  
import re  
  
  
class Task1MRJob(MRJob):  
  
def mapper_stage1(self, _, line):  
fields = line.strip().split(",")  
age = int(fields[-2])  
  
# 匹配中括号内的内容  
pattern = re.compile("\\['(.*?)'\\]")  
matcher = pattern.search(line)  
  
# 提取匹配到的内容  
if matcher:  
wordsString = matcher.group(1)  
# 将逗号分隔的词转换为数组  
words = wordsString.split(", ")  
  
# 对每个行为对应的年龄进行分组  
if 10 <= age <= 20:  
for word in words:  
yield "10-20", (word.replace("'", ""), 1)  
elif 21 <= age <= 30:  
for word in words:  
yield "21-30", (word.replace("'", ""), 1)  
elif 31 <= age <= 40:  
for word in words:  
yield "31-40", (word.replace("'", ""), 1)  
elif 41 <= age <= 50:  
for word in words:  
yield "41-50", (word.replace("'", ""), 1)  
elif 51 <= age <= 60:  
for word in words:  
yield "51-60", (word.replace("'", ""), 1)  
elif 61 <= age <= 70:  
for word in words:  
yield "61-70", (word.replace("'", ""), 1)  
else:  
for word in words:  
yield "70+", (word.replace("'", ""), 1)  
  
def reducer_stage1(self, key, values):  
# 聚合相同年龄段和行为的计数  
counts = {}  
for behavior, count in values:  
counts[behavior] = counts.get(behavior, 0) + count  
  
# 输出年龄段和行为及计数  
for behavior, count in counts.items():  
yield key, (behavior, count)  
  
def mapper_stage2(self, key, value):  
# 将年龄段作为键,行为及计数作为值  
yield key, value  
  
def reducer_stage2(self, key, values):  
# 找出每个年龄段中数量最大的行为及计数  
max_behavior = None  
max_count = 0  
for behavior, count in values:  
if count > max_count:  
max_behavior = behavior  
max_count = count  
  
# 输出年龄段和最大行为及计数  
yield key, (max_behavior, max_count)  
  
def steps(self):  
return [  
MRStep(mapper=self.mapper_stage1, reducer=self.reducer_stage1),  
MRStep(mapper=self.mapper_stage2, reducer=self.reducer_stage2)  
]  
  
  
if __name__ == '__main__':  
Task1MRJob.run()