python3 task1.py -r hadoop ./user_record.txt --output-dir=./task1_result/
from mrjob.job import MRJob
from mrjob.step import MRStep
import re
class Task1MRJob(MRJob):
def mapper_stage1(self, _, line):
fields = line.strip().split(",")
age = int(fields[-2])
pattern = re.compile("\\['(.*?)'\\]")
matcher = pattern.search(line)
if matcher:
wordsString = matcher.group(1)
words = wordsString.split(", ")
if 10 <= age <= 20:
for word in words:
yield "10-20", (word.replace("'", ""), 1)
elif 21 <= age <= 30:
for word in words:
yield "21-30", (word.replace("'", ""), 1)
elif 31 <= age <= 40:
for word in words:
yield "31-40", (word.replace("'", ""), 1)
elif 41 <= age <= 50:
for word in words:
yield "41-50", (word.replace("'", ""), 1)
elif 51 <= age <= 60:
for word in words:
yield "51-60", (word.replace("'", ""), 1)
elif 61 <= age <= 70:
for word in words:
yield "61-70", (word.replace("'", ""), 1)
else:
for word in words:
yield "70+", (word.replace("'", ""), 1)
def reducer_stage1(self, key, values):
counts = {}
for behavior, count in values:
counts[behavior] = counts.get(behavior, 0) + count
for behavior, count in counts.items():
yield key, (behavior, count)
def mapper_stage2(self, key, value):
yield key, value
def reducer_stage2(self, key, values):
max_behavior = None
max_count = 0
for behavior, count in values:
if count > max_count:
max_behavior = behavior
max_count = count
yield key, (max_behavior, max_count)
def steps(self):
return [
MRStep(mapper=self.mapper_stage1, reducer=self.reducer_stage1),
MRStep(mapper=self.mapper_stage2, reducer=self.reducer_stage2)
]
if __name__ == '__main__':
Task1MRJob.run()