抓取猫眼电影简介

146 阅读1分钟
import requests


from flask import json


from requests.exceptions import RequestException


import re


from multiprocessing import Pool



'''


Request+正则表达式抓取猫眼电影


'''



'''


获取第一页的内容


'''


def getOneContent(url,headers):


try:


response = requests.get(url,headers=headers)


if response.status_code == 200:


return response.text


return None


except RequestException:


return None



'''


解析内容,根据正则表达式


'''


def parserContent(content):


if content:


# pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>'


# +'.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?<dd>',re.S)



# 字符串换行不需要添加“+”,上面这种写法是错误的。


pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>'


'.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?</dd>',re.S)


results = re.findall(pattern,content)


# print(results)


return results



def processData(results):


for result in results:


yield {


'index':result[0],


'imgurl':result[1],


'name':result[2],


'star':result[3].strip()[3:],


'releasetime':result[4].strip()[5:],


'score':result[5]+result[6]


}


# print(result)



def storeData(data):


'''


为了防止出现unicode码


:param data: 需要写入文本的数据


:return: 无返回值


'''


with open("mmovie.txt",'a',encoding='utf-8') as f:


f.write(json.dumps(data,ensure_ascii=False)+'\n')


f.close()



def main(offset):


url = 'http://maoyan.com/board/4?offset='+str(offset)


headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'}


html = getOneContent(url,headers=headers)


# print(html)


results = parserContent(html)


for item in processData(results):


storeData(item)



if __name__ == '__main__':


# for i in range(10):


# main(i*10)


pool = Pool()


pool.map(main,[i*10 for i in range(10)])

更多免费技术资料可关注:annalin1203