利用多线程方式来实现爬取北京新发地数据(少量,不增加服务器压力)

256 阅读1分钟
import requests
from concurrent.futures import ThreadPoolExecutor       #多线程支持模块


def downloadurl(url):   #定义一个函数去爬取一页完整的数据
    head = {
        'limit': '20',
        'current': '{}'.format(num),
        'pubDateStartTime': '',
        'pubDateEndTime': '',
        'prodPcatid': '',
        'prodCatid': '',
        'prodName': '',
    }
    rsp = requests.post(url,data=head)
    content = dict(rsp.json())
    content_list = content['list']
    for s in content_list:
        info = s['prodName'], s['avgPrice'], s['place'], s['pubDate'].strip(' 00:00:00')
        info_s = (iten.replace('\','_').replace('/','_') for iten in info)     #处理数据中带有'\'和'/'的字符
        info_l = list(info_s)   #把生成器改为list
        #保存到文件
        with open('today_price.csv', mode='a', encoding='utf-8') as f:
            f.write(str(info_l))
            f.write('\n')
    print(url,'提取完毕')


if __name__ == '__main__':
    with ThreadPoolExecutor(2) as t:    #创建一个2个线程的线程池
        for num in range(1,10):
            t.submit(downloadurl,'http://www.xinfadi.com.cn/getPriceData.html')     #让线程开始执行任务