Python3爬虫-04-模拟登录爬取企信宝200页数据

651 阅读1分钟
'''
批量抓取企信宝页面(前200页吧)
'''
#导入需要的库
import urllib.request,socket,re,sys,os
import ssl
import fileinput
import time
import random
ssl._create_default_https_context = ssl._create_unverified_context
#定义文件保存路径
targetPath = "//Users//wangleilei//Documents//03__douban_Images"



    # 定义保存函数
def saveFile(data):
        # 路径替换成你自己的
        path = "//Users//wangleilei//Documents//007_企信宝.html"
        f = open(path, 'ab')
        f.write(data)
        f.close()
# 网址

def getData(index1):

    temp=index1

    url = "http://www.qixin.com/search?key=%E6%97%85%E6%B8%B8&page=" + temp + "&status[]=1"

    print(url)

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0',
               'Cookie': 'channel=baidu; _zg=%7B%22uuid%22%3A%20%221604427941638e-0e3eb0a6fa80588-49566e-13c680-160442794174ec%22%2C%22sid%22%3A%201512971932.697%2C%22updated%22%3A%201512972319.613%2C%22info%22%3A%201512971932703%2C%22cuid%22%3A%20%228449f8dd-5c6a-4768-b489-f34053c20d77%22%7D; showsale=1; cookieShowLoginTip=1; responseTimeline=85; Hm_lvt_52d64b8d3f6d42a2e416d59635df3f71=1512971936; Hm_lpvt_52d64b8d3f6d42a2e416d59635df3f71=1512972313; sid=s%3ATF94-C8QhbiJVOwC2ZRAKYUXPJBXVJFn.dpf832kO3Fdn66716KAquegeH6LtIHYMCab5u9bINwU'}
    request = urllib.request.Request(url=url, headers=headers)

    response = urllib.request.urlopen(request)

    data = response.read()
    saveFile(data)
    print(data)

# getData(2)
i=195
while i < 300:
    print (i)
    string = str(i)
    getData(string)
    i = i + 1
    a=random.randrange(0, 2)
    time.sleep(a)
    print("随机数")
    print(a)

我的Python3爬虫系列