爬虫练习——urlib的使用3

111 阅读2分钟

爬虫在发生情请求时,需要根据根据不同的请求方式,做不同的处理,post请求就需要使用到encode()方法。二而对于批量下载,可以知道data数据的方式,这需要观察网页的响应中,data的数据定制,这也是在请求定制中的一个参数。对于代理的问题是,要进可能的模仿人去访问网页,所有,不能使用同个ip频繁的去访问,因此,ip的代理池就很有必要,并且可以通过增加睡眠时间,来进一步处理。


import urllib.request
import urllib.parse

def create_request( page):
    base_url='http://movie.douban.com/j/chart/top_list?type=5&inteval_id=100%3A90&action=&'

    data={
        'start':(page-1)*20,
        'limit':20
    }

    data = urllib.parse.urlencode(data)

    url=base_url+data

    print(url)

    headers={
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
    }

    request=urllib.request.Request(url=url,headers=headers)

    return  request

def get_content(request):
    response=urllib.request.urlopen(request)

    content=response.read().decode('utf-8')

    return content

def download(page,content):
    with open('douban_'+str(page)+'.json','w',encoding='utf-8') as fp:
        fp.write(content)

if __name__ == '__main__':
    start_age=int(input('起始的页码'))
    end_page=int(input('结束的页码'))

    for page in range(start_age,end_page+1):
        print(page)
        request=create_request(page)
        content=get_content(request)
        download(page,content)
#ajax post 请求

import  urllib.request
import  urllib.parse

base_url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'



def create_request( page):
    base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'

    data={
        'cname':'北京',
        'pid':'',
        'pageIndex':page,
        'pageSize':10,
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
    }

    data=urllib.parse.urlencode(data).encode('utf-8')

    requests=urllib.request.Request(url=base_url,headers=headers,data=data)
    return  requests

def get_content(request):
    response=urllib.request.urlopen(request)
    content=response.read().decode('utf-8')

    return  content




if __name__ == '__main__':

    star_page=int(input('起始页码'))
    end_page=int(input('结束页码'))

    for page in range(star_page,end_page):
        print(page)
        request=create_request(page)

        content=get_content(request)

        with open('kfc_'+str(page)+'.json','w',encoding='utf-8') as fp:
            fp.write(content)
import  urllib.request
import  urllib.parse

url='https://juejin.cn/post/71319027135954289001'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}

try:
    requests = urllib.request.Request(url=url, headers=headers)

    response = urllib.request.urlopen(requests)

    content = response.read().decode('utf-8')

    print(content)
except urllib.error.HTTPError:
    print('error...')
except urllib.error.URLError:
    print('url error')
import urllib.request

url='http://www.baidu.com/s?wd=ip'

proxies={
    'http':'118.24.219.151:16817'
}


headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}

requests=urllib.request.Request(url=url,headers=headers)

handler=urllib.request.ProxyHandler()

opener=urllib.request.build_opener(handler)

response=urllib.request.urlopen(requests)

response=opener.open(requests)

content=response.read().decode('utf-8')

with open('da.html','w',encoding='utf-8') as fp:
    fp.write(content)
#代理池

import random
import  urllib.request
import  urllib.parse

proxies_pool=[
    {'http':'118.24.219.151:16817'},
    {'http':'118.24.219.151:16817'},
]

proxies=random.choice(proxies_pool)

url='http://www.baidu.com/s?wd=ip'

headers={
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}

request=urllib.request.Request(url=url,headers=headers)

#response=urllib.request.urlopen(request)

handler=urllib.request.ProxyHandler(proxies=proxies)

opener=urllib.request.build_opener(handler)

response=opener.open(request)

content=response.read().decode('utf-8')

with open('dal.html','w',encoding='utf-8')as fp:
    fp.write(content)