爬虫在发生情请求时,需要根据根据不同的请求方式,做不同的处理,post请求就需要使用到encode()方法。二而对于批量下载,可以知道data数据的方式,这需要观察网页的响应中,data的数据定制,这也是在请求定制中的一个参数。对于代理的问题是,要进可能的模仿人去访问网页,所有,不能使用同个ip频繁的去访问,因此,ip的代理池就很有必要,并且可以通过增加睡眠时间,来进一步处理。
import urllib.request
import urllib.parse
def create_request( page):
base_url='http://movie.douban.com/j/chart/top_list?type=5&inteval_id=100%3A90&action=&'
data={
'start':(page-1)*20,
'limit':20
}
data = urllib.parse.urlencode(data)
url=base_url+data
print(url)
headers={
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
request=urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
return content
def download(page,content):
with open('douban_'+str(page)+'.json','w',encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_age=int(input('起始的页码'))
end_page=int(input('结束的页码'))
for page in range(start_age,end_page+1):
print(page)
request=create_request(page)
content=get_content(request)
download(page,content)
#ajax post 请求
import urllib.request
import urllib.parse
base_url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
def create_request( page):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
data={
'cname':'北京',
'pid':'',
'pageIndex':page,
'pageSize':10,
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
data=urllib.parse.urlencode(data).encode('utf-8')
requests=urllib.request.Request(url=base_url,headers=headers,data=data)
return requests
def get_content(request):
response=urllib.request.urlopen(request)
content=response.read().decode('utf-8')
return content
if __name__ == '__main__':
star_page=int(input('起始页码'))
end_page=int(input('结束页码'))
for page in range(star_page,end_page):
print(page)
request=create_request(page)
content=get_content(request)
with open('kfc_'+str(page)+'.json','w',encoding='utf-8') as fp:
fp.write(content)
import urllib.request
import urllib.parse
url='https://juejin.cn/post/71319027135954289001'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
try:
requests = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(requests)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('error...')
except urllib.error.URLError:
print('url error')
import urllib.request
url='http://www.baidu.com/s?wd=ip'
proxies={
'http':'118.24.219.151:16817'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
requests=urllib.request.Request(url=url,headers=headers)
handler=urllib.request.ProxyHandler()
opener=urllib.request.build_opener(handler)
response=urllib.request.urlopen(requests)
response=opener.open(requests)
content=response.read().decode('utf-8')
with open('da.html','w',encoding='utf-8') as fp:
fp.write(content)
#代理池
import random
import urllib.request
import urllib.parse
proxies_pool=[
{'http':'118.24.219.151:16817'},
{'http':'118.24.219.151:16817'},
]
proxies=random.choice(proxies_pool)
url='http://www.baidu.com/s?wd=ip'
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
request=urllib.request.Request(url=url,headers=headers)
#response=urllib.request.urlopen(request)
handler=urllib.request.ProxyHandler(proxies=proxies)
opener=urllib.request.build_opener(handler)
response=opener.open(request)
content=response.read().decode('utf-8')
with open('dal.html','w',encoding='utf-8')as fp:
fp.write(content)