Python爬虫个人整理
一、代码基础规范
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
if __name__ == "__main__" :
.......
xxxxxxx
.......
二、带参数获取html
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#UA伪装:让爬虫对应的请求载体身份标识伪装成某一款浏览器
import requests
if __name__ == "__main__":
#UA伪装:将对应的User-Agent封装到一个字典中
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
url = 'https://bj.58.com/ershoufang'
#处理url携带的参数:封装到字典中
kw = input('输入要搜索的小区:')
param = {
'q':kw
}
#对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数
response = requests.get(url=url,params=param,headers=headers)
page_text = response.text
fileName = kw+'.html'
fp = open(fileName,'w',encoding='utf-8')
fp.write(page_text)
print(fileName,'保存成功!!!')
三、使用xpath定位html元素(待修改)
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
from lxml import etree
#需求:爬取58二手房中的房源信息
if __name__ == "__main__":
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
#爬取到页面源码数据
url = 'https://bj.58.com/ershoufang/'
page_text = requests.get(url=url,headers=headers).text
#数据解析
tree = etree.HTML(page_text)
#存储的就是li标签对象
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
fp = open('58.txt','w',encoding='utf-8')
for li in li_list:
#局部解析
title = li.xpath('./div[2]/h2/a/text()')[0]
print(title)
fp.write(title+'\n')
四、使用正则表达式定位html元素
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
from lxml import etree
#需求:爬取58二手房中的房源信息
if __name__ == "__main__":
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
#爬取到页面源码数据
url = 'https://bj.58.com/ershoufang/'
page_text = requests.get(url=url,headers=headers).text
ex = 'class="property-content-title".*?<h3 title="(.*?)" class="property-'
title_list = re.findall(ex,page_text,re.S)
# print(title_list)
fp = open('58title.txt','w')
fp.write(title_list)
print('下载成功!!!')
五、使用正则表达式定位html元素
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
#需求:爬取58二手房中的房源title信息
if __name__ == "__main__":
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
#爬取到页面源码数据
url = 'https://bj.58.com/ershoufang/'
kw = input("请输入房源地址:")
param = {
"q" : kw
}
page_text = requests.get(url=url,params=param, headers=headers).text
print(page_text)
ex = 'class="property-content-title".*?<h3 title="(.*?)" class="property-'
title_list = re.findall(ex,page_text,re.S)
# print(title_list)
fp = open('58title.txt','w',encoding='utf-8')
for title in title_list :
fp.write(title + '\r')
print('下载成功!!!')
五、分页获取json数据+解析字段
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import json
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
id_list = [] # 存储企业的id
all_data_list = [] # 存储所有的企业详情数据
#批量获取不同企业的id值
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
#参数的封装
for page in range(1,6):
page = str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname':'',
'applysn':'',
}
json_ids = requests.post(url=url,headers=headers,data=data).json()
for dic in json_ids['list']:
id_list.append(dic['ID'])
#获取企业详情数据
post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id':id
}
detail_json = requests.post(url=post_url,headers=headers,data=data).json()
# print(detail_json,'-------------ending-----------')
all_data_list.append(detail_json)
#持久化存储all_data_list
fp = open('./allData.json','w',encoding='utf-8')
json.dump(all_data_list,fp=fp,ensure_ascii=False)
print('over!!!')
六、保存数据