Python爬虫个人整理

一、代码基础规范

#!/usr/bin/env python  
# -*- coding:utf-8 -*-
import requests  
if __name__ ==  "__main__" :  
    .......

    xxxxxxx

    .......

二、带参数获取html

#!/usr/bin/env python 
# -*- coding:utf-8 -*-

#UA伪装：让爬虫对应的请求载体身份标识伪装成某一款浏览器
import requests
if __name__ == "__main__":
    #UA伪装：将对应的User-Agent封装到一个字典中
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    url = 'https://bj.58.com/ershoufang'
    #处理url携带的参数：封装到字典中
    kw = input('输入要搜索的小区:')
    param = {
        'q':kw
    }
    #对指定的url发起的请求对应的url是携带参数的，并且请求过程中处理了参数
    response = requests.get(url=url,params=param,headers=headers)

    page_text = response.text
    fileName = kw+'.html'
    fp = open(fileName,'w',encoding='utf-8')
    fp.write(page_text)
    print(fileName,'保存成功！！！')

三、使用xpath定位html元素(待修改)

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import requests
from lxml import etree
#需求：爬取58二手房中的房源信息
if __name__ == "__main__":
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    #爬取到页面源码数据
    url = 'https://bj.58.com/ershoufang/'
    page_text = requests.get(url=url,headers=headers).text

    #数据解析
    tree = etree.HTML(page_text)
    #存储的就是li标签对象
    li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
    fp = open('58.txt','w',encoding='utf-8')
    for li in li_list:
        #局部解析
        title = li.xpath('./div[2]/h2/a/text()')[0]
        print(title)
        fp.write(title+'\n')

四、使用正则表达式定位html元素

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import requests
import re
from lxml import etree
#需求：爬取58二手房中的房源信息
if __name__ == "__main__":
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    #爬取到页面源码数据
    url = 'https://bj.58.com/ershoufang/'
    page_text = requests.get(url=url,headers=headers).text

    ex = 'class="property-content-title".*?<h3 title="(.*?)" class="property-'
    title_list = re.findall(ex,page_text,re.S)
    # print(title_list)

    fp = open('58title.txt','w')
    fp.write(title_list)
    print('下载成功！！！')

五、使用正则表达式定位html元素

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import requests
import re
#需求：爬取58二手房中的房源title信息
if __name__ == "__main__":
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    #爬取到页面源码数据
    url = 'https://bj.58.com/ershoufang/'
    kw = input("请输入房源地址：")
    param = {
        "q" : kw
    }
    page_text = requests.get(url=url,params=param, headers=headers).text
    print(page_text)
    ex = 'class="property-content-title".*?<h3 title="(.*?)" class="property-'
    title_list = re.findall(ex,page_text,re.S)
    # print(title_list)

    fp = open('58title.txt','w',encoding='utf-8')
    for title in title_list :
        fp.write(title + '\r')

    print('下载成功！！！')

五、分页获取json数据+解析字段

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import  requests
import json
if __name__ == "__main__":
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

    }
    id_list = []  # 存储企业的id
    all_data_list = []  # 存储所有的企业详情数据
    #批量获取不同企业的id值
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    #参数的封装
    for page in range(1,6):
        page = str(page)
        data = {
            'on': 'true',
            'page': page,
            'pageSize': '15',
            'productName':'',
            'conditionType': '1',
            'applyname':'',
            'applysn':'',
        }
        json_ids = requests.post(url=url,headers=headers,data=data).json()
        for dic in json_ids['list']:
            id_list.append(dic['ID'])

    #获取企业详情数据
    post_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    for id in id_list:
        data = {
            'id':id
        }
        detail_json = requests.post(url=post_url,headers=headers,data=data).json()
        # print(detail_json,'-------------ending-----------')
        all_data_list.append(detail_json)

    #持久化存储all_data_list
    fp = open('./allData.json','w',encoding='utf-8')
    json.dump(all_data_list,fp=fp,ensure_ascii=False)
    print('over!!!')

Python爬虫个人整理

Python爬虫个人整理

一、代码基础规范

二、带参数获取html

三、使用xpath定位html元素(待修改)

四、使用正则表达式定位html元素

五、使用正则表达式定位html元素

五、分页获取json数据+解析字段

六、保存数据