阅读本文章需要提前学习xpath,python等基本知识。本文章只适合python初学者学习了解python网络爬虫所用,禁止用于商业谋取利益。话不多说,直接上图
# 获取url对应的网页源码
def getsource(url):
headers = {
'User-Agent': 'User-Agent Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)'}
sourceHtml = requests.get(url, headers=headers)
return sourceHtml.text
此处为根据得到的网页源代码,通过xpath解析得到每一项数据,封装成我们需要的数组。
# 开始抓取并分析数据
def spiderData(url):
domtext = getsource(url)
dom = etree.HTML(domtext)
div_list = dom.xpath('//div[contains(@class, "js-tips-list")]/div[contains(@class, "f-list-item")]')
data = []
for item in div_list:
res = {}
res['title'] = item.xpath('.//dd[contains(@class, "title")]/a/text()')[0]
res['address'] = item.xpath('.//dd[contains(@class, "address")]//a[@class="address-eara"]/text()')[0]
res['address-eara'] = item.xpath('.//dd[contains(@class, "address")]//span[@class="address-eara"]/text()')[0]
res['price'] = item.xpath('.//dd[contains(@class, "info")]//span[@class="num"]/text()')[0]+'万'
res['singlePrice'] = item.xpath('.//dd[contains(@class, "info")]//div[@class="time"]/text()')[0]
res['images'] = item.xpath('.//div[@class="img-wrap"]//img/@src')[0]
data.append(res)
return json.dumps(data, encoding="UTF-8", ensure_ascii=False )
执行,查看部分json结果(因为太长,所以截取)
[{
"title": "9号线 外地人可买无需社保 70年产权 可落户上",
"address-eara": "阳光理想城",
"price": "53万",
"singlePrice": "9636元/㎡",
"address": "松江",
"images": "http://pic7.58cdn.com.cn/anjuke_58/8205d4e70763f8a7bd7217d0e3cd574c?w=480&h=360&crop=1"
}, {
"title": "崇明品质小区 ,外地人可买 ,精装修带地暖 ,首",
"address-eara": "明南佳苑",
"price": "101万",
"singlePrice": "17127元/㎡",
"address": "崇明",
"images": "http://pic6.58cdn.com.cn/anjuke_58/940dfa7b71d5e2a651e4fccb584c6170?w=480&h=360&crop=1"
}, {
"title": "经典小户型,外地可买,不受限购,送6万家具家电!",
"address-eara": "临港17区",
"price": "20万",
"singlePrice": "15384元/㎡",
"address": "浦东",
"images": "http://pic2.58cdn.com.cn/anjuke_58/f7a9cd75d2d0439bf867d984ce06fcee?w=480&h=360&crop=1"
}, {
"title": "上海临港自贸区,不限贷,中国的第二个香港,精装修",
"address-eara": "临港17区",
"price": "105万",
"singlePrice": "16153元/㎡",
"address": "浦东",
"images": "http://pic1.58cdn.com.cn/anjuke_58/6553ebe8ccb5e062e7e727b2dd48e4e4?w=480&h=360&crop=1"
}
···
]
这样我们就得到的所需要的爬虫数据。
欢迎喜欢网络爬虫的同学们批评指正,一起共同学习交流。
以下提供全部源码
# -*- coding: UTF-8 -*-
import urllib
from lxml import etree
import json
import requests
# 获取url对应的网页源码
def getsource(url):
headers = {
'User-Agent': 'User-Agent Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0)'}
sourceHtml = requests.get(url, headers=headers)
return sourceHtml.text
# 开始抓取并分析数据
def spiderData(url):
domtext = getsource(url)
dom = etree.HTML(domtext)
div_list = dom.xpath('//div[contains(@class, "js-tips-list")]/div[contains(@class, "f-list-item")]')
data = []
for item in div_list:
res = {}
res['title'] = item.xpath('.//dd[contains(@class, "title")]/a/text()')[0]
res['address'] = item.xpath('.//dd[contains(@class, "address")]//a[@class="address-eara"]/text()')[0]
res['address-eara'] = item.xpath('.//dd[contains(@class, "address")]//span[@class="address-eara"]/text()')[0]
res['price'] = item.xpath('.//dd[contains(@class, "info")]//span[@class="num"]/text()')[0]+'万'
res['singlePrice'] = item.xpath('.//dd[contains(@class, "info")]//div[@class="time"]/text()')[0]
res['images'] = item.xpath('.//div[@class="img-wrap"]//img/@src')[0]
data.append(res)
return json.dumps(data, encoding="UTF-8", ensure_ascii=False )
result = spiderData('http://sh.ganji.com/ershoufang/')
print result