scrapy框架下爬虫实现详情页抓取https://www.cnblogs.com/zhiliang9408/p/100

www.cnblogs.com/zhiliang940…

以爬取阳光阳光热线问政平台网站为例，进行详情页的爬取。

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from yanguang.items import YanguangItem
 4 
 5 class SunSpider(scrapy.Spider):
 6     name = 'sun'
 7     allowed_domains = ['sun0769.com']
 8     start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4']
 9 
10     def parse(self, response):
11         tr_list=response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
12         for tr in tr_list:
13             item=YanguangItem()
14             item['title']=tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
15             item["href"]=tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()
16             item["publish_date"]=tr.xpath("./td[last()]/text()").extract_first()
17 
18             yield scrapy.Request(
19                 item["href"],
20                 callback=self.parse_detail,
21                 meta={"item":item},
22             )
23         #翻页
24         next_url=response.xpath(".//a[text()='>']/@href").extract_first()
25         if next_url is not None:
26             yield scrapy.Request(
27                 next_url,
28                 callback=self.parse()
29             )
30 
31 
32     def parse_detail(self,response): #处理详情页
33         item=response.meta["item"]
34         item["content"]=response.xpath("//div[@class='c1 text14_2']//text()").extract()
35         item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
36         item["content_img"] =["http://wz.sun0769.com"+i for i in item["content_img"]]
37         yield item

下面为pipelines.py文件中对爬取的数据处理操作。

 1 import re
 2 class YanguangPipeline(object):
 3     def process_item(self, item, spider):
 4         item["content"]=self.process_content(item["content"])
 5         print(item)
 6         return item
 7 
 8     def process_content(self,content):#文本内容的处理
 9         content=[re.sub(r"\xa0|\s","",i)for i in content]
10         content=[i for i in content if len(i)>0]#去除列表中的空字符串
11         return content

在settings.py文件中修改USER_AGENT的内容是对方服务器无法一眼看出我们的请求是爬虫。

默认settings.py文件中的USER_AGENT为：

1 # Crawl responsibly by identifying yourself (and your website) on the user-agent
2 USER_AGENT = 'tencent (+http://www.yourdomain.com)'

将settings.py文件中的USER_AGENT修改为：

1 # Crawl responsibly by identifying yourself (and your website) on the user-agent
2 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'