以爬取阳光阳光热线问政平台网站为例,进行详情页的爬取。
1 # -*- coding: utf-8 -*-
2 import scrapy
3 from yanguang.items import YanguangItem
4
5 class SunSpider(scrapy.Spider):
6 name = 'sun'
7 allowed_domains = ['sun0769.com']
8 start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4']
9
10 def parse(self, response):
11 tr_list=response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
12 for tr in tr_list:
13 item=YanguangItem()
14 item['title']=tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
15 item["href"]=tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()
16 item["publish_date"]=tr.xpath("./td[last()]/text()").extract_first()
17
18 yield scrapy.Request(
19 item["href"],
20 callback=self.parse_detail,
21 meta={"item":item},
22 )
23 #翻页
24 next_url=response.xpath(".//a[text()='>']/@href").extract_first()
25 if next_url is not None:
26 yield scrapy.Request(
27 next_url,
28 callback=self.parse()
29 )
30
31
32 def parse_detail(self,response): #处理详情页
33 item=response.meta["item"]
34 item["content"]=response.xpath("//div[@class='c1 text14_2']//text()").extract()
35 item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
36 item["content_img"] =["http://wz.sun0769.com"+i for i in item["content_img"]]
37 yield item
下面为pipelines.py文件中对爬取的数据处理操作。
1 import re
2 class YanguangPipeline(object):
3 def process_item(self, item, spider):
4 item["content"]=self.process_content(item["content"])
5 print(item)
6 return item
7
8 def process_content(self,content):#文本内容的处理
9 content=[re.sub(r"\xa0|\s","",i)for i in content]
10 content=[i for i in content if len(i)>0]#去除列表中的空字符串
11 return content
在settings.py文件中修改USER_AGENT的内容是对方服务器无法一眼看出我们的请求是爬虫。
默认settings.py文件中的USER_AGENT为:
1 # Crawl responsibly by identifying yourself (and your website) on the user-agent
2 USER_AGENT = 'tencent (+http://www.yourdomain.com)'
将settings.py文件中的USER_AGENT修改为:
1 # Crawl responsibly by identifying yourself (and your website) on the user-agent
2 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
\