从 Softpedia.com 获取软件下载安装程序

240 阅读1分钟

从 Softpedia.com 获取软件下载安装程序。目前,我可以在 Softpedia.com 上获取无限的已抓取链接(包括所需的安装程序链接,例如 hotdownloads.com/trialware/d…

huake_00066_.jpg spider.py 如下:

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

class MySpider(CrawlSpider):
    """ Crawl through web sites you specify """
    name = "softpedia"

    # Stay within these domains when crawling
    allowed_domains = ["www.softpedia.com"]

    start_urls = [
    "http://win.softpedia.com/",]

    download_delay = 2

    # Add our callback which will be called for every found link
    rules = [
            Rule(SgmlLinkExtractor(), follow=True)
    ]

items.py, pipelines.py, settings.py comes as default, except an added line to settings.py:
FILES_STORE = '/home/test/softpedia/downloads'

使用 urllib2,我能够判断一个链接是否是安装程序,在这种情况下,我在内容类型中获得“应用程序”:

import urllib2
url = 'http://hotdownloads.com/trialware/download/Download_a1keylogger.zip?item=33649-3&affiliate=22260'
response = urllib2.urlopen(url)
content_type = response.info().get('Content-Type')
print content_type

我的问题是,如何收集所需的安装程序链接并将其下载到我的目标文件夹?提前谢谢!

2、解决方案

我结合提到的两种方法来获得实际/镜像安装程序下载,然后使用文件下载管道进行实际下载。然而,如果文件下载 URL 是动态/复杂的,例如 www.softpedia.com/dyn-postdow… www.ietf.org/rfc/rfc2616…

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.contrib.loader import XPathItemLoader
from scrapy import log
from datetime import datetime 
from scrapy.conf import settings
from myscraper.items import SoftpediaItem

class SoftpediaSpider(CrawlSpider):
    name = "sosoftpedia"
    allowed_domains = ["www.softpedia.com"]
    start_urls = ['http://www.softpedia.com/get/Antivirus/']
    rules = Rule(SgmlLinkExtractor(allow=('/get/', ),allow_domains=("www.softpedia.com"), restrict_xpaths=("//td[@class='padding_tlr15px']",)), callback='parse_links', follow=True,),

    def parse_start_url(self, response):
    return self.parse_links(response)

    def parse_links(self, response):
    print "PRODUCT DOWNLOAD PAGE: "+response.url
    hxs = HtmlXPathSelector(response)
    urls = hxs.select("//a[contains(@itemprop, 'downloadURL')]/@href").extract()
    for url in urls:
        item = SoftpediaItem()
        request =  Request(url=url, callback=self.parse_downloaddetail) 
        request.meta['item'] = item
        yield request

    def parse_downloaddetail(self, response):
    item = response.meta['item']
    hxs = HtmlXPathSelector(response)
    item["file_urls"] = hxs.select('//p[@class="fontsize16"]/b/a/@href').extract() #["http://www.ietf.org/rfc/rfc2616.txt"]
    print "ACTUAL DOWNLOAD LINKS "+ hxs.select('//p[@class="fontsize16"]/b/a/@href').extract()[0]
    yield item