爬虫:关于xpath的使用

538 阅读1分钟

一般爬虫步骤


rsp = request.get(url,headers,params)  #获取页面
html = etree.HTML(rsp.text)  #用xpath进行解析
result = html.xpath(xxxx) #今天学习的是xxxx的内容
with open(filename,'w') as f: #写入文件
    f.write()
    f.close()

xpath使用

示例html

  • 获取所有节点
result = html.xpath('//*')
  • 获取子节点/子孙节点

通过/获取子节点,//获取子孙节点
result = html.xpath('//li/a')  #获取li子节点a
result = html.xpath('//ul//a') #获取ul子孙节点a
  • 获取父节点

用/../实现
result = html.xpath('//a[@href="link4.html"]/../@class')
  • 属性匹配
result = html.xpath('//li[@class="item-0"]')
  • 属性多值匹配

text = """
  • first item
  • """ html = etree.HTML(text) result = html.xpath('//li[contains(@class,"li-first")]/a/text()') #或者('li[contains(@class,"li")]') print(result)
    • 多属性匹配
    
    text = """
    
  • first item
  • """ html = etree.HTML(text) result = html.xpath('//li[contains(@class,"li-first") and @name="item"]/a/text()') print(result)
    • 属性获取
    result = html.xpath('//li/a/@href')
    #区分属性匹配
    • 文本获取
    result = html.xpath('//li[@class="item-0"]/a/text()')
    或者
    result = html.xpath('//li[@class="item-0"]//text()')
    • 按序选择
      • last()
      • position()等函数
    • 节点轴选择

    实例:爬取豆瓣top250

    
    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
    }
    
    total=[]
    
    #利用xpath解析获取网页内容
    def getdetails(url):
        try:
            res = requests.get(url,headers=headers)
            html = etree.HTML(res.text)
            film = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
            for div in film:
                title = div.xpath('div/div[2]/div[1]/a/span[1]/text()')[0]
                score = div.xpath('div/div[2]/div[2]/div/span[2]/text()')[0]
                detail = div.xpath('div/div[2]/div[2]/p[1]/text()')[0].strip()
                introduction = div.xpath('div/div[2]/div[2]/p[2]/span/text()')[0]
                total.append({
                    "title":title,
                    'score':score,
                    "detail":detail,
                    "introduction":introduction
                })
        except:
            print('Crawl failure')
        return total
    
    if __name__ == '__main__':
        for i in range(10):
            num = i*25
            url = 'https://movie.douban.com/top250?start={}&filter='.format(num)
            print(url)
            getdetails(url)
            print('page {} has been crawled'.format(i))
    
    filename = 'movieInformation.json'
    with open(filename,'w') as f:
        jsObj = json.dumps(total) #dumps()将json对象转化为字符串
        f.write(jsObj)
        f.close()