爬煎蛋凉飕飕系列

178 阅读1分钟

自己封装了一套工具,用来解析xpath,url查重,封装了MySQL的游标,下面主要展示爬虫部分

# ! /usr/bin/python3 
# -*- coding:utf-8 -*-  
# @Author: vicnic  
# @Date: 2018-09-10 10:02:32   

import Tools as tl
import Settings as st
from enum import Enum, unique
import json
import requests

@unique #@unique装饰器可以帮助我们检查保证没有重复值。
class WebSite(Enum):
    pass

def startSpider(url_list):
    for url in url_list:#首先从url列表里拿到url解析后的tree
        tree = tl.getHtmlTrees(url)
        analyzeTree(tree)

def analyzeTree(tree):
#走到最后会报一个null然后程序退出,这里我没做处理要做处理比较好
    nav_list = tree.xpath('//div[@class="wp-pagenavi"]/a')
    next_page_url = u''
    if len(nav_list)>1:#后续页
        next_page_url = nav_list[1].xpath('@href')[0]
    else:#第二页
        nav_name = next_page_url = nav_list[0].xpath('text()')
        next_page_url = nav_list[0].xpath('@href')[0]
    a_list = tree.xpath('//div[@id="content"]/div/div[@class="indexs"]/h2/a')
    for a in a_list:
        title =''
        url = ''
        title_list = a.xpath('text()')
        url_list = a.xpath('@href')
        if (len(title_list)>0) and (len(url_list)>0):
            title = title_list[0]
            url = url_list[0]
        if (u'凉飕飕' in title) or (u'半夜' in title):
            getContent(url)
    #下一页
    next_list = []
    new_url = u'http://jandan.net'+next_page_url
    next_list.append(new_url)
    startSpider(next_list)

def getContent(url):
    tree = tl.getHtmlTrees(url)
    p_list = tree.xpath('//*[@id="content"]/div[2]/p/text()')
    w = open(u'E:\\凉飕飕系列.txt','ab+')     
    for e in p_list:
        w.write(e.encode('utf8'))
        w.write('\r\n'.encode('utf8'))
    w.close()

    

    
if __name__ == '__main__':  
    startSpider(['http://jandan.net/tag/%E6%95%85%E4%BA%8B'])