自己封装了一套工具,用来解析xpath,url查重,封装了MySQL的游标,下面主要展示爬虫部分
# ! /usr/bin/python3
# -*- coding:utf-8 -*-
# @Author: vicnic
# @Date: 2018-09-10 10:02:32
import Tools as tl
import Settings as st
from enum import Enum, unique
import json
import requests
@unique #@unique装饰器可以帮助我们检查保证没有重复值。
class WebSite(Enum):
pass
def startSpider(url_list):
for url in url_list:#首先从url列表里拿到url解析后的tree
tree = tl.getHtmlTrees(url)
analyzeTree(tree)
def analyzeTree(tree):
#走到最后会报一个null然后程序退出,这里我没做处理要做处理比较好
nav_list = tree.xpath('//div[@class="wp-pagenavi"]/a')
next_page_url = u''
if len(nav_list)>1:#后续页
next_page_url = nav_list[1].xpath('@href')[0]
else:#第二页
nav_name = next_page_url = nav_list[0].xpath('text()')
next_page_url = nav_list[0].xpath('@href')[0]
a_list = tree.xpath('//div[@id="content"]/div/div[@class="indexs"]/h2/a')
for a in a_list:
title =''
url = ''
title_list = a.xpath('text()')
url_list = a.xpath('@href')
if (len(title_list)>0) and (len(url_list)>0):
title = title_list[0]
url = url_list[0]
if (u'凉飕飕' in title) or (u'半夜' in title):
getContent(url)
#下一页
next_list = []
new_url = u'http://jandan.net'+next_page_url
next_list.append(new_url)
startSpider(next_list)
def getContent(url):
tree = tl.getHtmlTrees(url)
p_list = tree.xpath('//*[@id="content"]/div[2]/p/text()')
w = open(u'E:\\凉飕飕系列.txt','ab+')
for e in p_list:
w.write(e.encode('utf8'))
w.write('\r\n'.encode('utf8'))
w.close()
if __name__ == '__main__':
startSpider(['http://jandan.net/tag/%E6%95%85%E4%BA%8B'])