利用lxml爬取韩寒博客全部文档并保存

481 阅读1分钟

闲来无事打算练练lxml,就选择了韩寒的博客,虽然只有7页,但是爬下来发现文档也有1.2M,看来也是高产的呢2333,话不多说,先上代码,代码里都有必要注释,想来应该不会有什么问题

# -*- coding: UTF-8 -*-
#参考资料:https://segmentfault.com/q/1010000004879947
import requests
from lxml import html
import time

def requestPageText(pageUrl):
    n_page = requests.get(pageUrl)
    time.sleep(2)
    n_page.encoding = 'utf-8'#让页面的编码为utf8,如果没有这个标题就乱码
    q = n_page.text.encode('utf-8')#只写了这个对标题是没有用的,原因还不明白
    n_tree = html.fromstring(q)
    #获取URL数组
    urlArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/@href')
    #获取标题数组
    titleArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/text()')
    print titleArr[0]
    for i in range(len(urlArr)):
        textPage = requests.get(urlArr[i])
        textPage.encoding = 'utf-8'
        textTree = html.fromstring(textPage.text)
        textArray = textTree.xpath('//div[@id="sina_keyword_ad_area2"]')
        e = textArray[0]
        t = e.xpath('string(.)')
        write = open('F:\\aaa.txt','a')
        write.write("\r\n")  
        write.write((titleArr[i]).encode('utf8'))
        write.write(t.encode('utf8'))
        write.close()
if __name__ == '__main__':  
    # url = 'http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html'
    # mypage = requests.get(url)
    # mypage.encoding = 'utf-8'
    # k = mypage.text.encode('utf-8')
    # tree = html.fromstring(k)
    # #获取页码数
    # pageCountArr = tree.xpath('//div[@class="SG_page"]/ul[@class="SG_pages"]/li')
    # pageTotal = len(pageCountArr) - 1


    for i in range(1,8):
        print '------->http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i
        requestPageText('http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i)