闲来无事打算练练lxml,就选择了韩寒的博客,虽然只有7页,但是爬下来发现文档也有1.2M,看来也是高产的呢2333,话不多说,先上代码,代码里都有必要注释,想来应该不会有什么问题
# -*- coding: UTF-8 -*-
#参考资料:https://segmentfault.com/q/1010000004879947
import requests
from lxml import html
import time
def requestPageText(pageUrl):
n_page = requests.get(pageUrl)
time.sleep(2)
n_page.encoding = 'utf-8'#让页面的编码为utf8,如果没有这个标题就乱码
q = n_page.text.encode('utf-8')#只写了这个对标题是没有用的,原因还不明白
n_tree = html.fromstring(q)
#获取URL数组
urlArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/@href')
#获取标题数组
titleArr = n_tree.xpath('//div[@class="articleCell SG_j_linedot1"]/p[@class="atc_main SG_dot"]/span[@class="atc_title"]/a/text()')
print titleArr[0]
for i in range(len(urlArr)):
textPage = requests.get(urlArr[i])
textPage.encoding = 'utf-8'
textTree = html.fromstring(textPage.text)
textArray = textTree.xpath('//div[@id="sina_keyword_ad_area2"]')
e = textArray[0]
t = e.xpath('string(.)')
write = open('F:\\aaa.txt','a')
write.write("\r\n")
write.write((titleArr[i]).encode('utf8'))
write.write(t.encode('utf8'))
write.close()
if __name__ == '__main__':
# url = 'http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html'
# mypage = requests.get(url)
# mypage.encoding = 'utf-8'
# k = mypage.text.encode('utf-8')
# tree = html.fromstring(k)
# #获取页码数
# pageCountArr = tree.xpath('//div[@class="SG_page"]/ul[@class="SG_pages"]/li')
# pageTotal = len(pageCountArr) - 1
for i in range(1,8):
print '------->http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i
requestPageText('http://blog.sina.com.cn/s/articlelist_1191258123_0_%s.html' %i)