简单静态单页面爬虫
引入requests库后,使用get方法,返回网页的状态,打印此时网页是否回应(获取成功)
response = requests.get('https://www.37zww.net/0/234/208873.html')
print(response.status_code)
如果返回200(输出为200)则获取成功
完整代码如下:
import requests
#解法一
from pyquery import PyQuery as pq
#爬取单个页面
response = requests.get('https://www.37zww.net/0/234/208873.html')
print(response.status_code)
#
# type(response)
# response.headers
#将网页的编码改变,以免乱码
response.encoding = "GB2312"
# response.apparent_encoding
# response.text
doc = pq(response.text)#获取网页的文本形式
title = doc('.bookname > h1').text()
print(title)
content = doc('#content').text()
print(content)
#将获取的文本内容写入txt
with open ('剑来.txt',mode = 'w+',encoding='utf-8') as f:
f.write(title)
f.write('\n')
f.write(content)
f.write('\n')
#简单多章小说爬取 完整代码如下:
response = requests.get('https://www.37zww.net/0/234/')
response.encoding = response.apparent_encoding
doc = pq(response.text)
links = doc('#list>dl>dd>a')
for link in links.items():
print("https://www.37zww.net/0/234/"+link.attr.href)
response = requests.get("https://www.37zww.net/0/234/"+link.attr.href)
response.encoding=response.apparent_encoding
doc = pq(response.text)
title = doc('.bookname>h1').text()
content = doc('#content').text()
with open ('剑来.txt',mode = 'a+',encoding='utf-8')as f:
f.write(title)
f.write('\n')
f.write(content)
f.write('\n')