爬起点小说中的一章
# -*- coding: utf-8 -*-
import requests
import re
#下载一个网页
url = 'https://read.qidian.com/chapter/pHCOMN5YAqETFqQ-idajwA2/NhXKjTTceCNOBDFlr9quQA2'
#模拟浏览器发送HTTP请求
response = requests.get(url)
#修改编码方式
#response.encoding = "utf-8"
#response.encoding = "gbk"
#目标小说主页网页源码
html = response.text
main = re.findall(r'<div class="read-content j_readContent">(.*?)</div>',html,re.S)[0]
main = main.replace("<p>","")
print(main)
关于如何爬小说可以参考 python爬虫爬网络小说
接下来的内容学自自从学会了Python,我从来不为看什么电影发愁,好看的,付费的,百度不到资源的全部一网打井!
关于反爬虫
最简单反爬虫
请求头:User-Agent
用来确保是浏览器来访问的,而不是使用代码访问网站
我们可以使用代码伪装成浏览器来访问网站(反反爬)
爬猫眼电影榜单信息
import requests
import re
#下载一个网页
url = 'https://maoyan.com/board/7'
#请求头,告诉服务器这是浏览器
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#模拟浏览器发送HTTP请求
response = requests.get(url,headers = header)
print(response.text)
可以将respond定义为方法
import requests
import re
def respon(n):
#下载一个网页
url = f'https://maoyan.com/board/6?offset={n}'
#请求头,告诉服务器这是浏览器
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#模拟浏览器发送HTTP请求
response = requests.get(url,headers = header)
print(response.text)
respon(5)
import requests
import re
import lxml
from lxml import etree
def respon(n):
#下载一个网页
#字符串的格式化
url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n)
#请求头,告诉服务器这是浏览器
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#模拟浏览器发送HTTP请求
response = requests.get(url,headers = header)
return response.text
def parse(text):
#初始化,标准化
html = etree.HTML(text)
#提取我们想要的信息 需要写xpath语法
names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title')
print(names)
text = respon(5)
parse(text)
import requests
import re
import lxml
from lxml import etree
def respon(n):
#下载一个网页
#字符串的格式化
url = f'https://maoyan.com/board/6?offset={n}' #url = 'https://maoyan.com/board/6?offset={}'.format(n)
#请求头,告诉服务器这是浏览器
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#模拟浏览器发送HTTP请求
response = requests.get(url,headers = header)
return response.text
def parse(text):
#初始化,标准化
html = etree.HTML(text)
#提取我们想要的信息 需要写xpath语法
names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title')
times = html.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')
#zip拉链函数
for name,time in zip(names,times):
print(name,time)
text = respon(10)
parse(text)