python爬虫练习爬取信息

131 阅读2分钟

爬起点小说中的一章 

# -*- coding: utf-8 -*-
import requests
import re

#下载一个网页
url = 'https://read.qidian.com/chapter/pHCOMN5YAqETFqQ-idajwA2/NhXKjTTceCNOBDFlr9quQA2'
#模拟浏览器发送HTTP请求
response = requests.get(url)
#修改编码方式
#response.encoding = "utf-8"
#response.encoding = "gbk"
#目标小说主页网页源码
html = response.text
main = re.findall(r'<div class="read-content j_readContent">(.*?)</div>',html,re.S)[0]
main = main.replace("<p>","")
print(main)

关于如何爬小说可以参考 python爬虫爬网络小说

接下来的内容学自自从学会了Python,我从来不为看什么电影发愁,好看的,付费的,百度不到资源的全部一网打井!

关于反爬虫

最简单反爬虫

请求头:User-Agent

用来确保是浏览器来访问的,而不是使用代码访问网站

我们可以使用代码伪装成浏览器来访问网站(反反爬)

 

爬猫眼电影榜单信息

 

import requests
import re

#下载一个网页
url = 'https://maoyan.com/board/7'

#请求头,告诉服务器这是浏览器
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
#模拟浏览器发送HTTP请求
response = requests.get(url,headers = header)
print(response.text)

可以将respond定义为方法

import requests
import re

def respon(n):
    #下载一个网页
    url = f'https://maoyan.com/board/6?offset={n}'

    #请求头,告诉服务器这是浏览器
    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    #模拟浏览器发送HTTP请求
    response = requests.get(url,headers = header)
    print(response.text)

respon(5)

安装lxml

 

import requests
import re
import lxml
from lxml import etree

def respon(n):
    #下载一个网页
    #字符串的格式化
    url = f'https://maoyan.com/board/6?offset={n}'  #url = 'https://maoyan.com/board/6?offset={}'.format(n)

    #请求头,告诉服务器这是浏览器
    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    #模拟浏览器发送HTTP请求
    response = requests.get(url,headers = header)
    return response.text


def parse(text):
    #初始化,标准化
    html = etree.HTML(text)
    #提取我们想要的信息 需要写xpath语法
    names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title')
    print(names)

text = respon(5)
parse(text)

import requests
import re
import lxml
from lxml import etree

def respon(n):
    #下载一个网页
    #字符串的格式化
    url = f'https://maoyan.com/board/6?offset={n}'  #url = 'https://maoyan.com/board/6?offset={}'.format(n)

    #请求头,告诉服务器这是浏览器
    header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    #模拟浏览器发送HTTP请求
    response = requests.get(url,headers = header)
    return response.text


def parse(text):
    #初始化,标准化
    html = etree.HTML(text)
    #提取我们想要的信息 需要写xpath语法
    names = html.xpath('//div[@class="movie-item-info"]/p[@class="name"]/a/@title')
    times = html.xpath('//div[@class="movie-item-info"]/p[@class="releasetime"]/text()')
    #zip拉链函数
    for name,time in zip(names,times):
        print(name,time)

text = respon(10)
parse(text)