【python】爬取豆瓣读书

114 阅读4分钟

使用python爬取豆瓣读书相关数据

4.15 成功获取了基础信息,以及评分信息。

4.16 添加了应对反爬的机制,并开始尝试获取读书简介以及作者简介

4.21 能够成功爬取基础信息,以及作者信息,并且对相关的作者信息搜索不到的情况下进行了处理,能够爬取大部分的数据,但还是有部分数据在导入数据库的时候会进行报错,我选择直接跳过这个tag下的剩余数据直接修改parse_html_tag函数下的tag数组来进行跳过。

4.22 能够爬取大部分书籍数据,偶尔有部分书籍数据获取不到导致列表问题报错,可以修改for循环当中的if语句进行获取后面tag的书籍数据

import requests         #进行请求
from lxml import etree  #解析数据
import re               #数据处理
import time             #注意网站的反爬机制
import pymysql
import sys
import random
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait   # 等待对象
from selenium.webdriver.support import expected_conditions as EC   # 条件
from selenium.webdriver.common.by import By # 定义定位器的常量


agent_list = [
   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
   "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
   "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
   "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
   "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
   "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
   "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"
   ]
#发送请求 获取数据

def get_content_by_selenium(url):#搜索页面结果加密使用此方法
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver,20)
    driver.get(url=url)
    #请求
    time.sleep(random.uniform(1, 2))
    res = wait.until(EC.presence_of_all_elements_located((By.XPATH,'//*[@id="root"]/div/div[2]/div[1]/div[1]/div')))
    html_str = driver.page_source

    return html_str

def get_html(url):
    #构建请求头,伪装成浏览器
    db_headers = {'User-Agent': random.choice(agent_list),'referer': 'https://book.douban.com/'}
    basic_url = url
    db_novel = requests.get(url=basic_url,headers=db_headers)
    db_novel_html = db_novel.text
    return db_novel_html

#获取图书标签
def parse_html_tag(db_html):
    db_html = etree.HTML(db_html)
    book_tag = db_html.xpath('//div[@class="indent tag_cloud"]//td/a/text()')
    book_tag = list(book_tag)[:20]#设置获取标签的数量
    return book_tag


# 定义函数:解析数据(提取需要的数据)
def parse_html_book(db_html):
    #将text数据转换成html数据
    db_html = etree.HTML(db_html)
    #使用xpath提取相关数据
    #作者  (译者) 出版社   时间   价格
    basic_info = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="pub"]/text()')
    basic_info = [info.replace("\n","").strip().split("/") for info in basic_info]

    #评价星级  评价人数
    rating_stars_class = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="star clearfix"]/span[1]/@class')
    for i in range(len(rating_stars_class)):
        if rating_stars_class[i] == 'pl':
            rating_stars_class[i] = '00'

    rating_stars_class = [re.findall(r"\d+", star) for star in rating_stars_class]
    rating_stars_class = [str(info).replace("'", "") for info in rating_stars_class]
    rating_stars_class = [str(info).replace("[", "") for info in rating_stars_class]
    rating_stars_class = [str(info).replace("]", "") for info in rating_stars_class]

    rating_stars = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()')

    rating_nums = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="star clearfix"]/span[@class="pl"]/text()')
    rating_nums = [info.replace("\n", "").strip().split("/") for info in rating_nums]
    rating_nums = [str(info).replace("'", "") for info in rating_nums]
    rating_nums = [str(info).replace("[", "") for info in rating_nums]
    rating_nums = [str(info).replace("]", "") for info in rating_nums]
    # 获取书籍名称
    book_name = db_html.xpath('//ul[@class="subject-list"]//a/@title')

    # 获取书籍封面
    book_img = db_html.xpath('//*[@id="subject_list"]/ul//div[1]/a/img/@src')
    book_img = [str(info).replace("'", "") for info in book_img]

    # 获取书籍ID
    id_string = db_html.xpath('//*[@id="subject_list"]/ul//div[1]/a/@href')
    book_id = [re.findall(r"\d+", id) for id in id_string]
    book_id = [str(info).replace("'", "") for info in book_id]
    book_id = [str(info).replace("[", "") for info in book_id]
    book_id = [str(info).replace("]", "") for info in book_id]

    return basic_info,rating_stars,book_id,rating_nums,book_img,book_name,rating_stars_class


#豆瓣未录入作者,随机生成n位数id
def parse_html_no_author(id_len):
    all_id='0123456789'
    last_pos = len(all_id)-1
    author_id = ''
    for _ in range(id_len):#随机生成n位id
        index = random.randint(0, last_pos)
        author_id = author_id + all_id[index]
    return  author_id

#作者id获取不到,a标签链接指向一个搜索页面的情况下,且能够搜索到作者时调用此函数
def parse_html_search_author(db_html):
    db_html = etree.HTML(db_html)
    id_string = db_html.xpath('//*[@id="root"]/div/div[2]/div[1]/div[1]/div[1]/div/div/div[1]/a/@href')
    author_id = [re.findall(r"\d+", str(id_string))]
    author_id = str(author_id).replace("'", "")
    author_id = str(author_id).replace("[", "")
    author_id = str(author_id).replace("]", "")
    i = len(author_id)
    if i >= 8 :
        author_id = parse_html_no_author(9)
        #7位数是作者id,8位数是图书id,所以选择生成九位随机数避免相同
    return  author_id

def parse_html_author(db_html):
    db_html = etree.HTML(db_html)
    # 获取到作者id
    id_string = db_html.xpath('//*[@id="info"]/span[1]/a/@href')
    author_id = [re.findall(r"\d+", str(id_string))]
    i = len(str(author_id))
    if i > 20:
        search_name = db_html.xpath('//*[@id="info"]/span[1]/a/text()')
        search_name = [info.replace("'", "") for info in search_name]
        author_url_1 = 'https://book.douban.com/subject_search?search_text={}'.format(search_name)

        author_url_1 = get_content_by_selenium(author_url_1)
        author_id = parse_html_search_author(author_url_1)
    if i < 5:
        id_string = db_html.xpath('//*[@id="info"]/a/@href')
        author_id = [re.findall(r"\d+", str(id_string[0]))]
    author_id = str(author_id).replace("[", "")
    author_id = str(author_id).replace("]", "")
    author_id = str(author_id).replace("'", "")
    # 获取书籍简介和作者简介
    book_dec = db_html.xpath('//*[@id="link-report"]//div/div/p/text()')
    str_desc = ''
    for desc in book_dec:
        str_desc = str_desc + desc
    book_str_desc = str(str_desc).replace("'","")

    #当简介太短的时候  没有all hidden这个span标签  直接跳过span标签查询div标签
    author_dec = db_html.xpath('//*[@id="content"]/div/div[1]/div[3]/div[@class="indent "]//div/p/text()')
    author_str_dec = str(author_dec).replace("'","")
    return author_id,author_str_dec,book_str_desc

#作者图片
def parse_html_aimg(db_html):
    db_html = etree.HTML(db_html)
    author_img = db_html.xpath('//*[@id="headline"]/div[1]/div/img/@src')

    author_img = [str(info).replace("'", "") for info in author_img]
    author_img = [str(info).replace("[", "") for info in author_img]
    author_img = [str(info).replace("]", "") for info in author_img]
    return author_img


#获取图书标签 豆瓣读书标签tag没有相关的id展示在网页端
tag_url = "https://book.douban.com/tag/?view=cloud"
tag_html = get_html(tag_url)
book_tag = parse_html_tag(tag_html)

time.sleep(random.uniform(0.5, 1.5))

#连接数据库
conn = pymysql.connect(host='127.0.0.1', user='root', password='你的数据库密码', port=3306, database='bookrec',charset='utf8')
cursor = conn.cursor()

# 爬取小说标签并存入数据库sort
sid = [1+n for n in range(len(book_tag))]
for n in range(len(book_tag)):
    sql = "insert ignore into sort(sid,sname) values('%d','%s')"%(sid[n],book_tag[n])
    cursor.execute(sql)
    conn.commit()
#获取多页数据

for tag in book_tag:
    print(tag)
    sid_n = book_tag.index(tag) + 1
    if sid_n > 15:#控制从第几个标签开始爬取数据
        for page in range(0, 60, 20):
            print("正在获取",page,"页")
            basic_url = "https://book.douban.com/tag/{tag}?start={page}&type=T".format(tag=tag,page=page)
            basic_html = get_html(basic_url)
            basic_info,rating_stars,book_id,rating_nums,book_img,book_name,rating_stars_class = parse_html_book(basic_html)
            index_none = 0
            for i in range(20):
                if rating_stars_class[i] == '00':
                    index_none = i
                    rating_stars[index_none:index_none] = ['0.0']

            time.sleep(random.uniform(1, 2.5))
            # 防反爬虫
            print("正在抓取图书信息")
            for id in book_id:
                id = id.replace("'","")
                n = book_id.index(id)
                book_url = "https://book.douban.com/subject/{}/".format(id)
                book_html = get_html(book_url)
                authorID, author_dec, book_dec = parse_html_author(book_html)
                if authorID == id:
                    authorID = parse_html_no_author(9)
                author_dec = str(author_dec)
                book_dec = str(book_dec)
                if len(author_dec)<10:
                    author_dec = '该作家暂无介绍'
                if len(book_dec)<10:
                    author_dec = '该书籍暂无介绍'
                time.sleep(random.uniform(0.5, 1.5))
                if len(authorID) > 8:
                    author_img = 'https://img3.doubanio.com/f/book/0e27fcad0e64da9769f748b2070a295b56405077/pics/book/author-default-large.png'
                if len(authorID) <= 7:
                    author_url = "https://book.douban.com/author/{}/".format(authorID)
                    author_html = get_html(author_url)
                    author_img = parse_html_aimg(author_html)
                    if len(author_img) < 5:
                        author_img = 'https://img3.doubanio.com/f/book/0e27fcad0e64da9769f748b2070a295b56405077/pics/book/author-default-large.png'
                print("正在抓取作者信息")
                basic_info_len = len(basic_info[n])
                price_n = basic_info_len-1
                publisher_n = basic_info_len-3
                time_n = basic_info_len-2
                for i in range(basic_info_len):
                    basic_info[n][i] = str(basic_info[n][i]).replace("'", "").strip()
                    basic_info[n][i] = str(basic_info[n][i]).replace("[", "")
                    basic_info[n][i] = str(basic_info[n][i]).replace("]", "")

                # author_sql = "insert ignore into author(aid,aname,adesc,aimg) values(%s,%s,%s,%s)" , (authorID, basic_info[n][0], str(author_dec), author_img)
                cursor.execute("insert ignore into author(aid,aname,adesc,aimg) values(%s,%s,%s,%s)",
                               (authorID, basic_info[n][0], author_dec, author_img))
                print("正在插入作者信息", n)
                # book_sql = "insert ignore into book(bid,bname,bimg,aid,bprice,bdesc,bpublisher,btime,sid,bstar,bnum) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" , (book_id[n], book_name[n], book_img[n], authorID, basic_info[n][basic_info_len-1], str(book_dec), basic_info[n][basic_info_len-3],basic_info[n][basic_info_len-2],sid[sid_n], rating_stars[n], rating_nums[n])
                cursor.execute("insert ignore into book(bid,bname,bimg,aid,bprice,bdesc,bpublisher,btime,sid,bstar,bnum) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" ,
                               (id, book_name[n], book_img[n], authorID, basic_info[n][price_n], book_dec, basic_info[n][publisher_n],basic_info[n][time_n],sid_n, rating_stars[n], rating_nums[n]))
                print("正在插入书籍信息",n)

            time.sleep(random.uniform(1, 1.5))
            conn.commit()

    time.sleep(random.uniform(1, 1.5))
time.sleep(random.uniform(1, 1.5))
cursor.close()
conn.close()