爬取豆瓣读书TOP250实践

620 阅读2分钟

最近在练习爬取豆瓣读书,其中出了很多小插曲,最后终于写出一个,可是还是存留两个问题: 1.网址能爬取出来,可是不能写入Excel中 2.第3页的有一本书没有评语,导致爬取出现错误list index out of range,想用if/else排除替换没有评语的item,可是发现《中国少年儿童百科全书》里根本就没有<p class="quote"......

68A43168-4E10-4B32-9040-C90C110DE59F_4_5005_c.jpeg

FD551829-847B-4AD8-8377-EB49E148CE6D_4_5005_c.jpeg

也就意味着按照图片中根本就无法排除掉没有inq的内容

1B58762E-1498-4F35-BB17-549ED815672F_4_5005_c.jpeg

暂时记录一下,要是有看到的人,希望能给一下指点。 附上代码和爬取内容

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 26 23:36:51 2021

import re
import requests
import time
import random
from openpyxl import workbook,load_workbook
from bs4 import BeautifulSoup

f=open('豆瓣读书250.txt','w+')


def getBook(page):
    if page==0:  
        url = 'https://book.douban.com/top250'
    else:
        url='https://book.douban.com/top250'+'?start='+str(page*25)
    try:
        kv = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15',
      'Cookie': '__utma=81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb=81379588.2.10.1624499388; __utmc=81379588; __utmz=81379588.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb=30149280.2.10.1624499388; __utmc=30149280; __utmz=30149280.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; _pk_id.100001.3ac3=449fa3ee36cea64b.1624331967.5.1624499726.1624430146.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1624499389%2C%22https%3A%2F%2Fwww.baidu.com%22%5D; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ',
      }
        r = requests.get(url,headers=kv,verify=False)
        time.sleep(random.randint(3,5))
        r.raise_for_status()
        r.encoding=r.apparent_encoding
    except Exception as e:
        print('爬取错误')
    html=r.text
    bs=BeautifulSoup(html,"html.parser")
    return bs
def getNames(bs):
    titles=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        name=item.find('a').get('title')
        titles.append(name)
    return titles
def getUrls(bs):
    urls=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        url=item.find('a').get('herf')
        urls.append(url)
    return urls
def getBasicMessage(bs):
    lists=bs.find_all('p',class_="pl")
    messages=[]
    for item in lists:
        message=item.string
        messages.append(message)
    return messages
def getStars(bs):
    lists=bs.find_all('span',class_="rating_nums")
    stars=[]
    for item in lists:
        star=float(item.string)
        stars.append(star)
    return stars
def getPeopleNumbers(bs):
    peoples=[]
    lists=bs.find_all('span',class_="pl")
    for item in lists:
        people=item.text
        peoples.append(people)
    num=[]
    for i in peoples:
        r = re.findall(r"\d+\.?\d*",i)
        num.append(int(r[0]))
    return num
def getInq(bs):
    Inqs=[]
    lists=bs.find_all('span',class_="inq")
    for item in lists:
        if len(item.text)!=0:
            Inqs.append(item.text)
        else:
            Inqs.append('无评词')
    return Inqs

if __name__=='__main__':
    wb = workbook.Workbook()
    ws=wb.active
    ws.append(['书名','网址','基本信息','评分','评论数'])
    for n in range(0,10):
        print("爬取第%d页的数据"%(n+1))
        bs=getBook(n)
        for i in range(25):
            ws.append([getNames(bs)[i],getUrls(bs)[i],getBasicMessage(bs)[i],getStars(bs)[i],getPeopleNumbers(bs)[i]])
    wb.save("bookTop250.xlsx")
    print("爬取完毕")
    

6425F21E-FBEE-4CCD-88E0-BFE495211014.jpeg