对比用openpyxl和xlwt,将网页爬取内容写入Excel中

402 阅读4分钟

用xlwt将爬取的内容写入excel

import re
import requests
import time
import random
import xlwt

from bs4 import BeautifulSoup



def getBook(page):
    if page==0:  
        url = 'https://book.douban.com/top250'
    else:
        url='https://book.douban.com/top250'+'?start='+str(page*25)
    try:
        kv = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15',
      'Cookie': '__utma=81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb=81379588.2.10.1624499388; __utmc=81379588; __utmz=81379588.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb=30149280.2.10.1624499388; __utmc=30149280; __utmz=30149280.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; _pk_id.100001.3ac3=449fa3ee36cea64b.1624331967.5.1624499726.1624430146.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1624499389%2C%22https%3A%2F%2Fwww.baidu.com%22%5D; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ',
      }
        r = requests.get(url,headers=kv,verify=False)
        time.sleep(random.randint(3,5))
        r.raise_for_status()
        r.encoding=r.apparent_encoding
    except Exception as e:
        print('爬取错误')
    html=r.text
    bs=BeautifulSoup(html,"html.parser")
    return bs
def getNames(bs):
    titles=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        name=item.find('a').get('title')
        titles.append(name)
    return titles
def getUrls(bs):
    urls=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        url=item.find('a').get('herf')
        urls.append(url)
    return urls
def getBasicMessage(bs):
    lists=bs.find_all('p',class_="pl")
    messages=[]
    for item in lists:
        message=item.string
        messages.append(message)
    return messages
def getStars(bs):
    lists=bs.find_all('span',class_="rating_nums")
    stars=[]
    for item in lists:
        star=float(item.string)
        stars.append(star)
    return stars
def getPeopleNumbers(bs):
    peoples=[]
    lists=bs.find_all('span',class_="pl")
    for item in lists:
        people=item.text
        peoples.append(people)
    num=[]
    for i in peoples:
        r = re.findall(r"\d+\.?\d*",i)
        num.append(int(r[0]))
    return num
def getInq(bs):
    Inqs=[]
    lists=bs.find_all('span',class_="inq")
    for item in lists:
        inq=item.text
        Inqs.append(inq)
    return Inqs

if __name__=='__main__':
    book=xlwt.Workbook(encoding='utf-8')
    sheet=book.add_sheet('豆瓣图书TOP250',cell_overwrite_ok=True)
    sheet.write(0,0,'书名')
    sheet.write(0,1,'网址')
    sheet.write(0,2,'基本信息')
    sheet.write(0,3,'评分')
    sheet.write(0,4,'评价人数')
    #sheet.write(0,5,'推荐语')
    for n in range(0,10):
        print("爬取第%d页的数据"%(n+1))
        bs=getBook(n)
        for i in range(0,25):
            sheet.write(n*25+i+1,0,getNames(bs)[i])
            sheet.write(n*25+i+1,1,getUrls(bs)[i])
            sheet.write(n*25+i+1,2,getBasicMessage(bs)[i])                        
            sheet.write(n*25+i+1,3,getStars(bs)[i])
            sheet.write(n*25+i+1,4,getPeopleNumbers(bs)[i])
            #sheet.write(i+1,5,getInq(bs)[i])
    book.save(u'豆瓣最受欢迎250本书')
    print("爬取完毕")
        

用openpyxl写入Excel

import re
import requests
import time
import random
from openpyxl import workbook,load_workbook
from bs4 import BeautifulSoup




def getBook(page):
    if page==0:  
        url = 'https://book.douban.com/top250'
    else:
        url='https://book.douban.com/top250'+'?start='+str(page*25)
    try:
        kv = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15',
      'Cookie': '__utma=81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb=81379588.2.10.1624499388; __utmc=81379588; __utmz=81379588.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb=30149280.2.10.1624499388; __utmc=30149280; __utmz=30149280.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; _pk_id.100001.3ac3=449fa3ee36cea64b.1624331967.5.1624499726.1624430146.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1624499389%2C%22https%3A%2F%2Fwww.baidu.com%22%5D; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ',
      }
        r = requests.get(url,headers=kv,verify=False)
        time.sleep(random.randint(3,5))
        r.raise_for_status()
        r.encoding=r.apparent_encoding
    except Exception as e:
        print('爬取错误')
    html=r.text
    bs=BeautifulSoup(html,"html.parser")
    return bs
def getNames(bs):
    titles=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        name=item.find('a').get('title')
        titles.append(name)
    return titles
def getUrls(bs):
    urls=[]
    lists=bs.find_all('div',class_="pl2")
    for item in lists:
        url=item.find('a').get('herf')
        urls.append(url)
    return urls
def getBasicMessage(bs):
    lists=bs.find_all('p',class_="pl")
    messages=[]
    for item in lists:
        message=item.string
        messages.append(message)
    return messages
def getStars(bs):
    lists=bs.find_all('span',class_="rating_nums")
    stars=[]
    for item in lists:
        star=float(item.string)
        stars.append(star)
    return stars
def getPeopleNumbers(bs):
    peoples=[]
    lists=bs.find_all('span',class_="pl")
    for item in lists:
        people=item.text
        peoples.append(people)
    num=[]
    for i in peoples:
        r = re.findall(r"\d+\.?\d*",i)
        num.append(int(r[0]))
    return num
def getInq(bs):
    Inqs=[]
    lists=bs.find_all('span',class_="inq")
    for item in lists:
        if len(item.text)!=0:
            Inqs.append(item.text)
        else:
            Inqs.append('无评词')
    return Inqs

if __name__=='__main__':
    wb = workbook.Workbook()
    ws=wb.active
    ws.append(['书名','网址','基本信息','评分','评论数'])
    for n in range(0,10):
        print("爬取第%d页的数据"%(n+1))
        bs=getBook(n)
        for i in range(25):
            ws.append([getNames(bs)[i],getUrls(bs)[i],getBasicMessage(bs)[i],getStars(bs)[i],getPeopleNumbers(bs)[i]])
    wb.save("bookTop250.xlsx")
    print("爬取完毕")

具体分析,参考相关网页

针对python向excel中写入数据调用write函数会出现Attempt to overwrite cell: sheetname=‘Sheet1’ rowx=1 colx=0的问题解决

简单使用openpyxl (openpyxl.readthedocs.io/en/stable/t…)