用xlwt将爬取的内容写入excel
import re
import requests
import time
import random
import xlwt
from bs4 import BeautifulSoup
def getBook(page):
if page==0:
url = 'https://book.douban.com/top250'
else:
url='https://book.douban.com/top250'+'?start='+str(page*25)
try:
kv = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15',
'Cookie': '__utma=81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb=81379588.2.10.1624499388; __utmc=81379588; __utmz=81379588.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb=30149280.2.10.1624499388; __utmc=30149280; __utmz=30149280.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; _pk_id.100001.3ac3=449fa3ee36cea64b.1624331967.5.1624499726.1624430146.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1624499389%2C%22https%3A%2F%2Fwww.baidu.com%22%5D; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ',
}
r = requests.get(url,headers=kv,verify=False)
time.sleep(random.randint(3,5))
r.raise_for_status()
r.encoding=r.apparent_encoding
except Exception as e:
print('爬取错误')
html=r.text
bs=BeautifulSoup(html,"html.parser")
return bs
def getNames(bs):
titles=[]
lists=bs.find_all('div',class_="pl2")
for item in lists:
name=item.find('a').get('title')
titles.append(name)
return titles
def getUrls(bs):
urls=[]
lists=bs.find_all('div',class_="pl2")
for item in lists:
url=item.find('a').get('herf')
urls.append(url)
return urls
def getBasicMessage(bs):
lists=bs.find_all('p',class_="pl")
messages=[]
for item in lists:
message=item.string
messages.append(message)
return messages
def getStars(bs):
lists=bs.find_all('span',class_="rating_nums")
stars=[]
for item in lists:
star=float(item.string)
stars.append(star)
return stars
def getPeopleNumbers(bs):
peoples=[]
lists=bs.find_all('span',class_="pl")
for item in lists:
people=item.text
peoples.append(people)
num=[]
for i in peoples:
r = re.findall(r"\d+\.?\d*",i)
num.append(int(r[0]))
return num
def getInq(bs):
Inqs=[]
lists=bs.find_all('span',class_="inq")
for item in lists:
inq=item.text
Inqs.append(inq)
return Inqs
if __name__=='__main__':
book=xlwt.Workbook(encoding='utf-8')
sheet=book.add_sheet('豆瓣图书TOP250',cell_overwrite_ok=True)
sheet.write(0,0,'书名')
sheet.write(0,1,'网址')
sheet.write(0,2,'基本信息')
sheet.write(0,3,'评分')
sheet.write(0,4,'评价人数')
#sheet.write(0,5,'推荐语')
for n in range(0,10):
print("爬取第%d页的数据"%(n+1))
bs=getBook(n)
for i in range(0,25):
sheet.write(n*25+i+1,0,getNames(bs)[i])
sheet.write(n*25+i+1,1,getUrls(bs)[i])
sheet.write(n*25+i+1,2,getBasicMessage(bs)[i])
sheet.write(n*25+i+1,3,getStars(bs)[i])
sheet.write(n*25+i+1,4,getPeopleNumbers(bs)[i])
#sheet.write(i+1,5,getInq(bs)[i])
book.save(u'豆瓣最受欢迎250本书')
print("爬取完毕")
用openpyxl写入Excel
import re
import requests
import time
import random
from openpyxl import workbook,load_workbook
from bs4 import BeautifulSoup
def getBook(page):
if page==0:
url = 'https://book.douban.com/top250'
else:
url='https://book.douban.com/top250'+'?start='+str(page*25)
try:
kv = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.3 Safari/605.1.15',
'Cookie': '__utma=81379588.1410774223.1624331967.1624430146.1624499388.5; __utmb=81379588.2.10.1624499388; __utmc=81379588; __utmz=81379588.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1458731315.1624331967.1624430146.1624499388.5; __utmb=30149280.2.10.1624499388; __utmc=30149280; __utmz=30149280.1624499388.5.5.utmcsr=baidu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; gr_cs1_84825faf-1548-4089-8031-acd6fdaa3ce1=user_id%3A0; gr_user_id=5f5fb227-eb54-47cd-80bd-fa7cbbaeb2b3; _pk_id.100001.3ac3=449fa3ee36cea64b.1624331967.5.1624499726.1624430146.; _pk_ses.100001.3ac3=*; ap_v=0,6.0; __utmt=1; _vwo_uuid_v2=DD4F4DF42FA305FDB3940B128E6DE508D|87adadc0f8fbc5da7ed45a64ca113bad; __utmt_douban=1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_84825faf-1548-4089-8031-acd6fdaa3ce1=true; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1624499389%2C%22https%3A%2F%2Fwww.baidu.com%22%5D; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=84825faf-1548-4089-8031-acd6fdaa3ce1; ct=y; viewed="1052990_1007914_4913064_35378776_4465858_1683129"; __gads=ID=668d257fc5284aeb-22ede7a3a7c9008c:T=1624331967:RT=1624331967:S=ALNI_MaPwpYsc5fdhZ0jN4lIkO-CgZWF0w; ll="108288"; bid=A3beH6OH7gQ',
}
r = requests.get(url,headers=kv,verify=False)
time.sleep(random.randint(3,5))
r.raise_for_status()
r.encoding=r.apparent_encoding
except Exception as e:
print('爬取错误')
html=r.text
bs=BeautifulSoup(html,"html.parser")
return bs
def getNames(bs):
titles=[]
lists=bs.find_all('div',class_="pl2")
for item in lists:
name=item.find('a').get('title')
titles.append(name)
return titles
def getUrls(bs):
urls=[]
lists=bs.find_all('div',class_="pl2")
for item in lists:
url=item.find('a').get('herf')
urls.append(url)
return urls
def getBasicMessage(bs):
lists=bs.find_all('p',class_="pl")
messages=[]
for item in lists:
message=item.string
messages.append(message)
return messages
def getStars(bs):
lists=bs.find_all('span',class_="rating_nums")
stars=[]
for item in lists:
star=float(item.string)
stars.append(star)
return stars
def getPeopleNumbers(bs):
peoples=[]
lists=bs.find_all('span',class_="pl")
for item in lists:
people=item.text
peoples.append(people)
num=[]
for i in peoples:
r = re.findall(r"\d+\.?\d*",i)
num.append(int(r[0]))
return num
def getInq(bs):
Inqs=[]
lists=bs.find_all('span',class_="inq")
for item in lists:
if len(item.text)!=0:
Inqs.append(item.text)
else:
Inqs.append('无评词')
return Inqs
if __name__=='__main__':
wb = workbook.Workbook()
ws=wb.active
ws.append(['书名','网址','基本信息','评分','评论数'])
for n in range(0,10):
print("爬取第%d页的数据"%(n+1))
bs=getBook(n)
for i in range(25):
ws.append([getNames(bs)[i],getUrls(bs)[i],getBasicMessage(bs)[i],getStars(bs)[i],getPeopleNumbers(bs)[i]])
wb.save("bookTop250.xlsx")
print("爬取完毕")
具体分析,参考相关网页
针对python向excel中写入数据调用write函数会出现Attempt to overwrite cell: sheetname=‘Sheet1’ rowx=1 colx=0的问题解决