python学习初次上手练习实例。 参考 juejin.cn/post/742714…
import json
import re
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
api_url = 'https://kgbook.com/'
headers = {}
# excel列
columnNames = ['分类', '名字', '作者', '格式', '链接']
# 二维数据
dataArray = []
def isValidUrl(url):
regex = re.compile(r'^https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
return regex.match(url) is not None
def decodeResponseContent(content):
if content:
htmlstr = content.decode('utf-8')
soup = BeautifulSoup(htmlstr, 'html.parser')
return soup
return ''
# 保存成excel
def saveBookToExcel():
df = pd.DataFrame(dataArray, columns = columnNames)
print('result', df)
df.to_excel('./books.xlsx')
# 获取目录
def getcategory():
req_result=requests.request("Get", api_url, headers=headers, data={},verify=False)
if req_result.status_code==200:
soup = decodeResponseContent(req_result.content)
categorys = soup.find_all(attrs={'id': 'category'})[0].ul
for li in categorys.find_all(name='li'):
print('开始抓取' + li.a.attrs['href'] + "--" + li.string)
getcategroyList(api_url + li.a.attrs['href'], li.string)
time.sleep(1)
else:
saveBookToExcel()
# 获取列表
def getcategroyList(url, cate_name):
req_result = requests.request("Get", url, headers=headers, data={}, verify=False)
if req_result.status_code == 200:
soup = decodeResponseContent(req_result.content)
bookdivList = soup.select('.channel-item')
for book_div in bookdivList:
bookurl = book_div.select('.list-title > a')[0].attrs['href']
print('111', bookurl, isValidUrl(bookurl))
if isValidUrl(bookurl):
getBoolDetail(bookurl, cate_name)
def getBoolDetail(url, cate_name):
req_result = requests.request("Get", url, headers=headers, data={}, verify=False)
soup = decodeResponseContent(req_result.content)
# 书名
bookname = soup.select('.news_title')
if len(bookname):
bookname = bookname[0].text.strip()
infolihtml = soup.select('#news_details')
if len(infolihtml):
infolihtml = infolihtml[0].ul
bookauthor = infolihtml.li.find(string=re.compile('作者:(.*?)'))
if bookauthor is not None:
bookauthor = bookauthor.strip().replace('作者:', '')
booktype = infolihtml.find(string=re.compile('格式:(.*?)'))
if booktype is not None:
booktype = booktype.strip().replace('格式:', '')
print('名+作者', bookname+'-'+ bookauthor)
dataArray.append([cate_name, bookname, bookauthor, booktype, url])
getcategory()