python学习初次上手练习

48 阅读1分钟

python学习初次上手练习实例。 参考 juejin.cn/post/742714…

import json
import re
import time

import requests
from bs4 import BeautifulSoup
import pandas as pd

api_url = 'https://kgbook.com/'
headers = {}
# excel列
columnNames = ['分类', '名字', '作者', '格式', '链接']
# 二维数据
dataArray = []
def isValidUrl(url):
    regex = re.compile(r'^https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
    return regex.match(url) is not None

def decodeResponseContent(content):
    if content:
        htmlstr = content.decode('utf-8')
        soup = BeautifulSoup(htmlstr, 'html.parser')
        return soup
    return ''

# 保存成excel
def saveBookToExcel():
    df = pd.DataFrame(dataArray, columns = columnNames)
    print('result', df)
    df.to_excel('./books.xlsx')
# 获取目录
def getcategory():
    req_result=requests.request("Get", api_url, headers=headers, data={},verify=False)
    if req_result.status_code==200:
        soup = decodeResponseContent(req_result.content)
        categorys = soup.find_all(attrs={'id': 'category'})[0].ul
        for li in categorys.find_all(name='li'):
            print('开始抓取' + li.a.attrs['href'] + "--" + li.string)
            getcategroyList(api_url + li.a.attrs['href'], li.string)
            time.sleep(1)
        else:
            saveBookToExcel()

# 获取列表
def getcategroyList(url, cate_name):
    req_result = requests.request("Get", url, headers=headers, data={}, verify=False)
    if req_result.status_code == 200:
        soup = decodeResponseContent(req_result.content)
        bookdivList = soup.select('.channel-item')
        for book_div in bookdivList:
            bookurl = book_div.select('.list-title > a')[0].attrs['href']
            print('111', bookurl, isValidUrl(bookurl))
            if isValidUrl(bookurl):
                getBoolDetail(bookurl, cate_name)


def getBoolDetail(url, cate_name):
    req_result = requests.request("Get", url, headers=headers, data={}, verify=False)
    soup = decodeResponseContent(req_result.content)
    # 书名
    bookname = soup.select('.news_title')
    if len(bookname):
        bookname = bookname[0].text.strip()

    infolihtml = soup.select('#news_details')
    if len(infolihtml):
        infolihtml = infolihtml[0].ul
        bookauthor = infolihtml.li.find(string=re.compile('作者:(.*?)'))
        if bookauthor is not None:
            bookauthor = bookauthor.strip().replace('作者:', '')

        booktype = infolihtml.find(string=re.compile('格式:(.*?)'))
        if booktype is not None:
            booktype = booktype.strip().replace('格式:', '')

        print('名+作者', bookname+'-'+ bookauthor)
        dataArray.append([cate_name, bookname, bookauthor, booktype, url])


getcategory()