用协程下载无忧书城小说(慎用,速度过快)

928 阅读2分钟
import re
import os
import time
import asyncio
import aiohttp
import aiofiles
import requests
from lxml import etree



def search():
    '''查询功能'''
    userinfo = input("请输入想要爬取的小说名:").strip()
    url = 'https://www.51shucheng.net/search?q={}'.format(userinfo)
    parms = {
        'q': '{}'.format(userinfo)      # 查询接口的入参
    }
    proxies = {
        'https': '113.238.142.208:3128' # 设置代理,以防访问次数过多IP被屏蔽
    }
    resp = requests.get(url,params=parms)  # 发起请求
    resp.encoding = 'utf-8' # 拿到响应后对数据进行解码
    search_result_obj = re.compile(r'.*?<div class="search_result"><div class="(?P<type>.*?)"><h1>(?P<search_result_content>.*?)'
                                   r'<div class="footer">.*?<div class="footer-nav">',re.S)     # 准备匹配正则,获取type是为了判断是否有结果
    search_result_value = search_result_obj.findall(resp.text)  # 对响应数据进行处理,提取需要的内容
    if search_result_value == []:   # 判断提取出来的数据是否是空列表,如果是空列表直接结束查询,并返回0给别的函数调用
        print("查询失败或暂无此书")
        return 0
    fiction_name = []   # 小说名字存放
    fiction_all = {}    # 小说的所有信息存放格式为{fiction_name:[fiction_name,url],...}
    for search_result_text in search_result_value:
        search_result_text = search_result_text.__str__()       # 把结果转成字符串类型
        search_result_value_obj = re.compile(r'.*?<li><a href="(?P<hearf>.*?)" title="(?P<title>.*?)"><h3>',re.S)
        search_result_resp = search_result_value_obj.finditer(search_result_text)

        for search_result_end in search_result_resp:
            href = search_result_end.group('hearf').strip()
            title = search_result_end.group('title').replace('在线阅读','').strip()
            fiction_name.append(title)
            fiction_name.append(href)
            fiction_all[title] = fiction_name
            fiction_name = []

        return fiction_all  # 返回小说的全部查询结果


def choois_fiction(dict):
    '''选择下载哪部小说'''
    a = 0
    download_list = []
    if dict == 0:
        exit()
    for key,value in dict.items():
        print(a,key)
        download_list.append(value)
        a += 1
    userchooise = int(input('请选择你要下载哪一部小说序号:').strip())
    if userchooise <= len(download_list):
        print('选择要下载的小说是:',f'{download_list[userchooise][0]}')
        # print('开始下载',f'{download_list[userchooise][1]}')
        page_lists = get_download_url(download_list[userchooise][1])
        page_list_all_url = page_lists[0]
        fiction_name = page_lists[1][0]
        # print(page_list_all_url)
        return page_list_all_url,fiction_name
    elif userchooise > len(download_list):
        print('输入的编号不存在')


def get_download_url(url):
    '''获取下载数据全部url'''
    page_source = requests.get(url)
    page_source.encoding = 'utf-8'
    page_source_obj = re.compile(r'.*?<li><a href="(?P<href>.*?)" title="(?P<title>.*?)">.*?</a></li>',re.S)
    page_source_result = page_source_obj.finditer(page_source.text)
    page_soutce_lists = []
    fiction_name = []
    for page_source_li in page_source_result:
        href = page_source_li.group('href').strip()
        title = page_source_li.group('title').strip()
        filename = title.split(' ',2)[0]
        fiction_name.append(filename)
        # print(title,href)
        page_soutce_lists.append(href)
    return page_soutce_lists,fiction_name


async def download_one(url,name):
    '''开始协程下载单个任务'''
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            one_page_source = await resp.text(encoding='utf-8')
            tree = etree.HTML(one_page_source)
            page_name = tree.xpath("//div[@class='content book-content']/h1/text()")[0].strip()
            page_text = '\n'.join(tree.xpath("//div[@class='neirong']/p/text()"))
            async with aiofiles.open(f'{name}/{page_name}.txt',mode='w',encoding='utf-8') as f:
                await f.write(page_text)
            print(f'{page_name}','下载完成')

# 下面是用正则匹配的方式进行下载,性能略慢
             # one_page_name_obj = re.compile(r'.*?<div class="content book-content">.*?<h1>(?P<page_name>.*?)</h1>',re.S)
            # c = time.time()
            # page_names = one_page_name_obj.findall(one_page_source)
            # page_name =page_names[0]
            # one_page_obj = re.compile(r'.*?<div class="neirong" id="neirong">(?P<page_content>.*?)<div class="ad-bottom">',re.S)
            # one_page_results = one_page_obj.findall(one_page_source)
            # one_page_text = one_page_results[0]
#             d = time.time()
#             # print(one_page_text)
#             # for one_page_text in one_page_results:
#             print(f'{page_name}','开始下载')
#             one_page_texts = ''.join(one_page_text).strip().replace('<br />','').replace('<p>','').replace('</p>','').replace('</div>','').replace('''<script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
# <ins class="adsbygoogle"
#      style="display:block; text-align:center;"
#      data-ad-layout="in-article"
#      data-ad-format="fluid"
#      data-ad-client="ca-pub-9405492286923119"
#      data-ad-slot="4945244819"></ins>
# <script>
#      (adsbygoogle = window.adsbygoogle || []).push({});
# </script>''','')
#             b = time.time()
#             print('耗时:',c-a,d-c,b-d)
#             async with aiofiles.open(f'{name}/{page_name}.txt',mode='w',encoding='utf-8') as f:
#                 await f.write(one_page_texts)
#             print(f'{page_name}','下载完成')


async def download_all(list,name):
    '''下载全部章节'''
    tasks = []
    if not os.path.exists(name):
        os.mkdir(name)
    for lis in list:
        t = asyncio.create_task(download_one(lis,name))
        tasks.append(t)
    await asyncio.wait(tasks)
    print('总共:',len(tasks),'个任务下载完成')


def main():
    '''主进程'''
    search_result = search()    # 返回查询结果
    fiction_all = choois_fiction(search_result)  # 选择要下载的书名开始下载
    time_strt = time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(download_all(fiction_all[0],fiction_all[1]))
    #asyncio.run(download_all(fiction_all[0],fiction_all[1]))
    time_end = time.time()
    print('总耗时:',time_end-time_strt)



if __name__ == '__main__':
    main()