import re
import os
import time
import asyncio
import aiohttp
import aiofiles
import requests
from lxml import etree
def search():
'''查询功能'''
userinfo = input("请输入想要爬取的小说名:").strip()
url = 'https://www.51shucheng.net/search?q={}'.format(userinfo)
parms = {
'q': '{}'.format(userinfo)
}
proxies = {
'https': '113.238.142.208:3128'
}
resp = requests.get(url,params=parms)
resp.encoding = 'utf-8'
search_result_obj = re.compile(r'.*?<div class="search_result"><div class="(?P<type>.*?)"><h1>(?P<search_result_content>.*?)'
r'<div class="footer">.*?<div class="footer-nav">',re.S)
search_result_value = search_result_obj.findall(resp.text)
if search_result_value == []:
print("查询失败或暂无此书")
return 0
fiction_name = []
fiction_all = {}
for search_result_text in search_result_value:
search_result_text = search_result_text.__str__()
search_result_value_obj = re.compile(r'.*?<li><a href="(?P<hearf>.*?)" title="(?P<title>.*?)"><h3>',re.S)
search_result_resp = search_result_value_obj.finditer(search_result_text)
for search_result_end in search_result_resp:
href = search_result_end.group('hearf').strip()
title = search_result_end.group('title').replace('在线阅读','').strip()
fiction_name.append(title)
fiction_name.append(href)
fiction_all[title] = fiction_name
fiction_name = []
return fiction_all
def choois_fiction(dict):
'''选择下载哪部小说'''
a = 0
download_list = []
if dict == 0:
exit()
for key,value in dict.items():
print(a,key)
download_list.append(value)
a += 1
userchooise = int(input('请选择你要下载哪一部小说序号:').strip())
if userchooise <= len(download_list):
print('选择要下载的小说是:',f'{download_list[userchooise][0]}')
page_lists = get_download_url(download_list[userchooise][1])
page_list_all_url = page_lists[0]
fiction_name = page_lists[1][0]
return page_list_all_url,fiction_name
elif userchooise > len(download_list):
print('输入的编号不存在')
def get_download_url(url):
'''获取下载数据全部url'''
page_source = requests.get(url)
page_source.encoding = 'utf-8'
page_source_obj = re.compile(r'.*?<li><a href="(?P<href>.*?)" title="(?P<title>.*?)">.*?</a></li>',re.S)
page_source_result = page_source_obj.finditer(page_source.text)
page_soutce_lists = []
fiction_name = []
for page_source_li in page_source_result:
href = page_source_li.group('href').strip()
title = page_source_li.group('title').strip()
filename = title.split(' ',2)[0]
fiction_name.append(filename)
page_soutce_lists.append(href)
return page_soutce_lists,fiction_name
async def download_one(url,name):
'''开始协程下载单个任务'''
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
one_page_source = await resp.text(encoding='utf-8')
tree = etree.HTML(one_page_source)
page_name = tree.xpath("//div[@class='content book-content']/h1/text()")[0].strip()
page_text = '\n'.join(tree.xpath("//div[@class='neirong']/p/text()"))
async with aiofiles.open(f'{name}/{page_name}.txt',mode='w',encoding='utf-8') as f:
await f.write(page_text)
print(f'{page_name}','下载完成')
async def download_all(list,name):
'''下载全部章节'''
tasks = []
if not os.path.exists(name):
os.mkdir(name)
for lis in list:
t = asyncio.create_task(download_one(lis,name))
tasks.append(t)
await asyncio.wait(tasks)
print('总共:',len(tasks),'个任务下载完成')
def main():
'''主进程'''
search_result = search()
fiction_all = choois_fiction(search_result)
time_strt = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(download_all(fiction_all[0],fiction_all[1]))
time_end = time.time()
print('总耗时:',time_end-time_strt)
if __name__ == '__main__':
main()