import requests
import re
import os
import time
def search():
'''查询功能'''
userinfo = input("请输入想要爬取的小说名:").strip()
url = 'https://www.51shucheng.net/search?q={}'.format(userinfo)
parms = {
'q': '{}'.format(userinfo)
}
proxies = {
'https': '113.238.142.208:3128'
}
resp = requests.get(url,params=parms)
resp.encoding = 'utf-8'
search_result_obj = re.compile(r'.*?<div class="search_result"><div class="(?P<type>.*?)"><h1>(?P<search_result_content>.*?)'
r'<div class="footer">.*?<div class="footer-nav">',re.S)
search_result_value = search_result_obj.findall(resp.text)
if search_result_value == []:
print("查询失败或暂无此书")
return 0
fiction_name = []
fiction_all = {}
for search_result_text in search_result_value:
search_result_text = search_result_text.__str__()
search_result_value_obj = re.compile(r'.*?<li><a href="(?P<hearf>.*?)" title="(?P<title>.*?)"><h3>',re.S)
search_result_resp = search_result_value_obj.finditer(search_result_text)
for search_result_end in search_result_resp:
href = search_result_end.group('hearf').strip()
title = search_result_end.group('title').replace('在线阅读','').strip()
fiction_name.append(title)
fiction_name.append(href)
fiction_all[title] = fiction_name
fiction_name = []
return fiction_all
def choois_fiction(dict):
'''选择下载哪部小说'''
a = 0
download_list = []
if dict == 0:
exit()
for key,value in dict.items():
print(a,key)
download_list.append(value)
a += 1
userchooise = int(input('请选择你要下载哪一部小说序号:').strip())
if userchooise <= len(download_list):
print('选择要下载的小说是:',f'{download_list[userchooise][0]}')
page_lists = get_download_url(download_list[userchooise][1])
page_list_all_url = page_lists[0]
fiction_name = page_lists[1][0]
return page_list_all_url,fiction_name
elif userchooise > len(download_list):
print('输入的编号不存在')
def get_download_url(url):
'''获取下载数据全部url'''
page_source = requests.get(url)
page_source.encoding = 'utf-8'
page_source_obj = re.compile(r'.*?<li><a href="(?P<href>.*?)" title="(?P<title>.*?)">.*?</a></li>',re.S)
page_source_result = page_source_obj.finditer(page_source.text)
page_soutce_lists = []
fiction_name = []
for page_source_li in page_source_result:
href = page_source_li.group('href').strip()
title = page_source_li.group('title').strip()
filename = title.split(' ',2)[0]
fiction_name.append(filename)
page_soutce_lists.append(href)
return page_soutce_lists,fiction_name
def download_one(url,name):
'''开始协程下载单个任务'''
print('这里是执行单个下载任务的函数')
time_one_star = time.time()
resp = requests.get(url)
resp.encoding = 'utf-8'
one_page_name_obj = re.compile(r'.*?<div class="content book-content">.*?<h1>(?P<page_name>.*?)</h1>',re.S)
page_names = one_page_name_obj.findall(resp.text)
page_name =page_names[0]
one_page_obj = re.compile(r'.*?<div class="neirong" id="neirong">(?P<page_content>.*?)<div class="ad-bottom">',re.S)
one_page_results = one_page_obj.findall(resp.text)
for one_page_text in one_page_results:
one_page_texts = ''.join(one_page_text).strip().replace('<br />','').replace('<p>','').replace('</p>','').replace('</div>','').replace('''<script async src="//pagead2.googlesyndication.com/pagead/js/adsbygoogle.js"></script>
<ins class="adsbygoogle"
style="display:block; text-align:center;"
data-ad-layout="in-article"
data-ad-format="fluid"
data-ad-client="ca-pub-9405492286923119"
data-ad-slot="4945244819"></ins>
<script>
(adsbygoogle = window.adsbygoogle || []).push({});
</script>''','')
with open(f'{name}/{page_name}.txt',mode='w',encoding='utf-8') as f:
f.write(one_page_texts)
time_one_end = time.time()
print('执行完单个任务总共花费:',time_one_end-time_one_star)
print(f'{page_name}','下载完成')
def download_all(list,name):
'''下载全部章节'''
print('\033[1;32m{}\033[0m'.format('这里是执行download_all的时间'))
time_all_star = time.time()
if not os.path.exists(name):
os.mkdir(name)
for lis in list:
download_one(lis,name)
time_all_end = time.time()
print('\033[1;32m{}\033[0m'.format('执行完毕all函数一共花费:'),time_all_end-time_all_star)
def main():
'''主进程'''
search_result = search()
fiction_all = choois_fiction(search_result)
time_strt = time.time()
download_all(fiction_all[0],fiction_all[1])
time_end = time.time()
print(time_end-time_strt)
if __name__ == '__main__':
main()