前言
一年一次的毕业季马上就要来临了,初三的小朋友面临着中考,高三的学子面临着高考,而大四的学子面临着论文!所以说每年的六月份都是检验你学习成果的时候!不过咱也是这样子过来的,哈哈哈!今天咱帮不了初高三的学子!但是大四面临着拿到文凭这一关还是能多少帮点忙的!
花了半个小时写好的论文下载器供大家写论文做参考使用!废话不多说,进入正题!
编辑
正文
开发工具
Python 版本: 3.7.8
相关模块:
requests模块;
paperdl模块;
reportlab模块;
pillow模块;
img2pdf模块;
PyPDF2模块;
以及一些python自带的模块。
环境搭建
安装Python并添加到环境变量,pip安装需要的相关模块即可。
内容
1. 支持直接在终端调用
和musicdl以及videodl一样,我添加了直接在终端调用paperdl的功能,这样方便我们使用,核心代码如下:
'''cmd直接运行'''
@click.command()
@click.version_option()
@click.option('-m', '--mode', default='download', help='the used mode, support "search" and "download"')
@click.option('-i', '--inp', default=None, help='the paper to download, the supported format is the same as sci-hub')
@click.option('-s', '--source', default=None, help='the used source, support "arxiv", "scihub" and "googlescholar", you can use "," to split multi sources')
@click.option('-d', '--savedir', default='papers', help='the directory for saving papers')
@click.option('-l', '--logfilepath', default='paperdl.log', help='the logging filepath')
@click.option('-z', '--size', default=5, help='search size per source')
@click.option('-p', '--proxies', default='{}', help='the proxies to be adopted')
@click.option('-a', '--area', default='CN', help='your area, support "CN" and "EN"')
@click.option('-c', '--cookie', default=None, help='the cookie copied from the target website, only used in "baiduwenku"')
def paperdlcmd(mode, inp, source, savedir, logfilepath, size, proxies, area, cookie):
# prepare
assert mode in ['search', 'download']
area = area.upper()
if mode == 'download': assert inp is not None, 'input url should be specified in download mode'
config = {
'logfilepath': logfilepath,
'savedir': savedir,
'search_size_per_source': size,
'proxies': json.loads(proxies),
'area': area,
}
if source is None:
target_srcs = ['arxiv', 'googlescholar']
else:
target_srcs = [s.strip() for s in source.split(',')]
client = Paperdl(config=config)
# if you select the search mode
if mode == 'search':
client.run(target_srcs=target_srcs)
else:
print(client)
if source is None:
if 'wenku.baidu.com' in inp:
source = 'baiduwenku'
else:
source = 'scihub'
paperinfo = {
'savename': inp.strip('/').split('/')[-1],
'ext': 'pdf',
'savedir': savedir,
'input': inp,
'source': source,
}
if source in ['baiduwenku']: paperinfo['cookie'] = cookie
client.download([paperinfo])
修复一个小Bug
修复了Windows系统因为文件名不合法导致的无法保存下载的内容的问题,具体而言,解决问题的核心代码如下:
'''clear bad characters in filename'''
def filterBadCharacter(string):
need_removed_strs = ['<em>', '</em>', '<', '>', '\', '/', '?', ':', '"', ':', '|', '?', '*']
for item in need_removed_strs:
string = string.replace(item, '')
try:
rule = re.compile(u'[\U00010000-\U0010ffff]')
except:
rule = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
string = rule.sub('', string)
return string.strip().encode('utf-8', 'ignore').decode('utf-8')
添加对百度文库的下载支持
添加了对百度文库里的文档的下载支持,核心代码如下:
'''Seach and download papers from Baiduwenku'''
class Baiduwenku(Base):
def __init__(self, config=None, logger_handle=None, **kwargs):
super(Baiduwenku, self).__init__(config, logger_handle, **kwargs)
self.source = 'baiduwenku'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
}
'''parse paper infos before dowload paper'''
def parseinfosbeforedownload(self, paperinfos):
for paperinfo in paperinfos:
self.parseinfobeforedownload(paperinfo)
paperinfo['source'] = self.source
return paperinfos
'''parse paper info before dowload paper'''
def parseinfobeforedownload(self, paperinfo):
# prepare
input_content = paperinfo['input']
url = input_content.split('?')[0] + '?edtMode=2'
headers = self.headers.copy()
if 'cookie' in paperinfo: headers['Cookie'] = paperinfo['cookie']
headers['Referer'] = url
self.session.headers.update(headers)
# obtain the basic infos
response = self.session.get(url)
page_data = re.search( r'var pageData = (.*);', response.text)
page_data = json.loads(page_data.group(1))
title = re.search( r'<title>(.*) - 百度文库</title>', response.text).group(1)
filetype = page_data['viewBiz']['docInfo']['fileType']
docid = url.split('?')[0].split('/')[-1][:-5]
paperinfo['savename'] = title
paperinfo['filetype'] = filetype
paperinfo['docid'] = docid
# ppt
if page_data['readerInfo']['tplKey'] == 'new_view' and filetype in ['ppt']:
download_url = page_data['readerInfo']['htmlUrls']
paperinfo['download_url'] = download_url
# word, pdf, excel
elif page_data['readerInfo']['tplKey'] == 'html_view' and filetype in ['word', 'pdf', 'excel']:
jsons = {x['pageIndex']: x['pageLoadUrl'] for x in page_data['readerInfo']['htmlUrls']['json']}
pngs = {x['pageIndex']: x['pageLoadUrl'] for x in page_data['readerInfo']['htmlUrls']['png']}
fonts_csss = {x['pageIndex']: 'https://wkretype.bdimg.com/retype/pipe/' + docid + '?pn=' + str(x['pageIndex']) + '&t=ttf&rn=1&v=6' + x['param'] for x in page_data['readerInfo']['htmlUrls']['ttf']}
if page_data['readerInfo']['page'] > 100:
for pn in list(range(101, data['readerInfo']['page'] + 1, 50)):
url = f"https://wenku.baidu.com/ndocview/readerinfo?doc_id={docid}&docId={docid}&type=html&clientType=1&pn={pn}&t={str(int(time.time()))}&isFromBdSearch=0&rn=50"
response = self.session.get(url)
page_data_others = json.loads(response.text)['data']['htmlUrls']
jsons.update({x['pageIndex']: x['pageLoadUrl'] for x in page_data_others['json']})
pngs.update({x['pageIndex']: x['pageLoadUrl'] for x in page_data_others['png']})
fonts_csss.update({x['pageIndex']: 'https://wkretype.bdimg.com/retype/pipe/' + docid + '?pn=' + str(x['pageIndex']) + '&t=ttf&rn=1&v=6' + x['param'] for x in data_temp['ttf']})
download_url = {'fonts_csss': fonts_csss, 'jsons': jsons, 'pngs': pngs}
paperinfo['download_url'] = download_url
# text
elif page_data['readerInfo']['tplKey'] == 'txt_view' and filetype in ['txt']:
lines = re.findall(r'<p class="p-txt">(.*)</p>', response.text)
lines = [line for line in lines if line]
lines[-1] = lines[-1][:-1]
download_url = 'https://wkretype.bdimg.com/retype/text/' + docid + page_data['readerInfo']['md5sum'] + '&pn=2&rn=' + str(int(page_data['viewBiz']['docInfo']['page']) - 1) + '&type=txt&rsign=' + page_data['readerInfo']['rsign'] + '&callback=cb&_=' + str(int(time.time()))
response = self.session.get(download_url)
lines_others_json = json.loads(response.text[3: -1])
lines_others = [x['parags'][0]['c'][:-2] for x in lines_others_json]
lines = lines + lines_others
paperinfo['download_url'] = download_url
paperinfo['lines'] = lines
paperinfo['ext'] = 'txt'
注意,部分文档下载不全可能是因为的输入的cookie是非百度文库会员用户的cookie,所以无法访问全文导致的。
4. 部分终端字体彩色化
主要为了更好的使用体验,全是黑白的有时候眼睛会看花,核心代码如下:
'''colorize words in terminal'''
def colorize(string, color):
string = str(string)
if color not in COLORS: return string
return COLORS[color] + string + '\033[0m'
效果展示
首先pip安装paperdl模块:
pip install paperdl
然后使用方式如下:
Usage: paperdl [OPTIONS]
Options:
--version Show the version and exit.
-m, --mode TEXT the used mode, support "search" and "download"
-i, --inp TEXT the paper to download, the supported format is the
same as sci-hub
-s, --source TEXT the used source, support "arxiv", "scihub" and
"googlescholar", you can use "," to split multi
sources
-d, --savedir TEXT the directory for saving papers
-l, --logfilepath TEXT the logging filepath
-z, --size INTEGER search size per source
-p, --proxies TEXT the proxies to be adopted
-a, --area TEXT your area, support "CN" and "EN"
-c, --cookie TEXT the cookie copied from the target website, only used
in "baiduwenku"
--help Show this message and exit.
例如(如果是百度文库,则必须通过-c来添加从浏览器中获得的cookie):
# sci论文
paperdl -i https://ieeexplore.ieee.org/document/9193963/
# 百度文库
paperdl -c cookie -i https://wenku.baidu.com/view/291a3ff982d049649b6648d7c1c708a1284a0ad9.html
使用的演示效果如下:
编辑
完结撒花
你学会了吗?是不是很简单!对了完整代码或解答可关注公众号:Python源码