代码运行:
pdf展示
需要下载wkhtmltopdf,html转换为pdf
wkhtmltopdf下载网址:wkhtmltopdf.org/downloads.h…
代码:csdn_article.py
import os
import requests
from lxml import etree
import pdfkit
config = pdfkit.configuration(wkhtmltopdf=r"D:\wkhtmltopdf\bin\wkhtmltopdf.exe")
sess = requests.session()
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
sess.headers['User-Agent'] = agent
def crawler_blog_by(title):
article_request_url = f'https://blog.csdn.net/weixin_44319595/article/details/133311597' #复制文章链接粘贴到此处
response = sess.get(article_request_url)
selector = etree.HTML(response.text)
head_msg = selector.xpath(r"//head")[0]
head_str = etree.tostring(head_msg,encoding='utf8',method='html').decode()
body_msg = selector.xpath(r"//div[@id='content_views']")[0]
body_str = etree.tostring(body_msg, encoding='utf8', method='html').decode().replace('\quad','')
if not os.path.exists('c_articles'):
os.mkdir('c_articles')
title = title.replace("/","-").replace(":","")
save_file_name = os.path.join('c_articles',f'{title}.html')
with open(save_file_name,'w',encoding='utf8') as writer:
writer.write(f"""<head><meta charset="utf8"></head>
{body_str}""")
html_to_pdf(save_file_name)
def html_to_pdf(file_html_name):
pre_file_name = os.path.splitext(file_html_name)[0]
pdfkit.from_file(file_html_name,pre_file_name+'.pdf',configuration=config)
filename = input('请输入文件名:')
crawler_blog_by(filename)