【python爬虫】爬取CSDN文章输出为htlm、pdf格式-CSDN博客

67 阅读1分钟

代码运行:
在这里插入图片描述
pdf展示
在这里插入图片描述

需要下载wkhtmltopdf,html转换为pdf
wkhtmltopdf下载网址:wkhtmltopdf.org/downloads.h…

代码:csdn_article.py

import os
import requests
from lxml import etree
import pdfkit
config = pdfkit.configuration(wkhtmltopdf=r"D:\wkhtmltopdf\bin\wkhtmltopdf.exe")


sess = requests.session()
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
sess.headers['User-Agent'] = agent


def crawler_blog_by(title):
    article_request_url = f'https://blog.csdn.net/weixin_44319595/article/details/133311597'    #复制文章链接粘贴到此处
    response = sess.get(article_request_url)

    selector = etree.HTML(response.text)
    head_msg = selector.xpath(r"//head")[0]
    head_str = etree.tostring(head_msg,encoding='utf8',method='html').decode()
    body_msg = selector.xpath(r"//div[@id='content_views']")[0]
    body_str = etree.tostring(body_msg, encoding='utf8', method='html').decode().replace('\quad','')

    if not os.path.exists('c_articles'):
        os.mkdir('c_articles')

    title = title.replace("/","-").replace(":","")
    save_file_name = os.path.join('c_articles',f'{title}.html')
    with open(save_file_name,'w',encoding='utf8') as writer:
        writer.write(f"""<head><meta charset="utf8"></head>
                    {body_str}""")
        html_to_pdf(save_file_name)

def html_to_pdf(file_html_name):
    pre_file_name = os.path.splitext(file_html_name)[0]
    pdfkit.from_file(file_html_name,pre_file_name+'.pdf',configuration=config)

filename = input('请输入文件名:')
crawler_blog_by(filename)