【python爬虫】爬取CSDN个页人主文章输出为htlm、pdf格式-CSDN博客

76 阅读1分钟

代码运行:
在这里插入图片描述

pdf展示
在这里插入图片描述

需要下载wkhtmltopdf,html转换为pdf
wkhtmltopdf下载网址:wkhtmltopdf.org/downloads.h…

代码:csdn.py

"""
step1:爬取博主的所有博文的article_ids
step2:根据article_ids,爬取这篇文章的html,拿到想要的部分
step3:保存为html格式,再保存pdf格式
"""
import os
import random
import time
import requests
from lxml import etree
import pdfkit
config = pdfkit.configuration(wkhtmltopdf=r"D:\wkhtmltopdf\bin\wkhtmltopdf.exe")

author_name = input("请输入博主ID:")
MAX_PAGE_NUM = 200
i = 1

sess = requests.session()
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
sess.headers['User-Agent'] = agent

def crawler_blog_by(author_name,article_id,title):
    article_request_url = f'https://blog.csdn.net/{author_name}/article/details/{article_id}'
    response = sess.get(article_request_url)

    selector = etree.HTML(response.text)
    head_msg = selector.xpath(r"//head")[0]
    head_str = etree.tostring(head_msg,encoding='utf8',method='html').decode()
    body_msg = selector.xpath(r"//div[@id='content_views']")[0]
    body_str = etree.tostring(body_msg, encoding='utf8', method='html').decode()

    if not os.path.exists('c_articles'):
        os.mkdir('c_articles')

    title = title.replace("/","-").replace(":","")
    save_file_name = os.path.join('c_articles',f'{author_name}-{title}-{article_id}.html')
    with open(save_file_name,'w',encoding='utf8') as writer:
        writer.write(f"""<head><meta charset="utf8"></head>
                    {body_str}""")
        html_to_pdf(save_file_name)
        global i
        print(f'【INFO】:{author_name}{i}篇博文{title}-{article_id}.html保存文件成功')
        i += 1

def html_to_pdf(file_html_name):
    pre_file_name = os.path.splitext(file_html_name)[0]
    pdfkit.from_file(file_html_name,pre_file_name+'.pdf',configuration=config)

#循环爬取分页html

for page_no in range(MAX_PAGE_NUM):
    try:
        data = {"page": page_no,
                "size": 20,
                "businessType": "blog",
                "orderby": "",
                "noMore": False,
                "year": "",
                "month": "",
                "username": author_name}
        pages_dict = sess.get('https://blog.csdn.net/community/home-api/v1/get-business-list',
                              params=data).json()
        for article in pages_dict['data']['list']:
            article_id = article['articleId']
            title = article['title']
            crawler_blog_by(author_name,article_id,title)

        time.sleep(random.uniform(0.4,1.0))
    except Exception as e:
        print(e)