python 爬虫文章生成markdown文件

613 阅读1分钟
#-*- coding: utf-8 -*-

from requests_html import HTMLSession

import os


session = HTMLSession()


WEBSITE = 'https://www.ruanyifeng.com/blog/weekly/'
OUTPUT_FILE = '/root/markdown/ruanyifengweekly.md'


def clean():

    if os.path.exists(OUTPUT_FILE):

        os.remove(OUTPUT_FILE)

    else:

        print('No such file: %s' %OUTPUT_FILE)



def get_article_list():

    r = session.get(WEBSITE)

    content = r.html.find('.module-content', first=True)

    items = content.find('li')

    for item in items:

        a = item.find('a', first=True)

        href = a.attrs['href']

        get_article_detail(href)



def get_article_detail(url):

    r = session.get(url)

    title = r.html.find('h1', first=True)

    print(title.text, url)

    f = open(OUTPUT_FILE, 'a+')

    f.write(' - [{}]({})\n'.format(title.text, url))

    f.close()



def start():

    clean()

    get_article_list()



if __name__ == "__main__":

    start()

使用md-http,启动一个支持markdown的web服务,页面预览如下:

image.png