#-*- coding: utf-8 -*-
from requests_html import HTMLSession
import os
session = HTMLSession()
WEBSITE = 'https://www.ruanyifeng.com/blog/weekly/'
OUTPUT_FILE = '/root/markdown/ruanyifengweekly.md'
def clean():
if os.path.exists(OUTPUT_FILE):
os.remove(OUTPUT_FILE)
else:
print('No such file: %s' %OUTPUT_FILE)
def get_article_list():
r = session.get(WEBSITE)
content = r.html.find('.module-content', first=True)
items = content.find('li')
for item in items:
a = item.find('a', first=True)
href = a.attrs['href']
get_article_detail(href)
def get_article_detail(url):
r = session.get(url)
title = r.html.find('h1', first=True)
print(title.text, url)
f = open(OUTPUT_FILE, 'a+')
f.write(' - [{}]({})\n'.format(title.text, url))
f.close()
def start():
clean()
get_article_list()
if __name__ == "__main__":
start()
使用md-http,启动一个支持markdown的web服务,页面预览如下: