使用指南
- 本代码用于爬取
FT新闻
新闻网站首页的新闻文章(包括新闻标题,新闻链接,和发布日期三个数据)
- 在
Python3
环境下运行本代码,同时保证运行环境中安装有 requests
,pandas
库。
- 运行结果保存为
"FT新闻.csv"
文件,路径位于脚本同路径下(如有需要可以修改代码中 filename
的值,设置文件名和存储路径)
- 使用此爬虫前,请确保您的网络可以正常访问 FT新闻 网站,否则爬虫运行会报错失败。
- 本爬虫仅供学习交流使用,请勿用于商业用途。
源码
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
def fetchUrl(url):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
}
r = requests.get(url, headers = header)
r.encoding = r.apparent_encoding
return r.text
def parseHtml(html):
bsObj = BeautifulSoup(html, "lxml")
temp = bsObj.find_all("h2", attrs={"class": "item-headline"})
retData = []
for item in temp[0:-4]:
a = item.find_all("a")[-1]
if "premium" not in a['href'] and "story" not in a['href'] and "interactive" not in a['href']:
continue
link = "https://cn.ft.com" + a["href"]
title = a.text
retData.append([title, link])
return retData
def getDateTime(url):
html = fetchUrl(url)
bsObj = BeautifulSoup(html, "lxml")
span = bsObj.find("span", attrs={"class": "story-time"})
if span:
pattern = r"(\d+年\d+月\d+日)"
date = re.findall(pattern, span.text)[0]
return date
else:
if "archive" in url:
print("no date time")
return ""
elif "exclusive" in url:
url = url.replace("exclusive", "archive")
return getDateTime(url)
else:
url = url + "?exclusive"
return getDateTime(url)
def saveData(data, filename, mode):
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, mode=mode, index=False, sep=',', header=False, encoding="utf_8_sig")
if __name__ == "__main__":
print("爬虫启动")
url = "https://cn.ft.com/"
html = fetchUrl(url)
data = parseHtml(html)
saveData(data, "temp.csv", 'w')
print("临时文件保存成功")
print("开始爬取详细信息……")
df = pd.read_csv('temp.csv')
for index, title, link in df.itertuples():
print("正在爬取: ", link)
date = getDateTime(link)
print(date, title, link)
saveData([[date, title, link]], "FT新闻.csv", 'a')
print("----"*20)