使用指南
- 本代码用于爬取
Now新闻 新闻网站首页的新闻文章(包括新闻标题,新闻链接,和发布日期三个数据)
- 在
Python3 环境下运行本代码,同时保证运行环境中安装有 requests,pandas 库。
- 运行结果保存为
"Now新闻.csv" 文件,路径位于脚本同路径下(如有需要可以修改代码中 filename 的值,设置文件名和存储路径)
- 使用此爬虫前,请确保您的网络可以正常访问 Now新闻 网站,否则爬虫运行会报错失败。
- 本爬虫仅供学习交流使用,请勿用于商业用途。
源码
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time
def fetchUrl(url):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
}
r = requests.get(url, headers = header)
r.encoding = r.apparent_encoding
return r.text
def parseHtml(html):
bsObj = BeautifulSoup(html, "lxml")
news = bsObj.find("div", attrs={"class": "newsCategoryColLeft"})
retData = []
focus = news.find('div', attrs={"class":"focusNews"})
if focus:
tempList = focus.find_all("a")
for item in tempList:
link = "https://news.now.com" + item['href']
title = item.find("div", attrs={"class":"newsTitle"}).text
dateTemp = item.find("div", attrs={"class":"newsTime"}).text
if "年" not in dateTemp:
date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")
print(date, title, link)
retData.append([date, title, link])
newsList = news.find("div", attrs={"class":"newsList"})
if newsList:
tempList = newsList.find_all("a", attrs = {"class":"newsWrap"})
for item in tempList:
link = "https://news.now.com" + item['href']
title = item.find("div", attrs={"class":"newsTitle"}).text
dateTemp = item.find("div", attrs={"class":"newsTime"}).text
if "年" not in dateTemp:
date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")
else:
date = dateTemp
print(date, title, link)
retData.append([date, title, link])
return retData
def saveData(data, filename):
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)
def spiderManager(totalPage, filename):
if totalPage < 1:
page = 28
while True:
url = "https://news.now.com/home/local?pageNo={0}".format(page)
html = fetchUrl(url)
data = parseHtml(html)
saveData(data, filename)
print("第", page, "页爬取完成")
if len(data) == 0:
break;
page += 1
else:
for page in range(1, totalPage + 1):
url = "https://news.now.com/home/local?pageNo={0}".format(page)
html = fetchUrl(url)
data = parseHtml(html)
saveData(data, filename)
print("第", page, "页爬取完成")
if __name__ == "__main__":
totalPage = 0
filename = "Now新闻.csv"
spiderManager(totalPage, filename)