使用指南

本代码用于爬取 Now新闻 新闻网站首页的新闻文章（包括新闻标题，新闻链接，和发布日期三个数据）
在 Python3 环境下运行本代码，同时保证运行环境中安装有 requests，pandas 库。
运行结果保存为 "Now新闻.csv" 文件，路径位于脚本同路径下（如有需要可以修改代码中 filename 的值，设置文件名和存储路径）
使用此爬虫前，请确保您的网络可以正常访问 Now新闻网站，否则爬虫运行会报错失败。
本爬虫仅供学习交流使用，请勿用于商业用途。

源码

import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time

def fetchUrl(url):

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
    }

    r = requests.get(url, headers = header)
    r.encoding = r.apparent_encoding
    return r.text

def parseHtml(html):

    bsObj = BeautifulSoup(html, "lxml")
    news = bsObj.find("div", attrs={"class": "newsCategoryColLeft"})

    retData = []

    focus = news.find('div', attrs={"class":"focusNews"})
    if focus:
        tempList = focus.find_all("a")
        for item in tempList:
            # print(item)
            link = "https://news.now.com" + item['href']
            title = item.find("div", attrs={"class":"newsTitle"}).text
            dateTemp = item.find("div", attrs={"class":"newsTime"}).text

            if "年" not in dateTemp:
                date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")

            print(date, title, link)
            retData.append([date, title, link])

    newsList = news.find("div", attrs={"class":"newsList"})
    if newsList:
        tempList = newsList.find_all("a", attrs = {"class":"newsWrap"})
        for item in tempList:
            link = "https://news.now.com" + item['href']
            title = item.find("div", attrs={"class":"newsTitle"}).text
            dateTemp = item.find("div", attrs={"class":"newsTime"}).text

            if "年" not in dateTemp:
                date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")
            else:
                date = dateTemp

            print(date, title, link)
            retData.append([date, title, link])

    return retData

def saveData(data, filename):

    dataframe = pd.DataFrame(data)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)

def spiderManager(totalPage, filename):

    if totalPage < 1:
        page = 28
        while True:
            url = "https://news.now.com/home/local?pageNo={0}".format(page)
            html = fetchUrl(url)
            data = parseHtml(html)
            saveData(data, filename)
            print("第", page, "页爬取完成")
            if len(data) == 0:
                break;
            page += 1
    else:
        for page in range(1, totalPage + 1):
            url = "https://news.now.com/home/local?pageNo={0}".format(page)
            html = fetchUrl(url)
            data = parseHtml(html)
            saveData(data, filename)
            print("第", page, "页爬取完成")

if __name__ == "__main__":

    # Now新闻
    totalPage = 0
    filename = "Now新闻.csv"

    spiderManager(totalPage, filename)

python爬虫实战：爬取Now新闻网站首页新闻

使用指南

源码