python爬虫实战:爬取Now新闻网站首页新闻

272 阅读2分钟

使用指南

  1. 本代码用于爬取 Now新闻 新闻网站首页的新闻文章(包括新闻标题,新闻链接,和发布日期三个数据)
  2. Python3 环境下运行本代码,同时保证运行环境中安装有 requestspandas 库。
  3. 运行结果保存为 "Now新闻.csv" 文件,路径位于脚本同路径下(如有需要可以修改代码中 filename 的值,设置文件名和存储路径)
  4. 使用此爬虫前,请确保您的网络可以正常访问 Now新闻 网站,否则爬虫运行会报错失败。
  5. 本爬虫仅供学习交流使用,请勿用于商业用途。

源码

import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time

def fetchUrl(url):

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
    }

    r = requests.get(url, headers = header)
    r.encoding = r.apparent_encoding
    return r.text

def parseHtml(html):

    bsObj = BeautifulSoup(html, "lxml")
    news = bsObj.find("div", attrs={"class": "newsCategoryColLeft"})

    retData = []

    focus = news.find('div', attrs={"class":"focusNews"})
    if focus:
        tempList = focus.find_all("a")
        for item in tempList:
            # print(item)
            link = "https://news.now.com" + item['href']
            title = item.find("div", attrs={"class":"newsTitle"}).text
            dateTemp = item.find("div", attrs={"class":"newsTime"}).text

            if "年" not in dateTemp:
                date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")

            print(date, title, link)
            retData.append([date, title, link])

    newsList = news.find("div", attrs={"class":"newsList"})
    if newsList:
        tempList = newsList.find_all("a", attrs = {"class":"newsWrap"})
        for item in tempList:
            link = "https://news.now.com" + item['href']
            title = item.find("div", attrs={"class":"newsTitle"}).text
            dateTemp = item.find("div", attrs={"class":"newsTime"}).text

            if "年" not in dateTemp:
                date = time.strftime("%Y{0}%m{1}%d{2}", time.localtime()).format("年","月","日")
            else:
                date = dateTemp

            print(date, title, link)
            retData.append([date, title, link])

    return retData

def saveData(data, filename):

    dataframe = pd.DataFrame(data)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)

def spiderManager(totalPage, filename):

    if totalPage < 1:
        page = 28
        while True:
            url = "https://news.now.com/home/local?pageNo={0}".format(page)
            html = fetchUrl(url)
            data = parseHtml(html)
            saveData(data, filename)
            print("第", page, "页爬取完成")
            if len(data) == 0:
                break;
            page += 1
    else:
        for page in range(1, totalPage + 1):
            url = "https://news.now.com/home/local?pageNo={0}".format(page)
            html = fetchUrl(url)
            data = parseHtml(html)
            saveData(data, filename)
            print("第", page, "页爬取完成")

if __name__ == "__main__":

    # Now新闻
    totalPage = 0
    filename = "Now新闻.csv"

    spiderManager(totalPage, filename)