python爬虫实战:爬取彭博社网站各板块新闻

1,438 阅读4分钟

使用指南

  1. 本代码用于爬取 彭博社 新闻网站中 市场技术政治 等板块的新闻文章(包括新闻标题,新闻链接,和发布日期三个数据)
  2. Python3 环境下运行本代码,同时保证运行环境中安装有 requestspandas 库。
  3. 运行结果保存为 "彭博社-市场.csv" 等文件中,路径位于脚本同路径下(如有需要可以修改代码中 filename 的值,设置文件名和存储路径)
  4. 使用此爬虫前,请确保您的网络可以正常访问 彭博社 网站,否则爬虫运行会报错失败。
  5. 本爬虫仅供学习交流使用,请勿用于商业用途。

源码

import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
import time

def fetchUrl(url):

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
        'cookie': 'bb_geo_info={"country":"JP","region":"Asia"}|1600828222929; _sp_v1_uid=1:397:c14bd1c3-c218-412b-9ad3-6e152f20f8ab; _sp_krux=false; _sp_v1_ss=1:H4sIAAAAAAAAAItWqo5RKimOUbLKK83J0YlRSkVil4AlqmtrlXSIUxYLAPL1gztPAAAA; _sp_v1_lt=1:; _sp_v1_csv=null; _sp_v1_opt=1:; _gcl_au=1.1.1202405454.1600223424; ccpaUUID=3506961d-7f44-45de-99e3-76229247ab44; dnsDisplayed=true; ccpaApplies=true; signedLspa=false; bbgconsentstring=req1fun1pad1; bdfpc=004.0814204529.1600223424498; _pxvid=935c1a81-f7c4-11ea-9707-0242ac120003; _ga=GA1.2.1568133537.1600223425; agent_id=62ab111a-5ded-4aa4-a35b-fd7129dd5cb5; session_id=a9ba102d-fa7e-4ad1-8139-ac383d58d619; session_key=6b930652995b2412eee5055aeddf8f47b3ba9887; _scid=1f36a350-c59b-4321-864e-ad824ff7b426; _rdt_uuid=1600223426592.3d1f38f3-e364-410f-a765-bf86bed0a76c; _lc2_fpi=b1166d620485--01ejabzeej0e9vzx4bxmvzp6tx; trc_cookie_storage=taboola%2520global%253Auser-id%3Dfc7ce04f-abf8-476d-8f23-83366e85988a-tuct65afe42; __tbc=%7Bjzx%7Dt_3qvTkEkvt3AGEeiiNNgDYe72zPo2kvHOyNauNbF4nyPa1LTgjyYoKx3OGw1vXQ6cR9r5GWrSDHCk1OJGDN6CsMRdey2Wd6ZjWFpI4s8EpgcHEw38aKYcCxC2teAeA2jylU0Z664w9lha1BgkmqDg; __pat=-14400000; _cc_id=4bed7f99ce978bd1b0a3906ddf151d07; _parsely_visitor={%22id%22:%22pid=b68901e16f9442487dace3d7302f15e8%22%2C%22session_count%22:1%2C%22last_session_ts%22:1600223429185}; _fbp=fb.1.1600223429312.932979589; __gads=ID=b572c647eedf5c2a:T=1600223429:S=ALNI_Mba5w7Z59_BDKe_GxOCt4LoykLOiA; ntv_as_us_privacy=1YNN; _pxhd=e72ebc4e4f8b5102fee8e3810d10e168df8b8a820a7412757715636378856b30:935c1a81-f7c4-11ea-9707-0242ac120003; _pxff_cc=U2FtZVNpdGU9TGF4Ow==; _reg-csrf=s%3A8eZRK6hF4wvyUumJoCyX6rm2.apCWeFZDBQeCb5zLaZxqFYZaxMhDy46VJY7uO61xi7A; _reg-csrf-token=YZCvRQ0M-EUnIOme4mOvJXAj4p2DAWSM-pI8; _user-status=anonymous; _px2=eyJ1IjoiYmNhYjFkYzAtZmE1MC0xMWVhLWJlYjUtZWZlZjdjNTNmYjU3IiwidiI6IjkzNWMxYTgxLWY3YzQtMTFlYS05NzA3LTAyNDJhYzEyMDAwMyIsInQiOjE2MDA1MDM4MjkyMjEsImgiOiJhNzgwODcwMDZjMDlmMGQwNjI3MWVjZjZkMDlkMzFiYWVkZjg5ZTdhOTY5YzA0MmEzYzgzOGJjMjYzZTgwNzlkIn0=; _px3=884b03989119a696719937752f63bf6d58015bde6feecdc601e60f138239adf3:YxfPGxgeEz4KjZ73zGab76NNgsuzgkpAqSa6nKe3WMvUh1YXbPmojwYRI7P1svP+xGecY47w2bnbKPya9pg5GA==:1000:7gnIRfPj8aAqVN7FOt0gE0BddMGbs9Kqqcp6j3NyfE94YxDcglwy47M6VcLh32BsqrRWa1IRrzdYwhqcoS3jUS5PFopilDyiD9CpnCAJgru2wZNkmmqHt4S0aV9Ar54DdVdb1GdvChLuUBdU/zjeyEBllScpox1qXS2z+wsfipY=; _uetsid=69efb01f728ca30514d21efc7f9129e7; _uetvid=797700e40a7f5d30dd179e183cfc0e5b; _sp_v1_data=2:197212:1600223423:0:5:0:5:0:0:_:-1; consentUUID=6ba60b60-d99e-4526-ad97-06c57c8d530b; _user_newsletters=[]; _gid=GA1.2.1436585998.1600503529; _dc_gtm_UA-11413116-1=1; _pxde=21381760573e2e2252f13b31d29fd606c5cc221121edef833ba7f973f8ae138b:eyJ0aW1lc3RhbXAiOjE2MDA1MDM1MzI5OTYsImZfa2IiOjAsImlwY19pZCI6W119; _tb_sess_r=https%3A//www.bloomberg.com/markets; _tb_t_ppg=https%3A//www.bloomberg.com/vendors/taboola_loader.html%3Fdisplay%3Dfluid; bbAbVisits=; __pvi=%7B%22id%22%3A%22v-2020-09-19-16-18-50-708-xwp7L5czwmMvEYf0-2d048d09475a9dcc68707e4b4507aa5f%22%2C%22domain%22%3A%22.bloomberg.com%22%2C%22time%22%3A1600503531309%7D; xbc=%7Bjzx%7DMuDMa2ET_r5g5YJ0lQI5QtEh6HAPYkVYEikJhSeIt8bwQyPvA_7eHdBVcKnu23CChubgD23gBW3oJRkIysZB_b0IyvxJVUcfpP_OVG_NEvGyxRDe6cP6PPc9V8loDljJq_5kf0cQsg96yrR3f6NKrhrtLoauHmthgQ6nMmJm2o6m2UL5pL2FmUYYm__-pzhfCpyYbafGiT_BN-sSE9YgoBsmq5r81KfFmlJCDciwUCPrfJzjk8RyPF2uGgbxH89oyy_Cc1dZJtoDZUdcm6ZoV1BpSUFOv-TweTbgWlyLRsqDTsDIvIAIxlvU_8IdvUrLQgrRqs7ClJHp66NnJg9pLZ-vfndEE6_ZpQeA1Kli7FQ; __sppvid=50253da5-0d79-493c-9f8e-1cc7208ab57e; _li_dcdm_c=.bloomberg.com; _gat_UA-11413116-1=1'
    }

    r = requests.get(url, headers = header)
    r.encoding = r.apparent_encoding
    return r.text

def parseHtml(html):

    bsObj = BeautifulSoup(html, "lxml")

    retData = []

    temp1 = bsObj.find_all("a", attrs={"class":"story-package-module__story__headline-link"})
    for item in temp1:
        link = "https://www.bloomberg.com" + item["href"]
        title = item.text.replace("\n","").lstrip().rstrip()
        date = item["href"].split("/")[3]
        print(date, title, link)
        retData.append([date, title, link])

    temp2 = bsObj.find_all("a", attrs={"class","story-list-story__info__headline-link"})
    for item in temp2:
        link = "https://www.bloomberg.com" + item["href"]
        title = item.text.replace("\n","").lstrip().rstrip()
        date = item["href"].split("/")[3]
        print(date, title, link)
        retData.append([date, title, link])

    return retData

def saveData(data, filename):

    dataframe = pd.DataFrame(data)
    dataframe.to_csv(filename, mode='a', index=False, sep=',', header=False)


if __name__ == "__main__":

    # 市场,技术,政治

    url = "https://www.bloomberg.com/markets"
    html = fetchUrl(url)
    data = parseHtml(html)
    saveData(data, "彭博社-市场.csv")

    url = "https://www.bloomberg.com/technology"
    html = fetchUrl(url)
    data = parseHtml(html)
    saveData(data, "彭博社-技术.csv")

    url = "https://www.bloomberg.com/politics"
    html = fetchUrl(url)
    data = parseHtml(html)
    saveData(data, "彭博社-政治.csv")