python编程-27:实例3-股票数据定向爬虫

129 阅读2分钟

python编程-27:实例3-股票数据定向爬虫
\

\

源码:

优化前:

  1. #CrawBaiduStocksA.py

  2. import requests

  3. from bs4 import BeautifulSoup

  4. import traceback

  5. import re

  6.  

  7. def getHTMLText(url):

  8.     try:

  9.         r = requests.get(url)

  10.         r.raise_for_status()

  11.         r.encoding = r.apparent_encoding

  12.         return r.text

  13.     except:

  14.         return ""

  15.  

  16. def getStockList(lst, stockURL):

  17.     html = getHTMLText(stockURL)

  18.     soup = BeautifulSoup(html, 'html.parser') 

  19.     a = soup.find_all('a')

  20.     for i in a:

  21.         try:

  22.             href = i.attrs['href']

  23.             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])

  24.         except:

  25.             continue

  26.  

  27. def getStockInfo(lst, stockURL, fpath):

  28.     for stock in lst:

  29.         url = stockURL + stock + ".html"

  30.         html = getHTMLText(url)

  31.         try:

  32.             if html=="":

  33.                 continue

  34.             infoDict = {}

  35.             soup = BeautifulSoup(html, 'html.parser')

  36.             stockInfo = soup.find('div',attrs={

    'class':'stock-bets'})

  37.  

  38.             name = stockInfo.find_all(attrs={

    'class':'bets-name'})[0]

  39.             infoDict.update({

    '股票名称': name.text.split()[0]})

  40.              

  41.             keyList = stockInfo.find_all('dt')

  42.             valueList = stockInfo.find_all('dd')

  43.             for i in range(len(keyList)):

  44.                 key = keyList[i].text

  45.                 val = valueList[i].text

  46.                 infoDict[key] = val

  47.              

  48.             with open(fpath, 'a', encoding='utf-8') as f:

  49.                 f.write( str(infoDict) + '\n' )

  50.         except:

  51.             traceback.print_exc()

  52.             continue

  53.  

  54. def main():

  55.     stock_list_url = 'quote.eastmoney.com/stocklist.h…'

  56.     stock_info_url = 'gupiao.baidu.com/stock/'

  57.     output_file = 'D:/BaiduStockInfo.txt'

  58.     slist=[]

  59.     getStockList(slist, stock_list_url)

  60.     getStockInfo(slist, stock_info_url, output_file)

  61.  

  62. main()

\

优化后:

  1. #CrawBaiduStocksB.py

  2. import requests

  3. from bs4 import BeautifulSoup

  4. import traceback

  5. import re

  6.  

  7. def getHTMLText(url, code="utf-8"):

  8.     try:

  9.         r = requests.get(url)

  10.         r.raise_for_status()

  11.         r.encoding = code

  12.         return r.text

  13.     except:

  14.         return ""

  15.  

  16. def getStockList(lst, stockURL):

  17.     html = getHTMLText(stockURL, "GB2312")

  18.     soup = BeautifulSoup(html, 'html.parser') 

  19.     a = soup.find_all('a')

  20.     for i in a:

  21.         try:

  22.             href = i.attrs['href']

  23.             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])

  24.         except:

  25.             continue

  26.  

  27. def getStockInfo(lst, stockURL, fpath):

  28.     count = 0

  29.     for stock in lst:

  30.         url = stockURL + stock + ".html"

  31.         html = getHTMLText(url)

  32.         try:

  33.             if html=="":

  34.                 continue

  35.             infoDict = {}

  36.             soup = BeautifulSoup(html, 'html.parser')

  37.             stockInfo = soup.find('div',attrs={

    'class':'stock-bets'})

  38.  

  39.             name = stockInfo.find_all(attrs={

    'class':'bets-name'})[0]

  40.             infoDict.update({

    '股票名称': name.text.split()[0]})

  41.              

  42.             keyList = stockInfo.find_all('dt')

  43.             valueList = stockInfo.find_all('dd')

  44.             for i in range(len(keyList)):

  45.                 key = keyList[i].text

  46.                 val = valueList[i].text

  47.                 infoDict[key] = val

  48.              

  49.             with open(fpath, 'a', encoding='utf-8') as f:

  50.                 f.write( str(infoDict) + '\n' )

  51.                 count = count + 1

  52.                 print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")

  53.         except:

  54.             count = count + 1

  55.             print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")

  56.             continue

  57.  

  58. def main():

  59.     stock_list_url = 'quote.eastmoney.com/stocklist.h…'

  60.     stock_info_url = 'gupiao.baidu.com/stock/'

  61.     output_file = 'D:/BaiduStockInfo.txt'

  62.     slist=[]

  63.     getStockList(slist, stock_list_url)

  64.     getStockInfo(slist, stock_info_url, output_file)

  65.  

  66. main()

\

\

\