Python书籍示例代码(第十章)

52 阅读1分钟
《Python语言程序设计基础(第2版)》嵩天 著
第十章
●10 - 1
获得一个HTML页面的通用代码

import requests


def getHTMLText():
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()  # 如果状态不是200,引发异常
        r.encoding = 'utf-8'  # 无论原来用什么编码,都改成utf-8
        return r.text
    except:
        return ""


url = "http://www.baidu.com"
print(getHTMLText(url))
●10 - 2
中国大学排名爬虫

# e23.1CrawUnivRanking.py
import requests
from bs4 import BeautifulSoup

allUniv = []


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""


def fillUnivList(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd) == 0:
            continue
        singleUniv = []
        for td in ltd:
            singleUniv.append(td.string)
        allUniv.append(singleUniv)


def printUnivList(num):
    print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名", "学校名称", "省市", "总分", "培养规模"))
    for i in range(num):
        u = allUniv[i]
        print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0], u[1], u[2], u[3], u[6]))


def main():
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    fillUnivList(soup)
    printUnivList(10)


main()

●10 - 3
百度关键词自动提交

# e24.1AutoKeywordSearch.py
import requests
from bs4 import BeautifulSoup
import re
import json


def getKeywordResult(keyword):
    url = 'http://www.baidu.com/s?wd=' + keyword
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""


def parserLinks(html):
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for div in soup.find_all('div', {'data-tools': re.compile('title')}):
        data = div.attrs['data-tools']  # 获得属性值
        d = json.loads(data)  # 将属性值转换成字典
        links.append(d['title'])  # 将返回链接的题目返回
    return links


def main():
    html = getKeywordResult('Python语言程序设计基础(第2版)')
    ls = parserLinks(html)
    count = 1
    for i in ls:
        print("[{:^3}]{}".format(count, i))
        count += 1


main()