《Python语言程序设计基础(第2版)》嵩天 著
第十章
●10 - 1
获得一个HTML页面的通用代码
import requests
def getHTMLText():
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
url = "http://www.baidu.com"
print(getHTMLText(url))
●10 - 2
中国大学排名爬虫
import requests
from bs4 import BeautifulSoup
allUniv = []
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def fillUnivList(soup):
data = soup.find_all('tr')
for tr in data:
ltd = tr.find_all('td')
if len(ltd) == 0:
continue
singleUniv = []
for td in ltd:
singleUniv.append(td.string)
allUniv.append(singleUniv)
def printUnivList(num):
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format("排名", "学校名称", "省市", "总分", "培养规模"))
for i in range(num):
u = allUniv[i]
print("{:^4}{:^10}{:^5}{:^8}{:^10}".format(u[0], u[1], u[2], u[3], u[6]))
def main():
url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
fillUnivList(soup)
printUnivList(10)
main()
●10 - 3
百度关键词自动提交
import requests
from bs4 import BeautifulSoup
import re
import json
def getKeywordResult(keyword):
url = 'http://www.baidu.com/s?wd=' + keyword
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ""
def parserLinks(html):
soup = BeautifulSoup(html, "html.parser")
links = []
for div in soup.find_all('div', {'data-tools': re.compile('title')}):
data = div.attrs['data-tools']
d = json.loads(data)
links.append(d['title'])
return links
def main():
html = getKeywordResult('Python语言程序设计基础(第2版)')
ls = parserLinks(html)
count = 1
for i in ls:
print("[{:^3}]{}".format(count, i))
count += 1
main()