参与拿奖:本文已参与「新人创作礼」活动,一起开启掘金创作之路
ps:代码文末自取
1.运行效果
2.网页读取基础
Python3.x主要使用urlib库来读取网页内容!
3.网页内容读取方法
①使用简单的urlib.request来获取网页内容
import urllib
import urllib.request
def pa():
#请求打开网页
fp=urllib.request.urlopen(r'http://www.python.org')
#打印未解码的前100个字符
print(fp.read(100))
#打印解码的前一百个字符
print(fp.read(100).decode())
#关闭网页
fp.close()
②使用get方法来读取指定域名的网页内容
import urllib.parse
def pa1():
params=urllib.parse.urlencode({'spam':1,'eggs':2,'bacon':0})
url="http://www.musi-cal.com/cgi-bin/query?%s"%params
url1="http://www.python.org?%s"%params
with urllib.request.urlopen(url1) as f:
print(f.read(100).decode('utf-8'))
③使用post方法提交参数并读取指定页面内容
def post():
data=urllib.parse.urlencode({'spam':1,'eggs':2,'bacon':0})
data=data.encode('ascii')
with urllib.request.urlopen("http://www.python.org",data) as f:
print(f.read(100).decode('utf-8'))
4.使用调用浏览器打开网页
import webbrowser
def web():
webbrowser.open('http://www.python.org')
5.域名解析
from urllib.parse import urlparse
def jiexi():
o=urlparse('http://www.CWi.nl:80/%7Eguido/Python.html')
#端口号
print(o.port)
#域名
print(o.hostname)
6.代码解读
①首先自定义一个爬取类:
import requests
from bs4 import BeautifulSoup
class spider:
②构造函数为: page:表示抓取页数
self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page=' + str(page)
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
③初始化抓取页面 通过encoding函数设置解码格式,防止出现乱码
# 得到每一页的网页源码
def get_html(self):
res = requests.get(self.url, headers=self.headers)
res.encoding='utf-8'
html = res.text
#print(html)
return html
④编写抓取函数 a.通过BeautifulSoup()函数来出使得抓取内容标准化; b.通过find_all(标签名,类名)函数抓取指定类的标签中的内容; c.通过select()函数选择指定子标签的内容
def get_information(self):
html = self.get_html()
#html.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all("div", class_='p-img') #图片
divs_prices = soup.find_all("div", class_='p-price') #价格
divs_name=soup.find_all("div",class_="p-name") #商品名
divs_content=soup.find_all("em") #商品描述
divs_all=soup.find_all("li",class_="gl-item")
for div in divs_all:
name=div.find("a",target="_blank").get("title")
content1=div.select("em:nth-of-type(1)")[1].text
#content=div.find("em").text
img1=div.find("img").get('source-data-lazy-img')
img2=div.select("img:nth-of-type(1)")[1].get("data-lazy-img")
price=div.find("i").text
pid=div.get("data-pid")
print("商品名称:", name)
print("商品pid:",pid)
print("商品描述:", content1)
print("图片1:", img1)
print("图片2:",img2)
print("价格:", price)
print("\n")
⑤最后一个内部调用函数:
def main(self):
self.get_information()
7.源代码
import requests
from bs4 import BeautifulSoup
class spider:
self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page=' + str(page)
self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
def get_html(self):
res = requests.get(self.url, headers=self.headers)
res.encoding='utf-8'
html = res.text
#print(html)
return html
def main(self):
self.get_information()
def get_information(self):
html = self.get_html()
#html.encode('utf-8')
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all("div", class_='p-img') #图片
divs_prices = soup.find_all("div", class_='p-price') #价格
divs_name=soup.find_all("div",class_="p-name") #商品名
divs_content=soup.find_all("em") #商品描述
divs_all=soup.find_all("li",class_="gl-item")
for div in divs_all:
name=div.find("a",target="_blank").get("title")
content1=div.select("em:nth-of-type(1)")[1].text
#content=div.find("em").text
img1=div.find("img").get('source-data-lazy-img')
img2=div.select("img:nth-of-type(1)")[1].get("data-lazy-img")
price=div.find("i").text
pid=div.get("data-pid")
print("商品名称:", name)
print("商品pid:",pid)
print("商品描述:", content1)
print("图片1:", img1)
print("图片2:",img2)
print("价格:", price)
print("\n")
if __name__ == '__main__':
threads = []
for i in range(1, 2):
page = i * 2 - 2 # 这里每一页对应的都是奇数,但是ajax的请求都是偶数的,所有在获取扩展的网页时都要用page+1转换成偶数
t = threading.Thread(target=spiders(page).main, args=[])
threads.append(t)
for t in threads:
t.start()
t.join()