缺点:对内存要求高。若有 1 亿网页,则占用内存为:1000000000*2byte*50 个字符/1024/1024/1024 = 9G 3、URL 经过 md5 等方法哈希后保存到 set 中 优点:可以成倍降低内存占用,Scrapy 使用的这种方法 4、用bitmap或者bloomfilter方法,将访问过的URL通过hash函数映射到某一位 bitmap 方法优点:一亿 URL 占用约 12M bloomfilter 优点:对 bitmap 进行改进,多重 hash 函数降低冲突
import requests
from bs4 import BeautifulSoup
import re
import pymongo
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接
db = client['dbkongjie1805' ] ## 选择一个数据库
self.kongjie_collection = db['kongjie' ] ##在数据库中,选择一个集合
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent' : user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
people_list = soup.select('div.ptw > ul > li' )
for people in people_list:
self.save_images_in_album(people.div.a['href' ])
#爬取下一页
next_page = soup.select_one('a.nxt' )
if next_page:
self.parse_album_url(next_page['href' ])
else :
print('下载结束!' )
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
ls = soup.select('ul.ptw.ml.mlp.cl > li' )
if len(ls)>0 :
print('len ' ,len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a' )['href' ]
#去重操作
if self.kongjie_collection.find_one({'img_url' : url}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了' )
else :
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)' )
matchObj = pat.search(url)
uid = matchObj.group(1 )
picid = matchObj.group(2 )
print('uid:' ,uid)
print('picid:' ,picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml' )
img_url = soup1.select_one('div#photo_pic > a > img' )['src' ]
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/' + uid + picid +'.jpg'
with open(imgName,'wb' ) as file:
file.write(response.content)
self.kongjie_collection.save({'img_url' : url})
ls = soup.select('a.nxt' )
print('next_page: ' ,len(ls))
if len(ls)>0 :
next_page_url = ls[0 ]['href' ]
print('next_page_url:' ,next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__' :
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
import requests
from bs4 import BeautifulSoup
import re
import pymongo
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie' ] ## 选择一个数据库
self.kongjie_collection = db['kongjie' ] ##在数据库中,选择一个集合
self.img_urls = set() ##初始化一个集合 用来保存图片地址
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent' : user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
people_list = soup.select('div.ptw > ul > li' )
for people in people_list:
self.save_images_in_album(people.div.a['href' ])
#爬取下一页
next_page = soup.select_one('a.nxt' )
if next_page:
self.parse_album_url(next_page['href' ])
else :
print('下载结束!' )
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
ls = soup.select('ul.ptw.ml.mlp.cl > li' )
if len(ls)>0 :
print('len ' ,len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a' )['href' ]
#去重操作
if url in self.img_urls: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了' )
else :
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)' )
matchObj = pat.search(url)
uid = matchObj.group(1 )
picid = matchObj.group(2 )
print('uid:' ,uid)
print('picid:' ,picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml' )
img_url = soup1.select_one('div#photo_pic > a > img' )['src' ]
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/' + uid + picid +'.jpg'
with open(imgName,'wb' ) as file:
file.write(response.content)
self.img_urls.add(url) # 把url添加到集合中
print('count:' ,len(self.img_urls))
ls = soup.select('a.nxt' )
print('next_page: ' ,len(ls))
if len(ls)>0 :
next_page_url = ls[0 ]['href' ]
print('next_page_url:' ,next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__' :
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
import requests
from bs4 import BeautifulSoup
import re
import pymongo
from hashlib import md5
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie' ] ## 选择一个数据库
self.kongjie_collection = db['kongjie' ] ##在数据库中,选择一个集合
self.img_urls = set() ##初始化一个集合 用来保存图片地址
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent' : user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
people_list = soup.select('div.ptw > ul > li' )
for people in people_list:
self.save_images_in_album(people.div.a['href' ])
#爬取下一页
next_page = soup.select_one('a.nxt' )
if next_page:
self.parse_album_url(next_page['href' ])
else :
print('下载结束!' )
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
ls = soup.select('ul.ptw.ml.mlp.cl > li' )
if len(ls)>0 :
print('len ' ,len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a' )['href' ]
#去重操作
hash_md5 = md5(url.encode('utf8' ))
hash_str = hash_md5.hexdigest()
if hash_str in self.img_urls: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了' )
else :
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)' )
matchObj = pat.search(url)
uid = matchObj.group(1 )
picid = matchObj.group(2 )
print('uid:' ,uid)
print('picid:' ,picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml' )
img_url = soup1.select_one('div#photo_pic > a > img' )['src' ]
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/' + uid + picid +'.jpg'
with open(imgName,'wb' ) as file:
file.write(response.content)
self.img_urls.add(hash_str)
print('count:' ,len(self.img_urls),hash_str)
ls = soup.select('a.nxt' )
print('next_page: ' ,len(ls))
if len(ls)>0 :
next_page_url = ls[0 ]['href' ]
print('next_page_url:' ,next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__' :
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
import requests
from bs4 import BeautifulSoup
import re
import pymongo
from pybloom_live import ScalableBloomFilter
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie' ] ## 选择一个数据库
self.kongjie_collection = db['kongjie' ] ##在数据库中,选择一个集合
self.sbf = ScalableBloomFilter(initial_capacity=100 , error_rate=0.001 , mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent' : user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
people_list = soup.select('div.ptw > ul > li' )
for people in people_list:
self.save_images_in_album(people.div.a['href' ])
#爬取下一页
next_page = soup.select_one('a.nxt' )
if next_page:
self.parse_album_url(next_page['href' ])
else :
print('下载结束!' )
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml' )
ls = soup.select('ul.ptw.ml.mlp.cl > li' )
if len(ls)>0 :
print('len ' ,len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a' )['href' ]
#去重操作
if url in self.sbf: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了' )
else :
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)' )
matchObj = pat.search(url)
uid = matchObj.group(1 )
picid = matchObj.group(2 )
print('uid:' ,uid)
print('picid:' ,picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml' )
img_url = soup1.select_one('div#photo_pic > a > img' )['src' ]
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/' + uid + picid +'.jpg'
with open(imgName,'wb' ) as file:
file.write(response.content)
self.sbf.add(url)
print('count:' ,len(self.sbf))
ls = soup.select('a.nxt' )
print('next_page: ' ,len(ls))
if len(ls)>0 :
next_page_url = ls[0 ]['href' ]
print('next_page_url:' ,next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__' :
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
|
|