爬虫10-爬虫的去重

394 阅读6分钟

python爬虫的去重策略

1、将访问过的url保存到数据库中

2、将访问过的url保存到set中

优点:只需要 o(1)的代价就可以查询 URL

缺点:对内存要求高。若有 1 亿网页,则占用内存为:1000000000*2byte*50 个字符/1024/1024/1024 = 9G

3、URL 经过 md5 等方法哈希后保存到 set 中

优点:可以成倍降低内存占用,Scrapy 使用的这种方法

4、用bitmap或者bloomfilter方法,将访问过的URL通过hash函数映射到某一位

bitmap 方法优点:一亿 URL 占用约 12M

bitmap 方法:去重没那么精准,存在冲突。

bloomfilter 优点:对 bitmap 进行改进,多重 hash 函数降低冲突

第一种方法案例



  • import
    requests



  • from
    bs4
    import
    BeautifulSoup



  • import
    re



  • import
    pymongo











  • class KongjieSpider:



  • def __init__(self):



  • server =
    'localhost'




  • port =
    '27017'




  • dbname =
    'admin'




  • user =
    'admin'




  • pwd =
    '123'




  • uri =
    'mongodb://'
    + user +
    ':'
    + pwd +
    '@'
    + server +
    ':'
    + port +
    '/'
    + dbname



  • client = pymongo.MongoClient(uri)
    ##与MongDB建立连接




  • db = client[
    'dbkongjie1805'
    ]
    ## 选择一个数据库




  • self.kongjie_collection = db[
    'kongjie'
    ]
    ##在数据库中,选择一个集合








  • def getUA(self):



  • user_agent =
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'




  • headers = {
    'User-Agent'
    : user_agent}



  • return
    headers







  • def parse_album_url(self,url):



  • """




  • 解析出相册url,然后进入相册爬取图片




  • """




  • headers = self.getUA()



  • response = requests.get(url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • people_list = soup.select(
    'div.ptw > ul > li'
    )



  • for
    people
    in
    people_list:



  • self.save_images_in_album(people.div.a[
    'href'
    ])







  • #爬取下一页




  • next_page = soup.select_one(
    'a.nxt'
    )



  • if
    next_page:



  • self.parse_album_url(next_page[
    'href'
    ])



  • else
    :



  • print(
    '下载结束!'
    )







  • def save_images_in_album(self,album_url):



  • """




  • 进入空姐网用户的相册,开始一张一张的保存相册中的图片。




  • """




  • headers = self.getUA()



  • response = requests.get(album_url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • ls = soup.select(
    'ul.ptw.ml.mlp.cl > li'
    )



  • if
    len(ls)>
    0
    :



  • print(
    'len '
    ,len(ls))



  • for
    item
    in
    ls:



  • # 提取照片页面的url




  • url = item.select_one(
    'a'
    )[
    'href'
    ]



  • #去重操作




  • if
    self.kongjie_collection.find_one({
    'img_url'
    : url}):
    ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。




  • print(
    u'这个页面已经爬取过了'
    )



  • else
    :



  • pat = re.compile(
    r'uid=(\d+)&.*?picid=(\d+)'
    )



  • matchObj = pat.search(url)



  • uid = matchObj.group(
    1
    )



  • picid = matchObj.group(
    2
    )



  • print(
    'uid:'
    ,uid)



  • print(
    'picid:'
    ,picid)



  • # 打开照片页面,提取image的src属性




  • response = requests.get(url, headers=headers)



  • soup1 = BeautifulSoup(response.text,
    'lxml'
    )



  • img_url = soup1.select_one(
    'div#photo_pic > a > img'
    )[
    'src'
    ]



  • # 下载图片




  • response = requests.get(img_url, headers=headers)



  • imgName =
    './images/'
    + uid + picid +
    '.jpg'




  • with
    open(imgName,
    'wb'
    )
    as
    file:



  • file.write(response.content)



  • self.kongjie_collection.save({
    'img_url'
    : url})







  • ls = soup.select(
    'a.nxt'
    )



  • print(
    'next_page: '
    ,len(ls))



  • if
    len(ls)>
    0
    :



  • next_page_url = ls[
    0
    ][
    'href'
    ]



  • print(
    'next_page_url:'
    ,next_page_url)



  • save_images_in_album(next_page_url)











  • if
    __name__ ==
    '__main__'
    :



  • start_url=
    'http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'




  • spider = KongjieSpider()



  • spider.parse_album_url(start_url)


第二种案例:







  • import
    requests



  • from
    bs4
    import
    BeautifulSoup



  • import
    re



  • import
    pymongo











  • class KongjieSpider:



  • def __init__(self):



  • server =
    'localhost'




  • port =
    '27017'




  • dbname =
    'admin'




  • user =
    'admin'




  • pwd =
    '123'




  • uri =
    'mongodb://'
    + user +
    ':'
    + pwd +
    '@'
    + server +
    ':'
    + port +
    '/'
    + dbname



  • client = pymongo.MongoClient(uri)
    ##与MongDB建立连接(这是默认连接本地MongDB数据库)




  • db = client[
    'dbkongjie'
    ]
    ## 选择一个数据库




  • self.kongjie_collection = db[
    'kongjie'
    ]
    ##在数据库中,选择一个集合




  • self.img_urls = set()
    ##初始化一个集合 用来保存图片地址












  • def getUA(self):



  • user_agent =
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'




  • headers = {
    'User-Agent'
    : user_agent}



  • return
    headers











  • def parse_album_url(self,url):



  • """




  • 解析出相册url,然后进入相册爬取图片




  • """




  • headers = self.getUA()



  • response = requests.get(url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • people_list = soup.select(
    'div.ptw > ul > li'
    )



  • for
    people
    in
    people_list:



  • self.save_images_in_album(people.div.a[
    'href'
    ])







  • #爬取下一页




  • next_page = soup.select_one(
    'a.nxt'
    )



  • if
    next_page:



  • self.parse_album_url(next_page[
    'href'
    ])



  • else
    :



  • print(
    '下载结束!'
    )











  • def save_images_in_album(self,album_url):



  • """




  • 进入空姐网用户的相册,开始一张一张的保存相册中的图片。




  • """




  • headers = self.getUA()



  • response = requests.get(album_url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • ls = soup.select(
    'ul.ptw.ml.mlp.cl > li'
    )



  • if
    len(ls)>
    0
    :



  • print(
    'len '
    ,len(ls))



  • for
    item
    in
    ls:



  • # 提取照片页面的url




  • url = item.select_one(
    'a'
    )[
    'href'
    ]



  • #去重操作




  • if
    url
    in
    self.img_urls:
    ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。




  • print(
    u'这个页面已经爬取过了'
    )



  • else
    :



  • pat = re.compile(
    r'uid=(\d+)&.*?picid=(\d+)'
    )



  • matchObj = pat.search(url)



  • uid = matchObj.group(
    1
    )



  • picid = matchObj.group(
    2
    )



  • print(
    'uid:'
    ,uid)



  • print(
    'picid:'
    ,picid)



  • # 打开照片页面,提取image的src属性




  • response = requests.get(url, headers=headers)



  • soup1 = BeautifulSoup(response.text,
    'lxml'
    )



  • img_url = soup1.select_one(
    'div#photo_pic > a > img'
    )[
    'src'
    ]



  • # 下载图片




  • response = requests.get(img_url, headers=headers)



  • imgName =
    './images/'
    + uid + picid +
    '.jpg'




  • with
    open(imgName,
    'wb'
    )
    as
    file:



  • file.write(response.content)



  • self.img_urls.add(url)
    # 把url添加到集合中




  • print(
    'count:'
    ,len(self.img_urls))







  • ls = soup.select(
    'a.nxt'
    )



  • print(
    'next_page: '
    ,len(ls))



  • if
    len(ls)>
    0
    :



  • next_page_url = ls[
    0
    ][
    'href'
    ]



  • print(
    'next_page_url:'
    ,next_page_url)



  • save_images_in_album(next_page_url)











  • if
    __name__ ==
    '__main__'
    :



  • start_url=
    'http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'




  • spider = KongjieSpider()



  • spider.parse_album_url(start_url)










第三种案例







  • import
    requests



  • from
    bs4
    import
    BeautifulSoup



  • import
    re



  • import
    pymongo



  • from
    hashlib
    import
    md5







  • class KongjieSpider:



  • def __init__(self):



  • server =
    'localhost'




  • port =
    '27017'




  • dbname =
    'admin'




  • user =
    'admin'




  • pwd =
    '123'




  • uri =
    'mongodb://'
    + user +
    ':'
    + pwd +
    '@'
    + server +
    ':'
    + port +
    '/'
    + dbname



  • client = pymongo.MongoClient(uri)
    ##与MongDB建立连接(这是默认连接本地MongDB数据库)




  • db = client[
    'dbkongjie'
    ]
    ## 选择一个数据库




  • self.kongjie_collection = db[
    'kongjie'
    ]
    ##在数据库中,选择一个集合




  • self.img_urls = set()
    ##初始化一个集合 用来保存图片地址












  • def getUA(self):



  • user_agent =
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'




  • headers = {
    'User-Agent'
    : user_agent}



  • return
    headers











  • def parse_album_url(self,url):



  • """




  • 解析出相册url,然后进入相册爬取图片




  • """




  • headers = self.getUA()



  • response = requests.get(url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • people_list = soup.select(
    'div.ptw > ul > li'
    )



  • for
    people
    in
    people_list:



  • self.save_images_in_album(people.div.a[
    'href'
    ])







  • #爬取下一页




  • next_page = soup.select_one(
    'a.nxt'
    )



  • if
    next_page:



  • self.parse_album_url(next_page[
    'href'
    ])



  • else
    :



  • print(
    '下载结束!'
    )











  • def save_images_in_album(self,album_url):



  • """




  • 进入空姐网用户的相册,开始一张一张的保存相册中的图片。




  • """




  • headers = self.getUA()



  • response = requests.get(album_url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • ls = soup.select(
    'ul.ptw.ml.mlp.cl > li'
    )



  • if
    len(ls)>
    0
    :



  • print(
    'len '
    ,len(ls))



  • for
    item
    in
    ls:



  • # 提取照片页面的url




  • url = item.select_one(
    'a'
    )[
    'href'
    ]



  • #去重操作




  • hash_md5 = md5(url.encode(
    'utf8'
    ))



  • hash_str = hash_md5.hexdigest()



  • if
    hash_str
    in
    self.img_urls:
    ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。




  • print(
    u'这个页面已经爬取过了'
    )



  • else
    :



  • pat = re.compile(
    r'uid=(\d+)&.*?picid=(\d+)'
    )



  • matchObj = pat.search(url)



  • uid = matchObj.group(
    1
    )



  • picid = matchObj.group(
    2
    )



  • print(
    'uid:'
    ,uid)



  • print(
    'picid:'
    ,picid)



  • # 打开照片页面,提取image的src属性




  • response = requests.get(url, headers=headers)



  • soup1 = BeautifulSoup(response.text,
    'lxml'
    )



  • img_url = soup1.select_one(
    'div#photo_pic > a > img'
    )[
    'src'
    ]



  • # 下载图片




  • response = requests.get(img_url, headers=headers)



  • imgName =
    './images/'
    + uid + picid +
    '.jpg'




  • with
    open(imgName,
    'wb'
    )
    as
    file:



  • file.write(response.content)







  • self.img_urls.add(hash_str)



  • print(
    'count:'
    ,len(self.img_urls),hash_str)







  • ls = soup.select(
    'a.nxt'
    )



  • print(
    'next_page: '
    ,len(ls))



  • if
    len(ls)>
    0
    :



  • next_page_url = ls[
    0
    ][
    'href'
    ]



  • print(
    'next_page_url:'
    ,next_page_url)



  • save_images_in_album(next_page_url)











  • if
    __name__ ==
    '__main__'
    :



  • start_url=
    'http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'




  • spider = KongjieSpider()



  • spider.parse_album_url(start_url)










第四种案例



  • import
    requests



  • from
    bs4
    import
    BeautifulSoup



  • import
    re



  • import
    pymongo



  • from
    pybloom_live
    import
    ScalableBloomFilter











  • class KongjieSpider:



  • def __init__(self):



  • server =
    'localhost'




  • port =
    '27017'




  • dbname =
    'admin'




  • user =
    'admin'




  • pwd =
    '123'




  • uri =
    'mongodb://'
    + user +
    ':'
    + pwd +
    '@'
    + server +
    ':'
    + port +
    '/'
    + dbname



  • client = pymongo.MongoClient(uri)
    ##与MongDB建立连接(这是默认连接本地MongDB数据库)




  • db = client[
    'dbkongjie'
    ]
    ## 选择一个数据库




  • self.kongjie_collection = db[
    'kongjie'
    ]
    ##在数据库中,选择一个集合




  • self.sbf = ScalableBloomFilter(initial_capacity=
    100
    , error_rate=
    0.001
    , mode=ScalableBloomFilter.LARGE_SET_GROWTH)







  • def getUA(self):



  • user_agent =
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'




  • headers = {
    'User-Agent'
    : user_agent}



  • return
    headers







  • def parse_album_url(self,url):



  • """




  • 解析出相册url,然后进入相册爬取图片




  • """




  • headers = self.getUA()



  • response = requests.get(url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • people_list = soup.select(
    'div.ptw > ul > li'
    )



  • for
    people
    in
    people_list:



  • self.save_images_in_album(people.div.a[
    'href'
    ])







  • #爬取下一页




  • next_page = soup.select_one(
    'a.nxt'
    )



  • if
    next_page:



  • self.parse_album_url(next_page[
    'href'
    ])



  • else
    :



  • print(
    '下载结束!'
    )







  • def save_images_in_album(self,album_url):



  • """




  • 进入空姐网用户的相册,开始一张一张的保存相册中的图片。




  • """




  • headers = self.getUA()



  • response = requests.get(album_url, headers=headers)



  • soup = BeautifulSoup(response.text,
    'lxml'
    )



  • ls = soup.select(
    'ul.ptw.ml.mlp.cl > li'
    )



  • if
    len(ls)>
    0
    :



  • print(
    'len '
    ,len(ls))



  • for
    item
    in
    ls:



  • # 提取照片页面的url




  • url = item.select_one(
    'a'
    )[
    'href'
    ]



  • #去重操作




  • if
    url
    in
    self.sbf:
    ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。




  • print(
    u'这个页面已经爬取过了'
    )



  • else
    :



  • pat = re.compile(
    r'uid=(\d+)&.*?picid=(\d+)'
    )



  • matchObj = pat.search(url)



  • uid = matchObj.group(
    1
    )



  • picid = matchObj.group(
    2
    )



  • print(
    'uid:'
    ,uid)



  • print(
    'picid:'
    ,picid)



  • # 打开照片页面,提取image的src属性




  • response = requests.get(url, headers=headers)



  • soup1 = BeautifulSoup(response.text,
    'lxml'
    )



  • img_url = soup1.select_one(
    'div#photo_pic > a > img'
    )[
    'src'
    ]



  • # 下载图片




  • response = requests.get(img_url, headers=headers)



  • imgName =
    './images/'
    + uid + picid +
    '.jpg'




  • with
    open(imgName,
    'wb'
    )
    as
    file:



  • file.write(response.content)







  • self.sbf.add(url)



  • print(
    'count:'
    ,len(self.sbf))







  • ls = soup.select(
    'a.nxt'
    )



  • print(
    'next_page: '
    ,len(ls))



  • if
    len(ls)>
    0
    :



  • next_page_url = ls[
    0
    ][
    'href'
    ]



  • print(
    'next_page_url:'
    ,next_page_url)



  • save_images_in_album(next_page_url)











  • if
    __name__ ==
    '__main__'
    :



  • start_url=
    'http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'




  • spider = KongjieSpider()



  • spider.parse_album_url(start_url)

更多免费技术资料可关注:annalin1203