爬虫实例十 爬取百度贴吧小姐姐照片

514 阅读4分钟

上上上上代码!!!

import requests
import re
url='https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BA%A6%E4%BC%9A%E5%90%A7&fr=search'
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Cookie":"BAIDUID=6E1E9D510CA3FBB723B4EE4B2846E36D:FG=1; BIDUPSID=6E1E9D510CA3FBB723B4EE4B2846E36D; PSTM=1604713571; __yjs_duid=1_71a81f938ba8293e48830352dd0ec1c91616484293272; delPer=0; BAIDUID_BFESS=6E1E9D510CA3FBB723B4EE4B2846E36D:FG=1; ZD_ENTRY=empty; BDRCVFR[VBH4JnM-Vd0]=OjjlczwSj8nXy4Grjf8mvqV; BDRCVFR[-Cxg3mV_4Yc]=OjjlczwSj8nXy4Grjf8mvqV; BDRCVFR[S4-dAuiWMmn]=oPlYXH5bwdffjfsnjcsPWnLg1NxUvNV; H_PS_PSSID=; PSINO=6; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BCLID=7766680461769038651; BDSFRCVID=mFkOJexroG38-O6eDzJq8nxEf2KK0gOTDYLEOwXPsp3LGJLVN4vPEG0Pt_U-mEt-J8jwogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tbkD_C-MfIvDqTrP-trf5DCShUFsK4PjB2Q-XPoO3K8WKqACbfO05hcbhl74KjjiWbRM2MbgylRp8P3y0bb2DUA1y4vpWj3qLgTxoUJ2XMKVDq5mqfCWMR-ebPRiJPb9Qg-qahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hI0ljj82e5PVKgTa54cbb4o2WbCQJUod8pcN2b5oQTJbqtPqKx3EWDuOWtnN5DovOIJTXpOUWfAkXpJvQnJjt2JxaqRCBDb-Vh5jDh3MBpQDhtoJexIO2jvy0hvctn3cShPCyUjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2XjQhDNtDt60jfn3aQ5rtKRTffjrnhPF335LFXP6-hnjy3bAOslAK5lb0ORD9hR7pb6DUyN3MWh3RymJ42-39LPO2hpRjyxv4X60B0-oxJpOJXaILWl52HlFWj43vbURvD--g3-AqBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoK0hJC-2bKvPKITD-tFO5eT22-us2a5i2hcHMPoosI89qfP-bf0hyG-O2Jv45JriaKJjBMbUoqRHXnJi0btQDPvxBf7pBJnqbp5TtUJM_UKzhfoMqfTbMlJyKMnitIv9-pPKWhQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKuDTtajj3QeaRabK6aKC5bL6rJabC3DqQcXU6q2bDeQN-Oex4q5mnEatDyatnpeCooyT3JXp0vWtv4WbbvLT7johRTWqR48CbC0MonDh83Bn_L2xQJHmLOBt3O5hvvhb3O3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRFtoCP53D; BCLID_BFESS=7766680461769038651; BDSFRCVID_BFESS=mFkOJexroG38-O6eDzJq8nxEf2KK0gOTDYLEOwXPsp3LGJLVN4vPEG0Pt_U-mEt-J8jwogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tbkD_C-MfIvDqTrP-trf5DCShUFsK4PjB2Q-XPoO3K8WKqACbfO05hcbhl74KjjiWbRM2MbgylRp8P3y0bb2DUA1y4vpWj3qLgTxoUJ2XMKVDq5mqfCWMR-ebPRiJPb9Qg-qahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hI0ljj82e5PVKgTa54cbb4o2WbCQJUod8pcN2b5oQTJbqtPqKx3EWDuOWtnN5DovOIJTXpOUWfAkXpJvQnJjt2JxaqRCBDb-Vh5jDh3MBpQDhtoJexIO2jvy0hvctn3cShPCyUjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2XjQhDNtDt60jfn3aQ5rtKRTffjrnhPF335LFXP6-hnjy3bAOslAK5lb0ORD9hR7pb6DUyN3MWh3RymJ42-39LPO2hpRjyxv4X60B0-oxJpOJXaILWl52HlFWj43vbURvD--g3-AqBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIEoK0hJC-2bKvPKITD-tFO5eT22-us2a5i2hcHMPoosI89qfP-bf0hyG-O2Jv45JriaKJjBMbUoqRHXnJi0btQDPvxBf7pBJnqbp5TtUJM_UKzhfoMqfTbMlJyKMnitIv9-pPKWhQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKuDTtajj3QeaRabK6aKC5bL6rJabC3DqQcXU6q2bDeQN-Oex4q5mnEatDyatnpeCooyT3JXp0vWtv4WbbvLT7johRTWqR48CbC0MonDh83Bn_L2xQJHmLOBt3O5hvvhb3O3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRFtoCP53D; BA_HECTOR=2h80ag2k8g0ka105i81g68aah0q; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1617176934; video_bubble0=1; st_key_id=17; wise_device=0; bdshare_firstime=1617176967061; ab_sr=1.0.0_YmFkN2M5YWU0OTEyNjFjY2M3YjM0M2MzYzAzOTU0ZjIxYjFmOGE3ZTZiYmY3YzU1MTgxNjVmNDliYzJkNWEyNWJlZDRlNDUzY2E1NDRiZTExNGJhMjJkMmZjN2NlNWFm; st_data=949b8c92cf211096345a9675813f6de106935a586d973c2ca12ea6c84fed19e4a43772e96ef1f44a72a7c937eeb652f42cf3a354f68d0c5a5d8c767e40914e94c4b04cd821c9f740b4dd68b25670a2c2a6a1a463f1fe160bbb9cbc207bc525a2c5ebbec87d53a0dbac77da4a29dd264e0c180d682a8db9634479cb68bea7e52c; st_sign=962299d6; tb_as_data=24619e9904415ee30a754e4988fcc09074228a15c3493fdd21f4798c2bf0bed9a309bab873e058eb59b6180818a86dd373e3147f35ec975f89dfbf713673443c97ff444a44534fb22ad287357bb3bfbc137972cd62192a2d6800c63e2e669bd0eadd94eeb6586165777101ec3cafad1a; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1617178212"
}
html_data=requests.get(url=url,headers=headers).text
findurl = re.compile(r'<a rel="noreferrer" href="(.*?)" title=".*" target=".*" class=".*">.*</a>')
urls=re.findall(findurl,html_data)
second_url="https://tieba.baidu.com/"
for i in urls:
    all_url=second_url+i
    second_resp=requests.get(url=all_url,headers=headers).text
    second_url_url=re.compile(r'<img class=".*" src="(.*?)" size=".*" changedsize=".*" width=".*" height=".*">')
    picture_url=re.findall(second_url_url,second_resp)
    for img in picture_url:
        res = requests.get(img, headers=headers)
        res.encoding = 'utf-8'
        # 二进制文件
        html = res.content
        # 写入本地文件
        TP = img[-12:]
        with open(TP, 'wb') as f:
            f.write(html)
            print("%s下载成功" % TP)

爬虫代码虽短,坑却很多。实现步骤就不写了,主要把坑说一下
第一:百度贴吧的反爬机制主要是UA伪装和cookie
没有cookie一定不会返回网页源代码(本人亲测,以后管他需不需要cookie,先给她安排上)
第二:正则解析
最开始我用的xpath,虽然源代码可以在控制台打印出来,但是无论如何都解析不出来想要的内容,于是用正则一下就解析出来了。
第三:网址的拼接,这个需要一丢丢基础,一般就是有手就行。
第四:图片,视频,音频是以二进制形式储存的。所以这里需要把图片转成二进制。
![在这里插入图片描述](img-blog.csdnimg.cn/20210331234…在这里插入图片描述