爬虫中伪装失败:
一帮常见的伪装头加 headers = {
user_agent cookies referer accept accept_languaga}
当我在做伪装时 只加了前两个 被发现是爬虫 直到加上 accept_languaga 才成功
这是在爬取图片时,当爬取多页时,找出网页的不同并循环 百度图片爬取是瀑布式的,可以先打开检查页面 搜索相关词 直接去其中一个图片网址的一小片段 再找关键词
用re 模块取值 并存入文件 注意:还得加json字符串格式转换成子典 get 只能在用字典取 有人看到预览内容是也是dict但你的转换成pyhton能看懂的
import requests
import re
url = 'image.baidu.com/search/inde…'
headers = {
'cookie':'PSTM=1777793956; BAIDUID=63C6DAEABA88F24A2D27B0827C151E41:FG=1; BIDUPSID=462A267734B46A50E3B86CAB6BF3828B; BA_HECTOR=0ga50h0004ah0125al042h812g8g821kvdutn27; BAIDUID_BFESS=63C6DAEABA88F24A2D27B0827C151E41:FG=1; ZFY=RO2EJ8O7ErqwSpamJhUjdLfYeOXn9EyhGYBSUnw:B4qw:C; H_PS_PSSID=63141_65590_67862_68165_68453_68741_68982_69002_69157_69191_69201_69245_69228_69238_69232_69237_69294_68779_69251_69356_69399_69400_69403_69397_69419_69415_69414_69423_69444_69437_69451_69342_69338_69347_69345_69348_69350_69340_69496_69502_69515_69559_69554_68487_69571_69590_69614_69650_69664_69667_69660_69672; PSINO=1; delPer=0; BCLID=2559185424141520038; BCLID_BFESS=2559185424141520038; BDSFRCVID=YqIOJexroGW9i9OEnI87EHtYoIC6w-vTDYLEOwXPsp3LGJLV1XLfEG0PtqGrylPbLrA9ogKKBeOTHgt5o2t2K2vmthejnG5V6U8Ptf8g0M5; BDSFRCVID_BFESS=YqIOJexroGW9i9OEnI87EHtYoIC6w-vTDYLEOwXPsp3LGJLV1XLfEG0PtqGrylPbLrA9ogKKBeOTHgt5o2t2K2vmthejnG5V6U8Ptf8g0M5; H_BDCLCKID_SF=tJAj_D-btK03H48k-4QEbbQH-UnLq-TB0mOZ04n-ah3RbJIwXUo_yfb0Qt7JX4RA0bcfs4Qm3UTdsq76Wh35K5tTQP6rLjovBIc4KKJxbPjnq4T10-5dKxo-hUJiBMj-Ban70--aaKoMhlk454cMM-Lqh4ttLj5PfNRJ0DnjtpChbC_RD5DMDTJWepJf-K6E2I5jBRu8Kb7VbIn6LUnkbJkXhPJUq4F8B2TXalLb3lnMoJ7VLTjbqJK7Qbrr04JPfR7rbRvXK4JrSCjyX-jpQT8r5hQxbJcQLIrrQqDXab3vOpoNXpO1bn30hN5JJfPHbDjJBn5_aq0hSCQaeh3Mb6ksD-FtqjDetnFJoK85fbo5KRopMtOhq4tehHRb3fn9WDTOQJ7TQKTEbq7HhxrPXULn5bryBxtqb-jq-pbwBp5cfUnMKn05XM-pXbDDXU7h3mkjbPbh3-oBDMJPXb5Net4syPRGKxRnWIvJ_RA-bIovh4Qz5-7qM4C1jJDqL6vTbIFO0KJzJCFKhItCj5-5jTPVKgTa54cbb4o2WbCQbJTr8pcN2b5oQpb-MHr9046A5nv4aPbwahTTM40z5lOUWfAkXpJvQnJjt2JxaqRCtxndel5jDh3MKf-X2fvT5f7H0R5y0hvc0J5cShnT55ooDR3XyN5q2R335CQXKK5CJfKWV-bIe-t2XjQhDG-jtj0etbks3t88KJjEe-Kk-PnVepFLytnZKRvHa2vUVRIXBh5_ODP6Qf7MWt4PLNjfeR5n3N5rKl75yUJ5qKOsQU6d36_RXMj405OTbT6xQtTsLnodqlDwhPJvyUD8XnO7Kb0tabrJoqk-bJ6Mjb5Fhxth36KAeb3JKjvMtgDtVJO-KKCKMKIRjUK; H_BDCLCKID_SF_BFESS=tJAj_D-btK03H48k-4QEbbQH-UnLq-TB0mOZ04n-ah3RbJIwXUo_yfb0Qt7JX4RA0bcfs4Qm3UTdsq76Wh35K5tTQP6rLjovBIc4KKJxbPjnq4T10-5dKxo-hUJiBMj-Ban70--aaKoMhlk454cMM-Lqh4ttLj5PfNRJ0DnjtpChbC_RD5DMDTJWepJf-K6E2I5jBRu8Kb7VbIn6LUnkbJkXhPJUq4F8B2TXalLb3lnMoJ7VLTjbqJK7Qbrr04JPfR7rbRvXK4JrSCjyX-jpQT8r5hQxbJcQLIrrQqDXab3vOpoNXpO1bn30hN5JJfPHbDjJBn5_aq0hSCQaeh3Mb6ksD-FtqjDetnFJoK85fbo5KRopMtOhq4tehHRb3fn9WDTOQJ7TQKTEbq7HhxrPXULn5bryBxtqb-jq-pbwBp5cfUnMKn05XM-pXbDDXU7h3mkjbPbh3-oBDMJPXb5Net4syPRGKxRnWIvJ_RA-bIovh4Qz5-7qM4C1jJDqL6vTbIFO0KJzJCFKhItCj5-5jTPVKgTa54cbb4o2WbCQbJTr8pcN2b5oQpb-MHr9046A5nv4aPbwahTTM40z5lOUWfAkXpJvQnJjt2JxaqRCtxndel5jDh3MKf-X2fvT5f7H0R5y0hvc0J5cShnT55ooDR3XyN5q2R335CQXKK5CJfKWV-bIe-t2XjQhDG-jtj0etbks3t88KJjEe-Kk-PnVepFLytnZKRvHa2vUVRIXBh5_ODP6Qf7MWt4PLNjfeR5n3N5rKl75yUJ5qKOsQU6d36_RXMj405OTbT6xQtTsLnodqlDwhPJvyUD8XnO7Kb0tabrJoqk-bJ6Mjb5Fhxth36KAeb3JKjvMtgDtVJO-KKCKMKIRjUK; BDRCVFR[BIVAaPonX6T]=-_EV5wtlMr0mh-8uz4WUvY; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_WISE_SIDS=65590_68165_68453_68982_69002_69157_69191_69201_69245_69228_69238_69232_69237_69294_68779_69251_69419_69415_69414_69423_69444_69437_69451_69496_69515_69559_69554_69571_69590_69650_69664_69667_69660_69672; ab_sr=1.0.1_MjdkYjgyNGQyMTNmMDVlODU5MThlN2E3OWU3MTlkMDRiMDEyOGJhMzFmMmI1NzQyNTlhNjIyMDlmMTEwZGQxZjQ2ODQyYzlkYmU5YWViZTVkOWZlMzc5MGFmNDQ4MzQ3MDU2ZjAwYzcxYmI4MzgxODc4ZjRlNmE4ZWRlZDFlMWIxNWQ0NTdjNWE2ZDc2YTdhMjRhODY3ODFlMTEwNDkzOGExZTY4Y2QyNjA1ZjMxNTEyYWE0ZmE1OWVhNmY1NTUwOWU3MTZlNTQ0YzJkMjQxZjllNzM2OGI5ZWY1ZjJhOTM=',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147.0.0.0 Safari/537.36 Edg/147.0.0.0',
'referer':'image.baidu.com/search/inde…',
'accept-language':'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6'
}
count = 1
for i in range(2,4):
url = f'image.baidu.com/search/acjs…'
res = requests.get(url, headers=headers)
li_data = res.json()
data = li_data.get('data').get('images')
for i in data:
i_url = i.get('thumburl')
m_data = requests.get(i_url)
with open(f'蜡笔k{count}.png', 'wb') as f:
f.write(m_data.content)
count += 1