线程安全队列
Python内置了一个线程安全的模块叫做queue模块。Python中的queue模块中提供了同步的、线程安全的队列类,包括FIFO(先进先出)队列Queue,LIFO(后入先出)队列LifoQueue。这些队列都实现了锁原语(可以理解为原子操作,即要么不做,要么都做完),能够在多线程中直接使用。可以使用队列来实现线程间的同步。
队列的常用方法
| 函数 | 描述 |
|---|---|
| qsize() | 返回队列大小 |
| empty() | 判断队列是否为空 |
| full() | 判断队列是否满了 |
| get() | 从队列中获取插入的数据 |
| put() | 将一个数据放到队列中 |
from queue import Queue
#指定当前队列最多可以存放5个数据,不填默认无限制
q=Queue(5)
for i in range(1,6): #(1,7)会发生阻塞,因为q最多只能存5个数据,(1,7)中包含了6个数据
q.put(i) #队列当中添加数据
print(q.get()) # get返回队列中一个数据
print(q.qsize()) # qsize返回队列大小
print(q.full()) # full判断是否存满
for i in range(10):
if q.empty():
break
else:
print(q.get())
在Python中,队列主要用于线程间通信。当多个线程需要共享数据时,由于多个线程之间的数据资源是共享的,无法保证数据的安全性和一致性。因此,当多个线程需要进行数据交换时,队列就出现了。队列可以完美解决线程间的数据交换问题,保证线程间数据的安全性和一致性。
from queue import Queue
import threading
q1=Queue()
def put_data(i):
while True:
q1.put(i)
print(f'放入的是{i}')
i+=1
if q1.qsize()>=5:
break
def get_data():
while True:
num = q1.get()
print('取出的数据是:',num)
if q1.empty():
break
if __name__=='__main__':
t1=threading.Thread(target=put_data,args=(5,)) #这里的args参数用来向target所传的函数传参,并且需要写成元组形式
t2=threading.Thread(target=get_data)
t1.start()
t2.start()
生产者和消费者模式
产生数据的模块,就形象地称为生产者;
而处理数据的模块,就称为消费者。
单单抽象出生产者和消费者,还够不上是生产者/消费者模式。该模式还需要有一个缓冲区处于生产者和消费者之间,作为一个中介。生产者把数据放入缓冲区,而消费者从缓冲区取出数据。
缓冲区 如果制造数据的速度时快时慢,缓冲区的好处就体现出来了。当数据制造快的时候,消费者来不及处理,未处理的数据可以暂时存在缓冲区中。等生产者的制造速度慢下来,消费者再慢慢处理掉。
单线程模式
import requests
from urllib.request import urlretrieve
headers = {
"Accept": "application/json",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/json",
"Referer": "https://www.vcg.com/creative-image/beijing/?page=4",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
"sec": "$2a$08$o0b4q1YNVxqUSAp6NNwzy.lzDSXoDZUS.2vfXq9SzEkB/Cn47rqda",
"sec-ch-ua": ""Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"",
"sec-ch-ua-mobile": "?1",
"sec-ch-ua-platform": ""Android""
}
cookies = {
"uuid": "9952bdf7-b943-4e82-8216-28c5c4a8b24b",
"clientIp": "219.152.36.193",
"sajssdk_2015_cross_new_user": "1",
"Hm_lvt_5fd2e010217c332a79f6f3c527df12e9": "1704355817",
"abBoss3": "1.0",
"FZ_STROAGE.vcg.com": "eyJTRUVTSU9OSUQiOiIxODk0ZTI5M2UwYWZkMzljIiwiU0VFU0lPTkRBVEUiOjE3MDQzNTcwNzEwMzl9",
"ARK_ID": "undefined",
"api_token": "ST-714-4dbed77ac46d5929674c3f570a81a8134",
"name": "",
"sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd385e3801b1-0659b0131a7a58c-26001951-2073600-18cd385e381142c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd385e3801b1-0659b0131a7a58c-26001951-2073600-18cd385e381142c%22%7D",
"acw_sc__v2": "659675f0c82afcb5c3cf4af1b87762a7d65d401f",
"Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9": "1704359493",
"_fp_": "eyJpcCI6IjIxOS4xNTIuMzYuMTkzIiwiZnAiOiIxMGViZGJlNTJkZjdjNWUzYjI2ZGFjNGM4MTM3NjFkYSIsImhzIjoiJDJhJDA4JDIyL3d5dnNIR0pvMFh5SHQ5VlZZdnVIS0k4ZDhnbXljdUx1ZVpLZVFnWk11ZGkxa2x6RVd1In0%3D",
"fingerprint": "10ebdbe52df7c5e3b26dac4c813761da",
"acw_tc": "276077dd17043596608001226e204e24a7e4cbd2cf5c47da69c784b34ddb2a",
"ssxmod_itna": "YqGxBD2iD=qWqBIx0LP8+GQ0QtitdzOFTkei7DlhBQxA5D8D6DQeGTTRdKeTPN5GmRxh=7lW7cx4LiRWmvWQM1x0aDbqGkkAGG4GGjxBYDQxAYDGDDPDopPD1D3qDkD7x6kguaqi3DEGKDaxDbDie8+xGCDeKD0xuaDQKDuhKD97R8eDDzeBoxzDGin7DGPza+2PgkkWY4YGKD9=oDsgD6eGFwczdxya7LXSH3+x0kPq0OuP5zOPoDUjFzngvNdmRDdQrNq70etjRKY7DYPOE9YGe4IQ0S8y2Kz7DIFFDKT7gZeDiTCwhDD=",
"ssxmod_itna2": "YqGxBD2iD=qWqBIx0LP8+GQ0QtitdzOFTkeiDnxnKxDsADwaAQjcbDjRG5et4G=VFnkKh86iAwvXGq2ox08DewGD"
}
url = "https://www.vcg.com/api/common/searchAllImage"
for i in range(1,3):
params = {
"page": f"{i}",
"phrase": "背景",
"transform": "beijing",
"uuid": "F4N5S94_68e6027de5004818237a2c20592e941b",
"productId": "400197"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params).json()
data = response['list']
for i in range(len(data)):
title=data[i]['title']
img ="https:"+data[i]['url800']
urlretrieve(img,f'视觉中国-单线程/{title}{i}.jpg')
以上就是利用单线程模式对【视觉中国】进行的爬取,因为该网站是一个动态网页,所以可以利用spidertools工具进行网页的响应来获取数据,只是注意该文件是json格式的就可以了。利用单线程模式也可以实现内容的爬虫和下载,但是缺点就是耗时太多,消耗太多资源。
多线程模式(生产者消费者模式)
import requests
from urllib.request import urlretrieve
import threading
from queue import Queue
#生产者
class Product_img(threading.Thread):
def run(self) -> None:
while True:
if url_q.empty():
break
else:
params = url_q.get()
print('-------取到一个参数')
headers = {
"authority": "sensorsdata.vcg.com",
"accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"accept-language": "zh-CN,zh;q=0.9",
"if-modified-since": "Fri, 21 Jul 2023 08:25:07 GMT",
"if-none-match": ""0BAC8A231F2D76EC36891D213F543188"",
"referer": "https://www.vcg.com/",
"sec-ch-ua": ""Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": ""Windows"",
"sec-fetch-dest": "image",
"sec-fetch-mode": "no-cors",
"sec-fetch-site": "same-site",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Referer": "https://www.vcg.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "application/json",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"cookie": "uuid=8f9ad52f-28e3-4101-a663-ad7ed2f73e1e; clientIp=183.226.116.146; sajssdk_2015_cross_new_user=1; fingerprint=6fdf72f57da351f4b3c9a5aa4e634a92; Hm_lvt_5fd2e010217c332a79f6f3c527df12e9=1704371387; acw_sc__v2=6596a4a93931aa06acb6661ce35d3d0bfed61a4b; api_token=ST-348-61f5b4991429d93b76a326d2033831146; abBoss3=1.0; name=; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%7D; Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9=1704371423; _fp_=eyJpcCI6IjE4My4yMjYuMTE2LjE0NiIsImZwIjoiNmZkZjcyZjU3ZGEzNTFmNGIzYzlhNWFhNGU2MzRhOTIiLCJocyI6IiQyYSQwOCR2UVk1MVlUMWJuLkFzbm5PMHFrVmdlYi5JdDFEeVk1T2JxT3N4Ukh5RUJqNEwuMFRwOXZJMiJ9; ssxmod_itna=eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI+Q5DBkiO4iNDnD8x7YDvII=GYIDoDbxxYOW=mxIKmWxhmxaOWDbPEPfYG1bbDHxY=DU2DneYTDen=D5xGoDPxDeDADYo0DAqiOD7qDd06TXZmqDEDYPBDA3Di4D+GT=DmqG0DDU7l4G2D7UgYDDlYiW4iDDhjD2KZ3RlRltK8CGxGqDM9eGXWBF4GLbCcpGV8eFO=WPcDB=mxBjZRqi0ReDHIdi/Ql5AD2GolxeuWGPKKi4qYGhv0YweKiePRGQK8Dxvi5YQSKs4DDf4U0jxYD===; ssxmod_itna2=eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI+Yx8d1GrxGN0hxGaDor9A5W0uzxn4q8DuiM5iiehiqb08Do1QARA=EbnAumOyOBorEYDHmKojDaikQHLvx4aC0hpskAdkSbOj1iaieiqQ0PIGm15Qx3mHXAB1VABiwPEO=bnikZyWe4n1Obm8Yr5IAPSCGzakqheWz+9wmaTsneYNFRpT3fmYmEemeaumHzaGxzI+c7dRn0DxGak+egM6o5=htxpb3qlWSzPWclq1yexT1DuUTO/y3=nOqYpLqGD=ygHQy6ydQAm8tgRtI4DqnV9uaBTDdA=eYZIKqmbB+hrG9dI56I4NUY9WbG/xZ0aPEmT7RczhrSnWmR+KorCp8ig5QOdaHh/bx3eQIrhKu5hYYmEaCRdZydWEEzEW/vn7WaFaQF3qUpMAFp8St/O992DfBbuPWs3H/+a6PBFu/FSOf+jz2dT9FV7=Qi3D07iGeeDL1QeQTo9PXaQCOikYAe74x9jFiYOYBq9TxODkoT2MG2KDrwgIO0R9GoGDRXdCK0YzIeqGoOGeY9d1IeOYqfh5oD0L9K6/c5hxD7=DYKxeD===",
"Content-Type": "application/json",
"If-None-Match": "W/"27-0FSKyI2Ex7tiw1CTIT4da7WD/x4"",
"sec": "$2a$08$FTRVClzf6hh2yu4n/.g3yeo6jVWIeilfayXUp5uyYBCSm4MA9Eh4G",
"Content-Length": "0",
"Origin": "https://www.vcg.com"
}
cookies = {
"acw_tc": "276077e317043713558224202ed96b634384be82b94e872cec74da2dcfb4a7",
"uuid": "8f9ad52f-28e3-4101-a663-ad7ed2f73e1e",
"clientIp": "183.226.116.146",
"sajssdk_2015_cross_new_user": "1",
"b-user-id": "e4103044-b312-4c54-fd42-ff5f6edb6fec",
"fingerprint": "6fdf72f57da351f4b3c9a5aa4e634a92",
"Hm_lvt_5fd2e010217c332a79f6f3c527df12e9": "1704371387",
"acw_sc__v2": "6596a4a93931aa06acb6661ce35d3d0bfed61a4b",
"api_token": "ST-348-61f5b4991429d93b76a326d2033831146",
"abBoss3": "1.0",
"name": "",
"sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%7D",
"Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9": "1704371423",
"_fp_": "eyJpcCI6IjE4My4yMjYuMTE2LjE0NiIsImZwIjoiNmZkZjcyZjU3ZGEzNTFmNGIzYzlhNWFhNGU2MzRhOTIiLCJocyI6IiQyYSQwOCQ1aE9EMVYvRk5Ub1EzSmYyM1J3MVl1eGJ1bmFrZjB6Q05HamREU0o4TzRLSGltWW1obk9PUyJ9",
"ssxmod_itna": "eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI8W0DBkYPiNDnD8x7YDvIm=GYIDoDEGDmQfeRDTqbfGiRGfQ3DE+n4aeUtxoGLDmKDy7D84moD4RKGwD0eG+DD4DWDmnHDnxAQDjxGpycuTXBDi3Dbg=Df4DmDGAybqDgDYQDGqIUD7QDITt4DDN44pDeDiYdtm2UGeNQ6wxK57GyD0t=xBdPjpGyAl9Co0NKs=94ao=DzM7DtTNl/MdQx0PgDCX4ZgoiQ7+zDEQP7hhxm0D470oQRYhxerhQKwhKiAEuVxqUXvDDGfGd3kDxD",
"ssxmod_itna2": "eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI8WD8d1GrxGXQqqGa7YAkoZz2x8OD6QZAxQ5EH/CiqWKPNuribRFPhiTxhmGYen54inziPRX4Y0b=T2x3bqn=0cPA8tB0XPrwnHCnqKYE8SwKm/oQi4QYCKAjH7Yvn7Yjx4kANeTAHUR=mKQYqzhb=hnvLGBPXfmo67Q=XbmKeY4Yb4fDWVEQOk6A0iAykh2alUpICweNY6m380xap7357Q45l4wr6lxMyBK8Y6tEbq48U2kiY5GNb3fKl7v0+Vh1AxjYSESTYdtUf3tQBCjmCPGet4gwcl6+jILOPx8es0UKK2540eUpPIwixr9=dYUK/DiyBrDRt/Bpt3UQRvx6KNF392GSxiUlKW8u9pIsmUiZqyrdY=L1FYGFrsEoYFwRaeyAj+DEpKNabAAUXIgqp+rj+WaZ0P==8PKm6=ID+LRQuYL++QC7Y9D9nzwI2Ah1nhm99idRpfwIynzyw+fUIhWDG2GDoYGOdGi4h=ek+Hl1RDpiioG344Pwkn6i05frkRGGXPeSlv2Rj7jQfxYUbTiCfUwxFTTjcaRDonQwQGLyhTIF4+6KM1K/1K7DDFqD+rDxD"
}
response = requests.get(url, headers=headers, cookies=cookies, params=params).json()
data=response['list']
for i in range(len(data)):
title=data[i]['title']
img ="https:"+data[i]['url800']
img_q.put((title,img)) #put一次传一组,可以是任意形式,可以使用的是元组
print('解析到的结果',img)
#消费者
class Custmor_img(threading.Thread):
# def __init__(self,num): #重新定义Thread之后,将Thread里面的副类__init__()调用一下
# super().__init__() #
# self.num=num 但这个方法不太好
def run(self) -> None:
global num
while True:
title,img=img_q.get()
print('获取到数据,开始下载')
urlretrieve(img,f'视觉中国-多线程/{title}{num}.jpg')
num +=1
if img_q.empty():
break
if __name__ == '__main__':
url_q = Queue(5) #请求队列,有五页的网址
img_q = Queue()
url = "https://www.vcg.com/api/common/searchAllImage"
#把5页数据放进地址队列
for i in range(1,5):
params = {
"page": f"{i}",
"phrase": "背景",
"transform": "beijing",
"uuid": "F4N5S94_68e6027de5004818237a2c20592e941b",
"productId": "400197"
}
url_q.put(params)
print('网站队列的长度:',url_q.qsize())
#创建三个生产者生产数据
for i in range(3):
p1 = Product_img()
p1.start() #这里开始就去执行Product_img()中的run方法
#创建三个消费者下载数据
num=1
for i in range(3):
c1 = Custmor_img()
c1.start() #这里开始就去执行Product_img()中的run方法