线程与进程(二)

216 阅读7分钟

线程安全队列

Python内置了一个线程安全的模块叫做queue模块。Python中的queue模块中提供了同步的、线程安全的队列类,包括FIFO(先进先出)队列Queue,LIFO(后入先出)队列LifoQueue。这些队列都实现了锁原语(可以理解为原子操作,即要么不做,要么都做完),能够在多线程中直接使用。可以使用队列来实现线程间的同步。

队列的常用方法

函数描述
qsize()返回队列大小
empty()判断队列是否为空
full()判断队列是否满了
get()从队列中获取插入的数据
put()一个数据放到队列中
from queue import Queue

#指定当前队列最多可以存放5个数据,不填默认无限制
q=Queue(5)
for i in range(1,6): #(1,7)会发生阻塞,因为q最多只能存5个数据,(1,7)中包含了6个数据
    q.put(i)    #队列当中添加数据

print(q.get())    # get返回队列中一个数据
print(q.qsize())  # qsize返回队列大小
print(q.full())   # full判断是否存满

for i in range(10):
    if q.empty():
        break
    else:
        print(q.get())

在Python中,队列主要用于线程间通信。当多个线程需要共享数据时,由于多个线程之间的数据资源是共享的,无法保证数据的安全性和一致性。因此,当多个线程需要进行数据交换时,队列就出现了。队列可以完美解决线程间的数据交换问题,保证线程间数据的安全性和一致性。


from queue import Queue
import threading

q1=Queue()
def put_data(i):
    while True:
        q1.put(i)
        print(f'放入的是{i}')
        i+=1
        if q1.qsize()>=5:
            break

def get_data():
    while True:
        num = q1.get()
        print('取出的数据是:',num)
        if q1.empty():
            break

if __name__=='__main__':
    t1=threading.Thread(target=put_data,args=(5,))  #这里的args参数用来向target所传的函数传参,并且需要写成元组形式
    t2=threading.Thread(target=get_data)

    t1.start()
    t2.start()

生产者和消费者模式

产生数据的模块,就形象地称为生产者

而处理数据的模块,就称为消费者

单单抽象出生产者和消费者,还够不上是生产者/消费者模式。该模式还需要有一个缓冲区处于生产者和消费者之间,作为一个中介。生产者把数据放入缓冲区,而消费者从缓冲区取出数据。

缓冲区 如果制造数据的速度时快时慢,缓冲区的好处就体现出来了。当数据制造快的时候,消费者来不及处理,未处理的数据可以暂时存在缓冲区中。等生产者的制造速度慢下来,消费者再慢慢处理掉。

单线程模式

import requests
from urllib.request import urlretrieve

headers = {
    "Accept": "application/json",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Connection": "keep-alive",
    "Content-Type": "application/json",
    "Referer": "https://www.vcg.com/creative-image/beijing/?page=4",
    "Sec-Fetch-Dest": "empty",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Site": "same-origin",
    "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
    "sec": "$2a$08$o0b4q1YNVxqUSAp6NNwzy.lzDSXoDZUS.2vfXq9SzEkB/Cn47rqda",
    "sec-ch-ua": ""Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"",
    "sec-ch-ua-mobile": "?1",
    "sec-ch-ua-platform": ""Android""
}
cookies = {
    "uuid": "9952bdf7-b943-4e82-8216-28c5c4a8b24b",
    "clientIp": "219.152.36.193",
    "sajssdk_2015_cross_new_user": "1",
    "Hm_lvt_5fd2e010217c332a79f6f3c527df12e9": "1704355817",
    "abBoss3": "1.0",
    "FZ_STROAGE.vcg.com": "eyJTRUVTSU9OSUQiOiIxODk0ZTI5M2UwYWZkMzljIiwiU0VFU0lPTkRBVEUiOjE3MDQzNTcwNzEwMzl9",
    "ARK_ID": "undefined",
    "api_token": "ST-714-4dbed77ac46d5929674c3f570a81a8134",
    "name": "",
    "sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd385e3801b1-0659b0131a7a58c-26001951-2073600-18cd385e381142c%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd385e3801b1-0659b0131a7a58c-26001951-2073600-18cd385e381142c%22%7D",
    "acw_sc__v2": "659675f0c82afcb5c3cf4af1b87762a7d65d401f",
    "Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9": "1704359493",
    "_fp_": "eyJpcCI6IjIxOS4xNTIuMzYuMTkzIiwiZnAiOiIxMGViZGJlNTJkZjdjNWUzYjI2ZGFjNGM4MTM3NjFkYSIsImhzIjoiJDJhJDA4JDIyL3d5dnNIR0pvMFh5SHQ5VlZZdnVIS0k4ZDhnbXljdUx1ZVpLZVFnWk11ZGkxa2x6RVd1In0%3D",
    "fingerprint": "10ebdbe52df7c5e3b26dac4c813761da",
    "acw_tc": "276077dd17043596608001226e204e24a7e4cbd2cf5c47da69c784b34ddb2a",
    "ssxmod_itna": "YqGxBD2iD=qWqBIx0LP8+GQ0QtitdzOFTkei7DlhBQxA5D8D6DQeGTTRdKeTPN5GmRxh=7lW7cx4LiRWmvWQM1x0aDbqGkkAGG4GGjxBYDQxAYDGDDPDopPD1D3qDkD7x6kguaqi3DEGKDaxDbDie8+xGCDeKD0xuaDQKDuhKD97R8eDDzeBoxzDGin7DGPza+2PgkkWY4YGKD9=oDsgD6eGFwczdxya7LXSH3+x0kPq0OuP5zOPoDUjFzngvNdmRDdQrNq70etjRKY7DYPOE9YGe4IQ0S8y2Kz7DIFFDKT7gZeDiTCwhDD=",
    "ssxmod_itna2": "YqGxBD2iD=qWqBIx0LP8+GQ0QtitdzOFTkeiDnxnKxDsADwaAQjcbDjRG5et4G=VFnkKh86iAwvXGq2ox08DewGD"
}
url = "https://www.vcg.com/api/common/searchAllImage"
for i in range(1,3):
    params = {
        "page": f"{i}",
        "phrase": "背景",
        "transform": "beijing",
        "uuid": "F4N5S94_68e6027de5004818237a2c20592e941b",
        "productId": "400197"
    }
    response = requests.get(url, headers=headers, cookies=cookies, params=params).json()

    data = response['list']
    for i in range(len(data)):
        title=data[i]['title']
        img ="https:"+data[i]['url800']

        urlretrieve(img,f'视觉中国-单线程/{title}{i}.jpg')

以上就是利用单线程模式对【视觉中国】进行的爬取,因为该网站是一个动态网页,所以可以利用spidertools工具进行网页的响应来获取数据,只是注意该文件是json格式的就可以了。利用单线程模式也可以实现内容的爬虫和下载,但是缺点就是耗时太多,消耗太多资源。

多线程模式(生产者消费者模式)

import requests
from urllib.request import urlretrieve
import threading
from queue import Queue


#生产者
class Product_img(threading.Thread):
    def run(self) -> None:
        while True:
            if url_q.empty():
                break
            else:
                params = url_q.get()
            print('-------取到一个参数')
            headers = {
                "authority": "sensorsdata.vcg.com",
                "accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
                "accept-language": "zh-CN,zh;q=0.9",
                "if-modified-since": "Fri, 21 Jul 2023 08:25:07 GMT",
                "if-none-match": ""0BAC8A231F2D76EC36891D213F543188"",
                "referer": "https://www.vcg.com/",
                "sec-ch-ua": ""Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"",
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": ""Windows"",
                "sec-fetch-dest": "image",
                "sec-fetch-mode": "no-cors",
                "sec-fetch-site": "same-site",
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Referer": "https://www.vcg.com/",
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                "Accept": "application/json",
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Connection": "keep-alive",
                "Sec-Fetch-Dest": "empty",
                "Sec-Fetch-Mode": "cors",
                "Sec-Fetch-Site": "same-origin",
                "cookie": "uuid=8f9ad52f-28e3-4101-a663-ad7ed2f73e1e; clientIp=183.226.116.146; sajssdk_2015_cross_new_user=1; fingerprint=6fdf72f57da351f4b3c9a5aa4e634a92; Hm_lvt_5fd2e010217c332a79f6f3c527df12e9=1704371387; acw_sc__v2=6596a4a93931aa06acb6661ce35d3d0bfed61a4b; api_token=ST-348-61f5b4991429d93b76a326d2033831146; abBoss3=1.0; name=; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%7D; Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9=1704371423; _fp_=eyJpcCI6IjE4My4yMjYuMTE2LjE0NiIsImZwIjoiNmZkZjcyZjU3ZGEzNTFmNGIzYzlhNWFhNGU2MzRhOTIiLCJocyI6IiQyYSQwOCR2UVk1MVlUMWJuLkFzbm5PMHFrVmdlYi5JdDFEeVk1T2JxT3N4Ukh5RUJqNEwuMFRwOXZJMiJ9; ssxmod_itna=eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI+Q5DBkiO4iNDnD8x7YDvII=GYIDoDbxxYOW=mxIKmWxhmxaOWDbPEPfYG1bbDHxY=DU2DneYTDen=D5xGoDPxDeDADYo0DAqiOD7qDd06TXZmqDEDYPBDA3Di4D+GT=DmqG0DDU7l4G2D7UgYDDlYiW4iDDhjD2KZ3RlRltK8CGxGqDM9eGXWBF4GLbCcpGV8eFO=WPcDB=mxBjZRqi0ReDHIdi/Ql5AD2GolxeuWGPKKi4qYGhv0YweKiePRGQK8Dxvi5YQSKs4DDf4U0jxYD===; ssxmod_itna2=eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI+Yx8d1GrxGN0hxGaDor9A5W0uzxn4q8DuiM5iiehiqb08Do1QARA=EbnAumOyOBorEYDHmKojDaikQHLvx4aC0hpskAdkSbOj1iaieiqQ0PIGm15Qx3mHXAB1VABiwPEO=bnikZyWe4n1Obm8Yr5IAPSCGzakqheWz+9wmaTsneYNFRpT3fmYmEemeaumHzaGxzI+c7dRn0DxGak+egM6o5=htxpb3qlWSzPWclq1yexT1DuUTO/y3=nOqYpLqGD=ygHQy6ydQAm8tgRtI4DqnV9uaBTDdA=eYZIKqmbB+hrG9dI56I4NUY9WbG/xZ0aPEmT7RczhrSnWmR+KorCp8ig5QOdaHh/bx3eQIrhKu5hYYmEaCRdZydWEEzEW/vn7WaFaQF3qUpMAFp8St/O992DfBbuPWs3H/+a6PBFu/FSOf+jz2dT9FV7=Qi3D07iGeeDL1QeQTo9PXaQCOikYAe74x9jFiYOYBq9TxODkoT2MG2KDrwgIO0R9GoGDRXdCK0YzIeqGoOGeY9d1IeOYqfh5oD0L9K6/c5hxD7=DYKxeD===",
                "Content-Type": "application/json",
                "If-None-Match": "W/"27-0FSKyI2Ex7tiw1CTIT4da7WD/x4"",
                "sec": "$2a$08$FTRVClzf6hh2yu4n/.g3yeo6jVWIeilfayXUp5uyYBCSm4MA9Eh4G",
                "Content-Length": "0",
                "Origin": "https://www.vcg.com"
            }
            cookies = {
                "acw_tc": "276077e317043713558224202ed96b634384be82b94e872cec74da2dcfb4a7",
                "uuid": "8f9ad52f-28e3-4101-a663-ad7ed2f73e1e",
                "clientIp": "183.226.116.146",
                "sajssdk_2015_cross_new_user": "1",
                "b-user-id": "e4103044-b312-4c54-fd42-ff5f6edb6fec",
                "fingerprint": "6fdf72f57da351f4b3c9a5aa4e634a92",
                "Hm_lvt_5fd2e010217c332a79f6f3c527df12e9": "1704371387",
                "acw_sc__v2": "6596a4a93931aa06acb6661ce35d3d0bfed61a4b",
                "api_token": "ST-348-61f5b4991429d93b76a326d2033831146",
                "abBoss3": "1.0",
                "name": "",
                "sensorsdata2015jssdkcross": "%7B%22distinct_id%22%3A%223b79827d64ddeb5723d3bea956f5b7348%22%2C%22first_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%7D%2C%22%24device_id%22%3A%2218cd4735e03f1b-0263075fde49bea-26001951-1327104-18cd4735e041178%22%7D",
                "Hm_lpvt_5fd2e010217c332a79f6f3c527df12e9": "1704371423",
                "_fp_": "eyJpcCI6IjE4My4yMjYuMTE2LjE0NiIsImZwIjoiNmZkZjcyZjU3ZGEzNTFmNGIzYzlhNWFhNGU2MzRhOTIiLCJocyI6IiQyYSQwOCQ1aE9EMVYvRk5Ub1EzSmYyM1J3MVl1eGJ1bmFrZjB6Q05HamREU0o4TzRLSGltWW1obk9PUyJ9",
                "ssxmod_itna": "eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI8W0DBkYPiNDnD8x7YDvIm=GYIDoDEGDmQfeRDTqbfGiRGfQ3DE+n4aeUtxoGLDmKDy7D84moD4RKGwD0eG+DD4DWDmnHDnxAQDjxGpycuTXBDi3Dbg=Df4DmDGAybqDgDYQDGqIUD7QDITt4DDN44pDeDiYdtm2UGeNQ6wxK57GyD0t=xBdPjpGyAl9Co0NKs=94ao=DzM7DtTNl/MdQx0PgDCX4ZgoiQ7+zDEQP7hhxm0D470oQRYhxerhQKwhKiAEuVxqUXvDDGfGd3kDxD",
                "ssxmod_itna2": "eqGxnDgDcDu73D5e0du7tDyC7ghxiKreDkI8WD8d1GrxGXQqqGa7YAkoZz2x8OD6QZAxQ5EH/CiqWKPNuribRFPhiTxhmGYen54inziPRX4Y0b=T2x3bqn=0cPA8tB0XPrwnHCnqKYE8SwKm/oQi4QYCKAjH7Yvn7Yjx4kANeTAHUR=mKQYqzhb=hnvLGBPXfmo67Q=XbmKeY4Yb4fDWVEQOk6A0iAykh2alUpICweNY6m380xap7357Q45l4wr6lxMyBK8Y6tEbq48U2kiY5GNb3fKl7v0+Vh1AxjYSESTYdtUf3tQBCjmCPGet4gwcl6+jILOPx8es0UKK2540eUpPIwixr9=dYUK/DiyBrDRt/Bpt3UQRvx6KNF392GSxiUlKW8u9pIsmUiZqyrdY=L1FYGFrsEoYFwRaeyAj+DEpKNabAAUXIgqp+rj+WaZ0P==8PKm6=ID+LRQuYL++QC7Y9D9nzwI2Ah1nhm99idRpfwIynzyw+fUIhWDG2GDoYGOdGi4h=ek+Hl1RDpiioG344Pwkn6i05frkRGGXPeSlv2Rj7jQfxYUbTiCfUwxFTTjcaRDonQwQGLyhTIF4+6KM1K/1K7DDFqD+rDxD"
            }
            response = requests.get(url, headers=headers, cookies=cookies, params=params).json()

            data=response['list']
            for i in range(len(data)):
                title=data[i]['title']
                img ="https:"+data[i]['url800']
                img_q.put((title,img))  #put一次传一组,可以是任意形式,可以使用的是元组
                print('解析到的结果',img)

#消费者
class Custmor_img(threading.Thread):
    # def __init__(self,num):  #重新定义Thread之后,将Thread里面的副类__init__()调用一下
    #     super().__init__() #
    #     self.num=num 但这个方法不太好

    def run(self) -> None:
        global num
        while True:
            title,img=img_q.get()
            print('获取到数据,开始下载')
            urlretrieve(img,f'视觉中国-多线程/{title}{num}.jpg')
            num +=1
            if img_q.empty():
                break




if __name__ == '__main__':
    url_q = Queue(5) #请求队列,有五页的网址
    img_q = Queue()

    url = "https://www.vcg.com/api/common/searchAllImage"
    #把5页数据放进地址队列
    for i in range(1,5):
        params = {
            "page": f"{i}",
            "phrase": "背景",
            "transform": "beijing",
            "uuid": "F4N5S94_68e6027de5004818237a2c20592e941b",
            "productId": "400197"
        }
        url_q.put(params)
    print('网站队列的长度:',url_q.qsize())


    #创建三个生产者生产数据
    for i in range(3):
        p1 = Product_img()
        p1.start()  #这里开始就去执行Product_img()中的run方法

    #创建三个消费者下载数据
    num=1
    for i in range(3):
        c1 = Custmor_img()
        c1.start()  #这里开始就去执行Product_img()中的run方法