虾皮获取商品信息

22 阅读1分钟

虾皮获取商品信息 用于从Shopee网站获取商品信息。它接受一个header参数(通常包含用户代理和cookie等信息)和一个可选的URL参数(默认为"shopee.co.id/")。如果提供了有效的…

import requests from bs4 import BeautifulSoup import re import json from urllib.parse import urlparse import uuid

def get_goods_info(header, Url="shopee.co.id/", *args, **kwargs): """ :获取shopee商品信息 """ # 判断URL是否存在 if Url == "shopee.co.id/": return {"code": "0001", "error": "not url"}

response = requests.get(Url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
if not response.headers.get("X-Request-ID"):
    return {"code": "0002", "error": "not login"}

data_box = {
    "title": soup.find("div", attrs={"class": "WBVL_7"}),
    "freight": soup.find("div", attrs={"class": "flex items-center PZGOkt"}),
    "attrs": soup.find("div", attrs={"class": "flex KIoPj6 W5LiQM"}),
}
for box in data_box.values():
    if box:
        continue
    return {}

att = {}
for i in data_box["attrs"].findAll("section"):
    if not i.find("div", attrs={"class": "flex items-center j7HL5Q"}):
        continue
    key = i.find("h3").text
    value = []
    for x in i.find("div", attrs={"class": "flex items-center j7HL5Q"}).findAll("button"):
        value.append({"src": x.find("img").get("src") if x.find("img") else None, "attr_name": x.text})
    att[key] = value

img_block = soup.find("div", attrs={"class": "airUhU"}).findAll("div", attrs={"class": "UBG7wZ"})
if not img_block:
    return {}
images = [img.find("img").attrs.get("src") for img in img_block]

# id参数筛选
result = re.search(r'-i.\d+\.\d+|/\d+/\d+', urlparse(Url).path)
if not result:
    return {}
at = re.search(r'\d+\.\d+|\d+/\d+', result.group()).group()
if not at:
    return {}

at = at.split(".") if "." in at else at.split("/")
get_info_response = requests.get("https://shopee.co.id/api/v4/pdp/get_pc", headers=header, params={
    "shop_id": at[0], "item_id": at[1], "detail_level": 0
})
get_info_text = BeautifulSoup(get_info_response.text, 'html.parser')
p_data = json.loads(get_info_text.text)
if not p_data and not len(p_data) >= 400:
    return {}
p_datad = p_data.get("data").get("product_attributes").get("attrs")
models = p_data.get("data").get("item").get("models")
p_datad = [{"name": i.get("name"), "value": i.get("value")} for i in p_datad]
description = p_data.get("data").get("item").get("description")

return {
    "url":Url,
    "title": data_box["title"].text,
    "freight": data_box["freight"].text,
    "attrs_info": att,
    "images": images,
    "info": p_datad,
    "models": models,
    "description": description,