虾皮获取商品信息 用于从Shopee网站获取商品信息。它接受一个header参数(通常包含用户代理和cookie等信息)和一个可选的URL参数(默认为"shopee.co.id/")。如果提供了有效的…
import requests from bs4 import BeautifulSoup import re import json from urllib.parse import urlparse import uuid
def get_goods_info(header, Url="shopee.co.id/", *args, **kwargs): """ :获取shopee商品信息 """ # 判断URL是否存在 if Url == "shopee.co.id/": return {"code": "0001", "error": "not url"}
response = requests.get(Url, headers=header)
soup = BeautifulSoup(response.text, 'html.parser')
if not response.headers.get("X-Request-ID"):
return {"code": "0002", "error": "not login"}
data_box = {
"title": soup.find("div", attrs={"class": "WBVL_7"}),
"freight": soup.find("div", attrs={"class": "flex items-center PZGOkt"}),
"attrs": soup.find("div", attrs={"class": "flex KIoPj6 W5LiQM"}),
}
for box in data_box.values():
if box:
continue
return {}
att = {}
for i in data_box["attrs"].findAll("section"):
if not i.find("div", attrs={"class": "flex items-center j7HL5Q"}):
continue
key = i.find("h3").text
value = []
for x in i.find("div", attrs={"class": "flex items-center j7HL5Q"}).findAll("button"):
value.append({"src": x.find("img").get("src") if x.find("img") else None, "attr_name": x.text})
att[key] = value
img_block = soup.find("div", attrs={"class": "airUhU"}).findAll("div", attrs={"class": "UBG7wZ"})
if not img_block:
return {}
images = [img.find("img").attrs.get("src") for img in img_block]
# id参数筛选
result = re.search(r'-i.\d+\.\d+|/\d+/\d+', urlparse(Url).path)
if not result:
return {}
at = re.search(r'\d+\.\d+|\d+/\d+', result.group()).group()
if not at:
return {}
at = at.split(".") if "." in at else at.split("/")
get_info_response = requests.get("https://shopee.co.id/api/v4/pdp/get_pc", headers=header, params={
"shop_id": at[0], "item_id": at[1], "detail_level": 0
})
get_info_text = BeautifulSoup(get_info_response.text, 'html.parser')
p_data = json.loads(get_info_text.text)
if not p_data and not len(p_data) >= 400:
return {}
p_datad = p_data.get("data").get("product_attributes").get("attrs")
models = p_data.get("data").get("item").get("models")
p_datad = [{"name": i.get("name"), "value": i.get("value")} for i in p_datad]
description = p_data.get("data").get("item").get("description")
return {
"url":Url,
"title": data_box["title"].text,
"freight": data_box["freight"].text,
"attrs_info": att,
"images": images,
"info": p_datad,
"models": models,
"description": description,