下载地址:www.pan38.com/share.php?c… 提取码:7785 【仅供学习参考用途】
这个爬虫系统包含4个主要模块:主爬虫程序、配置文件、工具函数和程序入口。主程序使用快手GraphQL API获取数据,支持多线程采集,自动提取UID并保存到文件。使用时需要配置有效的Cookie和代理。
import requests
import json
import re
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor
class KuaishouCommentSpider:
def __init__(self):
self.headers = {
'User-Agent': UserAgent().random,
'Cookie': 'your_cookie_here'
}
self.comment_api = "https://www.kuaishou.com/graphql"
self.uid_pattern = re.compile(r'user_id=(\d+)')
self.session = requests.Session()
def get_video_list(self, keyword, pages=3):
video_ids = []
params = {
'operationName': 'visionSearchPhoto',
'variables': json.dumps({
'keyword': keyword,
'page': 1,
'size': 20
})
}
for page in range(1, pages+1):
params['variables']['page'] = page
response = self.session.get(
self.comment_api,
headers=self.headers,
params=params
)
data = response.json()
video_ids.extend([item['photo']['id'] for item in data['data']['visionSearchPhoto']['feeds']])
return video_ids
def get_comments(self, video_id, max_comments=1000):
all_comments = []
params = {
'operationName': 'commentListQuery',
'variables': json.dumps({
'photoId': video_id,
'page': 1,
'size': 20
})
}
while len(all_comments) < max_comments:
response = self.session.get(
self.comment_api,
headers=self.headers,
params=params
)
data = response.json()
comments = data['data']['commentList']['comments']
if not comments:
break
all_comments.extend(comments)
params['variables']['page'] += 1
time.sleep(1)
return all_comments
def extract_uids(self, comments):
uids = set()
for comment in comments:
if 'user' in comment:
uids.add(comment['user']['id'])
elif 'authorId' in comment:
uids.add(comment['authorId'])
return list(uids)
def save_to_file(self, data, filename):
with open(filename, 'a', encoding='utf-8') as f:
if isinstance(data, dict):
json.dump(data, f, ensure_ascii=False)
elif isinstance(data, list):
for item in data:
f.write(f"{item}\n")
def run(self, keyword):
print(f"开始采集关键词: {keyword}")
video_ids = self.get_video_list(keyword)
print(f"获取到 {len(video_ids)} 个视频")
with ThreadPoolExecutor(max_workers=5) as executor:
for vid in video_ids:
comments = self.get_comments(vid)
uids = self.extract_uids(comments)
self.save_to_file(comments, f"{keyword}_comments.json")
self.save_to_file(uids, f"{keyword}_uids.txt")
print(f"视频 {vid} 处理完成,获取 {len(comments)} 条评论,{len(uids)} 个UID")
# 代理设置
PROXY = {
'http': 'http://127.0.0.1:1080',
'https': 'http://127.0.0.1:1080'
}
# 请求间隔(秒)
REQUEST_INTERVAL = 1.5
# 最大重试次数
MAX_RETRY = 3
# 输出目录
OUTPUT_DIR = './data/'
# 关键词列表
KEYWORDS = ['美食', '旅游', '科技', '搞笑']
import os
import hashlib
from datetime import datetime
def create_dir(path):
if not os.path.exists(path):
os.makedirs(path)
def get_md5(text):
return hashlib.md5(text.encode('utf-8')).hexdigest()
def format_time(timestamp):
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
def clean_text(text):
return re.sub(r'[\n\t\r]', '', text).strip()
def check_duplicate(item, cache_set):
item_id = str(item.get('id', '')) or get_md5(str(item))
if item_id in cache_set:
return True
cache_set.add(item_id)
return False
from ks_spider import KuaishouCommentSpider
from config import KEYWORDS
import time
if __name__ == '__main__':
spider = KuaishouCommentSpider()
for keyword in KEYWORDS:
start_time = time.time()
spider.run(keyword)
print(f"关键词 {keyword} 处理完成,耗时 {time.time()-start_time:.2f}秒")