采集快手最新评论区内容工具,提取快手评论内容UID信息,Python框架分享

115 阅读1分钟

下载地址:www.pan38.com/share.php?c…   提取码:7785  【仅供学习参考用途】

这个爬虫系统包含4个主要模块:主爬虫程序、配置文件、工具函数和程序入口。主程序使用快手GraphQL API获取数据,支持多线程采集,自动提取UID并保存到文件。使用时需要配置有效的Cookie和代理。

import requests
import json
import re
import time
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from concurrent.futures import ThreadPoolExecutor

class KuaishouCommentSpider:
    def __init__(self):
        self.headers = {
            'User-Agent': UserAgent().random,
            'Cookie': 'your_cookie_here'
        }
        self.comment_api = "https://www.kuaishou.com/graphql"
        self.uid_pattern = re.compile(r'user_id=(\d+)')
        self.session = requests.Session()
        
    def get_video_list(self, keyword, pages=3):
        video_ids = []
        params = {
            'operationName': 'visionSearchPhoto',
            'variables': json.dumps({
                'keyword': keyword,
                'page': 1,
                'size': 20
            })
        }
        for page in range(1, pages+1):
            params['variables']['page'] = page
            response = self.session.get(
                self.comment_api,
                headers=self.headers,
                params=params
            )
            data = response.json()
            video_ids.extend([item['photo']['id'] for item in data['data']['visionSearchPhoto']['feeds']])
        return video_ids
    
    def get_comments(self, video_id, max_comments=1000):
        all_comments = []
        params = {
            'operationName': 'commentListQuery',
            'variables': json.dumps({
                'photoId': video_id,
                'page': 1,
                'size': 20
            })
        }
        while len(all_comments) < max_comments:
            response = self.session.get(
                self.comment_api,
                headers=self.headers,
                params=params
            )
            data = response.json()
            comments = data['data']['commentList']['comments']
            if not comments:
                break
            all_comments.extend(comments)
            params['variables']['page'] += 1
            time.sleep(1)
        return all_comments
    
    def extract_uids(self, comments):
        uids = set()
        for comment in comments:
            if 'user' in comment:
                uids.add(comment['user']['id'])
            elif 'authorId' in comment:
                uids.add(comment['authorId'])
        return list(uids)
    
    def save_to_file(self, data, filename):
        with open(filename, 'a', encoding='utf-8') as f:
            if isinstance(data, dict):
                json.dump(data, f, ensure_ascii=False)
            elif isinstance(data, list):
                for item in data:
                    f.write(f"{item}\n")
    
    def run(self, keyword):
        print(f"开始采集关键词: {keyword}")
        video_ids = self.get_video_list(keyword)
        print(f"获取到 {len(video_ids)} 个视频")
        
        with ThreadPoolExecutor(max_workers=5) as executor:
            for vid in video_ids:
                comments = self.get_comments(vid)
                uids = self.extract_uids(comments)
                self.save_to_file(comments, f"{keyword}_comments.json")
                self.save_to_file(uids, f"{keyword}_uids.txt")
                print(f"视频 {vid} 处理完成,获取 {len(comments)} 条评论,{len(uids)} 个UID")


# 代理设置
PROXY = {
    'http': 'http://127.0.0.1:1080',
    'https': 'http://127.0.0.1:1080'
}

# 请求间隔(秒)
REQUEST_INTERVAL = 1.5

# 最大重试次数
MAX_RETRY = 3

# 输出目录
OUTPUT_DIR = './data/'

# 关键词列表
KEYWORDS = ['美食', '旅游', '科技', '搞笑']


import os
import hashlib
from datetime import datetime

def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def get_md5(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

def format_time(timestamp):
    return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')

def clean_text(text):
    return re.sub(r'[\n\t\r]', '', text).strip()

def check_duplicate(item, cache_set):
    item_id = str(item.get('id', '')) or get_md5(str(item))
    if item_id in cache_set:
        return True
    cache_set.add(item_id)
    return False


from ks_spider import KuaishouCommentSpider
from config import KEYWORDS
import time

if __name__ == '__main__':
    spider = KuaishouCommentSpider()
    for keyword in KEYWORDS:
        start_time = time.time()
        spider.run(keyword)
        print(f"关键词 {keyword} 处理完成,耗时 {time.time()-start_time:.2f}秒")