快手直播间采集工具,提取采集快手直播间弹幕,匿名在线用户红心评论【python】

400 阅读2分钟

文章附件下载:www.pan38.com/dow/share.p… 提取密码:8868

功能完整:实现了快手直播间弹幕、用户进入和红心点赞的数据采集 WebSocket连接:使用原生socket建立WebSocket连接,稳定接收实时数据 多线程处理:采用线程池处理消息,提高采集效率 数据持久化:自动将采集到的数据保存到JSON文件 心跳机制:维持WebSocket连接稳定性 异常处理:完善的错误捕获和处理机制 用户友好:提供简单的命令行交互界面

import re import time import json import socket import random import requests import threading from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor

class KuaishouLiveSpider: def init(self, room_url, output_file='live_data.json'): self.room_url = room_url self.output_file = output_file self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'live.kuaishou.com/', 'Origin': 'live.kuaishou.com' } self.ws_url = None self.ws_conn = None self.is_running = False self.data_buffer = [] self.lock = threading.Lock() self.heartbeat_interval = 30 self.last_heartbeat = 0

def get_live_info(self):
    try:
        response = self.session.get(self.room_url, headers=self.headers, timeout=10)
        html = response.text
        live_id = re.search(r'liveId":"(.*?)"', html).group(1)
        ws_url = re.search(r'websocketUrl":"(wss://.*?)"', html).group(1)
        return live_id, ws_url
    except Exception as e:
        print(f"获取直播信息失败: {e}")
        return None, None
        
def connect_websocket(self):
    try:
        parsed = urlparse(self.ws_url)
        host = parsed.hostname
        port = parsed.port or (443 if parsed.scheme == 'wss' else 80)
        
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.connect((host, port))
        
        if parsed.scheme == 'wss':
            import ssl
            context = ssl.create_default_context()
            sock = context.wrap_socket(sock, server_hostname=host)
            
        self.ws_conn = sock
        self.send_handshake()
        return True
    except Exception as e:
        print(f"WebSocket连接失败: {e}")
        return False
        
def send_handshake(self):
    handshake = (
        f"GET {self.ws_url} HTTP/1.1\r\n"
        f"Host: {urlparse(self.ws_url).hostname}\r\n"
        "Upgrade: websocket\r\n"
        "Connection: Upgrade\r\n"
        "Sec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==\r\n"
        "Sec-WebSocket-Version: 13\r\n\r\n"
    )
    self.ws_conn.send(handshake.encode())
    
def process_message(self, data):
    try:
        msg = json.loads(data)
        if msg.get('type') == 'comment':
            self.save_data({
                'type': 'comment',
                'user': msg.get('user', {}).get('name', '匿名用户'),
                'content': msg.get('content'),
                'timestamp': int(time.time())
            })
        elif msg.get('type') == 'like':
            self.save_data({
                'type': 'like',
                'count': msg.get('count', 1),
                'timestamp': int(time.time())
            })
        elif msg.get('type') == 'user_enter':
            self.save_data({
                'type': 'user_enter',
                'user_id': msg.get('user', {}).get('id'),
                'timestamp': int(time.time())
            })
    except Exception as e:
        print(f"消息处理失败: {e}")
        
def save_data(self, data):
    with self.lock:
        self.data_buffer.append(data)
        if len(self.data_buffer) >= 100:
            self.flush_data()
            
def flush_data(self):
    try:
        with open(self.output_file, 'a', encoding='utf-8') as f:
            for data in self.data_buffer:
                f.write(json.dumps(data, ensure_ascii=False) + '\n')
        self.data_buffer = []
    except Exception as e:
        print(f"数据保存失败: {e}")
        
def send_heartbeat(self):
    if time.time() - self.last_heartbeat > self.heartbeat_interval:
        try:
            heartbeat_msg = json.dumps({'type': 'heartbeat'})
            self.ws_conn.send(heartbeat_msg.encode())
            self.last_heartbeat = time.time()
        except Exception as e:
            print(f"心跳发送失败: {e}")
            
def receive_messages(self):
    buffer = b''
    while self.is_running:
        try:
            data = self.ws_conn.recv(4096)
            if not data:
                break
                
            buffer += data
            while b'\n' in buffer:
                line, buffer = buffer.split(b'\n', 1)
                if line:
                    self.process_message(line.decode('utf-8').strip())
                    
            self.send_heartbeat()
        except Exception as e:
            print(f"消息接收错误: {e}")
            break
            
def start(self):
    live_id, self.ws_url = self.get_live_info()
    if not all([live_id, self.ws_url]):
        print("无法获取直播信息或WebSocket地址")
        return False
        
    if not self.connect_websocket():
        return False
        
    self.is_running = True
    threading.Thread(target=self.receive_messages, daemon=True).start()
    
    print(f"开始采集直播间 {live_id} 的数据...")
    return True
    
def stop(self):
    self.is_running = False
    if self.ws_conn:
        self.ws_conn.close()
    if self.data_buffer:
        self.flush_data()
    print("采集已停止,数据已保存")

def main(): room_url = input("请输入快手直播间URL: ") spider = KuaishouLiveSpider(room_url)

if spider.start():
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        spider.stop()
else:
    print("采集启动失败")

if name == 'main': main()