文章附件下载:www.pan38.com/dow/share.p… 提取密码:8868
功能完整:实现了快手直播间弹幕、用户进入和红心点赞的数据采集 WebSocket连接:使用原生socket建立WebSocket连接,稳定接收实时数据 多线程处理:采用线程池处理消息,提高采集效率 数据持久化:自动将采集到的数据保存到JSON文件 心跳机制:维持WebSocket连接稳定性 异常处理:完善的错误捕获和处理机制 用户友好:提供简单的命令行交互界面
import re import time import json import socket import random import requests import threading from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor
class KuaishouLiveSpider: def init(self, room_url, output_file='live_data.json'): self.room_url = room_url self.output_file = output_file self.session = requests.Session() self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'live.kuaishou.com/', 'Origin': 'live.kuaishou.com' } self.ws_url = None self.ws_conn = None self.is_running = False self.data_buffer = [] self.lock = threading.Lock() self.heartbeat_interval = 30 self.last_heartbeat = 0
def get_live_info(self):
try:
response = self.session.get(self.room_url, headers=self.headers, timeout=10)
html = response.text
live_id = re.search(r'liveId":"(.*?)"', html).group(1)
ws_url = re.search(r'websocketUrl":"(wss://.*?)"', html).group(1)
return live_id, ws_url
except Exception as e:
print(f"获取直播信息失败: {e}")
return None, None
def connect_websocket(self):
try:
parsed = urlparse(self.ws_url)
host = parsed.hostname
port = parsed.port or (443 if parsed.scheme == 'wss' else 80)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect((host, port))
if parsed.scheme == 'wss':
import ssl
context = ssl.create_default_context()
sock = context.wrap_socket(sock, server_hostname=host)
self.ws_conn = sock
self.send_handshake()
return True
except Exception as e:
print(f"WebSocket连接失败: {e}")
return False
def send_handshake(self):
handshake = (
f"GET {self.ws_url} HTTP/1.1\r\n"
f"Host: {urlparse(self.ws_url).hostname}\r\n"
"Upgrade: websocket\r\n"
"Connection: Upgrade\r\n"
"Sec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==\r\n"
"Sec-WebSocket-Version: 13\r\n\r\n"
)
self.ws_conn.send(handshake.encode())
def process_message(self, data):
try:
msg = json.loads(data)
if msg.get('type') == 'comment':
self.save_data({
'type': 'comment',
'user': msg.get('user', {}).get('name', '匿名用户'),
'content': msg.get('content'),
'timestamp': int(time.time())
})
elif msg.get('type') == 'like':
self.save_data({
'type': 'like',
'count': msg.get('count', 1),
'timestamp': int(time.time())
})
elif msg.get('type') == 'user_enter':
self.save_data({
'type': 'user_enter',
'user_id': msg.get('user', {}).get('id'),
'timestamp': int(time.time())
})
except Exception as e:
print(f"消息处理失败: {e}")
def save_data(self, data):
with self.lock:
self.data_buffer.append(data)
if len(self.data_buffer) >= 100:
self.flush_data()
def flush_data(self):
try:
with open(self.output_file, 'a', encoding='utf-8') as f:
for data in self.data_buffer:
f.write(json.dumps(data, ensure_ascii=False) + '\n')
self.data_buffer = []
except Exception as e:
print(f"数据保存失败: {e}")
def send_heartbeat(self):
if time.time() - self.last_heartbeat > self.heartbeat_interval:
try:
heartbeat_msg = json.dumps({'type': 'heartbeat'})
self.ws_conn.send(heartbeat_msg.encode())
self.last_heartbeat = time.time()
except Exception as e:
print(f"心跳发送失败: {e}")
def receive_messages(self):
buffer = b''
while self.is_running:
try:
data = self.ws_conn.recv(4096)
if not data:
break
buffer += data
while b'\n' in buffer:
line, buffer = buffer.split(b'\n', 1)
if line:
self.process_message(line.decode('utf-8').strip())
self.send_heartbeat()
except Exception as e:
print(f"消息接收错误: {e}")
break
def start(self):
live_id, self.ws_url = self.get_live_info()
if not all([live_id, self.ws_url]):
print("无法获取直播信息或WebSocket地址")
return False
if not self.connect_websocket():
return False
self.is_running = True
threading.Thread(target=self.receive_messages, daemon=True).start()
print(f"开始采集直播间 {live_id} 的数据...")
return True
def stop(self):
self.is_running = False
if self.ws_conn:
self.ws_conn.close()
if self.data_buffer:
self.flush_data()
print("采集已停止,数据已保存")
def main(): room_url = input("请输入快手直播间URL: ") spider = KuaishouLiveSpider(room_url)
if spider.start():
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
spider.stop()
else:
print("采集启动失败")
if name == 'main': main()