下载地址:www.pan38.com/dow/share.p… 提取密码:2812
这个采集框架包含三个主要模块:主爬虫程序、代理管理模块和数据分析模块。使用时需要先获取抖音Cookie和配置代理IP池。主程序支持断点续爬、自动更换代理和多种数据保存格式。数据分析模块提供了词频统计、词云生成和时间分布分析功能。
import requests import json import time import re from bs4 import BeautifulSoup from urllib.parse import urlencode import random import pandas as pd
class DouyinCommentSpider: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'www.douyin.com/', 'Cookie': '你的抖音cookie' } self.session = requests.Session() self.comment_data = [] self.max_retries = 3 self.proxy_pool = [ 'proxy1.example.com:8080', 'proxy2.example.com:8080' ]
def get_video_id(self, url):
"""从视频URL提取视频ID"""
pattern = r'/video/(\d+)'
match = re.search(pattern, url)
return match.group(1) if match else None
def get_comments(self, video_id, max_count=1000):
"""获取视频评论"""
base_url = 'https://www.douyin.com/aweme/v1/web/comment/list/'
params = {
'aweme_id': video_id,
'cursor': 0,
'count': 20,
'item_type': 0
}
collected = 0
while collected < max_count:
try:
proxy = random.choice(self.proxy_pool)
response = self.session.get(
base_url + '?' + urlencode(params),
headers=self.headers,
proxies={'http': proxy},
timeout=10
)
if response.status_code == 200:
data = response.json()
comments = data.get('comments', [])
for comment in comments:
self.process_comment(comment)
collected += 1
if collected >= max_count:
break
if not data.get('has_more', False):
break
params['cursor'] = data.get('cursor', params['cursor'] + 20)
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error fetching comments: {e}")
time.sleep(5)
def process_comment(self, comment):
"""处理单条评论数据"""
comment_info = {
'comment_id': comment.get('cid', ''),
'user_id': comment.get('user', {}).get('uid', ''),
'nickname': comment.get('user', {}).get('nickname', ''),
'content': comment.get('text', ''),
'likes': comment.get('digg_count', 0),
'reply_count': comment.get('reply_comment_total', 0),
'create_time': comment.get('create_time', 0),
'ip_location': comment.get('ip_label', ''),
'is_author': comment.get('is_author_digg', False)
}
self.comment_data.append(comment_info)
def save_to_excel(self, filename):
"""保存数据到Excel"""
df = pd.DataFrame(self.comment_data)
df.to_excel(filename, index=False, encoding='utf-8-sig')
def save_to_json(self, filename):
"""保存数据到JSON"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(self.comment_data, f, ensure_ascii=False, indent=2)
import requests from threading import Lock
class ProxyManager: def init(self): self.proxies = [] self.lock = Lock() self.proxy_api = "你的代理API地址"
def fetch_proxies(self):
"""从代理API获取新代理"""
try:
response = requests.get(self.proxy_api)
if response.status_code == 200:
new_proxies = response.json().get('data', [])
with self.lock:
self.proxies = new_proxies
except Exception as e:
print(f"Error fetching proxies: {e}")
def get_random_proxy(self):
"""随机获取一个可用代理"""
with self.lock:
if not self.proxies:
self.fetch_proxies()
return random.choice(self.proxies) if self.proxies else None
def validate_proxy(self, proxy):
"""验证代理是否可用"""
try:
test_url = "http://www.baidu.com"
response = requests.get(
test_url,
proxies={'http': proxy},
timeout=5
)
return response.status_code == 200
except:
return False