抖音评论采集提取工具,抖音评论区数据采集脚本,python采集框架分享

324 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:2812

这个采集框架包含三个主要模块:主爬虫程序、代理管理模块和数据分析模块。使用时需要先获取抖音Cookie和配置代理IP池。主程序支持断点续爬、自动更换代理和多种数据保存格式。数据分析模块提供了词频统计、词云生成和时间分布分析功能。

import requests import json import time import re from bs4 import BeautifulSoup from urllib.parse import urlencode import random import pandas as pd

class DouyinCommentSpider: def init(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'www.douyin.com/', 'Cookie': '你的抖音cookie' } self.session = requests.Session() self.comment_data = [] self.max_retries = 3 self.proxy_pool = [ 'proxy1.example.com:8080', 'proxy2.example.com:8080' ]

def get_video_id(self, url):
    """从视频URL提取视频ID"""
    pattern = r'/video/(\d+)'
    match = re.search(pattern, url)
    return match.group(1) if match else None

def get_comments(self, video_id, max_count=1000):
    """获取视频评论"""
    base_url = 'https://www.douyin.com/aweme/v1/web/comment/list/'
    params = {
        'aweme_id': video_id,
        'cursor': 0,
        'count': 20,
        'item_type': 0
    }
    
    collected = 0
    while collected < max_count:
        try:
            proxy = random.choice(self.proxy_pool)
            response = self.session.get(
                base_url + '?' + urlencode(params),
                headers=self.headers,
                proxies={'http': proxy},
                timeout=10
            )
            
            if response.status_code == 200:
                data = response.json()
                comments = data.get('comments', [])
                
                for comment in comments:
                    self.process_comment(comment)
                    collected += 1
                    if collected >= max_count:
                        break
                
                if not data.get('has_more', False):
                    break
                
                params['cursor'] = data.get('cursor', params['cursor'] + 20)
                time.sleep(random.uniform(1, 3))
            
        except Exception as e:
            print(f"Error fetching comments: {e}")
            time.sleep(5)

def process_comment(self, comment):
    """处理单条评论数据"""
    comment_info = {
        'comment_id': comment.get('cid', ''),
        'user_id': comment.get('user', {}).get('uid', ''),
        'nickname': comment.get('user', {}).get('nickname', ''),
        'content': comment.get('text', ''),
        'likes': comment.get('digg_count', 0),
        'reply_count': comment.get('reply_comment_total', 0),
        'create_time': comment.get('create_time', 0),
        'ip_location': comment.get('ip_label', ''),
        'is_author': comment.get('is_author_digg', False)
    }
    self.comment_data.append(comment_info)

def save_to_excel(self, filename):
    """保存数据到Excel"""
    df = pd.DataFrame(self.comment_data)
    df.to_excel(filename, index=False, encoding='utf-8-sig')

def save_to_json(self, filename):
    """保存数据到JSON"""
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(self.comment_data, f, ensure_ascii=False, indent=2)

import requests from threading import Lock

class ProxyManager: def init(self): self.proxies = [] self.lock = Lock() self.proxy_api = "你的代理API地址"

def fetch_proxies(self):
    """从代理API获取新代理"""
    try:
        response = requests.get(self.proxy_api)
        if response.status_code == 200:
            new_proxies = response.json().get('data', [])
            with self.lock:
                self.proxies = new_proxies
    except Exception as e:
        print(f"Error fetching proxies: {e}")

def get_random_proxy(self):
    """随机获取一个可用代理"""
    with self.lock:
        if not self.proxies:
            self.fetch_proxies()
        return random.choice(self.proxies) if self.proxies else None

def validate_proxy(self, proxy):
    """验证代理是否可用"""
    try:
        test_url = "http://www.baidu.com"
        response = requests.get(
            test_url,
            proxies={'http': proxy},
            timeout=5
        )
        return response.status_code == 200
    except:
        return False