抖音评论区数据采集工具,抖音快手小红书评论采集软件,采集评论脚本插件【python】

162 阅读2分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:2812

import os import time import json from platforms.douyin import DouyinCrawler from platforms.kuaishou import KuaishouCrawler from platforms.xiaohongshu import XiaohongshuCrawler from utils.logger import setup_logger from config import CONFIG

logger = setup_logger(name)

class CommentCrawler: def init(self): self.platforms = { 'douyin': DouyinCrawler(), 'kuaishou': KuaishouCrawler(), 'xiaohongshu': XiaohongshuCrawler() }

def crawl(self, platform, url, max_comments=1000):
    if platform not in self.platforms:
        raise ValueError(f"Unsupported platform: {platform}")
        
    crawler = self.platforms[platform]
    logger.info(f"Starting to crawl comments from {url}")
    
    try:
        comments = crawler.get_comments(url, max_comments)
        self._save_comments(platform, url, comments)
        return True
    except Exception as e:
        logger.error(f"Error crawling {url}: {str(e)}")
        return False
        
def _save_comments(self, platform, url, comments):
    timestamp = int(time.time())
    filename = f"{platform}_comments_{timestamp}.json"
    os.makedirs('data', exist_ok=True)
    filepath = os.path.join('data', filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump({
            'platform': platform,
            'url': url,
            'comments': comments,
            'timestamp': timestamp
        }, f, ensure_ascii=False, indent=2)
        
    logger.info(f"Saved {len(comments)} comments to {filepath}")

if name == 'main': crawler = CommentCrawler() # 示例URL crawler.crawl('douyin', 'www.douyin.com/video/12345…', 500) crawler.crawl('kuaishou', 'www.kuaishou.com/short-video…', 500) crawler.crawl('xiaohongshu', 'www.xiaohongshu.com/explore/123…', 500)

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time import random from urllib.parse import urlparse, parse_qs from utils.logger import setup_logger from config import CONFIG

logger = setup_logger(name)

class DouyinCrawler: def init(self): self.driver = None self.setup_driver()

def setup_driver(self):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=' + CONFIG['USER_AGENT'])
    
    self.driver = webdriver.Chrome(
        executable_path=CONFIG['CHROME_DRIVER_PATH'],
        options=options
    )
    self.driver.set_page_load_timeout(30)
    
def get_comments(self, url, max_comments=1000):
    if not self.driver:
        self.setup_driver()
        
    try:
        self.driver.get(url)
        time.sleep(5)  # 等待页面加载
        
        # 获取视频ID
        video_id = self._extract_video_id(url)
        if not video_id:
            raise ValueError("Invalid Douyin URL")
            
        # 点击评论按钮
        self._click_comment_button()
        
        # 滚动加载评论
        comments = []
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while len(comments) < max_comments:
            self._scroll_down()
            time.sleep(random.uniform(1, 3))
            
            new_comments = self._parse_comments()
            for comment in new_comments:
                if comment not in comments:
                    comments.append(comment)
                    
            if len(comments) >= max_comments:
                break
                
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            
        return comments[:max_comments]
        
    except Exception as e:
        logger.error(f"Error getting comments: {str(e)}")
        raise
    finally:
        self.driver.quit()
        self.driver = None
        
def _extract_video_id(self, url):
    parsed = urlparse(url)
    if 'douyin.com' not in parsed.netloc:
        return None
        
    path_parts = parsed.path.split('/')
    if len(path_parts) >= 3 and path_parts[1] == 'video':
        return path_parts[2]
    return None
    
def _click_comment_button(self):
    try:
        comment_button = WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "comment-btn")]'))
        )
        comment_button.click()
        time.sleep(2)
    except TimeoutException:
        raise Exception("Could not find comment button")
        
def _scroll_down(self):
    self.driver.execute_script("window.scrollBy(0, 500);")
    
def _parse_comments(self):
    comments = []
    try:
        comment_elements = self.driver.find_elements(By.XPATH, '//div[contains(@class, "comment-item")]')
        
        for element in comment_elements:
            try:
                user = element.find_element(By.XPATH, './/span[contains(@class, "username")]').text
                content = element.find_element(By.XPATH, './/div[contains(@class, "comment-content")]').text
                likes = element.find_element(By.XPATH, './/span[contains(@class, "like-count")]').text
                time_text = element.find_element(By.XPATH, './/span[contains(@class, "time")]').text
                
                comments.append({
                    'user': user,
                    'content': content,
                    'likes': likes,
                    'time': time_text
                })
            except Exception as e:
                logger.warning(f"Error parsing comment element: {str(e)}")
                continue
                
    except Exception as e:
        logger.warning(f"Error finding comment elements: {str(e)}")
        
    return comments

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time import random from urllib.parse import urlparse from utils.logger import setup_logger from config import CONFIG

logger = setup_logger(name)

class KuaishouCrawler: def init(self): self.driver = None self.setup_driver()

def setup_driver(self):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=' + CONFIG['USER_AGENT'])
    
    self.driver = webdriver.Chrome(
        executable_path=CONFIG['CHROME_DRIVER_PATH'],
        options=options
    )
    self.driver.set_page_load_timeout(30)
    
def get_comments(self, url, max_comments=1000):
    if not self.driver:
        self.setup_driver()
        
    try:
        self.driver.get(url)
        time.sleep(5)  # 等待页面加载
        
        # 获取视频ID
        video_id = self._extract_video_id(url)
        if not video_id:
            raise ValueError("Invalid Kuaishou URL")
            
        # 点击评论按钮
        self._click_comment_button()
        
        # 滚动加载评论
        comments = []
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        
        while len(comments) < max_comments:
            self._scroll_down()
            time.sleep(random.uniform(1, 3))
            
            new_comments = self._parse_comments()
            for comment in new_comments:
                if comment not in comments:
                    comments.append(comment)
                    
            if len(comments) >= max_comments:
                break
                
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            
        return comments[:max_comments]
        
    except Exception as e:
        logger.error