下载地址:www.pan38.com/dow/share.p… 提取密码:2812
import os import time import json from platforms.douyin import DouyinCrawler from platforms.kuaishou import KuaishouCrawler from platforms.xiaohongshu import XiaohongshuCrawler from utils.logger import setup_logger from config import CONFIG
logger = setup_logger(name)
class CommentCrawler: def init(self): self.platforms = { 'douyin': DouyinCrawler(), 'kuaishou': KuaishouCrawler(), 'xiaohongshu': XiaohongshuCrawler() }
def crawl(self, platform, url, max_comments=1000):
if platform not in self.platforms:
raise ValueError(f"Unsupported platform: {platform}")
crawler = self.platforms[platform]
logger.info(f"Starting to crawl comments from {url}")
try:
comments = crawler.get_comments(url, max_comments)
self._save_comments(platform, url, comments)
return True
except Exception as e:
logger.error(f"Error crawling {url}: {str(e)}")
return False
def _save_comments(self, platform, url, comments):
timestamp = int(time.time())
filename = f"{platform}_comments_{timestamp}.json"
os.makedirs('data', exist_ok=True)
filepath = os.path.join('data', filename)
with open(filepath, 'w', encoding='utf-8') as f:
json.dump({
'platform': platform,
'url': url,
'comments': comments,
'timestamp': timestamp
}, f, ensure_ascii=False, indent=2)
logger.info(f"Saved {len(comments)} comments to {filepath}")
if name == 'main': crawler = CommentCrawler() # 示例URL crawler.crawl('douyin', 'www.douyin.com/video/12345…', 500) crawler.crawl('kuaishou', 'www.kuaishou.com/short-video…', 500) crawler.crawl('xiaohongshu', 'www.xiaohongshu.com/explore/123…', 500)
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time import random from urllib.parse import urlparse, parse_qs from utils.logger import setup_logger from config import CONFIG
logger = setup_logger(name)
class DouyinCrawler: def init(self): self.driver = None self.setup_driver()
def setup_driver(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=' + CONFIG['USER_AGENT'])
self.driver = webdriver.Chrome(
executable_path=CONFIG['CHROME_DRIVER_PATH'],
options=options
)
self.driver.set_page_load_timeout(30)
def get_comments(self, url, max_comments=1000):
if not self.driver:
self.setup_driver()
try:
self.driver.get(url)
time.sleep(5) # 等待页面加载
# 获取视频ID
video_id = self._extract_video_id(url)
if not video_id:
raise ValueError("Invalid Douyin URL")
# 点击评论按钮
self._click_comment_button()
# 滚动加载评论
comments = []
last_height = self.driver.execute_script("return document.body.scrollHeight")
while len(comments) < max_comments:
self._scroll_down()
time.sleep(random.uniform(1, 3))
new_comments = self._parse_comments()
for comment in new_comments:
if comment not in comments:
comments.append(comment)
if len(comments) >= max_comments:
break
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
return comments[:max_comments]
except Exception as e:
logger.error(f"Error getting comments: {str(e)}")
raise
finally:
self.driver.quit()
self.driver = None
def _extract_video_id(self, url):
parsed = urlparse(url)
if 'douyin.com' not in parsed.netloc:
return None
path_parts = parsed.path.split('/')
if len(path_parts) >= 3 and path_parts[1] == 'video':
return path_parts[2]
return None
def _click_comment_button(self):
try:
comment_button = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "comment-btn")]'))
)
comment_button.click()
time.sleep(2)
except TimeoutException:
raise Exception("Could not find comment button")
def _scroll_down(self):
self.driver.execute_script("window.scrollBy(0, 500);")
def _parse_comments(self):
comments = []
try:
comment_elements = self.driver.find_elements(By.XPATH, '//div[contains(@class, "comment-item")]')
for element in comment_elements:
try:
user = element.find_element(By.XPATH, './/span[contains(@class, "username")]').text
content = element.find_element(By.XPATH, './/div[contains(@class, "comment-content")]').text
likes = element.find_element(By.XPATH, './/span[contains(@class, "like-count")]').text
time_text = element.find_element(By.XPATH, './/span[contains(@class, "time")]').text
comments.append({
'user': user,
'content': content,
'likes': likes,
'time': time_text
})
except Exception as e:
logger.warning(f"Error parsing comment element: {str(e)}")
continue
except Exception as e:
logger.warning(f"Error finding comment elements: {str(e)}")
return comments
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException import time import random from urllib.parse import urlparse from utils.logger import setup_logger from config import CONFIG
logger = setup_logger(name)
class KuaishouCrawler: def init(self): self.driver = None self.setup_driver()
def setup_driver(self):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=' + CONFIG['USER_AGENT'])
self.driver = webdriver.Chrome(
executable_path=CONFIG['CHROME_DRIVER_PATH'],
options=options
)
self.driver.set_page_load_timeout(30)
def get_comments(self, url, max_comments=1000):
if not self.driver:
self.setup_driver()
try:
self.driver.get(url)
time.sleep(5) # 等待页面加载
# 获取视频ID
video_id = self._extract_video_id(url)
if not video_id:
raise ValueError("Invalid Kuaishou URL")
# 点击评论按钮
self._click_comment_button()
# 滚动加载评论
comments = []
last_height = self.driver.execute_script("return document.body.scrollHeight")
while len(comments) < max_comments:
self._scroll_down()
time.sleep(random.uniform(1, 3))
new_comments = self._parse_comments()
for comment in new_comments:
if comment not in comments:
comments.append(comment)
if len(comments) >= max_comments:
break
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
return comments[:max_comments]
except Exception as e:
logger.error