美团众包抢单辅助,达达京东货拉拉蜂鸟抢单脚本,自动抢单插件辅助【python源码】

273 阅读1分钟

下载地址:www.pan38.com/dow/share.p… 提取密码:2918

这个代码实现了一个基本的网页爬虫,包含URL验证、内容获取、链接提取和结果保存等功能。使用时需要安装requests和beautifulsoup4库。

import requests from bs4 import BeautifulSoup import time import random from urllib.parse import urljoin, urlparse import os

class WebCrawler: def init(self, base_url, max_pages=10, delay=1.0): self.base_url = base_url self.max_pages = max_pages self.delay = delay self.visited_urls = set() self.to_visit = set([base_url]) self.domain = urlparse(base_url).netloc self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }

def is_valid_url(self, url):
    parsed = urlparse(url)
    return parsed.netloc == self.domain and parsed.scheme in ['http', 'https']

def get_page_content(self, url):
    try:
        time.sleep(self.delay * (0.5 + random.random()))
        response = requests.get(url, headers=self.headers, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_links(self, html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    
    for link in soup.find_all('a', href=True):
        url = urljoin(base_url, link['href'])
        if self.is_valid_url(url):
            links.add(url)
            
    return links

def save_content(self, url, content):
    parsed = urlparse(url)
    path = parsed.path[1:] if parsed.path else 'index'
    filename = f"data/{path.replace('/', '_')}.html"
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(content)

def crawl(self):
    while self.to_visit and len(self.visited_urls) < self.max_pages:
        current_url = self.to_visit.pop()
        
        if current_url in self.visited_urls:
            continue
            
        print(f"Crawling: {current_url}")
        content = self.get_page_content(current_url)
        
        if content:
            self.save_content(current_url, content)
            links = self.extract_links(content, current_url)
            self.to_visit.update(links - self.visited_urls)
            
        self.visited_urls.add(current_url)
        
    print(f"Crawling completed. Visited {len(self.visited_urls)} pages.")

if name == "main": crawler = WebCrawler("example.com", max_pages=20, delay=2.0) crawler.crawl()