下载地址:www.pan38.com/dow/share.p… 提取密码:2918
这个代码实现了一个基本的网页爬虫,包含URL验证、内容获取、链接提取和结果保存等功能。使用时需要安装requests和beautifulsoup4库。
import requests from bs4 import BeautifulSoup import time import random from urllib.parse import urljoin, urlparse import os
class WebCrawler: def init(self, base_url, max_pages=10, delay=1.0): self.base_url = base_url self.max_pages = max_pages self.delay = delay self.visited_urls = set() self.to_visit = set([base_url]) self.domain = urlparse(base_url).netloc self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }
def is_valid_url(self, url):
parsed = urlparse(url)
return parsed.netloc == self.domain and parsed.scheme in ['http', 'https']
def get_page_content(self, url):
try:
time.sleep(self.delay * (0.5 + random.random()))
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
return response.text
except Exception as e:
print(f"Error fetching {url}: {e}")
return None
def extract_links(self, html, base_url):
soup = BeautifulSoup(html, 'html.parser')
links = set()
for link in soup.find_all('a', href=True):
url = urljoin(base_url, link['href'])
if self.is_valid_url(url):
links.add(url)
return links
def save_content(self, url, content):
parsed = urlparse(url)
path = parsed.path[1:] if parsed.path else 'index'
filename = f"data/{path.replace('/', '_')}.html"
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
def crawl(self):
while self.to_visit and len(self.visited_urls) < self.max_pages:
current_url = self.to_visit.pop()
if current_url in self.visited_urls:
continue
print(f"Crawling: {current_url}")
content = self.get_page_content(current_url)
if content:
self.save_content(current_url, content)
links = self.extract_links(content, current_url)
self.to_visit.update(links - self.visited_urls)
self.visited_urls.add(current_url)
print(f"Crawling completed. Visited {len(self.visited_urls)} pages.")
if name == "main": crawler = WebCrawler("example.com", max_pages=20, delay=2.0) crawler.crawl()