[python]爬取一个网站上的图片,写入csv文件中

37 阅读1分钟

爬取列表+详情

```
import csv, time
import requests
from bs4 import BeautifulSoup


def get_list(url, file):
    # 发起HTTPS请求
    response = requests.get(url)
    # 检查请求是否成功
    if response.status_code == 200:
        # 解析HTML内容
        soup = BeautifulSoup(response.content, 'html.parser')
        # 查找所有class为"box-massage__card-link"的<a>标签
        a_tags = soup.find_all('a', class_='main-content__card-link')
        # 创建CSV文件并写入链接地址
        with open(file, 'a', newline='') as csvfile:
            writer = csv.writer(csvfile)
            for a_tag in a_tags:
                href = a_tag.get('href')
                writer.writerow([href])
        print('get ist done.')
    else:
        print('请求失败:', response.status_code)


def get_detail(file1, file2):
    # 读取CSV文件并打印每一行内容
    with open(file1, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            url = row[0]
            print(url)
            response = requests.get(url)
            time.sleep(1)
            # 检查请求是否成功
            if response.status_code == 200:
                # 解析HTML内容
                soup = BeautifulSoup(response.content, 'html.parser')
                # 查找所有class为"box-massage__card-link"的<a>标签
                a_tags = soup.find_all('a', class_='box-massage__card-link')
                # 创建CSV文件并写入链接地址
                with open(file2, 'a', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    for a_tag in a_tags:
                        href = a_tag.get('href')
                        writer.writerow([href])
                print('get detail done.')
            else:
                print('请求失败:', response.status_code)


if __name__ == "__main__":
    url = 'https://babesource.com/paysites/318/Lets+Doe+It/page1.html'
    file = 'lubed_list.csv'
    get_list(url, file)
    # get_detail('film_list.csv','film.csv')
```