爬虫学习案例-bs4(爬取豆瓣音乐排行榜)

572 阅读2分钟

此次内容是使用bs4的方法爬取豆瓣音乐排行榜

步骤:

解析数据:

1、把页面源代码交给BeautifulSoup进行处理,生成bs对象

2、从bs对象中查找数据

import requests
from bs4 import BeautifulSoup

url = "https://music.douban.com/chart"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)

# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器

# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)

# ul1 = page.find("ul",class="col5")  #class是python的关键字
# ul1 = page.find("ul",class_="col5") 
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致


li1 = ul2.find_all("li")[0:1]

print(li1)

image.png

import requests
from bs4 import BeautifulSoup

url = "https://music.douban.com/chart"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)

# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器

# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)

# ul1 = page.find("ul",class="col5")  #class是python的关键字
# ul1 = page.find("ul",class_="col5") 
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致

li1 = ul2.find_all("li")
for li in li1:
    print("")
    a1 = li.find("a",href="javascript:;") 
    print("歌曲名:"+a1.text) #加text去掉查询到的代码
    p1 = li.find("p")
    print(p1.text)
    print('_'*100)

image.png

保存文件

import requests
from bs4 import BeautifulSoup
import re
import csv

url = "https://music.douban.com/chart"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)


f = open("豆瓣歌曲排名.csv",mode="w",encoding='utf-8')
csvwriter = csv.writer(f)

# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器

# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)

# ul1 = page.find("ul",class="col5")  #class是python的关键字
# ul1 = page.find("ul",class_="col5") 
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致

li1 = ul2.find_all("li")
for li in li1:
    
    a1 = li.find("a",href="javascript:;") 
    p1 = li.find("p")

    a = a1.text
    p = p1.text
    
    csvwriter.writerow([a,p])
#     print(p)
    
f.close()
    

检查查看

import pandas as pd 

df = pd.read_csv("豆瓣歌曲排名.csv")
df.head()

image.png

发现第一行成为了标题,修改下代码

import requests
from bs4 import BeautifulSoup
import re
import csv

url = "https://music.douban.com/chart"
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)


f = open("豆瓣歌曲排名.csv",mode="w",encoding='utf-8')
csvwriter = csv.writer(f)

# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器

# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)

# ul1 = page.find("ul",class="col5")  #class是python的关键字
# ul1 = page.find("ul",class_="col5") 
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致

li1 = ul2.find_all("li")

csvwriter.writerow(["歌曲名","作者与播放次数"])
for li in li1:
    
    a1 = li.find("a",href="javascript:;") 
    p1 = li.find("p")

    a = a1.text
    p = p1.text
    
    csvwriter.writerow([a,p])
#     print(p)
    
f.close()
    

再次检查

import pandas as pd 

df = pd.read_csv("豆瓣歌曲排名.csv")
df.head()

image.png