此次内容是使用bs4的方法爬取豆瓣音乐排行榜
步骤:
解析数据:
1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
2、从bs对象中查找数据
import requests
from bs4 import BeautifulSoup
url = "https://music.douban.com/chart"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)
# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器
# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)
# ul1 = page.find("ul",class="col5") #class是python的关键字
# ul1 = page.find("ul",class_="col5")
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致
li1 = ul2.find_all("li")[0:1]
print(li1)
import requests
from bs4 import BeautifulSoup
url = "https://music.douban.com/chart"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)
# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器
# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)
# ul1 = page.find("ul",class="col5") #class是python的关键字
# ul1 = page.find("ul",class_="col5")
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致
li1 = ul2.find_all("li")
for li in li1:
print("")
a1 = li.find("a",href="javascript:;")
print("歌曲名:"+a1.text) #加text去掉查询到的代码
p1 = li.find("p")
print(p1.text)
print('_'*100)
保存文件
import requests
from bs4 import BeautifulSoup
import re
import csv
url = "https://music.douban.com/chart"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)
f = open("豆瓣歌曲排名.csv",mode="w",encoding='utf-8')
csvwriter = csv.writer(f)
# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器
# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)
# ul1 = page.find("ul",class="col5") #class是python的关键字
# ul1 = page.find("ul",class_="col5")
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致
li1 = ul2.find_all("li")
for li in li1:
a1 = li.find("a",href="javascript:;")
p1 = li.find("p")
a = a1.text
p = p1.text
csvwriter.writerow([a,p])
# print(p)
f.close()
检查查看
import pandas as pd
df = pd.read_csv("豆瓣歌曲排名.csv")
df.head()
发现第一行成为了标题,修改下代码
import requests
from bs4 import BeautifulSoup
import re
import csv
url = "https://music.douban.com/chart"
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30'
}
resp = requests.get(url,headers=headers)
f = open("豆瓣歌曲排名.csv",mode="w",encoding='utf-8')
csvwriter = csv.writer(f)
# 解析数据
# 1、把页面源代码交给BeautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,'html.parser') #指定HTML解析器
# 2、从bs对象中查找数据
# find(标签,属性=值)
# find_all(标签,属性=值)
# ul1 = page.find("ul",class="col5") #class是python的关键字
# ul1 = page.find("ul",class_="col5")
ul2 = page.find("ul",attrs={"class":"col5"})
# 以上两种写法,结果一致
li1 = ul2.find_all("li")
csvwriter.writerow(["歌曲名","作者与播放次数"])
for li in li1:
a1 = li.find("a",href="javascript:;")
p1 = li.find("p")
a = a1.text
p = p1.text
csvwriter.writerow([a,p])
# print(p)
f.close()
再次检查
import pandas as pd
df = pd.read_csv("豆瓣歌曲排名.csv")
df.head()