from bs4 import BeautifulSoup
with open("showDoc.do_docSyskey=4144607.html","r") as f:
f_data = f.read()
html_doc = f_data
soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
span_text = soup.select("#tBody")[0].select("span")[0].text
b_number = span_text.split(":")[0][1:]
title = span_text.split(":")[1].split("-")[0].strip()
date = span_text.split("(")[1][:-2]
[s.extract() for s in soup.select(".bLarger")]
table_text = soup.select(".blltnHdr")[0]
subject = table_text.select(".term")[0].text
t_title = table_text.select(".definition")[0].text
# model = table_text.select(".term")[1].text
info_ =" ".join([df.text for df in table_text.select(".definition")[1:-1]])
attention = table_text.select(".term")[2].text
a_info = table_text.select(".definition")[-1].text
[s.extract() for s in soup.select(".blltnHdr")]
[s.extract() for s in soup.select("#dBody")[0].select("table")[0]]
[s.extract() for s in soup.select("#dBody > table")]
[s.extract() for s in soup.select("#copyright")]
[s.extract() for s in soup.select("#banner")]
[s.extract() for s in soup.select("#breadCrumbs")]
[s.extract() for s in soup.select("#c2cRd")]
tag = soup.find(id="tBody")
extra_text = """
<h2 class="hha">{0}</h2>
<p cla>Bulletin Number(s):{1}</p> <p style="inline">Data of Issue:{2}</p>
<p>Models:{3}</p>
<p>Attention:{4}</p>
""".format(title,b_number,date,info_,a_info)
extraSoup = BeautifulSoup(extra_text,"html.parser")
tag.insert_after(extraSoup)
with open('1.html',"w",encoding="utf8")as f:
f.write(soup.prettify().replace('body lang="en_US"','body lang="en_US" style="overflow:auto"').replace("dBody","ddd"))
我正在尝试将HTML字符串插入BeautifulSoup对象。
清除元素
- 将
select选中的元素直接清除
[s.extract() for s in soup.select(".bLarger")]
新增元素
- 详见:juejin.cn/post/698763…
- 使用insert_after在找到的指定位置之后,将soup对象进行插入操作
tag = soup.find(id="tBody")
extra_text = """
<h2 class="hha">{0}</h2>
<p cla>Bulletin Number(s):{1}</p> <p style="inline">Data of Issue:{2}</p>
<p>Models:{3}</p>
<p>Attention:{4}</p>
""".format(title,b_number,date,info_,a_info)
extraSoup = BeautifulSoup(extra_text,"html.parser")
tag.insert_after(extraSoup)
将soup对象写成一个html网页
- 使用prettify()将soup对象变成str字符串
with open('1.html',"w",encoding="utf8")as f:
f.write(soup.prettify())