浏览器书签转为json格式,进行分组,保存为json文件
'''
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import defaultdict
def parse_firefox_bookmarks(bookmarks_file):
# 读取书签文件
with open(bookmarks_file, 'r', encoding='utf-8') as file:
content = file.read()
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(content, 'lxml')
# 使用 defaultdict 来按域名分组
bookmarks_by_domain = defaultdict(list)
# 遍历所有的 <a> 标签,这些标签代表书签
for a in soup.find_all('a'):
url = a.get('href')
name = a.text.strip()
if url and name:
# 提取域名
domain = urlparse(url).netloc
# 将书签添加到对应的域名组
bookmarks_by_domain[domain].append({"name": name, "url": url})
return bookmarks_by_domain
def save_bookmarks_to_json(bookmarks_by_domain, output_file):
# 将数据转换为 JSON 格式并写入文件
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(bookmarks_by_domain, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
# 书签 HTML 文件路径
bookmarks_file = 'bookmarks.html'
output_json_file = 'bookmarks.json'
# 解析书签并存入 JSON
bookmarks_by_domain = parse_firefox_bookmarks(bookmarks_file)
save_bookmarks_to_json(bookmarks_by_domain, output_json_file)
print(f"书签已保存到 {output_json_file}")
'''
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import defaultdict
def parse_firefox_bookmarks(bookmarks_file):
# 读取书签文件
with open(bookmarks_file, 'r', encoding='utf-8') as file:
content = file.read()
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(content, 'lxml')
# 使用 defaultdict 来按域名分组,并用 set() 进行 URL 去重
bookmarks_by_domain = defaultdict(lambda: {"urls": set(), "bookmarks": []})
# 遍历所有 <a> 标签,这些标签代表书签
for a in soup.find_all('a'):
url = a.get('href')
name = a.text.strip()
if url and name:
# 提取域名
domain = urlparse(url).netloc
# 如果 URL 没有出现过,则添加
if url not in bookmarks_by_domain[domain]["urls"]:
bookmarks_by_domain[domain]["urls"].add(url) # 记录 URL(去重)
bookmarks_by_domain[domain]["bookmarks"].append({"name": name, "url": url})
# 移除临时的 `urls` 集合,只保留最终的书签数据
return {domain: data["bookmarks"] for domain, data in bookmarks_by_domain.items()}
def save_bookmarks_to_json(bookmarks_by_domain, output_file):
# 将数据转换为 JSON 格式并写入文件
with open(output_file, 'w', encoding='utf-8') as file:
json.dump(bookmarks_by_domain, file, ensure_ascii=False, indent=4)
if __name__ == "__main__":
# 书签 HTML 文件路径
bookmarks_file = 'bookmarks.html'
output_json_file = 'bookmarkBak.json'
# 解析书签并存入 JSON
bookmarks_by_domain = parse_firefox_bookmarks(bookmarks_file)
save_bookmarks_to_json(bookmarks_by_domain, output_json_file)
print(f"书签已保存到 {output_json_file}")
将json文件进行读取去重,生成浏览器可以读取的html,重新导入浏览器中
import json
from bs4 import BeautifulSoup
def load_and_filter_bookmarks(json_file):
# 读取 JSON 文件
with open(json_file, 'r', encoding='utf-8') as file:
bookmarks_by_domain = json.load(file)
# 整合所有书签到一个列表,并去除包含 "zte" 的书签
all_bookmarks = []
for bookmarks in bookmarks_by_domain.values():
for bookmark in bookmarks:
name = bookmark["name"]
url = bookmark["url"]
if "zte" not in name.lower() and "zte" not in url.lower(): # 过滤掉包含 'zte' 的书签
all_bookmarks.append({"name": name, "url": url})
return all_bookmarks
def bookmarks_to_firefox_html(bookmarks, output_html):
# 生成 Firefox 书签的 HTML 头部
html_content = [
'<!DOCTYPE NETSCAPE-Bookmark-file-1>',
'<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">',
'<TITLE>Bookmarks</TITLE>',
'<H1>Bookmarks</H1>',
'<DL><p>' # 开始书签列表
]
# 直接平铺所有书签(不使用文件夹)
for bookmark in bookmarks:
name = bookmark["name"]
url = bookmark["url"]
html_content.append(f' <DT><A HREF="{url}">{name}</A>')
html_content.append('</DL><p>') # 关闭书签列表
# 写入 HTML 文件
with open(output_html, 'w', encoding='utf-8') as file:
file.write("\n".join(html_content))
if __name__ == "__main__":
# JSON 文件路径
input_json_file = '/home/10325461@zte.intra/PyFoo/bookmarks/bookmarkBak.json' # 之前生成的 JSON 文件
output_html_file = 'filtered_bookmarks.html' # 生成的 Firefox 书签 HTML 文件
# 处理 JSON 并生成 Firefox 书签
filtered_bookmarks = load_and_filter_bookmarks(input_json_file)
bookmarks_to_firefox_html(filtered_bookmarks, output_html_file)
print(f"已成功去除 'zte' 相关书签,并生成 {output_html_file}")