python解析浏览器书签

192 阅读3分钟

浏览器书签转为json格式,进行分组,保存为json文件

'''
import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import defaultdict

def parse_firefox_bookmarks(bookmarks_file):
    # 读取书签文件
    with open(bookmarks_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(content, 'lxml')

    # 使用 defaultdict 来按域名分组
    bookmarks_by_domain = defaultdict(list)

    # 遍历所有的 <a> 标签,这些标签代表书签
    for a in soup.find_all('a'):
        url = a.get('href')
        name = a.text.strip()
        if url and name:
            # 提取域名
            domain = urlparse(url).netloc
            # 将书签添加到对应的域名组
            bookmarks_by_domain[domain].append({"name": name, "url": url})

    return bookmarks_by_domain

def save_bookmarks_to_json(bookmarks_by_domain, output_file):
    # 将数据转换为 JSON 格式并写入文件
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(bookmarks_by_domain, file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    # 书签 HTML 文件路径
    bookmarks_file = 'bookmarks.html'
    output_json_file = 'bookmarks.json'

    # 解析书签并存入 JSON
    bookmarks_by_domain = parse_firefox_bookmarks(bookmarks_file)
    save_bookmarks_to_json(bookmarks_by_domain, output_json_file)

    print(f"书签已保存到 {output_json_file}")

'''


import json
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from collections import defaultdict

def parse_firefox_bookmarks(bookmarks_file):
    # 读取书签文件
    with open(bookmarks_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # 使用 BeautifulSoup 解析 HTML
    soup = BeautifulSoup(content, 'lxml')

    # 使用 defaultdict 来按域名分组,并用 set() 进行 URL 去重
    bookmarks_by_domain = defaultdict(lambda: {"urls": set(), "bookmarks": []})

    # 遍历所有 <a> 标签,这些标签代表书签
    for a in soup.find_all('a'):
        url = a.get('href')
        name = a.text.strip()
        if url and name:
            # 提取域名
            domain = urlparse(url).netloc
            # 如果 URL 没有出现过,则添加
            if url not in bookmarks_by_domain[domain]["urls"]:
                bookmarks_by_domain[domain]["urls"].add(url)  # 记录 URL(去重)
                bookmarks_by_domain[domain]["bookmarks"].append({"name": name, "url": url})

    # 移除临时的 `urls` 集合,只保留最终的书签数据
    return {domain: data["bookmarks"] for domain, data in bookmarks_by_domain.items()}

def save_bookmarks_to_json(bookmarks_by_domain, output_file):
    # 将数据转换为 JSON 格式并写入文件
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(bookmarks_by_domain, file, ensure_ascii=False, indent=4)

if __name__ == "__main__":
    # 书签 HTML 文件路径
    bookmarks_file = 'bookmarks.html'
    output_json_file = 'bookmarkBak.json'

    # 解析书签并存入 JSON
    bookmarks_by_domain = parse_firefox_bookmarks(bookmarks_file)
    save_bookmarks_to_json(bookmarks_by_domain, output_json_file)

    print(f"书签已保存到 {output_json_file}")


将json文件进行读取去重,生成浏览器可以读取的html,重新导入浏览器中

import json
from bs4 import BeautifulSoup

def load_and_filter_bookmarks(json_file):
    # 读取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as file:
        bookmarks_by_domain = json.load(file)

    # 整合所有书签到一个列表,并去除包含 "zte" 的书签
    all_bookmarks = []
    for bookmarks in bookmarks_by_domain.values():
        for bookmark in bookmarks:
            name = bookmark["name"]
            url = bookmark["url"]
            if "zte" not in name.lower() and "zte" not in url.lower():  # 过滤掉包含 'zte' 的书签
                all_bookmarks.append({"name": name, "url": url})

    return all_bookmarks

def bookmarks_to_firefox_html(bookmarks, output_html):
    # 生成 Firefox 书签的 HTML 头部
    html_content = [
        '<!DOCTYPE NETSCAPE-Bookmark-file-1>',
        '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">',
        '<TITLE>Bookmarks</TITLE>',
        '<H1>Bookmarks</H1>',
        '<DL><p>'  # 开始书签列表
    ]

    # 直接平铺所有书签(不使用文件夹)
    for bookmark in bookmarks:
        name = bookmark["name"]
        url = bookmark["url"]
        html_content.append(f'    <DT><A HREF="{url}">{name}</A>')

    html_content.append('</DL><p>')  # 关闭书签列表

    # 写入 HTML 文件
    with open(output_html, 'w', encoding='utf-8') as file:
        file.write("\n".join(html_content))

if __name__ == "__main__":
    # JSON 文件路径
    input_json_file = '/home/10325461@zte.intra/PyFoo/bookmarks/bookmarkBak.json'  # 之前生成的 JSON 文件
    output_html_file = 'filtered_bookmarks.html'  # 生成的 Firefox 书签 HTML 文件

    # 处理 JSON 并生成 Firefox 书签
    filtered_bookmarks = load_and_filter_bookmarks(input_json_file)
    bookmarks_to_firefox_html(filtered_bookmarks, output_html_file)

    print(f"已成功去除 'zte' 相关书签,并生成 {output_html_file}")