browser.py
def open_in_browser(html):
"""
Open the HTML document in a web browser, saving it to a temporary
file to open it. Note that this does not delete the file after
use. This is mainly meant for debugging.
"""
import os
import webbrowser
import tempfile
handle, fn = tempfile.mkstemp(suffix=".html")
f = os.fdopen(handle, "wb")
try:
f.write(b"<meta charset='UTF-8' />")
f.write(html.encode("utf-8"))
finally:
f.close()
url = "file://" + fn.replace(os.path.sep, "/")
webbrowser.open(url)
return url
cleaner.py
import re
from lxml.html.clean import Cleaner
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = "[^ \"'>]+"
htmlstrip = re.compile(
"<"
"([^>]+) "
"(?:%s) *" % ("|".join(bad_attrs),)
+ "= *(?:%s|%s|%s)"
% (non_space, single_quoted, double_quoted)
+ "([^>]*)"
">",
re.I,
)
def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub("<\\1\\2>", html)
return html
def normalize_spaces(s):
if not s:
return ""
return " ".join(s.split())
html_cleaner = Cleaner(
scripts=True,
javascript=True,
comments=True,
style=True,
links=True,
meta=False,
add_nofollow=False,
page_structure=False,
processing_instructions=True,
embedded=False,
frames=False,
forms=False,
annoying_tags=False,
remove_tags=None,
remove_unknown_tags=False,
safe_attrs_only=False,
)
debug.py
import re
uids = {}
uids_document = None
def describe_node(node):
global uids
if node is None:
return ""
if not hasattr(node, "tag"):
return "[%s]" % type(node)
name = node.tag
if node.get("id", ""):
name += "#" + node.get("id")
if node.get("class", "").strip():
name += "." + ".".join(node.get("class").split())
if name[:4] in ["div#", "div."]:
name = name[3:]
if name in ["tr", "td", "div", "p"]:
uid = uids.get(node)
if uid is None:
uid = uids[node] = len(uids) + 1
name += "{%02d}" % uid
return name
def describe(node, depth=1):
global uids, uids_document
doc = node.getroottree().getroot()
if doc != uids_document:
uids = {}
uids_document = doc
parent = ""
if depth and node.getparent() is not None:
parent = describe(node.getparent(), depth=depth - 1) + ">"
return parent + describe_node(node)
RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
def text_content(elem, length=40):
content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
if len(content) < length:
return content
return content[:length] + "..."
encoding.py
import re
try:
import cchardet
except ImportError:
import chardet
import sys
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
CHARSETS = {
"big5": "big5hkscs",
"gb2312": "gb18030",
"ascii": "utf-8",
"maccyrillic": "cp1251",
"win1251": "cp1251",
"win-1251": "cp1251",
"windows-1251": "cp1251",
}
def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)
def get_encoding(page):
declared_encodings = (
RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
)
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
declared_encoding = declared_encoding.decode("ascii", "replace")
encoding = fix_charset(declared_encoding)
page.decode(encoding)
return encoding
except UnicodeDecodeError:
pass
text = re.sub(r'(\s*</?[^>]*>)+\s*', ' ', page).strip()
enc = 'utf-8'
if len(text) < 10:
return enc
res = chardet.detect(text)
enc = res["encoding"] or "utf-8"
enc = fix_charset(enc)
return enc