From 8cd64d1c6d8e704802e89eb209884c9675914c07 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 20 Nov 2023 03:50:26 +0300 Subject: [PATCH] Feat: Remove some HTML parsers EtreeTidy is outdated and buggy. html5 is outdated. --- parse_html/bkmk_parse_html.py | 29 ++------- parse_html/bkmk_ph_etreetidy.py | 72 --------------------- parse_html/bkmk_ph_html5.py | 109 -------------------------------- 3 files changed, 7 insertions(+), 203 deletions(-) delete mode 100644 parse_html/bkmk_ph_etreetidy.py delete mode 100644 parse_html/bkmk_ph_html5.py diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 7764303..4d5f9e6 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -24,14 +24,6 @@ from compat import unicode, unichr DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] -try: - from . import bkmk_ph_beautifulsoup4 -except ImportError: - pass -else: - bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET - parsers.append(bkmk_ph_beautifulsoup4.parse_html) - try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -41,33 +33,26 @@ else: parsers.append(bkmk_ph_beautifulsoup.parse_html) try: - from . import bkmk_ph_html5 + from . import bkmk_ph_beautifulsoup4 except ImportError: pass else: - parsers.append(bkmk_ph_html5.parse_html) + bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(bkmk_ph_beautifulsoup4.parse_html) try: - from . import bkmk_ph_lxml + from . import bkmk_ph_htmlparser except ImportError: pass else: - parsers.append(bkmk_ph_lxml.parse_html) + parsers.append(bkmk_ph_htmlparser.parse_html) try: - from . import bkmk_ph_htmlparser + from . import bkmk_ph_lxml except ImportError: pass else: - parsers.append(bkmk_ph_htmlparser.parse_html) - -# ElementTidy often segfaults -# try: -# from . import bkmk_ph_etreetidy -# except ImportError: -# pass -# else: -# parsers.append(bkmk_ph_etreetidy.parse_html) + parsers.append(bkmk_ph_lxml.parse_html) universal_charset = "utf-8" entity_re = re.compile("(&\\w+;)") diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py deleted file mode 100644 index eadcca3..0000000 --- a/parse_html/bkmk_ph_etreetidy.py +++ /dev/null @@ -1,72 +0,0 @@ -"""HTML Parser using ElementTree+TidyLib. - -This file is a part of Bookmarks database and Internet robot. - -""" - -__author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" -__license__ = "GNU GPL" - -__all__ = ['parse_html'] - - -from elementtidy import TidyHTMLTreeBuilder -from .bkmk_ph_util import HTMLParser - - -def parse_html(html_text, charset=None, log=None): - if not html_text: - return None - try: - html_tree = TidyHTMLTreeBuilder.parseString(html_text) - except: - return None - - XHTML = "{http://www.w3.org/1999/xhtml}" - - for elem in html_tree.getiterator(): - if elem.tag.startswith(XHTML): - elem.tag = elem.tag[len(XHTML):] - - title = html_tree.findtext('head/title') - if title is None: - title = html_tree.findtext('title') - - meta = html_tree.findall('head/meta') - for m in meta: - if m.get('http-equiv', '').lower() == 'content-type': - meta_content = m.get("content") - if meta_content: - try: - meta_charset = \ - meta_content.lower().split('charset=')[1].split(';')[0] - break - except IndexError: - meta_charset = False - elif m.get('charset', ''): - meta_charset = m.get('charset').lower() - break - else: - meta_charset = False - - #if title and (charset or meta_charset): - # title = title.encode(charset or meta_charset) - - for m in meta: - if m.get('http-equiv', '').lower() == 'refresh': - refresh = m.get("content") - break - else: - refresh = None - - for link in html_tree.findall('head/link'): - if link.get('rel', '').lower() in ('icon', 'shortcut icon'): - icon = link.get("href") - break - else: - icon = None - - if (title is None) and (refresh is None) and (icon is None): - return None - return HTMLParser(charset, meta_charset, title, refresh, icon) diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py deleted file mode 100644 index d973b72..0000000 --- a/parse_html/bkmk_ph_html5.py +++ /dev/null @@ -1,109 +0,0 @@ -"""HTML Parser using html5 - -This file is a part of Bookmarks database and Internet robot. - -""" - -__author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" -__license__ = "GNU GPL" - -__all__ = ['parse_html'] - - -from html5lib import HTMLParser as HTML5Parser -from .bkmk_ph_util import HTMLParser - - -def parse_html(html_text, charset=None, log=None): - if not html_text: - return None - parser = HTML5Parser() - if isinstance(html_text, bytes): - html_tree = parser.parse( - html_text, encoding=charset, parseMeta=bool(charset)) - else: - html_tree = parser.parse(html_text) - - html = None - if hasattr(html_tree, 'childNodes'): - for node in html_tree.childNodes: - # Skip DocType element - if (node.name == 'html') and (node.type != 3): - html = node - break - - if not html: - return None - - for node in html.childNodes: - if node.name == 'head': - head = node - break - else: - head = None - - meta_charset = False - title = None - refresh = None - icon = None - - if head: - for node in head.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' - - for node in head.childNodes: - if (node.name == 'meta') and \ - ('http-equiv' in node.attributes) and \ - (node.attributes['http-equiv'] == 'content-type'): - meta_content = node.attributes['content'] - if meta_content: - try: - meta_charset = \ - meta_content.lower().split('charset=')[1].\ - split(';')[0] - except IndexError: - meta_charset = False - else: - break - elif (node.name == 'meta') and ('charset' in node.attributes): - meta_charset = node.attributes['charset'].lower() - break - - if not charset: - charset = parser.tokenizer.stream.charEncoding[0] - - #if title and (charset or meta_charset): - # title = title.encode(charset or meta_charset) - - for node in head.childNodes: - if node.name == 'meta' and \ - ('http-equiv' in node.attributes) and \ - (node.attributes['http-equiv'] == 'refresh'): - refresh = node.attributes['content'] - break - - for node in head.childNodes: - if node.name == 'link' and \ - ('rel' in node.attributes) and \ - (node.attributes['rel'] in ('icon', 'shortcut icon')): - icon = node.attributes['href'] - break - - else: - for node in html.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' - - if (title is None) and (refresh is None) and (icon is None): - return None - return HTMLParser(charset, meta_charset, title, refresh, icon) -- 2.39.2