X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_ph_html5.py;h=6d0d38094bea6bd8935b4ddb29455ca458cd7cbf;hb=HEAD;hp=111e1ed4b4b38360fef84ca41a9276351d0e6c5e;hpb=cb9c36b39ed72cd1fa272130d2bcf162a89c3013;p=bookmarks_db.git diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py deleted file mode 100644 index 111e1ed..0000000 --- a/parse_html/bkmk_ph_html5.py +++ /dev/null @@ -1,104 +0,0 @@ -"""HTML Parser using html5 - -This file is a part of Bookmarks database and Internet robot. - -""" - -__author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design" -__license__ = "GNU GPL" - -__all__ = ['parse_html'] - - -from html5lib import HTMLParser as HTML5Parser -from .bkmk_ph_util import HTMLParser - - -def parse_html(html_text, charset=None, log=None): - parser = HTML5Parser() - html_tree = parser.parse( - html_text, encoding=charset, parseMeta=bool(charset)) - - html = None - if hasattr(html_tree, 'childNodes'): - for node in html_tree.childNodes: - # Skip DocType element - if (node.name == 'html') and (node.type != 3): - html = node - break - - if not html: - return None - - for node in html.childNodes: - if node.name == 'head': - head = node - break - else: - head = None - - meta_charset = False - title = None - refresh = None - icon = None - - if head: - for node in head.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' - - for node in head.childNodes: - if (node.name == 'meta') and \ - ('http-equiv' in node.attributes) and \ - (node.attributes['http-equiv'] == 'content-type'): - meta_content = node.attributes['content'] - if meta_content: - try: - meta_charset = \ - meta_content.lower().split('charset=')[1].\ - split(';')[0] - except IndexError: - meta_charset = False - else: - break - elif (node.name == 'meta') and ('charset' in node.attributes): - meta_charset = node.attributes['charset'].lower() - break - - if not charset: - charset = parser.tokenizer.stream.charEncoding[0] - - if title and (charset or meta_charset): - title = title.encode(charset or meta_charset) - - for node in head.childNodes: - if node.name == 'meta' and \ - ('http-equiv' in node.attributes) and \ - (node.attributes['http-equiv'] == 'refresh'): - refresh = node.attributes['content'] - break - - for node in head.childNodes: - if node.name == 'link' and \ - ('rel' in node.attributes) and \ - (node.attributes['rel'] in ('icon', 'shortcut icon')): - icon = node.attributes['href'] - break - - else: - for node in html.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' - - if (title is None) and (refresh is None) and (icon is None): - return None - return HTMLParser(charset, meta_charset, title, refresh, icon)