X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_etreetidy.py;h=7149c2286332a55c3655bddda4632e26865279d7;hb=4fd6d1f55e5e5dbcc0bdc1b933d6e14076316437;hp=65d42aeca65dfa932c5c9019347ca7d01150ecb7;hpb=349d6e0241d43e42a257d6019972402116236ee7;p=bookmarks_db.git diff --git a/Robots/parse_html_etreetidy.py b/Robots/parse_html_etreetidy.py index 65d42ae..7149c22 100644 --- a/Robots/parse_html_etreetidy.py +++ b/Robots/parse_html_etreetidy.py @@ -20,19 +20,27 @@ def parse_html(filename, charset=None, log=None): if elem.tag.startswith(XHTML): elem.tag = elem.tag[len(XHTML):] + title = html_tree.findtext('head/title') + if title is None: + title = html_tree.findtext('title') + if title is None: + return None + meta = html_tree.findall('head/meta') for m in meta: if m.get('http-equiv', '').lower() == 'content-type': meta_content = m.get("content") if meta_content: - meta_charset = \ - meta_content.lower().split('charset=')[1].split(';')[0] - break + try: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + break + except IndexError: + meta_charset = False else: meta_charset = False - title = html_tree.findtext('head/title') - if title and (charset or meta_charset): + if charset or meta_charset: title = title.encode(charset or meta_charset) for m in meta: