From 5f637b5a5fe27098985975928632b9fea5ea3c62 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 3 Mar 2008 21:13:38 +0000 Subject: [PATCH] Log more parsers errors. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@191 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 2 +- Robots/parse_html_beautifulsoup.py | 4 +++- Robots/parse_html_htmlparser.py | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Robots/parse_html.py b/Robots/parse_html.py index 3e49491..80d4a92 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -67,7 +67,7 @@ def parse_html(filename, charset=None, log=None): parser = None for c in charsets: try: - parser = p(filename, c) + parser = p(filename, c, log) break except UnicodeEncodeError: pass diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 11db563..25719ca 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -49,11 +49,12 @@ class BadDeclParser(BeautifulSoup): return j -def parse_html(filename, charset=None): +def parse_html(filename, charset=None, log=None): infile = open(filename, 'r') try: root = BadDeclParser(infile, fromEncoding=charset) except TypeError: + if log: log("TypeError") return None finally: infile.close() @@ -61,6 +62,7 @@ def parse_html(filename, charset=None): try: head = root.html.head except AttributeError: + if log: log("No HTML in root or no HEAD in HTML") return None if head is None: diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py index e1a35f1..30911dd 100644 --- a/Robots/parse_html_htmlparser.py +++ b/Robots/parse_html_htmlparser.py @@ -1,7 +1,7 @@ """ HTML Parser - Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design + Written by BroytMann. Copyright (C) 1997-2008 PhiloSoft Design """ from HTMLParser import HTMLParseError @@ -73,7 +73,7 @@ class HTMLParser(_HTMLParser): self.icon = href -def parse_html(filename, charset=None): +def parse_html(filename, charset=None, log=None): infile = open(filename, 'r') parser = HTMLParser(charset) -- 2.39.2