From 00549b1c0622ee6ed0ac12249097cf4562bc486e Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 30 Apr 2014 22:05:09 +0400 Subject: [PATCH] Change parse_html to parse strings, not files --- Robots/bkmk_rsimple.py | 6 +++--- doc/ANNOUNCE | 2 ++ doc/TODO | 2 -- parse_html/__init__.py | 9 +++++---- parse_html/bkmk_parse_html.py | 17 +++++++++++++---- parse_html/bkmk_ph_beautifulsoup.py | 16 +++++++--------- parse_html/bkmk_ph_etreetidy.py | 7 ++++--- parse_html/bkmk_ph_html5.py | 10 ++++------ parse_html/bkmk_ph_htmlparser.py | 17 +++++++---------- parse_html/bkmk_ph_lxml.py | 9 +++++---- 10 files changed, 50 insertions(+), 45 deletions(-) diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_rsimple.py index 9ab40a3..2c4df9e 100644 --- a/Robots/bkmk_rsimple.py +++ b/Robots/bkmk_rsimple.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['robot_simple', 'get_error'] @@ -20,7 +20,7 @@ from m_lib.net.www.util import parse_time from m_lib.md5wrapper import md5wrapper from bkmk_objects import Robot -from parse_html import parse_html +from parse_html import parse_filename class RedirectException(Exception): @@ -182,7 +182,7 @@ class robot_simple(Robot): else: html = False if html: - parser = parse_html(fname, charset, self.log) + parser = parse_filename(fname, charset, self.log) if parser: bookmark.real_title = parser.title icon = parser.icon diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index 4e7cd77..55e7351 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -8,6 +8,8 @@ bookmarks.html. WHAT'S NEW in version 4.5.7 + Change parse_html to parse strings, not files. + Add ChangeLog. diff --git a/doc/TODO b/doc/TODO index 6ec0d5a..64ef4ca 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,5 +1,3 @@ -Change parse_html to parse strings, not files. - Split simple robot: separate network operations and URL handling. Allow parameters in BKMK_ROBOT; for example, 'forking:urllib'. diff --git a/parse_html/__init__.py b/parse_html/__init__.py index 1e9393c..d9dfffc 100644 --- a/parse_html/__init__.py +++ b/parse_html/__init__.py @@ -1,16 +1,17 @@ """HTML Parsers This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design" __license__ = "GNU GPL" -__all__ = ['parse_html', 'main'] +__all__ = ['parse_html', 'parse_filename', 'main'] -from .bkmk_parse_html import parse_html +from .bkmk_parse_html import parse_html, parse_filename def main(): @@ -27,6 +28,6 @@ def main(): else: sys.exit("Usage: main filename [charset]") - parser = parse_html(filename, charset, log=lambda s: sys.stdout.write(s + '\n')) + parser = parse_filename(filename, charset, log=lambda s: sys.stdout.write(s + '\n')) print " refresh:", parser.refresh print " icon :", parser.icon diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 862fa2b..e951cdc 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -1,13 +1,14 @@ """HTML Parsers This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design" __license__ = "GNU GPL" -__all__ = ['parse_html', 'universal_charset'] +__all__ = ['parse_html', 'parse_filename', 'universal_charset'] import codecs @@ -86,7 +87,7 @@ def recode_entities(title, charset): import os BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS") -def parse_html(filename, charset=None, log=None): +def parse_html(html_text, charset=None, log=None): if not parsers: return None @@ -109,7 +110,7 @@ def parse_html(filename, charset=None, log=None): parser = None for c in charsets: try: - parser = p(filename, c, log) + parser = p(html_text, c, log) except UnicodeError: pass else: @@ -183,3 +184,11 @@ def parse_html(filename, charset=None, log=None): if parser.charset: parser.icon = icon.encode(parser.charset) return parser + +def parse_filename(filename, charset=None, log=None): + fp = open(filename, 'r') + try: + parser = parse_html(fp.read(), charset=charset, log=log) + finally: + fp.close() + return parser diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index 225cb27..a2f5715 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -1,10 +1,11 @@ """HTML Parser using BeautifulSoup This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design" +__copyright__ = "Copyright (C) 2007-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -46,24 +47,21 @@ class BadDeclParser(BeautifulSoup): return j -def _parse_html(filename, charset): - infile = open(filename, 'r') +def _parse_html(html_text, charset): try: - return BadDeclParser(infile, fromEncoding=charset) + return BadDeclParser(html_text, fromEncoding=charset) except TypeError: return None - finally: - infile.close() -def parse_html(filename, charset=None, log=None): - root = _parse_html(filename, charset) +def parse_html(html_text, charset=None, log=None): + root = _parse_html(html_text, charset) if root is None: return None _charset = root.originalEncoding if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default _charset = DEFAULT_CHARSET - root = _parse_html(filename, _charset) + root = _parse_html(html_text, _charset) if root is None: return None diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py index b85ae2a..fc596b1 100644 --- a/parse_html/bkmk_ph_etreetidy.py +++ b/parse_html/bkmk_ph_etreetidy.py @@ -1,10 +1,11 @@ """HTML Parser using ElementTree+TidyLib. This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -14,9 +15,9 @@ from elementtidy import TidyHTMLTreeBuilder from .bkmk_ph_util import HTMLParser -def parse_html(filename, charset=None, log=None): +def parse_html(html_text, charset=None, log=None): try: - html_tree = TidyHTMLTreeBuilder.parse(filename) + html_tree = TidyHTMLTreeBuilder.parseString(html_text) except: return None diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py index 53109be..6400d02 100644 --- a/parse_html/bkmk_ph_html5.py +++ b/parse_html/bkmk_ph_html5.py @@ -1,10 +1,11 @@ """HTML Parser using html5 This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -14,12 +15,9 @@ from html5lib import HTMLParser as HTML5Parser from .bkmk_ph_util import HTMLParser -def parse_html(filename, charset=None, log=None): +def parse_html(html_text, charset=None, log=None): parser = HTML5Parser() - fp = open(filename) - parser._parse(fp, encoding=charset, parseMeta=bool(charset)) - fp.close() - html_tree = parser.tree.getDocument() + html_tree = parser.parse(html_text, encoding=charset, parseMeta=bool(charset)) for node in html_tree.childNodes: if (node.name == 'html') and (node.type != 3): # Skip DocType element diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index d7020b0..0798467 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -1,10 +1,11 @@ """HTML Parser using Pythons' HTMLParser This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2013 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -79,17 +80,13 @@ class HTMLParser(_HTMLParser): self.icon = href -def parse_html(filename, charset=None, log=None): - infile = open(filename, 'r') +def parse_html(html_text, charset=None, log=None): parser = HTMLParser(charset) - for line in infile: - try: - parser.feed(line) - except (HTMLParseError, HTMLHeadDone): - break - - infile.close() + try: + parser.feed(html_text) + except (HTMLParseError, HTMLHeadDone): + pass try: parser.close() diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index 222f116..9cd29e4 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -1,21 +1,22 @@ """HTML Parser using lxml.html This file is a part of Bookmarks database and Internet robot. + """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] -from lxml.html import parse +from lxml.html import fromtring from .bkmk_ph_util import HTMLParser -def parse_html(filename, charset=None, log=None): - html_tree = parse(filename) +def parse_html(html_text, charset=None, log=None): + html_tree = fromtring(html_text) if html_tree.getroot() is None: return None -- 2.39.2