X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=bkmk_parser.py;h=02116ccf025c9edacd119362331b64174ce89767;hp=b2a279ab4547d076e402974db5a8ccc9d3d24442;hb=96e39d55c791326368ff14d538850e14ba5a2c97;hpb=0f9fc870c001deaa001c726e8001fb239d07db02 diff --git a/bkmk_parser.py b/bkmk_parser.py index b2a279a..02116cc 100644 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -1,11 +1,17 @@ -""" - Parser for Netscape Navigator's and Mozilla's bookmarks.html +"""Parser for Netscape Navigator's and Mozilla's bookmarks.html - Written by BroytMann. Copyright (C) 1997-2008 PhiloSoft Design +This file is a part of Bookmarks database and Internet robot. """ +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['BkmkParser'] + -import sys, os +import os +from m_lib.defenc import default_encoding from m_lib.net.www.html import HTMLParser from bkmk_objects import Folder, Bookmark, Ruler @@ -29,8 +35,6 @@ else: dump_names = debug -DEFAULT_CHARSET = None - class BkmkParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) @@ -38,17 +42,15 @@ class BkmkParser(HTMLParser): self.urls = 0 self.objects = 0 - self.charset = "" + self.charset = None self.recode = None - def handle_data(self, data): if data: - if DEFAULT_CHARSET: - data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "xmlcharrefreplace") + if self.charset and default_encoding: + data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace") self.accumulator += data - # Mozilla - get charset def do_meta(self, attrs): http_equiv = "" @@ -67,27 +69,15 @@ class BkmkParser(HTMLParser): self.charset = content.split('=')[1] except IndexError: pass - else: - global DEFAULT_CHARSET - DEFAULT_CHARSET = sys.getdefaultencoding() - if DEFAULT_CHARSET == "ascii": - try: - import locale - except ImportError: - pass - else: - DEFAULT_CHARSET = locale.getpreferredencoding() - def start_title(self, attrs): - if DEFAULT_CHARSET: - self.accumulator += '\n' % DEFAULT_CHARSET + if default_encoding: + self.accumulator += '\n' % default_encoding self.accumulator += "" def end_title(self): self.accumulator += "" - # Start root folder def start_h1(self, attrs): root_folder = Folder() @@ -106,7 +96,6 @@ class BkmkParser(HTMLParser): debug("Root folder name: `%s'" % accumulator) self.root_folder.name = accumulator - # Start a folder def start_h3(self, attrs): last_modified = None @@ -132,13 +121,12 @@ class BkmkParser(HTMLParser): debug("Folder name: `%s'" % accumulator) self.current_folder.name = accumulator - # Start a bookmark def start_a(self, attrs): add_date = None last_visit = None last_modified = None - keyword = None + keyword = '' icon = None charset = None @@ -161,7 +149,8 @@ class BkmkParser(HTMLParser): debug("Bookmark points to: `%s'" % href) bookmark = Bookmark(href, add_date, last_visit, last_modified, - keyword or '', '', icon, charset) + keyword=keyword, icon=icon, + charset=charset, parser_charset=self.charset or default_encoding) self.current_object = bookmark self.current_folder.append(bookmark) self.urls += 1 @@ -175,7 +164,6 @@ class BkmkParser(HTMLParser): bookmark = self.current_folder[-1] bookmark.name = accumulator - def flush(self): accumulator = self.accumulator @@ -187,13 +175,11 @@ class BkmkParser(HTMLParser): current_object.comment += accumulator.strip() debug("Comment: `%s'" % current_object.comment) - def start_dl(self, attrs): self.flush() do_dt = start_dl - # End of folder def end_dl(self): self.flush() @@ -209,19 +195,16 @@ class BkmkParser(HTMLParser): debug("FOLDER STACK is EMPTY!!! (2)") self.current_object = None - def close(self): HTMLParser.close(self) if self.folder_stack: raise ValueError, "wrong folder stack: %s" % self.folder_stack - def do_dd(self, attrs): pass do_p = do_dd - # Start ruler def do_hr(self, attrs): self.flush() @@ -230,17 +213,14 @@ class BkmkParser(HTMLParser): self.current_object = None self.objects += 1 - # BR in comment def do_br(self, attrs): self.accumulator += "
" - # Allow < in the text def unknown_starttag(self, tag, attrs): self.accumulator += "<%s>" % tag - # Do not allow unknow end tags def unknown_endtag(self, tag): raise NotImplementedError("Unknow end tag `%s'" % tag)