X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=bkmk_parser.py;h=e3b3db02532567aef8060c4546f1f3a6af6039e2;hb=338c964afba3651bd8fe6318644c0fcabb66cc3b;hp=7ce99e8412447cbba334291783149f80cc2c3e46;hpb=2626a655eb1ad0631280674e9564dc6802b73e3b;p=bookmarks_db.git diff --git a/bkmk_parser.py b/bkmk_parser.py old mode 100755 new mode 100644 index 7ce99e8..e3b3db0 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -1,9 +1,17 @@ -""" - Parser for Netscape Navigator's and Mozilla's bookmarks.html +"""Parser for Netscape Navigator's and Mozilla's bookmarks.html - Written by BroytMann. Copyright (C) 1997-2005 PhiloSoft Design +This file is a part of Bookmarks database and Internet robot. """ +__version__ = "$Revision$"[11:-2] +__revision__ = "$Id$"[5:-2] +__date__ = "$Date$"[7:-2] +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 1997-2011 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['BkmkParser'] + import sys, os from m_lib.net.www.html import HTMLParser @@ -41,14 +49,12 @@ class BkmkParser(HTMLParser): self.charset = "" self.recode = None - def handle_data(self, data): if data: if DEFAULT_CHARSET: - data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "replace") + data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "xmlcharrefreplace") self.accumulator += data - # Mozilla - get charset def do_meta(self, attrs): http_equiv = "" @@ -78,7 +84,6 @@ class BkmkParser(HTMLParser): else: DEFAULT_CHARSET = locale.getpreferredencoding() - def start_title(self, attrs): if DEFAULT_CHARSET: self.accumulator += '\n' % DEFAULT_CHARSET @@ -87,7 +92,6 @@ class BkmkParser(HTMLParser): def end_title(self): self.accumulator += "" - # Start root folder def start_h1(self, attrs): root_folder = Folder() @@ -106,16 +110,18 @@ class BkmkParser(HTMLParser): debug("Root folder name: `%s'" % accumulator) self.root_folder.name = accumulator - # Start a folder def start_h3(self, attrs): + last_modified = None for attrname, value in attrs: value = value.strip() if attrname == 'add_date': add_date = value + elif attrname == 'last_modified': + last_modified = value debug("New folder...") - folder = Folder(add_date) + folder = Folder(add_date, last_modified=last_modified) self.current_object = folder self.current_folder.append(folder) self.folder_stack.append(folder) # push new folder @@ -129,12 +135,14 @@ class BkmkParser(HTMLParser): debug("Folder name: `%s'" % accumulator) self.current_folder.name = accumulator - # Start a bookmark def start_a(self, attrs): + add_date = None last_visit = None last_modified = None keyword = None + icon = None + charset = None for attrname, value in attrs: value = value.strip() @@ -148,9 +156,14 @@ class BkmkParser(HTMLParser): last_modified = value elif attrname == "shortcuturl": keyword = value + elif attrname == "icon": + icon = value + elif attrname == "last_charset": + charset = value debug("Bookmark points to: `%s'" % href) - bookmark = Bookmark(href, add_date, last_visit, last_modified, keyword or '') + bookmark = Bookmark(href, add_date, last_visit, last_modified, + keyword or '', '', icon, charset) self.current_object = bookmark self.current_folder.append(bookmark) self.urls += 1 @@ -164,7 +177,6 @@ class BkmkParser(HTMLParser): bookmark = self.current_folder[-1] bookmark.name = accumulator - def flush(self): accumulator = self.accumulator @@ -176,13 +188,11 @@ class BkmkParser(HTMLParser): current_object.comment += accumulator.strip() debug("Comment: `%s'" % current_object.comment) - def start_dl(self, attrs): self.flush() do_dt = start_dl - # End of folder def end_dl(self): self.flush() @@ -198,19 +208,16 @@ class BkmkParser(HTMLParser): debug("FOLDER STACK is EMPTY!!! (2)") self.current_object = None - def close(self): HTMLParser.close(self) if self.folder_stack: raise ValueError, "wrong folder stack: %s" % self.folder_stack - def do_dd(self, attrs): pass do_p = do_dd - # Start ruler def do_hr(self, attrs): self.flush() @@ -219,17 +226,14 @@ class BkmkParser(HTMLParser): self.current_object = None self.objects += 1 - # BR in comment def do_br(self, attrs): self.accumulator += "
" - # Allow < in the text def unknown_starttag(self, tag, attrs): self.accumulator += "<%s>" % tag - # Do not allow unknow end tags def unknown_endtag(self, tag): raise NotImplementedError("Unknow end tag `%s'" % tag)