-"""
- Parser for Netscape Navigator's bookmarks.html
+"""Parser for Netscape Navigator's and Mozilla's bookmarks.html
- Written by BroytMann, Jun 1997 - Jul 2003. Copyright (C) 1997-2003 PhiloSoft Design
+This file is a part of Bookmarks database and Internet robot.
"""
+__version__ = "$Revision$"[11:-2]
+__revision__ = "$Id$"[5:-2]
+__date__ = "$Date$"[7:-2]
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 1997-2011 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['BkmkParser']
+
-import string
+import os
+from m_lib.defenc import default_encoding
from m_lib.net.www.html import HTMLParser
from bkmk_objects import Folder, Bookmark, Ruler
-if __debug__:
+DEBUG = os.environ.has_key("BKMK_DEBUG")
+
+if DEBUG:
def debug(note):
print note
for object in folder_stack:
if object.isFolder:
l.append(object.name)
- return "'" + string.join(l, "' '") + "'"
+ return "'%s'" % "' '".join(l)
else:
def debug(note):
self.urls = 0
self.objects = 0
- self.charset = ""
+ self.charset = None
self.recode = None
-
def handle_data(self, data):
if data:
- if self.charset:
- data = unicode(data, self.charset).encode()
- self.accumulator = "%s%s" % (self.accumulator, data)
-
+ if self.charset and default_encoding:
+ data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace")
+ self.accumulator += data
# Mozilla - get charset
def do_meta(self, attrs):
content = ""
for attrname, value in attrs:
- value = string.strip(value)
+ value = value.strip()
if attrname == 'http-equiv':
http_equiv = value.lower()
elif attrname == 'content':
except IndexError:
pass
-
def start_title(self, attrs):
- self.accumulator = "%s<TITLE>" % self.accumulator
+ if default_encoding:
+ self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding
+ self.accumulator += "<TITLE>"
def end_title(self):
- self.accumulator = "%s</TITLE>" % self.accumulator
-
+ self.accumulator += "</TITLE>"
# Start root folder
def start_h1(self, attrs):
self.current_folder = root_folder
self.folder_stack = [root_folder]
- self.root_folder.header = self.accumulator
+ self.root_folder.header = self.accumulator.strip()
self.accumulator = ''
def end_h1(self):
debug("Root folder name: `%s'" % accumulator)
self.root_folder.name = accumulator
-
- # Start next folder
+ # Start a folder
def start_h3(self, attrs):
+ last_modified = None
for attrname, value in attrs:
- value = string.strip(value)
+ value = value.strip()
if attrname == 'add_date':
add_date = value
+ elif attrname == 'last_modified':
+ last_modified = value
debug("New folder...")
- folder = Folder(add_date)
+ folder = Folder(add_date, last_modified=last_modified)
self.current_object = folder
self.current_folder.append(folder)
self.folder_stack.append(folder) # push new folder
self.current_folder = folder
- self.objects = self.objects + 1
+ self.objects += 1
def end_h3(self):
accumulator = self.accumulator
debug("Folder name: `%s'" % accumulator)
self.current_folder.name = accumulator
-
- # Start bookmark
+ # Start a bookmark
def start_a(self, attrs):
+ add_date = None
last_visit = None
last_modified = None
+ keyword = ''
+ icon = None
+ charset = None
for attrname, value in attrs:
- value = string.strip(value)
- if attrname == 'href':
+ value = value.strip()
+ if attrname == "href":
href = value
- if attrname == 'add_date':
+ elif attrname == "add_date":
add_date = value
- if attrname == 'last_visit':
+ elif attrname == "last_visit":
last_visit = value
- if attrname == 'last_modified':
+ elif attrname == "last_modified":
last_modified = value
+ elif attrname == "shortcuturl":
+ keyword = value
+ elif attrname == "icon":
+ icon = value
+ elif attrname == "last_charset":
+ charset = value
debug("Bookmark points to: `%s'" % href)
- bookmark = Bookmark(href, add_date, last_visit, last_modified)
+ bookmark = Bookmark(href, add_date, last_visit, last_modified,
+ keyword=keyword, icon=icon,
+ charset=charset, parser_charset=self.charset or default_encoding)
self.current_object = bookmark
self.current_folder.append(bookmark)
- self.urls = self.urls + 1
- self.objects = self.objects + 1
+ self.urls += 1
+ self.objects += 1
def end_a(self):
accumulator = self.accumulator
bookmark = self.current_folder[-1]
bookmark.name = accumulator
-
def flush(self):
accumulator = self.accumulator
self.accumulator = ''
current_object = self.current_object
- current_object.comment = current_object.comment + accumulator
- debug("Comment: `%s'" % current_object.comment)
-
+ if current_object:
+ current_object.comment += accumulator.strip()
+ debug("Comment: `%s'" % current_object.comment)
def start_dl(self, attrs):
self.flush()
do_dt = start_dl
-
# End of folder
def end_dl(self):
self.flush()
debug("FOLDER STACK is EMPTY!!! (2)")
self.current_object = None
-
def close(self):
HTMLParser.close(self)
if self.folder_stack:
raise ValueError, "wrong folder stack: %s" % self.folder_stack
-
def do_dd(self, attrs):
pass
do_p = do_dd
-
# Start ruler
def do_hr(self, attrs):
self.flush()
debug("Ruler")
self.current_folder.append(Ruler())
self.current_object = None
- self.objects = self.objects + 1
-
+ self.objects += 1
# BR in comment
def do_br(self, attrs):
- self.accumulator = "%s<BR>" % self.accumulator
-
+ self.accumulator += "<BR>"
# Allow < in the text
def unknown_starttag(self, tag, attrs):
- self.accumulator = "%s<%s>" % (self.accumulator, tag)
-
+ self.accumulator += "<%s>" % tag
# Do not allow unknow end tags
def unknown_endtag(self, tag):