Minor refactoring

[bookmarks_db.git] / bkmk_parser.py
diff --git a/bkmk_parser.py b/bkmk_parser.py

old mode 100755 (executable)

new mode 100644 (file)

index f396e2e..02116cc
--- a/bkmk_parser.py
+++ b/bkmk_parser.py
@@ -1,16 +1,24 @@
-"""
-   Parser for Netscape Navigator's bookmarks.html
+"""Parser for Netscape Navigator's and Mozilla's bookmarks.html
  
-   Written by BroytMann, Jun 1997 - Jun 2002. Copyright (C) 1997-2002 PhiloSoft Design
+This file is a part of Bookmarks database and Internet robot.
  """
  
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['BkmkParser']
+
  
-import string
-from m_lib.www.html import HTMLParser
+import os
+from m_lib.defenc import default_encoding
+from m_lib.net.www.html import HTMLParser
  from bkmk_objects import Folder, Bookmark, Ruler
  
  
-if __debug__:
+DEBUG = os.environ.has_key("BKMK_DEBUG")
+
+if DEBUG:
     def debug(note):
        print note
  
@@ -19,7 +27,7 @@ if __debug__:
        for object in folder_stack:
           if object.isFolder:
              l.append(object.name)
-      return "'" + string.join(l, "' '") + "'"
+      return "'%s'" % "' '".join(l)
  
  else:
     def debug(note):
@@ -34,16 +42,14 @@ class BkmkParser(HTMLParser):
        self.urls = 0
        self.objects = 0
  
-      self.charset = ""
+      self.charset = None
        self.recode = None
  
-
     def handle_data(self, data):
        if data:
-         if self.charset:
-            data = unicode(data, self.charset).encode()
-         self.accumulator = "%s%s" % (self.accumulator, data)
-
+         if self.charset and default_encoding:
+            data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace")
+         self.accumulator += data
  
     # Mozilla - get charset
     def do_meta(self, attrs):
@@ -51,7 +57,7 @@ class BkmkParser(HTMLParser):
        content = ""
  
        for attrname, value in attrs:
-         value = string.strip(value)
+         value = value.strip()
           if attrname == 'http-equiv':
              http_equiv = value.lower()
           elif attrname == 'content':
@@ -64,13 +70,13 @@ class BkmkParser(HTMLParser):
           except IndexError:
              pass
  
-
     def start_title(self, attrs):
-      self.accumulator = "%s<TITLE>" % self.accumulator
+      if default_encoding:
+         self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding
+      self.accumulator += "<TITLE>"
  
     def end_title(self):
-      self.accumulator = "%s</TITLE>" % self.accumulator
-
+      self.accumulator += "</TITLE>"
  
     # Start root folder
     def start_h1(self, attrs):
@@ -80,7 +86,7 @@ class BkmkParser(HTMLParser):
        self.current_folder = root_folder
        self.folder_stack = [root_folder]
  
-      self.root_folder.header = self.accumulator
+      self.root_folder.header = self.accumulator.strip()
        self.accumulator = ''
  
     def end_h1(self):
@@ -90,21 +96,23 @@ class BkmkParser(HTMLParser):
        debug("Root folder name: `%s'" % accumulator)
        self.root_folder.name = accumulator
  
-
-   # Start next folder
+   # Start a folder
     def start_h3(self, attrs):
+      last_modified = None
        for attrname, value in attrs:
-         value = string.strip(value)
+         value = value.strip()
           if attrname == 'add_date':
              add_date = value
+         elif attrname == 'last_modified':
+            last_modified = value
  
        debug("New folder...")
-      folder = Folder(add_date)
+      folder = Folder(add_date, last_modified=last_modified)
        self.current_object = folder
        self.current_folder.append(folder)
        self.folder_stack.append(folder) # push new folder
        self.current_folder = folder
-      self.objects = self.objects + 1
+      self.objects += 1
  
     def end_h3(self):
        accumulator = self.accumulator
@@ -113,26 +121,40 @@ class BkmkParser(HTMLParser):
        debug("Folder name: `%s'" % accumulator)
        self.current_folder.name = accumulator
  
-
-   # Start bookmark
+   # Start a bookmark
     def start_a(self, attrs):
+      add_date = None
+      last_visit = None
+      last_modified = None
+      keyword = ''
+      icon = None
+      charset = None
+
        for attrname, value in attrs:
-         value = string.strip(value)
-         if attrname == 'href':
+         value = value.strip()
+         if attrname == "href":
              href = value
-         if attrname == 'add_date':
+         elif attrname == "add_date":
              add_date = value
-         if attrname == 'last_visit':
+         elif attrname == "last_visit":
              last_visit = value
-         if attrname == 'last_modified':
+         elif attrname == "last_modified":
              last_modified = value
+         elif attrname == "shortcuturl":
+            keyword = value
+         elif attrname == "icon":
+            icon = value
+         elif attrname == "last_charset":
+            charset = value
  
        debug("Bookmark points to: `%s'" % href)
-      bookmark = Bookmark(href, add_date, last_visit, last_modified)
+      bookmark = Bookmark(href, add_date, last_visit, last_modified,
+         keyword=keyword, icon=icon,
+         charset=charset, parser_charset=self.charset or default_encoding)
        self.current_object = bookmark
        self.current_folder.append(bookmark)
-      self.urls = self.urls + 1
-      self.objects = self.objects + 1
+      self.urls += 1
+      self.objects += 1
  
     def end_a(self):
        accumulator = self.accumulator
@@ -142,7 +164,6 @@ class BkmkParser(HTMLParser):
        bookmark = self.current_folder[-1]
        bookmark.name = accumulator
  
-
     def flush(self):
        accumulator = self.accumulator
  
@@ -150,16 +171,15 @@ class BkmkParser(HTMLParser):
           self.accumulator = ''
  
           current_object = self.current_object
-         current_object.comment = current_object.comment + accumulator
-         debug("Comment: `%s'" % current_object.comment)
-
+         if current_object:
+            current_object.comment += accumulator.strip()
+            debug("Comment: `%s'" % current_object.comment)
  
     def start_dl(self, attrs):
        self.flush()
  
     do_dt = start_dl
  
-
     # End of folder
     def end_dl(self):
        self.flush()
@@ -175,37 +195,31 @@ class BkmkParser(HTMLParser):
           debug("FOLDER STACK is EMPTY!!! (2)")
        self.current_object = None
  
-
     def close(self):
        HTMLParser.close(self)
        if self.folder_stack:
           raise ValueError, "wrong folder stack: %s" % self.folder_stack
  
-
     def do_dd(self, attrs):
        pass
  
     do_p = do_dd
  
-
     # Start ruler
     def do_hr(self, attrs):
        self.flush()
        debug("Ruler")
        self.current_folder.append(Ruler())
        self.current_object = None
-      self.objects = self.objects + 1
-
+      self.objects += 1
  
     # BR in comment
     def do_br(self, attrs):
-      self.accumulator = "%s<BR>" % self.accumulator
-
+      self.accumulator += "<BR>"
  
     # Allow < in the text
     def unknown_starttag(self, tag, attrs):
-      self.accumulator = "%s<%s>" % (self.accumulator, tag)
-
+      self.accumulator += "<%s>" % tag
  
     # Do not allow unknow end tags
     def unknown_endtag(self, tag):