X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;ds=sidebyside;f=bkmk_parser.py;h=7ce99e8412447cbba334291783149f80cc2c3e46;hb=fe82d8c9b7432c2e1f40a8d74113b71979b2c8b3;hp=c156cb76cc1b8fab12dcd48d012817a62ac73cbe;hpb=387f77d110986aa12967c9cd788ab0e4f41f2be2;p=bookmarks_db.git

diff --git a/bkmk_parser.py b/bkmk_parser.py
index c156cb7..7ce99e8 100755
--- a/bkmk_parser.py
+++ b/bkmk_parser.py
@@ -1,16 +1,18 @@
 """
-   Parser for Netscape Navigator's bookmarks.html
+   Parser for Netscape Navigator's and Mozilla's bookmarks.html
 
-   Written by BroytMann, Jun 1997 - Jul 2003. Copyright (C) 1997-2003 PhiloSoft Design
+   Written by BroytMann. Copyright (C) 1997-2005 PhiloSoft Design
 """
 
 
-import string
+import sys, os
 from m_lib.net.www.html import HTMLParser
 from bkmk_objects import Folder, Bookmark, Ruler
 
 
-if __debug__:
+DEBUG = os.environ.has_key("BKMK_DEBUG")
+
+if DEBUG:
    def debug(note):
       print note
 
@@ -19,7 +21,7 @@ if __debug__:
       for object in folder_stack:
          if object.isFolder:
             l.append(object.name)
-      return "'" + string.join(l, "' '") + "'"
+      return "'%s'" % "' '".join(l)
 
 else:
    def debug(note):
@@ -27,6 +29,8 @@ else:
    dump_names = debug
 
 
+DEFAULT_CHARSET = None
+
 class BkmkParser(HTMLParser):
    def __init__(self):
       HTMLParser.__init__(self)
@@ -40,9 +44,9 @@ class BkmkParser(HTMLParser):
 
    def handle_data(self, data):
       if data:
-         if self.charset:
-            data = unicode(data, self.charset).encode()
-         self.accumulator = "%s%s" % (self.accumulator, data)
+         if DEFAULT_CHARSET:
+            data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "replace")
+         self.accumulator += data
 
 
    # Mozilla - get charset
@@ -51,7 +55,7 @@ class BkmkParser(HTMLParser):
       content = ""
 
       for attrname, value in attrs:
-         value = string.strip(value)
+         value = value.strip()
          if attrname == 'http-equiv':
             http_equiv = value.lower()
          elif attrname == 'content':
@@ -63,13 +67,25 @@ class BkmkParser(HTMLParser):
             self.charset = content.split('=')[1]
          except IndexError:
             pass
+         else:
+            global DEFAULT_CHARSET
+            DEFAULT_CHARSET = sys.getdefaultencoding()
+            if DEFAULT_CHARSET == "ascii":
+               try:
+                  import locale
+               except ImportError:
+                  pass
+               else:
+                  DEFAULT_CHARSET = locale.getpreferredencoding()
 
 
    def start_title(self, attrs):
-      self.accumulator = "%s<TITLE>" % self.accumulator
+      if DEFAULT_CHARSET:
+         self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % DEFAULT_CHARSET
+      self.accumulator += "<TITLE>"
 
    def end_title(self):
-      self.accumulator = "%s</TITLE>" % self.accumulator
+      self.accumulator += "</TITLE>"
 
 
    # Start root folder
@@ -80,7 +96,7 @@ class BkmkParser(HTMLParser):
       self.current_folder = root_folder
       self.folder_stack = [root_folder]
 
-      self.root_folder.header = self.accumulator
+      self.root_folder.header = self.accumulator.strip()
       self.accumulator = ''
 
    def end_h1(self):
@@ -91,10 +107,10 @@ class BkmkParser(HTMLParser):
       self.root_folder.name = accumulator
 
 
-   # Start next folder
+   # Start a folder
    def start_h3(self, attrs):
       for attrname, value in attrs:
-         value = string.strip(value)
+         value = value.strip()
          if attrname == 'add_date':
             add_date = value
 
@@ -104,7 +120,7 @@ class BkmkParser(HTMLParser):
       self.current_folder.append(folder)
       self.folder_stack.append(folder) # push new folder
       self.current_folder = folder
-      self.objects = self.objects + 1
+      self.objects += 1
 
    def end_h3(self):
       accumulator = self.accumulator
@@ -114,28 +130,31 @@ class BkmkParser(HTMLParser):
       self.current_folder.name = accumulator
 
 
-   # Start bookmark
+   # Start a bookmark
    def start_a(self, attrs):
       last_visit = None
       last_modified = None
+      keyword = None
 
       for attrname, value in attrs:
-         value = string.strip(value)
-         if attrname == 'href':
+         value = value.strip()
+         if attrname == "href":
             href = value
-         if attrname == 'add_date':
+         elif attrname == "add_date":
             add_date = value
-         if attrname == 'last_visit':
+         elif attrname == "last_visit":
             last_visit = value
-         if attrname == 'last_modified':
+         elif attrname == "last_modified":
             last_modified = value
+         elif attrname == "shortcuturl":
+            keyword = value
 
       debug("Bookmark points to: `%s'" % href)
-      bookmark = Bookmark(href, add_date, last_visit, last_modified)
+      bookmark = Bookmark(href, add_date, last_visit, last_modified, keyword or '')
       self.current_object = bookmark
       self.current_folder.append(bookmark)
-      self.urls = self.urls + 1
-      self.objects = self.objects + 1
+      self.urls += 1
+      self.objects += 1
 
    def end_a(self):
       accumulator = self.accumulator
@@ -153,8 +172,9 @@ class BkmkParser(HTMLParser):
          self.accumulator = ''
 
          current_object = self.current_object
-         current_object.comment = current_object.comment + accumulator
-         debug("Comment: `%s'" % current_object.comment)
+         if current_object:
+            current_object.comment += accumulator.strip()
+            debug("Comment: `%s'" % current_object.comment)
 
 
    def start_dl(self, attrs):
@@ -197,17 +217,17 @@ class BkmkParser(HTMLParser):
       debug("Ruler")
       self.current_folder.append(Ruler())
       self.current_object = None
-      self.objects = self.objects + 1
+      self.objects += 1
 
 
    # BR in comment
    def do_br(self, attrs):
-      self.accumulator = "%s<BR>" % self.accumulator
+      self.accumulator += "<BR>"
 
 
    # Allow < in the text
    def unknown_starttag(self, tag, attrs):
-      self.accumulator = "%s<%s>" % (self.accumulator, tag)
+      self.accumulator += "<%s>" % tag
 
 
    # Do not allow unknow end tags