-"""
- Parser for Netscape Navigator's and Mozilla's bookmarks.html
+"""Parser for Netscape Navigator's and Mozilla's bookmarks.html
- Written by BroytMann. Copyright (C) 1997-2004 PhiloSoft Design
+This file is a part of Bookmarks database and Internet robot.
"""
+__author__ = "Oleg Broytman <phd@phdru.name>"
+__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design"
+__license__ = "GNU GPL"
+
+__all__ = ['BkmkParser']
+
-import sys, os
+import os
+from m_lib.defenc import default_encoding
from m_lib.net.www.html import HTMLParser
from bkmk_objects import Folder, Bookmark, Ruler
DEBUG = os.environ.has_key("BKMK_DEBUG")
if DEBUG:
- def debug(note):
- print note
+ def debug(note):
+ print(note)
- def dump_names(folder_stack):
- l = []
- for object in folder_stack:
- if object.isFolder:
- l.append(object.name)
- return "'%s'" % "' '".join(l)
+ def dump_names(folder_stack):
+ _l = []
+ for object in folder_stack:
+ if object.isFolder:
+ _l.append(object.name)
+ return "'%s'" % "' '".join(_l)
else:
- def debug(note):
- pass
- dump_names = debug
+ def debug(note):
+ pass
+ dump_names = debug
-DEFAULT_CHARSET = None
-
class BkmkParser(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
-
- self.urls = 0
- self.objects = 0
-
- self.charset = ""
- self.recode = None
-
-
- def handle_data(self, data):
- if data:
- if DEFAULT_CHARSET:
- data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "replace")
- self.accumulator += data
-
-
- # Mozilla - get charset
- def do_meta(self, attrs):
- http_equiv = ""
- content = ""
-
- for attrname, value in attrs:
- value = value.strip()
- if attrname == 'http-equiv':
- http_equiv = value.lower()
- elif attrname == 'content':
- content = value
-
- if http_equiv == "content-type":
- try:
- # extract charset from "text/html; charset=UTF-8"
- self.charset = content.split('=')[1]
- except IndexError:
- pass
- else:
- global DEFAULT_CHARSET
- DEFAULT_CHARSET = sys.getdefaultencoding()
-
-
- def start_title(self, attrs):
- self.accumulator += "<TITLE>"
-
- def end_title(self):
- self.accumulator += "</TITLE>"
-
-
- # Start root folder
- def start_h1(self, attrs):
- root_folder = Folder()
- self.current_object = root_folder
- self.root_folder = root_folder
- self.current_folder = root_folder
- self.folder_stack = [root_folder]
-
- self.root_folder.header = self.accumulator.strip()
- self.accumulator = ''
-
- def end_h1(self):
- accumulator = self.accumulator
- self.accumulator = ''
-
- debug("Root folder name: `%s'" % accumulator)
- self.root_folder.name = accumulator
-
-
- # Start a folder
- def start_h3(self, attrs):
- for attrname, value in attrs:
- value = value.strip()
- if attrname == 'add_date':
- add_date = value
-
- debug("New folder...")
- folder = Folder(add_date)
- self.current_object = folder
- self.current_folder.append(folder)
- self.folder_stack.append(folder) # push new folder
- self.current_folder = folder
- self.objects += 1
-
- def end_h3(self):
- accumulator = self.accumulator
- self.accumulator = ''
-
- debug("Folder name: `%s'" % accumulator)
- self.current_folder.name = accumulator
-
-
- # Start a bookmark
- def start_a(self, attrs):
- last_visit = None
- last_modified = None
- keyword = None
-
- for attrname, value in attrs:
- value = value.strip()
- if attrname == "href":
- href = value
- elif attrname == "add_date":
- add_date = value
- elif attrname == "last_visit":
- last_visit = value
- elif attrname == "last_modified":
- last_modified = value
- elif attrname == "shortcuturl":
- keyword = value
-
- debug("Bookmark points to: `%s'" % href)
- bookmark = Bookmark(href, add_date, last_visit, last_modified, keyword or '')
- self.current_object = bookmark
- self.current_folder.append(bookmark)
- self.urls += 1
- self.objects += 1
-
- def end_a(self):
- accumulator = self.accumulator
- self.accumulator = ''
-
- debug("Bookmark name: `%s'" % accumulator)
- bookmark = self.current_folder[-1]
- bookmark.name = accumulator
-
-
- def flush(self):
- accumulator = self.accumulator
-
- if accumulator:
- self.accumulator = ''
-
- current_object = self.current_object
- if current_object:
- current_object.comment += accumulator.strip()
- debug("Comment: `%s'" % current_object.comment)
-
-
- def start_dl(self, attrs):
- self.flush()
-
- do_dt = start_dl
-
-
- # End of folder
- def end_dl(self):
- self.flush()
- debug("End folder")
- debug("Folder stack: %s" % dump_names(self.folder_stack))
- if self.folder_stack:
- del self.folder_stack[-1] # pop last folder
- if self.folder_stack:
- self.current_folder = self.folder_stack[-1]
- else:
- debug("FOLDER STACK is EMPTY!!! (1)")
- else:
- debug("FOLDER STACK is EMPTY!!! (2)")
- self.current_object = None
-
-
- def close(self):
- HTMLParser.close(self)
- if self.folder_stack:
- raise ValueError, "wrong folder stack: %s" % self.folder_stack
-
-
- def do_dd(self, attrs):
- pass
-
- do_p = do_dd
-
-
- # Start ruler
- def do_hr(self, attrs):
- self.flush()
- debug("Ruler")
- self.current_folder.append(Ruler())
- self.current_object = None
- self.objects += 1
-
-
- # BR in comment
- def do_br(self, attrs):
- self.accumulator += "<BR>"
-
-
- # Allow < in the text
- def unknown_starttag(self, tag, attrs):
- self.accumulator += "<%s>" % tag
-
-
- # Do not allow unknow end tags
- def unknown_endtag(self, tag):
- raise NotImplementedError("Unknow end tag `%s'" % tag)
+ def __init__(self):
+ HTMLParser.__init__(self)
+
+ self.urls = 0
+ self.objects = 0
+
+ self.charset = None
+ self.recode = None
+
+ def handle_data(self, data):
+ if data:
+ if self.charset and default_encoding:
+ data = unicode(data, self.charset, "replace").\
+ encode(default_encoding, "xmlcharrefreplace")
+ self.accumulator += data
+
+ # Mozilla - get charset
+ def do_meta(self, attrs):
+ http_equiv = ""
+ content = ""
+
+ for attrname, value in attrs:
+ value = value.strip()
+ if attrname == 'http-equiv':
+ http_equiv = value.lower()
+ elif attrname == 'content':
+ content = value
+
+ if http_equiv == "content-type":
+ try:
+ # extract charset from "text/html; charset=UTF-8"
+ self.charset = content.split('=')[1]
+ except IndexError:
+ pass
+
+ def start_title(self, attrs):
+ if default_encoding:
+ self.accumulator += '<META HTTP-EQUIV="Content-Type" '
+ 'CONTENT="text/html; charset=%s">\n' % default_encoding
+ self.accumulator += "<TITLE>"
+
+ def end_title(self):
+ self.accumulator += "</TITLE>"
+
+ # Start root folder
+ def start_h1(self, attrs):
+ root_folder = Folder()
+ self.current_object = root_folder
+ self.root_folder = root_folder
+ self.current_folder = root_folder
+ self.folder_stack = [root_folder]
+
+ self.root_folder.header = self.accumulator.strip()
+ self.accumulator = ''
+
+ def end_h1(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
+
+ debug("Root folder name: `%s'" % accumulator)
+ self.root_folder.name = accumulator
+
+ # Start a folder
+ def start_h3(self, attrs):
+ last_modified = None
+ for attrname, value in attrs:
+ value = value.strip()
+ if attrname == 'add_date':
+ add_date = value
+ elif attrname == 'last_modified':
+ last_modified = value
+
+ debug("New folder...")
+ folder = Folder(add_date, last_modified=last_modified)
+ self.current_object = folder
+ self.current_folder.append(folder)
+ self.folder_stack.append(folder) # push new folder
+ self.current_folder = folder
+ self.objects += 1
+
+ def end_h3(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
+
+ debug("Folder name: `%s'" % accumulator)
+ self.current_folder.name = accumulator
+
+ # Start a bookmark
+ def start_a(self, attrs):
+ add_date = None
+ last_visit = None
+ last_modified = None
+ keyword = ''
+ icon = None
+ charset = None
+
+ for attrname, value in attrs:
+ value = value.strip()
+ if attrname == "href":
+ href = value
+ elif attrname == "add_date":
+ add_date = value
+ elif attrname == "last_visit":
+ last_visit = value
+ elif attrname == "last_modified":
+ last_modified = value
+ elif attrname == "shortcuturl":
+ keyword = value
+ elif attrname == "icon":
+ icon = value
+ elif attrname == "last_charset":
+ charset = value
+
+ debug("Bookmark points to: `%s'" % href)
+ bookmark = Bookmark(href, add_date, last_visit, last_modified,
+ keyword=keyword, icon=icon, charset=charset,
+ parser_charset=self.charset or default_encoding)
+ self.current_object = bookmark
+ self.current_folder.append(bookmark)
+ self.urls += 1
+ self.objects += 1
+
+ def end_a(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
+
+ debug("Bookmark name: `%s'" % accumulator)
+ bookmark = self.current_folder[-1]
+ bookmark.name = accumulator
+
+ def flush(self):
+ accumulator = self.accumulator
+
+ if accumulator:
+ self.accumulator = ''
+
+ current_object = self.current_object
+ if current_object:
+ current_object.comment += accumulator.strip()
+ debug("Comment: `%s'" % current_object.comment)
+
+ def start_dl(self, attrs):
+ self.flush()
+
+ do_dt = start_dl
+
+ # End of folder
+ def end_dl(self):
+ self.flush()
+ debug("End folder")
+ debug("Folder stack: %s" % dump_names(self.folder_stack))
+ if self.folder_stack:
+ del self.folder_stack[-1] # pop last folder
+ if self.folder_stack:
+ self.current_folder = self.folder_stack[-1]
+ else:
+ debug("FOLDER STACK is EMPTY!!! (1)")
+ else:
+ debug("FOLDER STACK is EMPTY!!! (2)")
+ self.current_object = None
+
+ def close(self):
+ HTMLParser.close(self)
+ if self.folder_stack:
+ raise ValueError("wrong folder stack: %s" % self.folder_stack)
+
+ def do_dd(self, attrs):
+ pass
+
+ do_p = do_dd
+
+ # Start ruler
+ def do_hr(self, attrs):
+ self.flush()
+ debug("Ruler")
+ self.current_folder.append(Ruler())
+ self.current_object = None
+ self.objects += 1
+
+ # BR in comment
+ def do_br(self, attrs):
+ self.accumulator += "<BR>"
+
+ # Allow < in the text
+ def unknown_starttag(self, tag, attrs):
+ self.accumulator += "<%s>" % tag
+
+ # Do not allow unknow end tags
+ def unknown_endtag(self, tag):
+ raise NotImplementedError("Unknow end tag `%s'" % tag)