1 """Parser for Netscape Navigator's and Mozilla's bookmarks.html
3 This file is a part of Bookmarks database and Internet robot.
6 __version__ = "$Revision$"[11:-2]
7 __revision__ = "$Id$"[5:-2]
8 __date__ = "$Date$"[7:-2]
9 __author__ = "Oleg Broytman <phd@phdru.name>"
10 __copyright__ = "Copyright (C) 1997-2011 PhiloSoft Design"
11 __license__ = "GNU GPL"
13 __all__ = ['BkmkParser']
17 from m_lib.net.www.html import HTMLParser
18 from bkmk_objects import Folder, Bookmark, Ruler
21 DEBUG = os.environ.has_key("BKMK_DEBUG")
27 def dump_names(folder_stack):
29 for object in folder_stack:
32 return "'%s'" % "' '".join(l)
40 DEFAULT_CHARSET = None
42 class BkmkParser(HTMLParser):
44 HTMLParser.__init__(self)
52 def handle_data(self, data):
55 data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "xmlcharrefreplace")
56 self.accumulator += data
58 # Mozilla - get charset
59 def do_meta(self, attrs):
63 for attrname, value in attrs:
65 if attrname == 'http-equiv':
66 http_equiv = value.lower()
67 elif attrname == 'content':
70 if http_equiv == "content-type":
72 # extract charset from "text/html; charset=UTF-8"
73 self.charset = content.split('=')[1]
77 global DEFAULT_CHARSET
78 DEFAULT_CHARSET = sys.getdefaultencoding()
79 if DEFAULT_CHARSET == "ascii":
85 DEFAULT_CHARSET = locale.getpreferredencoding()
87 def start_title(self, attrs):
89 self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % DEFAULT_CHARSET
90 self.accumulator += "<TITLE>"
93 self.accumulator += "</TITLE>"
96 def start_h1(self, attrs):
97 root_folder = Folder()
98 self.current_object = root_folder
99 self.root_folder = root_folder
100 self.current_folder = root_folder
101 self.folder_stack = [root_folder]
103 self.root_folder.header = self.accumulator.strip()
104 self.accumulator = ''
107 accumulator = self.accumulator
108 self.accumulator = ''
110 debug("Root folder name: `%s'" % accumulator)
111 self.root_folder.name = accumulator
114 def start_h3(self, attrs):
116 for attrname, value in attrs:
117 value = value.strip()
118 if attrname == 'add_date':
120 elif attrname == 'last_modified':
121 last_modified = value
123 debug("New folder...")
124 folder = Folder(add_date, last_modified=last_modified)
125 self.current_object = folder
126 self.current_folder.append(folder)
127 self.folder_stack.append(folder) # push new folder
128 self.current_folder = folder
132 accumulator = self.accumulator
133 self.accumulator = ''
135 debug("Folder name: `%s'" % accumulator)
136 self.current_folder.name = accumulator
139 def start_a(self, attrs):
147 for attrname, value in attrs:
148 value = value.strip()
149 if attrname == "href":
151 elif attrname == "add_date":
153 elif attrname == "last_visit":
155 elif attrname == "last_modified":
156 last_modified = value
157 elif attrname == "shortcuturl":
159 elif attrname == "icon":
161 elif attrname == "last_charset":
164 debug("Bookmark points to: `%s'" % href)
165 bookmark = Bookmark(href, add_date, last_visit, last_modified,
166 keyword or '', '', icon, charset)
167 self.current_object = bookmark
168 self.current_folder.append(bookmark)
173 accumulator = self.accumulator
174 self.accumulator = ''
176 debug("Bookmark name: `%s'" % accumulator)
177 bookmark = self.current_folder[-1]
178 bookmark.name = accumulator
181 accumulator = self.accumulator
184 self.accumulator = ''
186 current_object = self.current_object
188 current_object.comment += accumulator.strip()
189 debug("Comment: `%s'" % current_object.comment)
191 def start_dl(self, attrs):
200 debug("Folder stack: %s" % dump_names(self.folder_stack))
201 if self.folder_stack:
202 del self.folder_stack[-1] # pop last folder
203 if self.folder_stack:
204 self.current_folder = self.folder_stack[-1]
206 debug("FOLDER STACK is EMPTY!!! (1)")
208 debug("FOLDER STACK is EMPTY!!! (2)")
209 self.current_object = None
212 HTMLParser.close(self)
213 if self.folder_stack:
214 raise ValueError, "wrong folder stack: %s" % self.folder_stack
216 def do_dd(self, attrs):
222 def do_hr(self, attrs):
225 self.current_folder.append(Ruler())
226 self.current_object = None
230 def do_br(self, attrs):
231 self.accumulator += "<BR>"
233 # Allow < in the text
234 def unknown_starttag(self, tag, attrs):
235 self.accumulator += "<%s>" % tag
237 # Do not allow unknow end tags
238 def unknown_endtag(self, tag):
239 raise NotImplementedError("Unknow end tag `%s'" % tag)