1 """Parser for Netscape Navigator's and Mozilla's bookmarks.html
3 This file is a part of Bookmarks database and Internet robot.
6 __version__ = "$Revision$"[11:-2]
7 __revision__ = "$Id$"[5:-2]
8 __date__ = "$Date$"[7:-2]
9 __author__ = "Oleg Broytman <phd@phdru.name>"
10 __copyright__ = "Copyright (C) 1997-2011 PhiloSoft Design"
11 __license__ = "GNU GPL"
13 __all__ = ['BkmkParser']
17 from m_lib.defenc import default_encoding
18 from m_lib.net.www.html import HTMLParser
19 from bkmk_objects import Folder, Bookmark, Ruler
22 DEBUG = os.environ.has_key("BKMK_DEBUG")
28 def dump_names(folder_stack):
30 for object in folder_stack:
33 return "'%s'" % "' '".join(l)
41 class BkmkParser(HTMLParser):
43 HTMLParser.__init__(self)
51 def handle_data(self, data):
53 if self.charset and default_encoding:
54 data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace")
55 self.accumulator += data
57 # Mozilla - get charset
58 def do_meta(self, attrs):
62 for attrname, value in attrs:
64 if attrname == 'http-equiv':
65 http_equiv = value.lower()
66 elif attrname == 'content':
69 if http_equiv == "content-type":
71 # extract charset from "text/html; charset=UTF-8"
72 self.charset = content.split('=')[1]
76 def start_title(self, attrs):
78 self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding
79 self.accumulator += "<TITLE>"
82 self.accumulator += "</TITLE>"
85 def start_h1(self, attrs):
86 root_folder = Folder()
87 self.current_object = root_folder
88 self.root_folder = root_folder
89 self.current_folder = root_folder
90 self.folder_stack = [root_folder]
92 self.root_folder.header = self.accumulator.strip()
96 accumulator = self.accumulator
99 debug("Root folder name: `%s'" % accumulator)
100 self.root_folder.name = accumulator
103 def start_h3(self, attrs):
105 for attrname, value in attrs:
106 value = value.strip()
107 if attrname == 'add_date':
109 elif attrname == 'last_modified':
110 last_modified = value
112 debug("New folder...")
113 folder = Folder(add_date, last_modified=last_modified)
114 self.current_object = folder
115 self.current_folder.append(folder)
116 self.folder_stack.append(folder) # push new folder
117 self.current_folder = folder
121 accumulator = self.accumulator
122 self.accumulator = ''
124 debug("Folder name: `%s'" % accumulator)
125 self.current_folder.name = accumulator
128 def start_a(self, attrs):
136 for attrname, value in attrs:
137 value = value.strip()
138 if attrname == "href":
140 elif attrname == "add_date":
142 elif attrname == "last_visit":
144 elif attrname == "last_modified":
145 last_modified = value
146 elif attrname == "shortcuturl":
148 elif attrname == "icon":
150 elif attrname == "last_charset":
153 debug("Bookmark points to: `%s'" % href)
154 bookmark = Bookmark(href, add_date, last_visit, last_modified,
155 keyword=keyword, icon=icon,
156 charset=charset, parser_charset=self.charset or default_encoding)
157 self.current_object = bookmark
158 self.current_folder.append(bookmark)
163 accumulator = self.accumulator
164 self.accumulator = ''
166 debug("Bookmark name: `%s'" % accumulator)
167 bookmark = self.current_folder[-1]
168 bookmark.name = accumulator
171 accumulator = self.accumulator
174 self.accumulator = ''
176 current_object = self.current_object
178 current_object.comment += accumulator.strip()
179 debug("Comment: `%s'" % current_object.comment)
181 def start_dl(self, attrs):
190 debug("Folder stack: %s" % dump_names(self.folder_stack))
191 if self.folder_stack:
192 del self.folder_stack[-1] # pop last folder
193 if self.folder_stack:
194 self.current_folder = self.folder_stack[-1]
196 debug("FOLDER STACK is EMPTY!!! (1)")
198 debug("FOLDER STACK is EMPTY!!! (2)")
199 self.current_object = None
202 HTMLParser.close(self)
203 if self.folder_stack:
204 raise ValueError, "wrong folder stack: %s" % self.folder_stack
206 def do_dd(self, attrs):
212 def do_hr(self, attrs):
215 self.current_folder.append(Ruler())
216 self.current_object = None
220 def do_br(self, attrs):
221 self.accumulator += "<BR>"
223 # Allow < in the text
224 def unknown_starttag(self, tag, attrs):
225 self.accumulator += "<%s>" % tag
227 # Do not allow unknow end tags
228 def unknown_endtag(self, tag):
229 raise NotImplementedError("Unknow end tag `%s'" % tag)