]> git.phdru.name Git - bookmarks_db.git/blob - bkmk_parser.py
Updated to m_lib version 1.2. Extended support for Mozilla.
[bookmarks_db.git] / bkmk_parser.py
1 """
2    Parser for Netscape Navigator's bookmarks.html
3
4    Written by BroytMann, Jun 1997 - Jul 2003. Copyright (C) 1997-2003 PhiloSoft Design
5 """
6
7
8 import string
9 from m_lib.net.www.html import HTMLParser
10 from bkmk_objects import Folder, Bookmark, Ruler
11
12
13 if __debug__:
14    def debug(note):
15       print note
16
17    def dump_names(folder_stack):
18       l = []
19       for object in folder_stack:
20          if object.isFolder:
21             l.append(object.name)
22       return "'" + string.join(l, "' '") + "'"
23
24 else:
25    def debug(note):
26       pass
27    dump_names = debug
28
29
30 class BkmkParser(HTMLParser):
31    def __init__(self):
32       HTMLParser.__init__(self)
33
34       self.urls = 0
35       self.objects = 0
36
37       self.charset = ""
38       self.recode = None
39
40
41    def handle_data(self, data):
42       if data:
43          if self.charset:
44             data = unicode(data, self.charset).encode()
45          self.accumulator = "%s%s" % (self.accumulator, data)
46
47
48    # Mozilla - get charset
49    def do_meta(self, attrs):
50       http_equiv = ""
51       content = ""
52
53       for attrname, value in attrs:
54          value = string.strip(value)
55          if attrname == 'http-equiv':
56             http_equiv = value.lower()
57          elif attrname == 'content':
58             content = value
59
60       if http_equiv == "content-type":
61          try:
62             # extract charset from "text/html; charset=UTF-8"
63             self.charset = content.split('=')[1]
64          except IndexError:
65             pass
66
67
68    def start_title(self, attrs):
69       self.accumulator = "%s<TITLE>" % self.accumulator
70
71    def end_title(self):
72       self.accumulator = "%s</TITLE>" % self.accumulator
73
74
75    # Start root folder
76    def start_h1(self, attrs):
77       root_folder = Folder()
78       self.current_object = root_folder
79       self.root_folder = root_folder
80       self.current_folder = root_folder
81       self.folder_stack = [root_folder]
82
83       self.root_folder.header = self.accumulator
84       self.accumulator = ''
85
86    def end_h1(self):
87       accumulator = self.accumulator
88       self.accumulator = ''
89
90       debug("Root folder name: `%s'" % accumulator)
91       self.root_folder.name = accumulator
92
93
94    # Start next folder
95    def start_h3(self, attrs):
96       for attrname, value in attrs:
97          value = string.strip(value)
98          if attrname == 'add_date':
99             add_date = value
100
101       debug("New folder...")
102       folder = Folder(add_date)
103       self.current_object = folder
104       self.current_folder.append(folder)
105       self.folder_stack.append(folder) # push new folder
106       self.current_folder = folder
107       self.objects = self.objects + 1
108
109    def end_h3(self):
110       accumulator = self.accumulator
111       self.accumulator = ''
112
113       debug("Folder name: `%s'" % accumulator)
114       self.current_folder.name = accumulator
115
116
117    # Start bookmark
118    def start_a(self, attrs):
119       last_visit = None
120       last_modified = None
121
122       for attrname, value in attrs:
123          value = string.strip(value)
124          if attrname == 'href':
125             href = value
126          if attrname == 'add_date':
127             add_date = value
128          if attrname == 'last_visit':
129             last_visit = value
130          if attrname == 'last_modified':
131             last_modified = value
132
133       debug("Bookmark points to: `%s'" % href)
134       bookmark = Bookmark(href, add_date, last_visit, last_modified)
135       self.current_object = bookmark
136       self.current_folder.append(bookmark)
137       self.urls = self.urls + 1
138       self.objects = self.objects + 1
139
140    def end_a(self):
141       accumulator = self.accumulator
142       self.accumulator = ''
143
144       debug("Bookmark name: `%s'" % accumulator)
145       bookmark = self.current_folder[-1]
146       bookmark.name = accumulator
147
148
149    def flush(self):
150       accumulator = self.accumulator
151
152       if accumulator:
153          self.accumulator = ''
154
155          current_object = self.current_object
156          current_object.comment = current_object.comment + accumulator
157          debug("Comment: `%s'" % current_object.comment)
158
159
160    def start_dl(self, attrs):
161       self.flush()
162
163    do_dt = start_dl
164
165
166    # End of folder
167    def end_dl(self):
168       self.flush()
169       debug("End folder")
170       debug("Folder stack: %s" % dump_names(self.folder_stack))
171       if self.folder_stack:
172          del self.folder_stack[-1] # pop last folder
173          if self.folder_stack:
174             self.current_folder = self.folder_stack[-1]
175          else:
176             debug("FOLDER STACK is EMPTY!!! (1)")
177       else:
178          debug("FOLDER STACK is EMPTY!!! (2)")
179       self.current_object = None
180
181
182    def close(self):
183       HTMLParser.close(self)
184       if self.folder_stack:
185          raise ValueError, "wrong folder stack: %s" % self.folder_stack
186
187
188    def do_dd(self, attrs):
189       pass
190
191    do_p = do_dd
192
193
194    # Start ruler
195    def do_hr(self, attrs):
196       self.flush()
197       debug("Ruler")
198       self.current_folder.append(Ruler())
199       self.current_object = None
200       self.objects = self.objects + 1
201
202
203    # BR in comment
204    def do_br(self, attrs):
205       self.accumulator = "%s<BR>" % self.accumulator
206
207
208    # Allow < in the text
209    def unknown_starttag(self, tag, attrs):
210       self.accumulator = "%s<%s>" % (self.accumulator, tag)
211
212
213    # Do not allow unknow end tags
214    def unknown_endtag(self, tag):
215       raise NotImplementedError("Unknow end tag `%s'" % tag)