]> git.phdru.name Git - bookmarks_db.git/blob - bkmk_parser.py
Fix(Robot): Stop splitting and un-splitting URLs
[bookmarks_db.git] / bkmk_parser.py
1 """Parser for Netscape Navigator's and Mozilla's bookmarks.html
2
3 This file is a part of Bookmarks database and Internet robot.
4 """
5
6 __author__ = "Oleg Broytman <phd@phdru.name>"
7 __copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design"
8 __license__ = "GNU GPL"
9
10 __all__ = ['BkmkParser']
11
12
13 import os
14 from m_lib.defenc import default_encoding
15 from m_lib.net.www.html import HTMLParser
16 from bkmk_objects import Folder, Bookmark, Ruler
17
18
19 DEBUG = "BKMK_DEBUG" in os.environ
20
21 if DEBUG:
22     def debug(note):
23         print(note)
24
25     def dump_names(folder_stack):
26         _l = []
27         for object in folder_stack:
28             if object.isFolder:
29                 _l.append(object.name)
30         return "'%s'" % "' '".join(_l)
31
32 else:
33     def debug(note):
34         pass
35     dump_names = debug
36
37
38 class BkmkParser(HTMLParser):
39     def __init__(self):
40         HTMLParser.__init__(self)
41
42         self.urls = 0
43         self.objects = 0
44
45         self.charset = None
46         self.recode = None
47
48     def handle_data(self, data):
49         if data:
50             #if self.charset and default_encoding:
51             #    data = data.decode(self.charset, "replace").\
52             #        encode(default_encoding, "xmlcharrefreplace")
53             self.accumulator += data
54
55     # Mozilla - get charset
56     def do_meta(self, attrs):
57         http_equiv = ""
58         content = ""
59
60         for attrname, value in attrs:
61             value = value.strip()
62             if attrname == 'http-equiv':
63                 http_equiv = value.lower()
64             elif attrname == 'content':
65                 content = value
66
67         if http_equiv == "content-type":
68             try:
69                 # extract charset from "text/html; charset=UTF-8"
70                 self.charset = content.split('=')[1]
71             except IndexError:
72                 pass
73
74     def start_title(self, attrs):
75         if default_encoding:
76             self.accumulator += '<META HTTP-EQUIV="Content-Type" '
77             'CONTENT="text/html; charset=%s">\n' % default_encoding
78         self.accumulator += "<TITLE>"
79
80     def end_title(self):
81         self.accumulator += "</TITLE>"
82
83     # Start root folder
84     def start_h1(self, attrs):
85         root_folder = Folder()
86         self.current_object = root_folder
87         self.root_folder = root_folder
88         self.current_folder = root_folder
89         self.folder_stack = [root_folder]
90
91         self.root_folder.header = self.accumulator.strip()
92         self.accumulator = ''
93
94     def end_h1(self):
95         accumulator = self.accumulator
96         self.accumulator = ''
97
98         debug("Root folder name: `%s'" % accumulator)
99         self.root_folder.name = accumulator
100
101     # Start a folder
102     def start_h3(self, attrs):
103         last_modified = None
104         for attrname, value in attrs:
105             value = value.strip()
106             if attrname == 'add_date':
107                 add_date = value
108             elif attrname == 'last_modified':
109                 last_modified = value
110
111         debug("New folder...")
112         folder = Folder(add_date, last_modified=last_modified)
113         self.current_object = folder
114         self.current_folder.append(folder)
115         self.folder_stack.append(folder)  # push new folder
116         self.current_folder = folder
117         self.objects += 1
118
119     def end_h3(self):
120         accumulator = self.accumulator
121         self.accumulator = ''
122
123         debug("Folder name: `%s'" % accumulator)
124         self.current_folder.name = accumulator
125
126     # Start a bookmark
127     def start_a(self, attrs):
128         add_date = None
129         last_visit = None
130         last_modified = None
131         keyword = ''
132         icon = None
133         charset = None
134
135         for attrname, value in attrs:
136             value = value.strip()
137             if attrname == "href":
138                 href = value
139             elif attrname == "add_date":
140                 add_date = value
141             elif attrname == "last_visit":
142                 last_visit = value
143             elif attrname == "last_modified":
144                 last_modified = value
145             elif attrname == "shortcuturl":
146                 keyword = value
147             elif attrname == "icon":
148                 icon = value
149             elif attrname == "last_charset":
150                 charset = value
151
152         debug("Bookmark points to: `%s'" % href)
153         bookmark = Bookmark(href, add_date, last_visit, last_modified,
154                             keyword=keyword, icon=icon, charset=charset,
155                             parser_charset=self.charset or default_encoding)
156         self.current_object = bookmark
157         self.current_folder.append(bookmark)
158         self.urls += 1
159         self.objects += 1
160
161     def end_a(self):
162         accumulator = self.accumulator
163         self.accumulator = ''
164
165         debug("Bookmark name: `%s'" % accumulator)
166         bookmark = self.current_folder[-1]
167         bookmark.name = accumulator
168
169     def flush(self):
170         accumulator = self.accumulator
171
172         if accumulator:
173             self.accumulator = ''
174
175             current_object = self.current_object
176             if current_object:
177                 current_object.comment += accumulator.strip()
178                 debug("Comment: `%s'" % current_object.comment)
179
180     def start_dl(self, attrs):
181         self.flush()
182
183     do_dt = start_dl
184
185     # End of folder
186     def end_dl(self):
187         self.flush()
188         debug("End folder")
189         debug("Folder stack: %s" % dump_names(self.folder_stack))
190         if self.folder_stack:
191             del self.folder_stack[-1]  # pop last folder
192             if self.folder_stack:
193                 self.current_folder = self.folder_stack[-1]
194             else:
195                 debug("FOLDER STACK is EMPTY!!! (1)")
196         else:
197             debug("FOLDER STACK is EMPTY!!! (2)")
198         self.current_object = None
199
200     def close(self):
201         HTMLParser.close(self)
202         if self.folder_stack:
203             raise ValueError("wrong folder stack: %s" % self.folder_stack)
204
205     def do_dd(self, attrs):
206         pass
207
208     do_p = do_dd
209
210     # Start ruler
211     def do_hr(self, attrs):
212         self.flush()
213         debug("Ruler")
214         self.current_folder.append(Ruler())
215         self.current_object = None
216         self.objects += 1
217
218     # BR in comment
219     def do_br(self, attrs):
220         self.accumulator += "<BR>"
221
222     # Allow < in the text
223     def unknown_starttag(self, tag, attrs):
224         self.accumulator += "<%s>" % tag
225
226     # Do not allow unknow end tags
227     def unknown_endtag(self, tag):
228         raise NotImplementedError("Unknow end tag `%s'" % tag)