]> git.phdru.name Git - bookmarks_db.git/blob - Robots/bkmk_robot_base.py
Allow the script to be run only in the origin directory
[bookmarks_db.git] / Robots / bkmk_robot_base.py
1 """Base class for robots
2
3 This file is a part of Bookmarks database and Internet robot.
4
5 """
6
7 __author__ = "Oleg Broytman <phd@phdru.name>"
8 __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
9 __license__ = "GNU GPL"
10
11 __all__ = ['robot_base', 'get_error']
12
13
14 import sys
15 import time, urllib
16 from base64 import b64encode
17 from urlparse import urljoin
18
19 from m_lib.net.www.util import parse_time
20 from m_lib.md5wrapper import md5wrapper
21
22 from bkmk_objects import Robot
23 from parse_html import parse_html
24
25
26 class RedirectException(Exception):
27    reloc_dict = {
28       301: "perm.",
29       302: "temp2.",
30       303: "temp3.",
31       307: "temp7.",
32       "html": "html"
33    }
34    def __init__(self, errcode, newurl):
35       Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl))
36       self.url = newurl
37
38
39 def get_error(msg):
40    if isinstance(msg, str):
41       return msg
42
43    else:
44       s = []
45       for i in msg:
46          s.append("'%s'" % str(i).replace('\n', "\\n"))
47       return "(%s)" % ' '.join(s)
48
49
50 icons = {} # Icon cache; maps URL to a tuple (content type, data)
51            # or None if there is no icon.
52
53 class robot_base(Robot):
54    def check_url(self, bookmark):
55       try:
56          self.start = int(time.time())
57          bookmark.icon = None
58
59          url_type, url_rest = urllib.splittype(bookmark.href)
60          url_host, url_path = urllib.splithost(url_rest)
61          url_path, url_tag  = urllib.splittag(url_path)
62
63          url = "%s://%s%s" % (url_type, url_host, url_path)
64          headers, content = self.urlretrieve(bookmark, url, True)
65
66          if content is None:
67              return 1
68
69          size = 0
70          last_modified = None
71
72          if headers:
73             try:
74                size = headers["Content-Length"]
75             except KeyError:
76                size = len(content)
77
78             try:
79                last_modified = headers["Last-Modified"]
80             except KeyError:
81                pass
82
83             if last_modified:
84                last_modified = parse_time(last_modified)
85          else:
86             size = len(content)
87
88          if last_modified:
89             last_modified = str(int(last_modified))
90          else:
91             last_modified = bookmark.last_visit
92
93          bookmark.size = size
94          bookmark.last_modified = last_modified
95
96          md5 = md5wrapper()
97          if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
98             md5.update(self.get_ftp_welcome())
99
100          md5.update(content)
101          bookmark.md5 = str(md5)
102
103          if headers:
104             try:
105                content_type = headers["Content-Type"]
106                self.log("   Content-Type: %s" % content_type)
107                try:
108                   # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
109                   content_type, charset = content_type.split(';', 1)
110                   content_type = content_type.strip()
111                   charset = charset.split('=')[1].strip().split(',')[0]
112                   self.log("   HTTP charset   : %s" % charset)
113                except (ValueError, IndexError):
114                   charset = None
115                   self.log("   no charset in Content-Type header")
116                for ctype in ("text/html", "application/xhtml+xml"):
117                   if content_type.startswith(ctype):
118                       html = True
119                       break
120                else:
121                   html = False
122                if html:
123                   parser = parse_html(content, charset, self.log)
124                   if parser:
125                       bookmark.real_title = parser.title
126                       icon = parser.icon
127                   else:
128                      icon = None
129                   if not icon:
130                      icon = "/favicon.ico"
131                   icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
132                   self.log("   looking for icon at: %s" % icon_url)
133                   if icon_url in icons:
134                      if icons[icon_url]:
135                         bookmark.icon_href = icon_url
136                         content_type, bookmark.icon = icons[icon_url]
137                         self.log("   cached icon: %s" % content_type)
138                      else:
139                         self.log("   cached icon: no icon")
140                   else:
141                      try:
142                         _icon_url = icon_url
143                         for i in range(8):
144                            try:
145                               icon_headers, icon_data = self.urlretrieve(bookmark, _icon_url)
146                            except RedirectException, e:
147                               _icon_url = e.url
148                               self.log("   redirect to : %s" % _icon_url)
149                            else:
150                               if icon_data is None:
151                                    raise IOError("No icon")
152                               break
153                         else:
154                            raise IOError("Too many redirects")
155                      except:
156                         etype, emsg, tb = sys.exc_info()
157                         self.log("   no icon        : %s %s" % (etype, emsg))
158                         etype = emsg = tb = None
159                         icons[icon_url] = None
160                      else:
161                         content_type = icon_headers["Content-Type"]
162                         if content_type.startswith("application/") \
163                               or content_type.startswith("image/") \
164                               or content_type.startswith("text/plain"):
165                            bookmark.icon_href = icon_url
166                            self.log("   got icon       : %s" % content_type)
167                            if content_type.startswith("application/") \
168                                  or content_type.startswith("text/plain"):
169                               self.log("   non-image content type, assume x-icon")
170                               content_type = 'image/x-icon'
171                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
172                            icons[icon_url] = (content_type, bookmark.icon)
173                         else:
174                            self.log("   no icon        : bad content type '%s'" % content_type)
175                            icons[icon_url] = None
176                   if parser and parser.refresh:
177                      refresh = parser.refresh
178                      try:
179                         url = refresh.split('=', 1)[1]
180                      except IndexError:
181                         url = "self"
182                      try:
183                         timeout = float(refresh.split(';')[0])
184                      except (IndexError, ValueError):
185                         raise RedirectException("html", "Bad redirect to %s (%s)" % (url, refresh))
186                      else:
187                         try:
188                            timeout = int(refresh.split(';')[0])
189                         except ValueError:
190                            pass # float timeout
191                         raise RedirectException("html", "%s (%s sec)" % (url, timeout))
192
193             except KeyError, key:
194                self.log("   no header: %s" % key)
195
196       except EOFError:
197          bookmark.error = "Unexpected EOF (FTP server closed connection)"
198          self.log('   EOF: %s' % bookmark.error)
199
200       except RedirectException, msg:
201          bookmark.moved = str(msg)
202          self.log('   Moved: %s' % bookmark.moved)
203
204       except KeyboardInterrupt:
205          self.log("Keyboard interrupt (^C)")
206          return 0
207
208       except:
209          import traceback
210          traceback.print_exc()
211          bookmark.error = "Exception!"
212          self.log('   Exception: %s' % bookmark.error)
213
214       finally:
215          self.finish_check_url(bookmark)
216
217       # Tested
218       return 1
219
220    def finish_check_url(self, bookmark):
221       start = self.start
222       bookmark.last_tested = str(start)
223
224       now = int(time.time())
225       bookmark.test_time = str(now - start)
226
227       self.cleanup()