]> git.phdru.name Git - bookmarks_db.git/blob - Robots/bkmk_robot_base.py
dce593310c1a1461749f189ca49dbd3b269f06cb
[bookmarks_db.git] / Robots / bkmk_robot_base.py
1 """Base class for robots
2
3 This file is a part of Bookmarks database and Internet robot.
4
5 """
6
7 __author__ = "Oleg Broytman <phd@phdru.name>"
8 __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
9 __license__ = "GNU GPL"
10
11 __all__ = ['robot_base', 'get_error']
12
13
14 import sys
15 import time, urllib
16 from base64 import b64encode
17 from urlparse import urljoin
18
19 from m_lib.net.www.util import parse_time
20 from m_lib.md5wrapper import md5wrapper
21
22 from bkmk_objects import Robot
23 from parse_html import parse_html
24
25
26 class RedirectException(Exception):
27    reloc_dict = {
28       301: "perm.",
29       302: "temp2.",
30       303: "temp3.",
31       307: "temp7.",
32       "html": "html"
33    }
34    def __init__(self, errcode, newurl):
35       Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl))
36       self.url = newurl
37
38
39 def get_error(msg):
40    if isinstance(msg, str):
41       return msg
42
43    else:
44       s = []
45       for i in msg:
46          s.append("'%s'" % str(i).replace('\n', "\\n"))
47       return "(%s)" % ' '.join(s)
48
49
50 icons = {} # Icon cache; maps URL to a tuple (content type, data)
51            # or None if there is no icon.
52
53 class robot_base(Robot):
54    def check_url(self, bookmark):
55       try:
56          self.start = int(time.time())
57          bookmark.icon = None
58
59          url_type, url_rest = urllib.splittype(bookmark.href)
60          url_host, url_path = urllib.splithost(url_rest)
61          url_path, url_tag  = urllib.splittag(url_path)
62
63          url = "%s://%s%s" % (url_type, url_host, url_path)
64          error, headers, content = self.get(bookmark, url, True)
65
66          if error:
67              bookmark.error = error
68              return 1
69
70          size = 0
71          last_modified = None
72
73          if headers:
74             try:
75                size = headers["Content-Length"]
76             except KeyError:
77                size = len(content)
78
79             try:
80                last_modified = headers["Last-Modified"]
81             except KeyError:
82                pass
83
84             if last_modified:
85                last_modified = parse_time(last_modified)
86          else:
87             size = len(content)
88
89          if last_modified:
90             last_modified = str(int(last_modified))
91          else:
92             last_modified = bookmark.last_visit
93
94          bookmark.size = size
95          bookmark.last_modified = last_modified
96
97          md5 = md5wrapper()
98          if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
99             md5.update(self.get_ftp_welcome())
100
101          md5.update(content)
102          bookmark.md5 = str(md5)
103
104          if headers:
105             try:
106                content_type = headers["Content-Type"]
107                self.log("   Content-Type: %s" % content_type)
108                try:
109                   # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
110                   content_type, charset = content_type.split(';', 1)
111                   content_type = content_type.strip()
112                   charset = charset.split('=')[1].strip().split(',')[0]
113                   self.log("   HTTP charset   : %s" % charset)
114                except (ValueError, IndexError):
115                   charset = None
116                   self.log("   no charset in Content-Type header")
117                for ctype in ("text/html", "application/xhtml+xml"):
118                   if content_type.startswith(ctype):
119                       html = True
120                       break
121                else:
122                   html = False
123                if html:
124                   parser = parse_html(content, charset, self.log)
125                   if parser:
126                       bookmark.real_title = parser.title
127                       icon = parser.icon
128                   else:
129                      icon = None
130                   if not icon:
131                      icon = "/favicon.ico"
132                   icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
133                   self.log("   looking for icon at: %s" % icon_url)
134                   if icon_url in icons:
135                      if icons[icon_url]:
136                         bookmark.icon_href = icon_url
137                         content_type, bookmark.icon = icons[icon_url]
138                         self.log("   cached icon: %s" % content_type)
139                      else:
140                         self.log("   cached icon: no icon")
141                   else:
142                      try:
143                         _icon_url = icon_url
144                         for i in range(8):
145                            try:
146                               error, icon_headers, icon_data = self.get(bookmark, _icon_url)
147                            except RedirectException, e:
148                               _icon_url = e.url
149                               self.log("   redirect to : %s" % _icon_url)
150                            else:
151                               if icon_data is None:
152                                    raise IOError("No icon")
153                               break
154                         else:
155                            raise IOError("Too many redirects")
156                      except:
157                         etype, emsg, tb = sys.exc_info()
158                         self.log("   no icon        : %s %s" % (etype, emsg))
159                         etype = emsg = tb = None
160                         icons[icon_url] = None
161                      else:
162                         content_type = icon_headers["Content-Type"]
163                         if content_type.startswith("application/") \
164                               or content_type.startswith("image/") \
165                               or content_type.startswith("text/plain"):
166                            bookmark.icon_href = icon_url
167                            self.log("   got icon       : %s" % content_type)
168                            if content_type.startswith("application/") \
169                                  or content_type.startswith("text/plain"):
170                               self.log("   non-image content type, assume x-icon")
171                               content_type = 'image/x-icon'
172                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
173                            icons[icon_url] = (content_type, bookmark.icon)
174                         else:
175                            self.log("   no icon        : bad content type '%s'" % content_type)
176                            icons[icon_url] = None
177                   if parser and parser.refresh:
178                      refresh = parser.refresh
179                      try:
180                         url = refresh.split('=', 1)[1]
181                      except IndexError:
182                         url = "self"
183                      try:
184                         timeout = float(refresh.split(';')[0])
185                      except (IndexError, ValueError):
186                         raise RedirectException("html", "Bad redirect to %s (%s)" % (url, refresh))
187                      else:
188                         try:
189                            timeout = int(refresh.split(';')[0])
190                         except ValueError:
191                            pass # float timeout
192                         raise RedirectException("html", "%s (%s sec)" % (url, timeout))
193
194             except KeyError, key:
195                self.log("   no header: %s" % key)
196
197       except EOFError:
198          bookmark.error = "Unexpected EOF (FTP server closed connection)"
199          self.log('   EOF: %s' % bookmark.error)
200
201       except RedirectException, msg:
202          bookmark.moved = str(msg)
203          self.log('   Moved: %s' % bookmark.moved)
204
205       except KeyboardInterrupt:
206          self.log("Keyboard interrupt (^C)")
207          return 0
208
209       except:
210          import traceback
211          traceback.print_exc()
212          bookmark.error = "Exception!"
213          self.log('   Exception: %s' % bookmark.error)
214
215       finally:
216          self.finish_check_url(bookmark)
217
218       # Tested
219       return 1
220
221    def finish_check_url(self, bookmark):
222       start = self.start
223       bookmark.last_tested = str(start)
224
225       now = int(time.time())
226       bookmark.test_time = str(now - start)
227
228       self.cleanup()