X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=Robots%2Fbkmk_robot_base.py;h=0600e7338d2b71c41492d73858acf72d0818ae84;hp=8dd032b714e076a75f4bd38ab8db10d65ed45830;hb=c88cb7a75e7caf1d67466cfa107981d95115fa0c;hpb=a04eaa0346e8aa5ad86a195f8f4d36487ebfe09c diff --git a/Robots/bkmk_robot_base.py b/Robots/bkmk_robot_base.py index 8dd032b..0600e73 100644 --- a/Robots/bkmk_robot_base.py +++ b/Robots/bkmk_robot_base.py @@ -35,204 +35,204 @@ reloc_dict = { def get_error(e): - if isinstance(e, str): - return e + if isinstance(e, str): + return e - else: - s = [] - for i in e: - s.append("'%s'" % str(i).replace('\n', "\\n")) - return "(%s)" % ' '.join(s) + else: + s = [] + for i in e: + s.append("'%s'" % str(i).replace('\n', "\\n")) + return "(%s)" % ' '.join(s) icons = {} # Icon cache; maps URL to a tuple (content type, data) # or None if there is no icon. class robot_base(Robot): - timeout = 60 + timeout = 60 - def __init__(self, *args, **kw): + def __init__(self, *args, **kw): Robot.__init__(self, *args, **kw) socket.setdefaulttimeout(int(self.timeout)) - def check_url(self, bookmark): - try: - self.start = int(time.time()) - bookmark.icon = None + def check_url(self, bookmark): + try: + self.start = int(time.time()) + bookmark.icon = None - url_type, url_rest = urllib.splittype(bookmark.href) - url_host, url_path = urllib.splithost(url_rest) - url_path, url_tag = urllib.splittag(url_path) + url_type, url_rest = urllib.splittype(bookmark.href) + url_host, url_path = urllib.splithost(url_rest) + url_path, url_tag = urllib.splittag(url_path) - url = "%s://%s%s" % (url_type, url_host, url_path) - error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True) + url = "%s://%s%s" % (url_type, url_host, url_path) + error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True) - if error: - bookmark.error = error - return 1 + if error: + bookmark.error = error + return 1 - if redirect_code: - self.set_redirect(bookmark, redirect_code, redirect_to) - return 1 + if redirect_code: + self.set_redirect(bookmark, redirect_code, redirect_to) + return 1 - size = 0 - last_modified = None + size = 0 + last_modified = None - if headers: - try: - size = headers["Content-Length"] - except KeyError: - size = len(content) + if headers: + try: + size = headers["Content-Length"] + except KeyError: + size = len(content) - try: - last_modified = headers["Last-Modified"] - except KeyError: - pass + try: + last_modified = headers["Last-Modified"] + except KeyError: + pass + + if last_modified: + last_modified = parse_time(last_modified) + else: + size = len(content) if last_modified: - last_modified = parse_time(last_modified) - else: - size = len(content) - - if last_modified: - last_modified = str(int(last_modified)) - else: - last_modified = bookmark.last_visit - - bookmark.size = size - bookmark.last_modified = last_modified - - md5 = md5wrapper() - if url_type == "ftp": # Pass welcome message through MD5 - md5.update(self.get_ftp_welcome()) - - md5.update(content) - bookmark.md5 = str(md5) - - if headers: - try: - content_type = headers["Content-Type"] - self.log(" Content-Type: %s" % content_type) - try: - # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" - content_type, charset = content_type.split(';', 1) - content_type = content_type.strip() - charset = charset.split('=')[1].strip().split(',')[0] - self.log(" HTTP charset : %s" % charset) - except (ValueError, IndexError): - charset = None - self.log(" no charset in Content-Type header") - for ctype in ("text/html", "application/xhtml+xml"): - if content_type.startswith(ctype): - html = True - break - else: - html = False - if html: - parser = parse_html(content, charset, self.log) - if parser: - bookmark.real_title = parser.title - icon = parser.icon - else: - icon = None - if not icon: - icon = "/favicon.ico" - icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon) - self.log(" looking for icon at: %s" % icon_url) - if icon_url in icons: - if icons[icon_url]: - bookmark.icon_href = icon_url - content_type, bookmark.icon = icons[icon_url] - self.log(" cached icon: %s" % content_type) - else: - self.log(" cached icon: no icon") - else: - try: - _icon_url = icon_url - for i in range(8): - error, icon_redirect_code, icon_redirect_to, \ - icon_headers, icon_data = \ - self.get(bookmark, _icon_url) - if icon_redirect_code: - _icon_url = icon_redirect_to - self.log(" redirect to : %s" % _icon_url) - else: - if icon_data is None: - raise IOError("No icon") - break + last_modified = str(int(last_modified)) + else: + last_modified = bookmark.last_visit + + bookmark.size = size + bookmark.last_modified = last_modified + + md5 = md5wrapper() + if url_type == "ftp": # Pass welcome message through MD5 + md5.update(self.get_ftp_welcome()) + + md5.update(content) + bookmark.md5 = str(md5) + + if headers: + try: + content_type = headers["Content-Type"] + self.log(" Content-Type: %s" % content_type) + try: + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + content_type, charset = content_type.split(';', 1) + content_type = content_type.strip() + charset = charset.split('=')[1].strip().split(',')[0] + self.log(" HTTP charset : %s" % charset) + except (ValueError, IndexError): + charset = None + self.log(" no charset in Content-Type header") + for ctype in ("text/html", "application/xhtml+xml"): + if content_type.startswith(ctype): + html = True + break + else: + html = False + if html: + parser = parse_html(content, charset, self.log) + if parser: + bookmark.real_title = parser.title + icon = parser.icon else: - raise IOError("Too many redirects") - except: - etype, emsg, tb = sys.exc_info() - self.log(" no icon : %s %s" % (etype, emsg)) - etype = emsg = tb = None - icons[icon_url] = None - else: - content_type = icon_headers["Content-Type"] - if content_type.startswith("application/") \ - or content_type.startswith("image/") \ - or content_type.startswith("text/plain"): - bookmark.icon_href = icon_url - self.log(" got icon : %s" % content_type) - if content_type.startswith("application/") \ - or content_type.startswith("text/plain"): - self.log(" non-image content type, assume x-icon") - content_type = 'image/x-icon' - bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data)) - icons[icon_url] = (content_type, bookmark.icon) + icon = None + if not icon: + icon = "/favicon.ico" + icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon) + self.log(" looking for icon at: %s" % icon_url) + if icon_url in icons: + if icons[icon_url]: + bookmark.icon_href = icon_url + content_type, bookmark.icon = icons[icon_url] + self.log(" cached icon: %s" % content_type) + else: + self.log(" cached icon: no icon") else: - self.log(" no icon : bad content type '%s'" % content_type) - icons[icon_url] = None - if parser and parser.refresh: - refresh = parser.refresh - try: - url = refresh.split('=', 1)[1] - except IndexError: - url = "self" - try: - timeout = float(refresh.split(';')[0]) - except (IndexError, ValueError): - self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh)) - else: - try: - timeout = int(refresh.split(';')[0]) - except ValueError: - pass # float timeout - self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout)) - - except KeyError as key: - self.log(" no header: %s" % key) - - except EOFError: - bookmark.error = "Unexpected EOF (FTP server closed connection)" - self.log(' EOF: %s' % bookmark.error) - - except KeyboardInterrupt: - self.log("Keyboard interrupt (^C)") - return 0 - - except socket.error as e: - bookmark.error = get_error(e) - self.log(bookmark.error) - - except: - import traceback - traceback.print_exc() - bookmark.error = "Exception!" - self.log(' Exception: %s' % bookmark.error) - - finally: - self.finish_check_url(bookmark) - - # Tested - return 1 - - def set_redirect(self, bookmark, errcode, newurl): + try: + _icon_url = icon_url + for i in range(8): + error, icon_redirect_code, icon_redirect_to, \ + icon_headers, icon_data = \ + self.get(bookmark, _icon_url) + if icon_redirect_code: + _icon_url = icon_redirect_to + self.log(" redirect to : %s" % _icon_url) + else: + if icon_data is None: + raise IOError("No icon") + break + else: + raise IOError("Too many redirects") + except: + etype, emsg, tb = sys.exc_info() + self.log(" no icon : %s %s" % (etype, emsg)) + etype = emsg = tb = None + icons[icon_url] = None + else: + content_type = icon_headers["Content-Type"] + if content_type.startswith("application/") \ + or content_type.startswith("image/") \ + or content_type.startswith("text/plain"): + bookmark.icon_href = icon_url + self.log(" got icon : %s" % content_type) + if content_type.startswith("application/") \ + or content_type.startswith("text/plain"): + self.log(" non-image content type, assume x-icon") + content_type = 'image/x-icon' + bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data)) + icons[icon_url] = (content_type, bookmark.icon) + else: + self.log(" no icon : bad content type '%s'" % content_type) + icons[icon_url] = None + if parser and parser.refresh: + refresh = parser.refresh + try: + url = refresh.split('=', 1)[1] + except IndexError: + url = "self" + try: + timeout = float(refresh.split(';')[0]) + except (IndexError, ValueError): + self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh)) + else: + try: + timeout = int(refresh.split(';')[0]) + except ValueError: + pass # float timeout + self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout)) + + except KeyError as key: + self.log(" no header: %s" % key) + + except EOFError: + bookmark.error = "Unexpected EOF (FTP server closed connection)" + self.log(' EOF: %s' % bookmark.error) + + except KeyboardInterrupt: + self.log("Keyboard interrupt (^C)") + return 0 + + except socket.error as e: + bookmark.error = get_error(e) + self.log(bookmark.error) + + except: + import traceback + traceback.print_exc() + bookmark.error = "Exception!" + self.log(' Exception: %s' % bookmark.error) + + finally: + self.finish_check_url(bookmark) + + # Tested + return 1 + + def set_redirect(self, bookmark, errcode, newurl): bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl) self.log(' Moved: %s' % bookmark.moved) - def finish_check_url(self, bookmark): - start = self.start - bookmark.last_tested = str(start) - now = int(time.time()) - bookmark.test_time = str(now - start) + def finish_check_url(self, bookmark): + start = self.start + bookmark.last_tested = str(start) + now = int(time.time()) + bookmark.test_time = str(now - start)