import sys
import socket
import time
-import urllib
-from urlparse import urljoin
+try:
+ from urllib.parse import splittype, splithost, splittag, urljoin
+except ImportError:
+ from urllib import splittype, splithost, splittag
+ from urlparse import urljoin
from m_lib.md5wrapper import md5wrapper
from m_lib.net.www.util import parse_time
reloc_dict = {
- 301: "perm.",
+ 301: "perm1.",
302: "temp2.",
303: "temp3.",
307: "temp7.",
+ 308: "temp8.",
"html": "html"
}
self.start = int(time.time())
bookmark.icon = None
- url_type, url_rest = urllib.splittype(bookmark.href)
- url_host, url_path = urllib.splithost(url_rest)
- url_path, url_tag = urllib.splittag(url_path) # noqa: E221
- # multiple spaces before operator
+ url_type, url_rest = splittype(bookmark.href)
+ url_host, url_path = splithost(url_rest)
+ url_path, url_tag = splittag(url_path) # noqa: E221
+ # multiple spaces before operator
url = "%s://%s%s" % (url_type, url_host, url_path)
- error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True)
+ error, redirect_code, redirect_to, headers, content = \
+ self.get(bookmark, url, True)
if error:
bookmark.error = error
bookmark.last_modified = last_modified
md5 = md5wrapper()
- if url_type == "ftp": # Pass welcome message through MD5
- md5.update(self.get_ftp_welcome())
-
- md5.update(content)
+ if url_type == "ftp": # Pass welcome message through MD5
+ ftp_welcome = self.get_ftp_welcome()
+ if not isinstance(ftp_welcome, bytes):
+ ftp_welcome = ftp_welcome.encode('utf-8')
+ md5.update(ftp_welcome)
+
+ if isinstance(content, bytes):
+ md5.update(content)
+ else:
+ md5.update(content.encode('utf-8'))
bookmark.md5 = str(md5)
if headers:
try:
content_type = headers["Content-Type"]
- self.log(" Content-Type: %s" % content_type)
+ self.log(" Content-Type : %s" % content_type)
+ if content_type is None:
+ if 'html' in content.lower():
+ content_type = 'text/html'
+ else:
+ content_type = 'text/plain'
+ self.log(" Set Content-Type to: %s"
+ % content_type)
try:
- # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+ # extract charset from
+ # "text/html; foo; charset=UTF-8, bar; baz;"
content_type, charset = content_type.split(';', 1)
content_type = content_type.strip()
charset = charset.split('=')[1].strip().split(',')[0]
except (ValueError, IndexError):
charset = None
self.log(" no charset in Content-Type header")
+ is_html = False
for ctype in ("text/html", "application/xhtml+xml"):
if content_type.startswith(ctype):
- html = True
+ is_html = True
break
- else:
- html = False
- if html:
+ if content and is_html:
parser = parse_html(content, charset, self.log)
if parser:
bookmark.real_title = parser.title
icon = None
if not icon:
icon = "/favicon.ico"
- icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
+ icon_url = urljoin(
+ "%s://%s%s" % (url_type, url_host, url_path), icon)
self.log(" looking for icon at: %s" % icon_url)
if icon_url in icons:
if icons[icon_url]:
try:
_icon_url = icon_url
for i in range(8):
- error, icon_redirect_code, icon_redirect_to, \
- icon_headers, icon_data = \
+ error, icon_redirect_code, \
+ icon_redirect_to, icon_headers, \
+ icon_data = \
self.get(bookmark, _icon_url)
if icon_redirect_code:
_icon_url = icon_redirect_to
- self.log(" redirect to : %s" % _icon_url)
+ self.log(" redirect to : %s"
+ % _icon_url)
else:
if icon_data is None:
raise IOError("No icon")
else:
raise IOError("Too many redirects")
except:
- etype, emsg, tb = sys.exc_info()
- self.log(" no icon : %s %s" % (etype, emsg))
- etype = emsg = tb = None
+ etype, emsg, _ = sys.exc_info()
+ self.log(" no icon : %s %s"
+ % (etype, emsg))
+ etype = emsg = _ = None
icons[icon_url] = None
else:
content_type = icon_headers["Content-Type"]
- if content_type.startswith("application/") \
- or content_type.startswith("image/") \
- or content_type.startswith("text/plain"):
+ if content_type and (
+ content_type.startswith("application/")
+ or content_type.startswith("image/")
+ or content_type.startswith("text/plain")
+ ):
bookmark.icon_href = icon_url
- self.log(" got icon : %s" % content_type)
- if content_type.startswith("application/") \
- or content_type.startswith("text/plain"):
- self.log(" non-image content type, assume x-icon")
+ self.log(" got icon : %s"
+ % content_type)
+ if (
+ content_type.startswith("application/")
+ or content_type.startswith(
+ "text/plain")
+ ):
+ self.log(" non-image content type,"
+ " assume x-icon")
content_type = 'image/x-icon'
- bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
- icons[icon_url] = (content_type, bookmark.icon)
+ if not isinstance(icon_data, bytes):
+ icon_data = icon_data.encode('utf-8')
+ bookmark.icon = "data:%s;base64,%s" \
+ % (content_type, b64encode(icon_data))
+ icons[icon_url] = (content_type,
+ bookmark.icon
+ )
else:
- self.log(" no icon : bad content type '%s'" % content_type)
+ self.log(" no icon :"
+ "bad content type '%s'"
+ % content_type
+ )
icons[icon_url] = None
if parser and parser.refresh:
refresh = parser.refresh
try:
timeout = float(refresh.split(';')[0])
except (IndexError, ValueError):
- self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh))
+ self.set_redirect(bookmark, "html",
+ "Bad redirect to %s (%s)"
+ % (url, refresh)
+ )
else:
try:
timeout = int(refresh.split(';')[0])
except ValueError:
- pass # float timeout
- self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout))
-
+ pass # float timeout
+ self.set_redirect(bookmark, "html",
+ "%s (%s sec)"
+ % (url, timeout)
+ )
+
+ if not content:
+ self.log(" empty response, no content")
+ if not is_html:
+ self.log(" not html")
except KeyError as key:
self.log(" no header: %s" % key)
return 1
def set_redirect(self, bookmark, errcode, newurl):
- bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
- self.log(' Moved: %s' % bookmark.moved)
+ bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
+ try:
+ moved.encode('ascii')
+ except UnicodeEncodeError:
+ try:
+ moved = moved.encode(bookmark.charset)
+ except (LookupError, TypeError, UnicodeEncodeError):
+ moved = moved.encode('utf-8')
+ self.log(' Moved: %s' % moved)
def finish_check_url(self, bookmark):
start = self.start