Split hrefs into domain and path components; recode only domain.

author Oleg Broytman <phd@phdru.name>

Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)

committer Oleg Broytman <phd@phdru.name>

Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)
author Oleg Broytman <phd@phdru.name>
Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)
committer Oleg Broytman <phd@phdru.name>
Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)
diff --git a/Writers/bkmk_whtml.py b/Writers/bkmk_whtml.py

index 309fdcfdd6114aeed19509c1efcd90cc1ed792aa..ad109f82686ac70c16e514212cbc5f278659efa8 100644 (file)
--- a/Writers/bkmk_whtml.py
+++ b/Writers/bkmk_whtml.py
@@ -49,7 +49,7 @@ class writer_html(Writer):
        self.outfile.write(ind_s*level + "</DL><p>\n")
  
     def bookmark(self, b, level):
-      self.outfile.write(ind_s*(level+1) + '<DT><A HREF="%s" ADD_DATE="%s"' % (b.href.encode(default_encoding), b.add_date))
+      self.outfile.write(ind_s*(level+1) + '<DT><A HREF="%s" ADD_DATE="%s"' % (b.href, b.add_date))
        if b.last_visit: self.outfile.write(' LAST_VISIT="%s"' % b.last_visit)
        self.outfile.write(' LAST_MODIFIED="%s"' % b.last_modified)
        if BKMK_FORMAT == "MOZILLA":
diff --git a/bkmk_objects.py b/bkmk_objects.py

index 00e186c794b8f1a32220c94b69fd7424bc25aeb4..be139edb3e205379733c8e7d38f8db42f2ce4c77 100644 (file)
--- a/bkmk_objects.py
+++ b/bkmk_objects.py
@@ -16,7 +16,7 @@ __all__ = ['Folder', 'Bookmark', 'Ruler', 'Walker', 'Writer', 'Robot',
  ]
  
  
-import os
+import os, urllib
  
  BKMK_FORMAT = os.environ.get("BKMK_FORMAT", "MOZILLA")
  
@@ -56,14 +56,45 @@ class Bookmark:
     isBookmark = 1
  
     def __init__(self, href, add_date, last_visit=None, last_modified=None,
-         keyword=None, comment='', icon_href=None, icon=None, charset=None):
-      if isinstance(href, str):
-         try:
-            href = href.decode('idna')
-         except UnicodeDecodeError: # Non-ascii href
-            href = href.decode('utf-8')
-      elif not isinstance(href, unicode):
-          raise TypeError("Bookmark's href must be str or unicode, not %r" % type(href))
+         keyword=None, comment='', icon_href=None, icon=None,
+         charset=None, parser_charset=None):
+      protocol, request = urllib.splittype(href)
+      user, password, port = None, None, None
+      host, path = urllib.splithost(request)
+      if host:
+         user, host = urllib.splituser(host)
+         if user:
+            user, password = urllib.splitpasswd(user)
+         host, port = urllib.splitport(host)
+         if port: port = int(port)
+      path, tag = urllib.splittag(path)
+      path, query = urllib.splitquery(path)
+      path = urllib.unquote(path)
+      if tag: tag = urllib.unquote_plus(tag)
+
+      if host: # host can be None for Mozilla's place: URLs
+          host = host.decode(parser_charset or 'utf-8').encode('idna')
+
+      href = protocol + "://"
+      if user:
+         href += urllib.quote(user)
+         if password:
+            href += ':' + urllib.quote(password)
+         href += '@'
+      if host:
+         href += host
+         if port:
+            href += ':%d' % port
+      if path:
+         if protocol == "file":
+            href += urllib.quote(path)
+         else:
+            href += urllib.quote(path)
+      if query:
+         href += '?' + query
+      if tag:
+         href += '#' + urllib.quote_plus(tag)
+
        self.href = href
        self.add_date = add_date
        self.last_visit = last_visit
diff --git a/bkmk_parser.py b/bkmk_parser.py

index 9d5bc0feeeeb465c01a85e997638d6d4a5547342..d4b6a2f32e05d6a8c3676f1415afa7ef1f441dfd 100644 (file)
--- a/bkmk_parser.py
+++ b/bkmk_parser.py
@@ -152,7 +152,8 @@ class BkmkParser(HTMLParser):
  
        debug("Bookmark points to: `%s'" % href)
        bookmark = Bookmark(href, add_date, last_visit, last_modified,
-         keyword=keyword, icon=icon, charset=charset)
+         keyword=keyword, icon=icon,
+         charset=charset, parser_charset=self.charset or default_encoding)
        self.current_object = bookmark
        self.current_folder.append(bookmark)
        self.urls += 1
diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE

index 3a63157ec356d2d1c426ae4246865925093e6ab2..2c5ddca30ffa5fe9ff6f7bce4eefc9cc46ba4a1d 100644 (file)
--- a/doc/ANNOUNCE
+++ b/doc/ANNOUNCE
@@ -5,32 +5,10 @@ WHAT IS IT
     A set of classes, libraries, programs and plugins I use to manipulate my
  bookmarks.html.
  
-WHAT'S NEW in version 4.4.0 (2011-01-07).
+WHAT'S NEW in version 4.5.0 (2011-??-??).
  
-Moved BeautifulSoup.py and subproc.py from Robots/ to the top-level directory.
+Recode international domain names.
  
-Moved parse_html.py and its submodules to a separate parse_html package.
-
-Added statistics code to parse_html, got a statistics on parser
-success/failrure rate, reordered parsers.
-
-Removed old cruft.
-
-
-WHAT'S NEW in version 4.3.1 (2011-01-03).
-
-Get favicon before HTML redirect (refresh).
-
-Get favicon even if it's of a wrong type; many sites return favicon as
-text/plain or application/*; the only exception is text/html which is usually
-an error page instead of error 404.
-
-
-WHAT'S NEW in version 4.3.0 (2011-01-01).
-
-Robots no longer have one global temporary file - there are at least two
-(html and favicon), and in the future there will be more for
-asynchronous robot(s) that will test many URLs in parallel.
  
  
  WHERE TO GET
author	Oleg Broytman <phd@phdru.name>
	Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)
committer	Oleg Broytman <phd@phdru.name>
	Tue, 12 Jul 2011 14:37:25 +0000 (14:37 +0000)
Writers/bkmk_whtml.py		patch \| blob \| history
bkmk_objects.py		patch \| blob \| history
bkmk_parser.py		patch \| blob \| history
doc/ANNOUNCE		patch \| blob \| history