From d68f2dd8fdab1ec22049d14f274ec34a2ed8431f Mon Sep 17 00:00:00 2001
From: Oleg Broytman <phd@phdru.name>
Date: Tue, 24 Aug 2010 18:45:34 +0000
Subject: [PATCH] Put the first paragraph or the full body to xml feeds.

git-svn-id: file:///home/phd/archive/SVN/phdru.name/scripts@110 7bb0bf08-9e0d-0410-b083-99cee3bf18b8
---
 phd_pp.py       | 45 ++++++++++++++++++++++++++++++++++++++++-----
 reindex_blog.py | 45 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 81 insertions(+), 9 deletions(-)
diff --git a/phd_pp.py b/phd_pp.py
index d731d38..95e90db 100644
--- a/phd_pp.py
+++ b/phd_pp.py
@@ -1,5 +1,8 @@
 import os, re, time, urllib
+from HTMLParser import HTMLParseError
 from Cheetah.Template import Template
+from m_lib.net.www.html import HTMLParser as _HTMLParser
+
 
 url_re = r"(((https?|ftp|gopher|telnet)://|(mailto|file|news|about|ed2k|irc|sip|magnet):)[^' \t<>\"]+|(www|web|w3)[A-Za-z0-9_-]*\.[A-Za-z0-9._-]+\.[^' \t<>\"]+)[A-Za-z0-9/]"
 
@@ -7,6 +10,7 @@ def _url2href(match):
    url = match.group(0)
    return '<a href="%s">%s</a>' % (url, url)
 
+
 full_dirs = len(os.getcwd().split('/')) + 1
 
 class phd_pp(Template):
@@ -36,11 +40,13 @@ class phd_pp(Template):
 
    def body(self):
       if hasattr(self, "body_html"):
-         return self.body_html()
+         body = self.body_html()
       if hasattr(self, "body_text"):
-         return self.text2html()
+         body = self.text2html()
       if hasattr(self, "body_rst"):
-         return self.rst2html()
+         body = self.rst2html()
+      self.Body = body
+      return body
 
    def text2html(self):
       body = re.sub(url_re, _url2href, self.body_text())
@@ -63,8 +69,7 @@ class phd_pp(Template):
 
    def rst2html(self):
       from docutils.core import publish_parts
-      from locale import getpreferredencoding
-      encoding = getpreferredencoding()
+      from m_lib.defenc import default_encoding as encoding
 
       parts = publish_parts(self.body_rst(), writer_name="html")
 
@@ -80,6 +85,21 @@ class phd_pp(Template):
       parts = [part for part in (title, subtitle, body) if part]
       return "\n\n".join(parts)
 
+   def get_first_p(self):
+      parser = HTMLParser()
+
+      try:
+         parser.feed(self.body())
+      except (HTMLParseError, HTMLHeadDone):
+         pass
+
+      try:
+         parser.close()
+      except (HTMLParseError, HTMLHeadDone):
+         pass
+
+      return parser.first_p
+
    def img_thumbnail_800_1024(self, img_name):
       return """\
 <img src="%(img_name)s-thumbnail.jpg" alt="%(img_name)s-thumbnail.jpg" /><br />
@@ -107,3 +127,18 @@ class phd_pp(Template):
 
 def quote_string(s, to_encoding="utf-8", ext_safe=''):
    return urllib.quote(unicode(s, "koi8-r").encode(to_encoding), '/' + ext_safe)
+
+
+class HTMLHeadDone(Exception): pass
+
+class HTMLParser(_HTMLParser):
+   def __init__(self, charset=None):
+      _HTMLParser.__init__(self)
+      self.first_p = None
+
+   def start_p(self, attrs):
+      self.accumulator = '<p>'
+
+   def end_p(self):
+      self.first_p = self.accumulator + '</p>'
+      raise HTMLHeadDone()
diff --git a/reindex_blog.py b/reindex_blog.py
index 0ba0804..fd82c18 100755
--- a/reindex_blog.py
+++ b/reindex_blog.py
@@ -39,6 +39,11 @@ else:
 blog = {}
 years = {}
 
+# excerpts nd bodies are dictionaries mapping file => excerpt/body
+
+excerpts = {}
+bodies = {}
+
 # Walk the directory recursively
 for dirpath, dirs, files in os.walk(blog_root):
    d = os.path.basename(dirpath)
@@ -77,6 +82,10 @@ for dirpath, dirs, files in os.walk(blog_root):
 
          if day not in days: days.append(day)
 
+         file = file[:-len("tmpl")] + "html"
+         key = (year, month, day, file)
+         excerpts[key] = template.get_first_p()
+         bodies[key] = template.Body
 
 # Need to save the blog?
 if blog <> old_blog:
@@ -130,8 +139,12 @@ def write_template(level, year, month, day, titles, tags=None):
 #attr $Title = "Oleg Broytman's blog"
 #attr $Description = "Broytman Russian Blog Index Document"
 #attr $Copyright = %(cyear)s
-#attr $alternates = (("News [Atom 1.0]", "application/atom+xml", "atom_10.xml"),
-                     ("News [RSS 2.0]",  "application/rss+xml",  "rss_20.xml")
+#attr $alternates = (("îÏ×ÏÓÔÉ [Atom 1.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ", "application/atom+xml", "atom_10_titles.xml"),
+                     ("îÏ×ÏÓÔÉ [Atom 1.0]", "application/atom+xml", "atom_10.xml"),
+                     ("îÏ×ÏÓÔÉ [Atom 1.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ", "application/atom+xml", "atom_10_full.xml"),
+                     ("îÏ×ÏÓÔÉ [RSS 2.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ",  "application/rss+xml",  "rss_20_titles.xml"),
+                     ("îÏ×ÏÓÔÉ [RSS 2.0]",  "application/rss+xml",  "rss_20.xml"),
+                     ("îÏ×ÏÓÔÉ [RSS 2.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ",  "application/rss+xml",  "rss_20_full.xml"),
 )
 ##
 #def body_html
@@ -222,8 +235,14 @@ def write_template(level, year, month, day, titles, tags=None):
 <hr>
 
 <p class="head">îÏ×ÏÓÔÅ×ÁÑ ÌÅÎÔÁ × ÆÏÒÍÁÔÁÈ
-<A HREF="atom_10.xml">Atom 1.0 <img src="../../Graphics/atom_10.jpg" border=0></A>
-É <A HREF="rss_20.xml">RSS 2.0 <img src="../../Graphics/rss_20.jpg" border=0></A>.
+<img src="../../Graphics/atom_10.jpg" border=0>
+<A HREF="atom_10_titles.xml">Atom 1.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ</A> /
+<A HREF="atom_10.xml">Atom 1.0</A> /
+<A HREF="atom_10_full.xml">Atom 1.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ</A>
+É <img src="../../Graphics/rss_20.jpg" border=0>
+<A HREF="rss_20_titles.xml">RSS 2.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ</A> /
+<A HREF="rss_20.xml">RSS 2.0</A> /
+<A HREF="rss_20_full.xml">RSS 2.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ</A>.
 </p>
 """)
 
@@ -442,6 +461,8 @@ for item in tuple(reversed(all_titles_tags))[:10]:
    items.append(item)
    item.baseURL = baseURL
    item.categoryList = tags
+   item.excerpt = excerpts[(year, month, day, file)]
+   item.body = bodies[(year, month, day, file)]
 
 namespace = {
    "title": "Oleg Broytman's blog",
@@ -462,3 +483,19 @@ atom_tmpl = str(atom_10(searchList=[namespace]))
 write_if_changed(os.path.join(blog_root, "atom_10.xml"), atom_tmpl)
 rss_tmpl = str(rss_20(searchList=[namespace]))
 write_if_changed(os.path.join(blog_root, "rss_20.xml"), rss_tmpl)
+
+for item in items:
+    item.excerpt = None
+
+atom_tmpl = str(atom_10(searchList=[namespace]))
+write_if_changed(os.path.join(blog_root, "atom_10_titles.xml"), atom_tmpl)
+rss_tmpl = str(rss_20(searchList=[namespace]))
+write_if_changed(os.path.join(blog_root, "rss_20_titles.xml"), rss_tmpl)
+
+for item in items:
+    item.content = item.body
+
+atom_tmpl = str(atom_10(searchList=[namespace]))
+write_if_changed(os.path.join(blog_root, "atom_10_full.xml"), atom_tmpl)
+rss_tmpl = str(rss_20(searchList=[namespace]))
+write_if_changed(os.path.join(blog_root, "rss_20_full.xml"), rss_tmpl)
-- 
2.39.2