From d68f2dd8fdab1ec22049d14f274ec34a2ed8431f Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Tue, 24 Aug 2010 18:45:34 +0000 Subject: [PATCH] Put the first paragraph or the full body to xml feeds. git-svn-id: file:///home/phd/archive/SVN/phdru.name/scripts@110 7bb0bf08-9e0d-0410-b083-99cee3bf18b8 --- phd_pp.py | 45 ++++++++++++++++++++++++++++++++++++++++----- reindex_blog.py | 45 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/phd_pp.py b/phd_pp.py index d731d38..95e90db 100644 --- a/phd_pp.py +++ b/phd_pp.py @@ -1,5 +1,8 @@ import os, re, time, urllib +from HTMLParser import HTMLParseError from Cheetah.Template import Template +from m_lib.net.www.html import HTMLParser as _HTMLParser + url_re = r"(((https?|ftp|gopher|telnet)://|(mailto|file|news|about|ed2k|irc|sip|magnet):)[^' \t<>\"]+|(www|web|w3)[A-Za-z0-9_-]*\.[A-Za-z0-9._-]+\.[^' \t<>\"]+)[A-Za-z0-9/]" @@ -7,6 +10,7 @@ def _url2href(match): url = match.group(0) return '%s' % (url, url) + full_dirs = len(os.getcwd().split('/')) + 1 class phd_pp(Template): @@ -36,11 +40,13 @@ class phd_pp(Template): def body(self): if hasattr(self, "body_html"): - return self.body_html() + body = self.body_html() if hasattr(self, "body_text"): - return self.text2html() + body = self.text2html() if hasattr(self, "body_rst"): - return self.rst2html() + body = self.rst2html() + self.Body = body + return body def text2html(self): body = re.sub(url_re, _url2href, self.body_text()) @@ -63,8 +69,7 @@ class phd_pp(Template): def rst2html(self): from docutils.core import publish_parts - from locale import getpreferredencoding - encoding = getpreferredencoding() + from m_lib.defenc import default_encoding as encoding parts = publish_parts(self.body_rst(), writer_name="html") @@ -80,6 +85,21 @@ class phd_pp(Template): parts = [part for part in (title, subtitle, body) if part] return "\n\n".join(parts) + def get_first_p(self): + parser = HTMLParser() + + try: + parser.feed(self.body()) + except (HTMLParseError, HTMLHeadDone): + pass + + try: + parser.close() + except (HTMLParseError, HTMLHeadDone): + pass + + return parser.first_p + def img_thumbnail_800_1024(self, img_name): return """\ %(img_name)s-thumbnail.jpg
@@ -107,3 +127,18 @@ class phd_pp(Template): def quote_string(s, to_encoding="utf-8", ext_safe=''): return urllib.quote(unicode(s, "koi8-r").encode(to_encoding), '/' + ext_safe) + + +class HTMLHeadDone(Exception): pass + +class HTMLParser(_HTMLParser): + def __init__(self, charset=None): + _HTMLParser.__init__(self) + self.first_p = None + + def start_p(self, attrs): + self.accumulator = '

' + + def end_p(self): + self.first_p = self.accumulator + '

' + raise HTMLHeadDone() diff --git a/reindex_blog.py b/reindex_blog.py index 0ba0804..fd82c18 100755 --- a/reindex_blog.py +++ b/reindex_blog.py @@ -39,6 +39,11 @@ else: blog = {} years = {} +# excerpts nd bodies are dictionaries mapping file => excerpt/body + +excerpts = {} +bodies = {} + # Walk the directory recursively for dirpath, dirs, files in os.walk(blog_root): d = os.path.basename(dirpath) @@ -77,6 +82,10 @@ for dirpath, dirs, files in os.walk(blog_root): if day not in days: days.append(day) + file = file[:-len("tmpl")] + "html" + key = (year, month, day, file) + excerpts[key] = template.get_first_p() + bodies[key] = template.Body # Need to save the blog? if blog <> old_blog: @@ -130,8 +139,12 @@ def write_template(level, year, month, day, titles, tags=None): #attr $Title = "Oleg Broytman's blog" #attr $Description = "Broytman Russian Blog Index Document" #attr $Copyright = %(cyear)s -#attr $alternates = (("News [Atom 1.0]", "application/atom+xml", "atom_10.xml"), - ("News [RSS 2.0]", "application/rss+xml", "rss_20.xml") +#attr $alternates = (("îÏ×ÏÓÔÉ [Atom 1.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ", "application/atom+xml", "atom_10_titles.xml"), + ("îÏ×ÏÓÔÉ [Atom 1.0]", "application/atom+xml", "atom_10.xml"), + ("îÏ×ÏÓÔÉ [Atom 1.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ", "application/atom+xml", "atom_10_full.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ", "application/rss+xml", "rss_20_titles.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0]", "application/rss+xml", "rss_20.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ", "application/rss+xml", "rss_20_full.xml"), ) ## #def body_html @@ -222,8 +235,14 @@ def write_template(level, year, month, day, titles, tags=None):

îÏ×ÏÓÔÅ×ÁÑ ÌÅÎÔÁ × ÆÏÒÍÁÔÁÈ -Atom 1.0 RSS 2.0 . + +Atom 1.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ / +Atom 1.0 / +Atom 1.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ +RSS 2.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ / +RSS 2.0 / +RSS 2.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ.

""") @@ -442,6 +461,8 @@ for item in tuple(reversed(all_titles_tags))[:10]: items.append(item) item.baseURL = baseURL item.categoryList = tags + item.excerpt = excerpts[(year, month, day, file)] + item.body = bodies[(year, month, day, file)] namespace = { "title": "Oleg Broytman's blog", @@ -462,3 +483,19 @@ atom_tmpl = str(atom_10(searchList=[namespace])) write_if_changed(os.path.join(blog_root, "atom_10.xml"), atom_tmpl) rss_tmpl = str(rss_20(searchList=[namespace])) write_if_changed(os.path.join(blog_root, "rss_20.xml"), rss_tmpl) + +for item in items: + item.excerpt = None + +atom_tmpl = str(atom_10(searchList=[namespace])) +write_if_changed(os.path.join(blog_root, "atom_10_titles.xml"), atom_tmpl) +rss_tmpl = str(rss_20(searchList=[namespace])) +write_if_changed(os.path.join(blog_root, "rss_20_titles.xml"), rss_tmpl) + +for item in items: + item.content = item.body + +atom_tmpl = str(atom_10(searchList=[namespace])) +write_if_changed(os.path.join(blog_root, "atom_10_full.xml"), atom_tmpl) +rss_tmpl = str(rss_20(searchList=[namespace])) +write_if_changed(os.path.join(blog_root, "rss_20_full.xml"), rss_tmpl) -- 2.39.5