From: Oleg Broytman Date: Wed, 25 Aug 2010 12:58:25 +0000 (+0000) Subject: Moved html parsing from phd_pp.py to reindex_blog.py. X-Git-Url: https://git.phdru.name/?p=phdru.name%2Fphdru.name.git;a=commitdiff_plain;h=6922322a54539e39cbba59653868fa9d749fa274 Moved html parsing from phd_pp.py to reindex_blog.py. Make all URLs absolute for xml feeds. git-svn-id: file:///home/phd/archive/SVN/phdru.name/scripts@112 7bb0bf08-9e0d-0410-b083-99cee3bf18b8 --- diff --git a/phd_pp.py b/phd_pp.py index 95e90db..f9c9a67 100644 --- a/phd_pp.py +++ b/phd_pp.py @@ -1,7 +1,5 @@ import os, re, time, urllib -from HTMLParser import HTMLParseError from Cheetah.Template import Template -from m_lib.net.www.html import HTMLParser as _HTMLParser url_re = r"(((https?|ftp|gopher|telnet)://|(mailto|file|news|about|ed2k|irc|sip|magnet):)[^' \t<>\"]+|(www|web|w3)[A-Za-z0-9_-]*\.[A-Za-z0-9._-]+\.[^' \t<>\"]+)[A-Za-z0-9/]" @@ -38,6 +36,7 @@ class phd_pp(Template): return "%s, %s" % (start_year, this_year) return "%s-%s" % (start_year, this_year) + def body(self): if hasattr(self, "body_html"): body = self.body_html() @@ -45,7 +44,6 @@ class phd_pp(Template): body = self.text2html() if hasattr(self, "body_rst"): body = self.rst2html() - self.Body = body return body def text2html(self): @@ -85,20 +83,6 @@ class phd_pp(Template): parts = [part for part in (title, subtitle, body) if part] return "\n\n".join(parts) - def get_first_p(self): - parser = HTMLParser() - - try: - parser.feed(self.body()) - except (HTMLParseError, HTMLHeadDone): - pass - - try: - parser.close() - except (HTMLParseError, HTMLHeadDone): - pass - - return parser.first_p def img_thumbnail_800_1024(self, img_name): return """\ @@ -127,18 +111,3 @@ class phd_pp(Template): def quote_string(s, to_encoding="utf-8", ext_safe=''): return urllib.quote(unicode(s, "koi8-r").encode(to_encoding), '/' + ext_safe) - - -class HTMLHeadDone(Exception): pass - -class HTMLParser(_HTMLParser): - def __init__(self, charset=None): - _HTMLParser.__init__(self) - self.first_p = None - - def start_p(self, attrs): - self.accumulator = '

' - - def end_p(self): - self.first_p = self.accumulator + '

' - raise HTMLHeadDone() diff --git a/reindex_blog.py b/reindex_blog.py index fd82c18..beba7c1 100755 --- a/reindex_blog.py +++ b/reindex_blog.py @@ -39,9 +39,8 @@ else: blog = {} years = {} -# excerpts nd bodies are dictionaries mapping file => excerpt/body +# bodies is a dictionary mapping file => body -excerpts = {} bodies = {} # Walk the directory recursively @@ -56,7 +55,7 @@ for dirpath, dirs, files in os.walk(blog_root): template = Template(file=fullpath) title_parts = template.Title.split() title = ' '.join(title_parts[6:]) - lead = getattr(template, "Lead", None) + lead = template.Lead tags = template.Tag if isinstance(tags, basestring): @@ -84,8 +83,7 @@ for dirpath, dirs, files in os.walk(blog_root): file = file[:-len("tmpl")] + "html" key = (year, month, day, file) - excerpts[key] = template.get_first_p() - bodies[key] = template.Body + bodies[key] = template.body() # Need to save the blog? if blog <> old_blog: @@ -220,15 +218,11 @@ def write_template(level, year, month, day, titles, tags=None): else: new_text.append('\n

%s %s

' % (day, months_names_ru[int(month)])) save_date = year, month, day - if lead: - lead = lead + ' ' - else: - lead = '' new_text.append('''

%s%s.

-''' % (lead, href, title)) +''' % (lead+' ' if lead else '', href, title)) if level == 0: new_text.append(""" @@ -407,12 +401,8 @@ for i, (count, tag, links) in enumerate(all_tags): count = 0 for year, month, day, filename, title, lead in reversed(links): - if lead: - lead = lead + ' ' - else: - lead = '' link = "../%s/%s/%s/%s" % (year, month, day, filename) - item_text = """
  • %s/%s/%s: %s%s
  • """ % (link, year, month, day, lead, title) + item_text = """
  • %s/%s/%s: %s%s
  • """ % (link, year, month, day, lead+' ' if lead else '', title) count += 1 if count <= 5: @@ -437,6 +427,76 @@ $phd_pp_ru.respond(self) write_if_changed(os.path.join(blog_root, "tags", "index.tmpl"), ''.join(new_text)) +from HTMLParser import HTMLParseError +import cgi +from urlparse import urljoin +from m_lib.net.www.html import HTMLParser as _HTMLParser + +class HTMLDone(Exception): pass + + +class FirstPHTMLParser(_HTMLParser): + def __init__(self): + _HTMLParser.__init__(self) + self.first_p = None + + def start_p(self, attrs): + self.accumulator = '

    ' + + def end_p(self): + self.first_p = self.accumulator + '

    ' + raise HTMLDone() + +def get_first_p(body): + parser = FirstPHTMLParser() + + try: + parser.feed(body) + except (HTMLParseError, HTMLDone): + pass + + try: + parser.close() + except (HTMLParseError, HTMLDone): + pass + + return parser.first_p + + +class AbsURLHTMLParser(_HTMLParser): + def __init__(self, base): + _HTMLParser.__init__(self) + self.base = base + + def start_a(self, attrs): + self.accumulator += '