X-Git-Url: https://git.phdru.name/?p=phdru.name%2Fphdru.name.git;a=blobdiff_plain;f=reindex_blog.py;h=291e182eee8ddffff655114f347e50b9da54b97a;hp=f554bc32a85323a737036999eb563007ef9f688f;hb=HEAD;hpb=338269ecdb539e349db2e0237dbefc70906320e2 diff --git a/reindex_blog.py b/reindex_blog.py index f554bc3..0b7c2a6 100755 --- a/reindex_blog.py +++ b/reindex_blog.py @@ -1,34 +1,21 @@ -#! /usr/local/bin/python -O +#! /usr/bin/env python # -*- coding: koi8-r -*- -__version__ = "$Revision$"[11:-2] -__revision__ = "$Id$"[5:-2] -__date__ = "$Date$"[7:-2] -__author__ = "Oleg BroytMann " -__copyright__ = "Copyright (C) 2006 PhiloSoft Design" - +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2006-2024 PhiloSoft Design" import sys, os - try: - import cPickle as pickle + from urllib.parse import quote except ImportError: - import pickle + from urllib import quote from Cheetah.Template import Template +from Cheetah.compat import string_type +from blog_db import blog_root, load_blog, save_blog -# Load old blog - -blog_filename = sys.argv[1] -try: - blog_file = open(blog_filename, "rb") -except IOError: - old_blog = {} -else: - old_blog = pickle.load(blog_file) - blog_file.close() - +old_blog = load_blog() # blog is a dictionary mapping # (year, month, day) => [list of (file, title, lead, tags)] @@ -36,27 +23,31 @@ else: blog = {} years = {} +# bodies is a dictionary mapping file => body + +bodies = {} + # Walk the directory recursively -for dirpath, dirs, files in os.walk(os.curdir): +for dirpath, dirs, files in os.walk(blog_root): d = os.path.basename(dirpath) if not d.startswith("20") and not d.isdigit(): continue for file in files: - # Ignore index.tmpl and *.html files; supose all other files are *.tmpl - if file == "index.tmpl" or file.endswith(".html"): + if not file.endswith(".tmpl"): continue fullpath = os.path.join(dirpath, file) template = Template(file=fullpath) - title_parts = template.Title.split() + title_parts = template.Title.decode('utf-8').encode('koi8-r').split() title = ' '.join(title_parts[6:]) - lead = getattr(template, "Lead", None) + lead = template.Lead.decode('utf-8').encode('koi8-r') - tags = getattr(template, "Tag", None) - if isinstance(tags, basestring): + tags = template.Tag + if isinstance(tags, string_type): tags = (tags,) + tags = [tag.decode('utf-8').encode('koi8-r') for tag in tags] if title: - key = year, month, day = tuple(dirpath.split(os.sep)[1:]) + key = year, month, day = tuple(dirpath[len(blog_root):].split(os.sep)[1:]) if key in blog: days = blog[key] else: @@ -75,49 +66,45 @@ for dirpath, dirs, files in os.walk(os.curdir): if day not in days: days.append(day) + file = file[:-len("tmpl")] + "html" + key = (year, month, day, file) + body = template.body() + if isinstance(body, unicode): + body = body.encode('koi8-r') + bodies[key] = body # Need to save the blog? -if blog <> old_blog: - blog_file = open(blog_filename, "wb") - pickle.dump(blog, blog_file, pickle.HIGHEST_PROTOCOL) - blog_file.close() - +if blog != old_blog: + save_blog(blog) # Localized month names import locale -locale.setlocale(locale.LC_ALL, '') -from calendar import _localized_day, _localized_month +locale.setlocale(locale.LC_ALL, "ru_RU.KOI8-R") +from calendar import _localized_month locale.setlocale(locale.LC_TIME, 'C') months_names_en = list(_localized_month('%B')) months_abbrs_en = list(_localized_month('%b')) -locale.setlocale(locale.LC_TIME, '') -months_names_ru = [month.lower() for month in _localized_month('%B')] +locale.setlocale(locale.LC_TIME, "ru_RU.KOI8-R") +# months_names_ru = list(_localized_month('%B')) + +months_names_ru = ['', "ÑÎ×ÁÒÑ", "ÆÅ×ÒÁÌÑ", "ÍÁÒÔÁ", "ÁÐÒÅÌÑ", "ÍÁÑ", "ÉÀÎÑ", + "ÉÀÌÑ", "Á×ÇÕÓÔÁ", "ÓÅÎÔÑÂÒÑ", "ÏËÔÑÂÒÑ", "ÎÏÑÂÒÑ", "ÄÅËÁÂÒÑ" +] months_names_ru0 = ['', "ÑÎ×ÁÒØ", "ÆÅ×ÒÁÌØ", "ÍÁÒÔ", "ÁÐÒÅÌØ", "ÍÁÊ", "ÉÀÎØ", "ÉÀÌØ", "Á×ÇÕÓÔ", "ÓÅÎÔÑÂÒØ", "ÏËÔÑÂÒØ", "ÎÏÑÂÒØ", "ÄÅËÁÂÒØ" ] +from news import write_if_changed -def write_if_changed(filename, new_text): - try: - infile = open(filename, 'r') - old_text = infile.read() - infile.close() - except IOError: - old_text = None - - if old_text <> new_text: - print "Writing", filename - outfile = open(filename, 'w') - outfile.write(new_text) - outfile.close() - +def encode_tag(tag): + return quote(tag.replace(' ', '_')) -def write_template(level, year, month, day, titles): - path = [] +def write_template(level, year, month, day, titles, tags=None): + path = [blog_root] if level >= 1: path.append(year) if level >= 2: @@ -129,39 +116,47 @@ def write_template(level, year, month, day, titles): new_text = ["""\ ## THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -#extends phd_pp_ru +#encoding koi8-r +#extends phd_site #implements respond """] if level == 0: new_text.append("""\ -#attr $Title = "Oleg BroytMann's blog" -#attr $Description = "BroytMann Russian Blog Index Document" +#attr $Title = "Oleg Broytman's blog" +#attr $Description = "Broytman Russian Blog Index Document" #attr $Copyright = %(cyear)s +#attr $alternates = (("îÏ×ÏÓÔÉ [Atom 1.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ", "application/atom+xml", "atom_10_titles.xml"), + ("îÏ×ÏÓÔÉ [Atom 1.0]", "application/atom+xml", "atom_10.xml"), + ("îÏ×ÏÓÔÉ [Atom 1.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ", "application/atom+xml", "atom_10_full.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0] ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ", "application/rss+xml", "rss_20_titles.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0]", "application/rss+xml", "rss_20.xml"), + ("îÏ×ÏÓÔÉ [RSS 2.0] ÐÏÌÎÙÅ ÔÅËÓÔÙ", "application/rss+xml", "rss_20_full.xml"), +) ## #def body_html -

öÕÒÎÁÌ

+

öÕÒÎÁÌ

""" % {"cyear": year or 2005}) elif level == 1: new_text.append("""\ -#attr $Title = "Oleg BroytMann's blog: %(year)s" -#attr $Description = "BroytMann Russian Blog %(year)s Index Document" +#attr $Title = "Oleg Broytman's blog: %(year)s" +#attr $Description = "Broytman Russian Blog %(year)s Index Document" #attr $Copyright = %(cyear)s ## #def body_html -

öÕÒÎÁÌ: %(year)s

+

öÕÒÎÁÌ: %(year)s

""" % {"year": year, "cyear": year or 2005}) elif level == 2: imonth = int(month) new_text.append("""\ -#attr $Title = "Oleg BroytMann's blog: %(month_abbr_en)s %(year)s" -#attr $Description = "BroytMann Russian Blog %(month_name_en)s %(year)s Index Document" +#attr $Title = "Oleg Broytman's blog: %(month_abbr_en)s %(year)s" +#attr $Description = "Broytman Russian Blog %(month_name_en)s %(year)s Index Document" #attr $Copyright = %(cyear)s ## #def body_html -

öÕÒÎÁÌ: %(month_name_ru0)s %(year)s

+

öÕÒÎÁÌ: %(month_name_ru0)s %(year)s

""" % { "year": year, "cyear": year or 2005, "month_abbr_en": months_abbrs_en[imonth], "month_name_en": months_names_en[imonth], @@ -172,34 +167,29 @@ def write_template(level, year, month, day, titles): iday = int(day) imonth = int(month) - new_text.append("""\ -#attr $Next = "%s" -""" % titles[0][3]) - - if len(titles) == 1: new_text.append("""\ -#attr $refresh = "0; URL=%s" +#attr $Refresh = "0; URL=%s" """ % titles[0][3]) new_text.append("""\ -#attr $Title = "Oleg BroytMann's blog: %(day)d %(month_abbr_en)s %(year)s" -#attr $Description = "BroytMann Russian Blog %(day)d %(month_name_en)s %(year)s Index Document" +#attr $Title = "Oleg Broytman's blog: %(day)d %(month_abbr_en)s %(year)s" +#attr $Description = "Broytman Russian Blog %(day)d %(month_name_en)s %(year)s Index Document" #attr $Copyright = %(cyear)s ## #def body_html -

öÕÒÎÁÌ: %(day)d %(month_name_ru0)s %(year)s

+

öÕÒÎÁÌ: %(day)d %(month_name_ru)s %(year)s

""" % { "year": year, "cyear": year or 2005, "month_abbr_en": months_abbrs_en[imonth], "month_name_en": months_names_en[imonth], - "month_name_ru0": months_names_ru0[imonth], + "month_name_ru": months_names_ru[imonth], "day": iday }) save_titles = titles[:] titles.reverse() - save_day = None + save_date = None for year, month, day, file, title, lead in titles: href = [] if level == 0: @@ -211,52 +201,92 @@ def write_template(level, year, month, day, titles): href.append(file) href = '/'.join(href) if day[0] == '0': day = day[1:] - if save_day <> day: + if save_date != (year, month, day): if level == 0: new_text.append('\n

%s %s %s

' % (day, months_names_ru[int(month)], year)) else: new_text.append('\n

%s %s

' % (day, months_names_ru[int(month)])) - save_day = day - if lead: - lead = lead + ' ' - else: - lead = '' + save_date = year, month, day new_text.append('''

%s%s.

-''' % (lead, href, title)) +''' % (lead+' ' if lead else '', href, title)) if level == 0: + new_text.append(""" +
+ +

îÏ×ÏÓÔÅ×ÁÑ ÌÅÎÔÁ × ÆÏÒÍÁÔÁÈ + +Atom 1.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ / +Atom 1.0 / +Atom 1.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ +RSS 2.0 ÔÏÌØËÏ ÚÁÇÏÌÏ×ËÉ / +RSS 2.0 / +RSS 2.0 ÐÏÌÎÙÅ ÔÅËÓÔÙ. +

+""") + years = {} for year, month, day, file, title, lead in save_titles: years[year] = True - first_year = True new_text.append(''' -
+

ôÅÇÉ: +''') + first_tag = True + for count, tag, links in all_tags: + if first_tag: + first_tag = False + else: + new_text.append(' - ') + new_text.append("""%s (%d)""" % ( + encode_tag(tag), tag, count)) + new_text.append(''' +

+''') -

+ max_year = int(sorted(years.keys())[-1]) + years = range(max_year, 2005, -1) + + new_text.append(''' +

ðÏ ÇÏÄÁÍ: ''') - for year in sorted(years.keys()): + + year_counts = {} + for year, month, day, file, title, lead in all_titles: + year_counts[year] = 0 + for year, month, day, file, title, lead in all_titles: + year_counts[year] += 1 + + first_year = True + for year in years: if first_year: first_year = False else: new_text.append(' - ') - new_text.append('%s' % (year, year)) + new_text.append('%s (%d)' % (year, year, year_counts[str(year)])) new_text.append('''

''') + new_text.append(""" +
+

öö +""") + new_text.append("""\ #end def -$phd_pp_ru.respond(self) +$phd_site.respond(self) """) write_if_changed(index_name, ''.join(new_text)) -all_titles = [] all_tags = {} +all_titles = [] +all_titles_tags = [] for year in sorted(years.keys()): year_titles = [] @@ -270,6 +300,7 @@ for year in sorted(years.keys()): for file, title, lead, tags in blog[key]: if file.endswith(".tmpl"): file = file[:-len("tmpl")] + "html" value = (year, month, day, file, title, lead) + all_titles_tags.append((year, month, day, file, title, lead, tags)) all_titles.append(value) year_titles.append(value) month_titles.append(value) @@ -279,58 +310,298 @@ for year in sorted(years.keys()): tag_links = all_tags[tag] else: tag_links = all_tags[tag] = [] - tag_links.append('/'.join(("..", year, month, day, file))) + tag_links.append(value) write_template(3, year, month, day, day_titles) write_template(2, year, month, day, month_titles) write_template(1, year, month, day, year_titles) -write_template(0, year, month, day, all_titles[-20:]) + +def by_count_rev_tag_link(t1, t2): + """Sort all_tags by count in descending order, + and by tags and links in ascending order + """ + r = cmp(t1[0], t2[0]) + if r: + return -r + return cmp((t1[1], t1[2]), (t2[1], t2[2])) all_tags = [(len(links), tag, links) for (tag, links) in all_tags.items()] -all_tags.sort() +all_tags.sort(by_count_rev_tag_link) + +write_template(0, year, month, day, all_titles[-20:], all_tags) new_text = ["""\ ## THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -#extends phd_pp_ru +#encoding koi8-r +#extends phd_site #implements respond -#attr $Title = "Oleg BroytMann's blog: tags" -#attr $Description = "BroytMann Russian Blog Tags Index Document" +#attr $Title = "Oleg Broytman's blog: tags" +#attr $Description = "Broytman Russian Blog Tags Index Document" #attr $Copyright = 2006 ## #def body_html -

ôÅÇÉ

+

ôÅÇÉ

+ +

+æÏÒÍÁ ÐÏÉÓËÁ ÐÏÚ×ÏÌÑÅÔ ÉÓËÁÔØ ÓÏÏÂÝÅÎÉÑ × ÂÌÏÇÅ, ÓÏÏÔ×ÅÔÓÔ×ÕÀÝÉÅ ×ÙÒÁÖÅÎÉÀ. +óÉÎÔÁËÓÉÓ ×ÙÒÁÖÅÎÉÑ:

+ +

+ðÒÉÍÅÒÙ ×ÙÒÁÖÅÎÉÊ: linux - ÐÒÏÉÚÏÊÄ£Ô ÐÅÒÅÎÁÐÒÁ×ÌÅÎÉÅ +ÎÁ ÓÔÒÁÎÉÃÕ linux.html; linux&!audio - ÉÓËÁÔØ ÚÁÐÉÓÉ × ËÏÔÏÒÙÈ ÅÓÔØ ÔÅÇ +linux É ÎÅÔ ÔÅÇÁ audio; linux and not audio - ÔÏ ÖÅ ÓÁÍÏÅ. +

+ +
+
+ + +
+
+ +
"""] -for count, tag, links in all_tags: - new_text.append(""" -

%s (%d)

-""" % (tag, tag, count)) +for i, (count, tag, links) in enumerate(all_tags): + new_text.append("""\ +
%s (%d)
+""" % (encode_tag(tag), tag, count)) + + first = all_tags[0][1] + if i == 0: + prev = None + else: + prev = all_tags[i-1][1] + if i >= len(all_tags)-1: + next = None + else: + next = all_tags[i+1][1] + last = all_tags[-1][1] tag_text = ["""\ ## THIS FILE IS AUTOMATICALLY GENERATED. DO NOT EDIT. -#extends phd_pp_ru +#encoding koi8-r +#extends phd_site #implements respond -#attr $Title = "Oleg BroytMann's blog: tag %s" -#attr $Description = "BroytMann Russian Blog Tag %s Index Document" +#attr $Title = "Oleg Broytman's blog: tag %s" +#attr $Description = "Broytman Russian Blog Tag %s Index Document" +""" % (tag, tag)] + + tag_text.append("""\ +#attr $First = "%s" +""" % first) + + if prev: + tag_text.append("""\ +#attr $Prev = "%s" +""" % prev) + + if next: + tag_text.append("""\ +#attr $Next = "%s" +""" % next) + + tag_text.append("""\ +#attr $Last = "%s" +""" % last) + + tag_text.append("""\ #attr $Copyright = 2006 ## #def body_html -

%s

-""" % (tag, tag, tag)] +

%s

+ +
    +""" % tag) - for link in links: - junk, year, month, day, filename = link.split('/') - tag_text.append(""" -

    %s/%s/%s: %s

    -""" % (link, year, month, day, filename)) + count = 0 + for year, month, day, filename, title, lead in reversed(links): + link = "../%s/%s/%s/%s" % (year, month, day, filename) + item_text = """
  • %s/%s/%s: %s%s
  • """ % (link, year, month, day, lead+' ' if lead else '', title) + + count += 1 + if count <= 5: + new_text.append("
    %s
    \n" % item_text) + + tag_text.append(" %s\n" % item_text) tag_text.append("""\ +
#end def -$phd_pp_ru.respond(self) +$phd_site.respond(self) """) - write_if_changed(os.path.join("tags", tag+".tmpl"), ''.join(tag_text)) + write_if_changed(os.path.join(blog_root, "tags", + tag.replace(' ', '_') + ".tmpl"), + ''.join(tag_text)) new_text.append("""\ +
#end def -$phd_pp_ru.respond(self) +$phd_site.respond(self) """) -write_if_changed(os.path.join("tags", "index.tmpl"), ''.join(new_text)) +write_if_changed(os.path.join(blog_root, "tags", "index.tmpl"), ''.join(new_text)) + + +from HTMLParser import HTMLParseError +import cgi +from urlparse import urljoin +from m_lib.net.www.html import HTMLParser as _HTMLParser + +class HTMLDone(Exception): pass + + +class FirstPHTMLParser(_HTMLParser): + def __init__(self): + _HTMLParser.__init__(self) + self.first_p = None + + def start_p(self, attrs): + self.accumulator = '

' + + def end_p(self): + self.first_p = self.accumulator + '

' + raise HTMLDone() + +def get_first_p(body): + parser = FirstPHTMLParser() + + try: + parser.feed(body) + except (HTMLParseError, HTMLDone): + pass + + try: + parser.close() + except (HTMLParseError, HTMLDone): + pass + + return parser.first_p + + +class AbsURLHTMLParser(_HTMLParser): + def __init__(self, base): + _HTMLParser.__init__(self) + self.base = base + + def start_a(self, attrs): + self.accumulator += '