From 4e86db886a2c446928438a038002b3084e7c0977 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 11 Feb 2008 19:58:39 +0000 Subject: [PATCH] Recode entities before num. entities. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@171 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/Robots/parse_html.py b/Robots/parse_html.py index dbca770..888e27c 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -33,23 +33,18 @@ num_entity_re = re.compile("(&#[0-9]+;)") def recode_entities(title, charset): output = [] - for part in num_entity_re.split(title): + for part in entity_re.split(title): + if entity_re.match(part): + part = entitydefs.get(part[1:-1], part) + output.append(part) + + output2 = [] + for part in num_entity_re.split(''.join(output)): if num_entity_re.match(part): try: part = unichr(int(part[2:-1])).encode(charset) except UnicodeEncodeError: pass # Leave the entity as is - output.append(part) - - output2 = [] - for part in entity_re.split(''.join(output)): - if entity_re.match(part): - part = entitydefs.get(part[1:-1], part) - if num_entity_re.match(part): - try: - part = unichr(int(part[2:-1])).encode(charset) - except UnicodeEncodeError: - pass # Leave the entity as is output2.append(part) return ''.join(output2) -- 2.39.5