output = []
for part in entity_re.split(title):
if entity_re.match(part):
- part = unichr(int(part[2:-1])).encode(charset, "replace")
+ try:
+ part = unichr(int(part[2:-1])).encode(charset)
+ except UnicodeEncodeError:
+ pass # Leave the entity as is
output.append(part)
return ''.join(output)
if log: log(" unknown charset: `%s' or `%s'" % (parser.charset, current_charset))
title = recode_entities(title, current_charset)
- title = title.replace('\r', '').replace('\n', ' ').strip()
+ parts = [s.strip() for s in title.replace('\r', '').split('\n')]
+ title = ' '.join([s for s in parts if s])
if log: log(" final title : %s" % title)
parser.title = title
return parser