From bf97248fba49413a16c41152bdb0722ce157164c Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Tue, 25 Sep 2007 17:19:36 +0000 Subject: [PATCH] Find an icon's URL in the HTML. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@72 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/Robots/parse_html.py b/Robots/parse_html.py index 2207ea2..57ad6a0 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -2,7 +2,7 @@ """ HTML Parser - Written by BroytMann. Copyright (C) 1997-2005 PhiloSoft Design + Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design """ @@ -71,6 +71,24 @@ class HTMLParser(_HTMLParser): self.title = self.accumulator + def do_link(self, attrs): + has_icon = False + href = None + + for attrname, value in attrs: + if value: + value = value.strip().lower() + if (attrname == 'rel') and (value in ('icon', 'shortcut icon')): + has_icon = True + elif attrname == 'href': + href = value + + if has_icon: + self.icon = href + else: + self.icon = None + + import re entity_re = re.compile("(&#[0-9]+;)") @@ -135,3 +153,4 @@ if __name__ == '__main__': print parser.charset print parser.title print parser.refresh + print parser.icon -- 2.39.5