]> git.phdru.name Git - bookmarks_db.git/commitdiff
Added parser for html based on BeautifulSoup.
authorOleg Broytman <phd@phdru.name>
Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
committerOleg Broytman <phd@phdru.name>
Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@106 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

Robots/parse_html.py
Robots/parse_html_beautifulsoup.py [new file with mode: 0644]

index 8e5ca2b826a1359b37b13af1b336627f3a59609b..3e20a5516a775eb8e425bdf932831bbcb559641f 100755 (executable)
@@ -11,17 +11,10 @@ from m_lib.defenc import default_encoding
 current_charset = default_encoding.replace("windows-", "cp")
 DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
 
-from parse_html_htmlparser import parse_html as _parse_html
-
-
-class HTMLParser(object):
-   def __init__(self, charset=None):
-      _HTMLParser.__init__(self)
-      self.charset = charset
-      self.meta_charset = 0
-      self.title = ''
-      self.refresh = ''
-      self.icon = None
+try:
+   from parse_html_beautifulsoup import parse_html as _parse_html
+except ImportError:
+   from parse_html_htmlparser import parse_html as _parse_html
 
 
 import re
@@ -75,7 +68,7 @@ def parse_html(filename, charset=None, log=None):
 
 if __name__ == '__main__':
    import sys
-   parser = parse_html(sys.argv[1])
+   parser = parse_html(sys.argv[1], current_charset)
    print parser.charset
    print parser.title
    print parser.refresh
diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py
new file mode 100644 (file)
index 0000000..4f395a1
--- /dev/null
@@ -0,0 +1,59 @@
+"""
+   HTML Parser using BeautifulSoup
+
+   Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
+"""
+
+from BeautifulSoup import BeautifulSoup
+
+
+class DummyParser(object):
+   def __init__(self, charset, meta, title, refresh, icon):
+      object.__init__(self)
+      self.charset = charset
+      self.meta_charset = meta
+      self.title = title
+      self.refresh = refresh
+      self.icon = icon
+
+def parse_html(filename, charset=None):
+   infile = open(filename, 'r')
+   root = BeautifulSoup(infile, fromEncoding=charset)
+   infile.close()
+
+   charset = root.originalEncoding
+   try:
+      title = root.html.head.title.string.encode(charset)
+   except AttributeError:
+      title = ''
+
+   try:
+      meta = root.html.head.find(_find_refresh, recursive=False)
+   except AttributeError:
+      refresh = None
+   else:
+      if meta:
+         refresh = meta.get("content")
+      else:
+         refresh = None
+
+   try:
+      meta = root.html.head.find(_find_icon, recursive=False)
+   except AttributeError:
+      icon = None
+   else:
+      if meta:
+         icon = meta.get("href")
+      else:
+         icon = None
+
+   parser = DummyParser(charset, False, title, refresh, icon)
+   return parser
+
+def _find_refresh(Tag):
+   return (Tag.name == "meta") and \
+      (Tag.get("http-equiv", '').lower() == "refresh")
+
+def _find_icon(Tag):
+   return (Tag.name == "link") and \
+      (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))