Added parser for html based on BeautifulSoup.

author Oleg Broytman <phd@phdru.name>

Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)

committer Oleg Broytman <phd@phdru.name>

Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
author Oleg Broytman <phd@phdru.name>
Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
committer Oleg Broytman <phd@phdru.name>
Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
diff --git a/Robots/parse_html.py b/Robots/parse_html.py

index 8e5ca2b826a1359b37b13af1b336627f3a59609b..3e20a5516a775eb8e425bdf932831bbcb559641f 100755 (executable)
--- a/Robots/parse_html.py
+++ b/Robots/parse_html.py
@@ -11,17 +11,10 @@ from m_lib.defenc import default_encoding
  current_charset = default_encoding.replace("windows-", "cp")
  DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
  
-from parse_html_htmlparser import parse_html as _parse_html
-
-
-class HTMLParser(object):
-   def __init__(self, charset=None):
-      _HTMLParser.__init__(self)
-      self.charset = charset
-      self.meta_charset = 0
-      self.title = ''
-      self.refresh = ''
-      self.icon = None
+try:
+   from parse_html_beautifulsoup import parse_html as _parse_html
+except ImportError:
+   from parse_html_htmlparser import parse_html as _parse_html
  
  
  import re
@@ -75,7 +68,7 @@ def parse_html(filename, charset=None, log=None):
  
  if __name__ == '__main__':
     import sys
-   parser = parse_html(sys.argv[1])
+   parser = parse_html(sys.argv[1], current_charset)
     print parser.charset
     print parser.title
     print parser.refresh
diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py

new file mode 100644 (file)

index 0000000..4f395a1
--- /dev/null
+++ b/Robots/parse_html_beautifulsoup.py
@@ -0,0 +1,59 @@
+"""
+   HTML Parser using BeautifulSoup
+
+   Written by BroytMann. Copyright (C) 2007 PhiloSoft Design
+"""
+
+from BeautifulSoup import BeautifulSoup
+
+
+class DummyParser(object):
+   def __init__(self, charset, meta, title, refresh, icon):
+      object.__init__(self)
+      self.charset = charset
+      self.meta_charset = meta
+      self.title = title
+      self.refresh = refresh
+      self.icon = icon
+
+def parse_html(filename, charset=None):
+   infile = open(filename, 'r')
+   root = BeautifulSoup(infile, fromEncoding=charset)
+   infile.close()
+
+   charset = root.originalEncoding
+   try:
+      title = root.html.head.title.string.encode(charset)
+   except AttributeError:
+      title = ''
+
+   try:
+      meta = root.html.head.find(_find_refresh, recursive=False)
+   except AttributeError:
+      refresh = None
+   else:
+      if meta:
+         refresh = meta.get("content")
+      else:
+         refresh = None
+
+   try:
+      meta = root.html.head.find(_find_icon, recursive=False)
+   except AttributeError:
+      icon = None
+   else:
+      if meta:
+         icon = meta.get("href")
+      else:
+         icon = None
+
+   parser = DummyParser(charset, False, title, refresh, icon)
+   return parser
+
+def _find_refresh(Tag):
+   return (Tag.name == "meta") and \
+      (Tag.get("http-equiv", '').lower() == "refresh")
+
+def _find_icon(Tag):
+   return (Tag.name == "link") and \
+      (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))
author	Oleg Broytman <phd@phdru.name>
	Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
committer	Oleg Broytman <phd@phdru.name>
	Sun, 16 Dec 2007 19:45:36 +0000 (19:45 +0000)
Robots/parse_html.py		patch \| blob \| history
Robots/parse_html_beautifulsoup.py	[new file with mode: 0644]	patch \| blob