J
Jeff Bowden
I've written a simple class derived from sgmllib.SGMLParser to extract
text from html pages. So far it's worked pretty well except for a few
cases where I get exceptions. I've managed to work around these
problems by overriding parse_declaration.
Since parse_declaration is preceded by the comment
# Internal -- parse declaration (for use by subclasses).
I am thinking my workaround might possibly stop working with future
versions of sgmllib so I'm looking for a more correct alternative.
Any suggestions?
Here's my code:
_endTag = re.compile(r'>')
class SGML2TextParser(sgmllib.SGMLParser):
def __init__(self, f, ignoretags=['script']):
sgmllib.SGMLParser.__init__(self)
self.f = f
self.ignoretags = ignoretags
self.tag = ''
def handle_starttag(self, tag, attrs):
self.tag = tag
def handle_data(self, data):
if self.tag not in self.ignoretags:
self.f.write(data)
def handle_charref(self, name):
try:
n = int(name)
self.handle_data(unichr(n))
except ValueError:
pass
# DANGER: overriding internal function
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except:
match = _endTag.search(self.rawdata, i)
return match and match.end(0) or -1
def extractText(html_text):
s = StringIO.StringIO()
x = SGML2TextParser(s)
x.feed(html_text)
return s.getvalue()
text from html pages. So far it's worked pretty well except for a few
cases where I get exceptions. I've managed to work around these
problems by overriding parse_declaration.
Since parse_declaration is preceded by the comment
# Internal -- parse declaration (for use by subclasses).
I am thinking my workaround might possibly stop working with future
versions of sgmllib so I'm looking for a more correct alternative.
Any suggestions?
Here's my code:
_endTag = re.compile(r'>')
class SGML2TextParser(sgmllib.SGMLParser):
def __init__(self, f, ignoretags=['script']):
sgmllib.SGMLParser.__init__(self)
self.f = f
self.ignoretags = ignoretags
self.tag = ''
def handle_starttag(self, tag, attrs):
self.tag = tag
def handle_data(self, data):
if self.tag not in self.ignoretags:
self.f.write(data)
def handle_charref(self, name):
try:
n = int(name)
self.handle_data(unichr(n))
except ValueError:
pass
# DANGER: overriding internal function
def parse_declaration(self, i):
try:
return sgmllib.SGMLParser.parse_declaration(self, i)
except:
match = _endTag.search(self.rawdata, i)
return match and match.end(0) or -1
def extractText(html_text):
s = StringIO.StringIO()
x = SGML2TextParser(s)
x.feed(html_text)
return s.getvalue()