HTMLParser and write

Kai I Hendry · Mar 5, 2004

I am finding the :
http://www.python.org/doc/current/lib/htmlparser-example.html
A little lacking.

I want an example with parses and then writes the same html file (a fine test
case!). Does anyone know where I can find such an example, as my initial attempt
is proving tricky. For example do I really need to do things like: ' %s="%s" '
% (name, value) with the attributes? What happens if a tag needs not be closed
by handle_endtag? Why does my __init__ def not work? And what about the rest?
From decl to parsing entities...

#!/usr/bin/python2.3
import sys
from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

#def __init__(self):
#self.tagsoup = []

def handle_starttag(self, tag, attrs):
self.tagsoup.append(tag)
sys.stdout.write('<%s' % tag)
for attr in attrs:
name, value = attr
sys.stdout.write(' %s="%s" ' % (name, value))
sys.stdout.write('>')

#This is the whole tag
#But, how do know if it needs to be closed?
#print self.get_starttag_text()

def handle_data(self, data):
sys.stdout.write(data)

def handle_endtag(self, tag):
self.tagsoup.remove(tag)
sys.stdout.write('</%s>' % tag)

#Something like this?
#Or is there a better way?
#print self.check_for_whole_start_tag

if __name__ == "__main__":
h = MyHTMLParser()

# __init__ def results in some sort of rawdata error, hence:
h.tagsoup = []

#h.feed(sys.stdin.read())

import urllib2
html = urllib2.urlopen('http://www.cs.helsinki.fi/u/hendry/')
h.feed(html.read())

Stephen Ferg · Mar 5, 2004

You're right. The example is REALLY feeble. Maybe this will help:

"""HTMLParserDemoProgram
Use HTMLParser to read in an HTML file and write it out again.
This will put all tag and attribute names into lowercase.
"""

"""
REVISION HISTORY
2 2004-01-05 added handle_pi and improved attribute processing
"""

from HTMLParser import HTMLParser

class CustomizedParser(HTMLParser):

def setOutfileName(self, argOutfileName):
"""Remember the output file, so it is easy to write to it.
"""
self.OutfileName = argOutfileName
self.Outfile = open(self.OutfileName, "w")

def closeOutfile(self):
self.Outfile.close()

def write(self, argString):
self.Outfile.write(argString)

def handle_starttag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s>" % (argTag, attributes))

def handle_startendtag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s/>" % (argTag, attributes))

def handle_endtag(self, argTag):
self.write("</%s>" % argTag)

def handle_data(self, argString):
self.write(argString)

def handle_charref(self, argString):
self.write("&#%s;" % argString)

def handle_entityref(self, argString):
self.write("&%s;" % argString)

def handle_comment(self, argString):
self.write("" % argString)

def handle_decl(self, argString):
self.write("<!%s>" % argString)

def handle_pi(self, argString):
# handle a processing instruction
self.write("<?%s>" % argString)

def main(myInfileName, myOutfileName ):
myInfile = open(myInfileName, "r")
myParser = CustomizedParser()
myParser.setOutfileName(myOutfileName)

myParser.feed(myInfile.read())

myInfile.close()
myParser.closeOutfile()

def dq(s):
"""Enclose a string argument in double quotes"""
return '"'+ s + '"'

if __name__ == "__main__":
print "Starting HTMLParserDemoProgram"
main("c:\junk\slide01.html", "c:\junk\slide01a.html")
print "Ending HTMLParserDemoProgram"

HTMLParser skipping HTML? [newbie]	6	Sep 5, 2012
confused by HTMLParser class	3	May 28, 2008
HTMLParser not parsing whole html file	4	Oct 24, 2010
HTMLParser can't read japanese	3	Apr 13, 2010
HTMLParser and non-ascii html pages	0	Sep 20, 2011
HTMLParser question	1	Aug 19, 2004
HTMLParser problems.	11	Oct 30, 2003
make a simple search function for homepage	1	Oct 31, 2006

HTMLParser and write

Kai I Hendry

Stephen Ferg

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads