HTMLParser and write

K

Kai I Hendry

I am finding the :
http://www.python.org/doc/current/lib/htmlparser-example.html
A little lacking.

I want an example with parses and then writes the same html file (a fine test
case!). Does anyone know where I can find such an example, as my initial attempt
is proving tricky. For example do I really need to do things like: ' %s="%s" '
% (name, value) with the attributes? What happens if a tag needs not be closed
by handle_endtag? Why does my __init__ def not work? And what about the rest?
From decl to parsing entities...


#!/usr/bin/python2.3
import sys
from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):

#def __init__(self):
#self.tagsoup = []

def handle_starttag(self, tag, attrs):
self.tagsoup.append(tag)
sys.stdout.write('<%s' % tag)
for attr in attrs:
name, value = attr
sys.stdout.write(' %s="%s" ' % (name, value))
sys.stdout.write('>')


#This is the whole tag
#But, how do know if it needs to be closed?
#print self.get_starttag_text()


def handle_data(self, data):
sys.stdout.write(data)

def handle_endtag(self, tag):
self.tagsoup.remove(tag)
sys.stdout.write('</%s>' % tag)

#Something like this?
#Or is there a better way?
#print self.check_for_whole_start_tag

if __name__ == "__main__":
h = MyHTMLParser()

# __init__ def results in some sort of rawdata error, hence:
h.tagsoup = []

#h.feed(sys.stdin.read())

import urllib2
html = urllib2.urlopen('http://www.cs.helsinki.fi/u/hendry/')
h.feed(html.read())
 
S

Stephen Ferg

You're right. The example is REALLY feeble. Maybe this will help:

"""HTMLParserDemoProgram
Use HTMLParser to read in an HTML file and write it out again.
This will put all tag and attribute names into lowercase.
"""

"""
REVISION HISTORY
2 2004-01-05 added handle_pi and improved attribute processing
"""

from HTMLParser import HTMLParser

class CustomizedParser(HTMLParser):

def setOutfileName(self, argOutfileName):
"""Remember the output file, so it is easy to write to it.
"""
self.OutfileName = argOutfileName
self.Outfile = open(self.OutfileName, "w")

def closeOutfile(self):
self.Outfile.close()

def write(self, argString):
self.Outfile.write(argString)

def handle_starttag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s>" % (argTag, attributes))

def handle_startendtag(self, argTag, argAttrs):
""" argAttrs is a list of tuples.
Each tuple is a pair of (attribute_name, attribute_value)
"""
attributes = "".join([' %s="%s"' % (key, value) for key, value in argAttrs])
self.Outfile.write("<%s%s/>" % (argTag, attributes))


def handle_endtag(self, argTag):
self.write("</%s>" % argTag)

def handle_data(self, argString):
self.write(argString)

def handle_charref(self, argString):
self.write("&#%s;" % argString)

def handle_entityref(self, argString):
self.write("&%s;" % argString)

def handle_comment(self, argString):
self.write("<!--%s-->" % argString)

def handle_decl(self, argString):
self.write("<!%s>" % argString)

def handle_pi(self, argString):
# handle a processing instruction
self.write("<?%s>" % argString)

def main(myInfileName, myOutfileName ):
myInfile = open(myInfileName, "r")
myParser = CustomizedParser()
myParser.setOutfileName(myOutfileName)

myParser.feed(myInfile.read())

myInfile.close()
myParser.closeOutfile()


def dq(s):
"""Enclose a string argument in double quotes"""
return '"'+ s + '"'

if __name__ == "__main__":
print "Starting HTMLParserDemoProgram"
main("c:\junk\slide01.html", "c:\junk\slide01a.html")
print "Ending HTMLParserDemoProgram"
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,982
Messages
2,570,186
Members
46,740
Latest member
JudsonFrie

Latest Threads

Top