K
Kai I Hendry
I am finding the :
http://www.python.org/doc/current/lib/htmlparser-example.html
A little lacking.
I want an example with parses and then writes the same html file (a fine test
case!). Does anyone know where I can find such an example, as my initial attempt
is proving tricky. For example do I really need to do things like: ' %s="%s" '
% (name, value) with the attributes? What happens if a tag needs not be closed
by handle_endtag? Why does my __init__ def not work? And what about the rest?
From decl to parsing entities...
#!/usr/bin/python2.3
import sys
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
#def __init__(self):
#self.tagsoup = []
def handle_starttag(self, tag, attrs):
self.tagsoup.append(tag)
sys.stdout.write('<%s' % tag)
for attr in attrs:
name, value = attr
sys.stdout.write(' %s="%s" ' % (name, value))
sys.stdout.write('>')
#This is the whole tag
#But, how do know if it needs to be closed?
#print self.get_starttag_text()
def handle_data(self, data):
sys.stdout.write(data)
def handle_endtag(self, tag):
self.tagsoup.remove(tag)
sys.stdout.write('</%s>' % tag)
#Something like this?
#Or is there a better way?
#print self.check_for_whole_start_tag
if __name__ == "__main__":
h = MyHTMLParser()
# __init__ def results in some sort of rawdata error, hence:
h.tagsoup = []
#h.feed(sys.stdin.read())
import urllib2
html = urllib2.urlopen('http://www.cs.helsinki.fi/u/hendry/')
h.feed(html.read())
http://www.python.org/doc/current/lib/htmlparser-example.html
A little lacking.
I want an example with parses and then writes the same html file (a fine test
case!). Does anyone know where I can find such an example, as my initial attempt
is proving tricky. For example do I really need to do things like: ' %s="%s" '
% (name, value) with the attributes? What happens if a tag needs not be closed
by handle_endtag? Why does my __init__ def not work? And what about the rest?
From decl to parsing entities...
#!/usr/bin/python2.3
import sys
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
#def __init__(self):
#self.tagsoup = []
def handle_starttag(self, tag, attrs):
self.tagsoup.append(tag)
sys.stdout.write('<%s' % tag)
for attr in attrs:
name, value = attr
sys.stdout.write(' %s="%s" ' % (name, value))
sys.stdout.write('>')
#This is the whole tag
#But, how do know if it needs to be closed?
#print self.get_starttag_text()
def handle_data(self, data):
sys.stdout.write(data)
def handle_endtag(self, tag):
self.tagsoup.remove(tag)
sys.stdout.write('</%s>' % tag)
#Something like this?
#Or is there a better way?
#print self.check_for_whole_start_tag
if __name__ == "__main__":
h = MyHTMLParser()
# __init__ def results in some sort of rawdata error, hence:
h.tagsoup = []
#h.feed(sys.stdin.read())
import urllib2
html = urllib2.urlopen('http://www.cs.helsinki.fi/u/hendry/')
h.feed(html.read())