C
cesar.ortiz
Hi all,
I have created an example using libxml2 based in the code that appears
in http://xmlsoft.org/python.html.
My example processes an enough amount of html files to see that the
memory consumption rises till the process ends (I check it with the
'top' command).
I don´t know if I am forgetting something in the code, as I have not
been able to find any example on the web.
Thanks in advance, Cesar
Note: I have also tried to put the cleanup functions inside the 'for'
loop.
****************************************] The Code
[****************************************
#!/usr/bin/python -u
import libxml2
#------------------------------------------------------------------------------
# Memory debug specific
libxml2.debugMemory(1)
#------------------------------------------------------------------------------
class callback:
def startDocument(self):
print "."
def endDocument(self):
pass
def startElement(self, tag, attrs):
pass
def endElement(self, tag):
pass
def characters(self, data):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
def fatalError(self, msg):
pass
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
import os
import sys
programName = os.path.basename(sys.argv[0])
if len(sys.argv) != 2:
print "Use: %s <dir html files>" % programName
sys.exit(1)
inputPath = sys.argv[1]
if not os.path.exists (inputPath):
print "Error: directory does not exist"
sys.exit(1)
inputFileNames = []
dirContent = os.listdir(inputPath)
for fichero in dirContent:
extension1=fichero.rfind(".htm")
extension2=fichero.rfind(".html")
dot = fichero.rfind(".")
extension = max(extension1,extension2)
if extension != -1 and extension == dot:
inputFileNames.append (fichero)
if len(inputFileNames) == 0:
print "Error: no input files"
sys.exit(1)
handler = callback()
NUM_ITERS = 5
for i in range(NUM_ITERS):
for inputFileName in inputFileNames:
print inputFileName
inputFilePath = inputPath + inputFileName
f = open(inputFilePath)
data = f.read()
f.close()
ctxt = libxml2.htmlCreatePushParser(handler, "", 0, inputFileName)
ctxt.htmlParseChunk(data, len(data), 1)
ctxt = None
# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
print "OK"
else:
print "Memory leak %d bytes" % (libxml2.debugMemory(1))
libxml2.dumpMemory()
# Other cleanup functions
#libxml2.cleanupCharEncodingHandlers()
#libxml2.cleanupEncodingAliases()
#libxml2.cleanupGlobals()
#libxml2.cleanupInputCallbacks()
#libxml2.cleanupOutputCallbacks()
#libxml2.cleanupPredefinedEntities()
I have created an example using libxml2 based in the code that appears
in http://xmlsoft.org/python.html.
My example processes an enough amount of html files to see that the
memory consumption rises till the process ends (I check it with the
'top' command).
I don´t know if I am forgetting something in the code, as I have not
been able to find any example on the web.
Thanks in advance, Cesar
Note: I have also tried to put the cleanup functions inside the 'for'
loop.
****************************************] The Code
[****************************************
#!/usr/bin/python -u
import libxml2
#------------------------------------------------------------------------------
# Memory debug specific
libxml2.debugMemory(1)
#------------------------------------------------------------------------------
class callback:
def startDocument(self):
print "."
def endDocument(self):
pass
def startElement(self, tag, attrs):
pass
def endElement(self, tag):
pass
def characters(self, data):
pass
def warning(self, msg):
pass
def error(self, msg):
pass
def fatalError(self, msg):
pass
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
import os
import sys
programName = os.path.basename(sys.argv[0])
if len(sys.argv) != 2:
print "Use: %s <dir html files>" % programName
sys.exit(1)
inputPath = sys.argv[1]
if not os.path.exists (inputPath):
print "Error: directory does not exist"
sys.exit(1)
inputFileNames = []
dirContent = os.listdir(inputPath)
for fichero in dirContent:
extension1=fichero.rfind(".htm")
extension2=fichero.rfind(".html")
dot = fichero.rfind(".")
extension = max(extension1,extension2)
if extension != -1 and extension == dot:
inputFileNames.append (fichero)
if len(inputFileNames) == 0:
print "Error: no input files"
sys.exit(1)
handler = callback()
NUM_ITERS = 5
for i in range(NUM_ITERS):
for inputFileName in inputFileNames:
print inputFileName
inputFilePath = inputPath + inputFileName
f = open(inputFilePath)
data = f.read()
f.close()
ctxt = libxml2.htmlCreatePushParser(handler, "", 0, inputFileName)
ctxt.htmlParseChunk(data, len(data), 1)
ctxt = None
# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
print "OK"
else:
print "Memory leak %d bytes" % (libxml2.debugMemory(1))
libxml2.dumpMemory()
# Other cleanup functions
#libxml2.cleanupCharEncodingHandlers()
#libxml2.cleanupEncodingAliases()
#libxml2.cleanupGlobals()
#libxml2.cleanupInputCallbacks()
#libxml2.cleanupOutputCallbacks()
#libxml2.cleanupPredefinedEntities()