B
Barak, Ron
Hi,
I'm parsing XML files using ElementTree from xml.etree (see code below (and attached xml_parse_example.py)).
However, I'm coming across input XML files (attached an example: tmp.xml) which include invalid characters, that produce the following traceback:
$ python xml_parse_example.py
Traceback (most recent call last):
File "xml_parse_example.py", line 63, in <module>
tree = xml2dict.open_and_parse_xml_file()
File "xml_parse_example.py", line 14, in open_and_parse_xml_file
tree = ElementTree.parse(f)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 862, in parse
tree.parse(source, parser)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in parse
parser.feed(data)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in feed
self._parser.Parse(data, 0)
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, column 34
I read the documentation for xml.etree.ElementTree and see that it may take an optional parser parameter, but I don't know what this parser should be - to ignore the invalid characters.
Could you suggest a way to call ElementTree, so it won't bomb on these invalid characters ?
Thanks,
Ron.
________________________________
#!/usr/bin/env python
from xml.etree import ElementTree
import pprint
compute_tail = False
class XmlFileToDict():
def __init__(self, xml_file_path):
self.xml_file_path = xml_file_path
def open_and_parse_xml_file(self):
with open(self.xml_file_path, 'rt') as f:
tree = ElementTree.parse(f)
return tree
def dict_list(self, node):
res = {}
res[node.tag] = []
self.xml_to_dict(node,res[node.tag])
reply = {}
if compute_tail:
reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}
else:
reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib}
return reply
def xml_to_dict(self, node, res):
rep = {}
if len(node):
#n = 0
for n in list(node):
rep[node.tag] = []
value = self.xml_to_dict(n,rep[node.tag])
if len(n):
if compute_tail:
value = {'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}
else:
value = {'value':rep[node.tag],'attributes':n.attrib}
res.append({n.tag:value})
else :
res.append(rep[node.tag][0])
else:
value = {}
if compute_tail:
value = {'value':node.text,'attributes':node.attrib,'tail':node.tail}
else:
value = {'value':node.text,'attributes':node.attrib}
res.append({node.tag:value})
return
if __name__ == '__main__' :
xml_file_path ='tmp.xml'
xml2dict = XmlFileToDict(xml_file_path)
tree = xml2dict.open_and_parse_xml_file()
xml_dict = xml2dict.dict_list(tree.getroot())
pprint.pprint(xml_dict)
________________________________
I'm parsing XML files using ElementTree from xml.etree (see code below (and attached xml_parse_example.py)).
However, I'm coming across input XML files (attached an example: tmp.xml) which include invalid characters, that produce the following traceback:
$ python xml_parse_example.py
Traceback (most recent call last):
File "xml_parse_example.py", line 63, in <module>
tree = xml2dict.open_and_parse_xml_file()
File "xml_parse_example.py", line 14, in open_and_parse_xml_file
tree = ElementTree.parse(f)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 862, in parse
tree.parse(source, parser)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in parse
parser.feed(data)
File "c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in feed
self._parser.Parse(data, 0)
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, column 34
I read the documentation for xml.etree.ElementTree and see that it may take an optional parser parameter, but I don't know what this parser should be - to ignore the invalid characters.
Could you suggest a way to call ElementTree, so it won't bomb on these invalid characters ?
Thanks,
Ron.
________________________________
#!/usr/bin/env python
from xml.etree import ElementTree
import pprint
compute_tail = False
class XmlFileToDict():
def __init__(self, xml_file_path):
self.xml_file_path = xml_file_path
def open_and_parse_xml_file(self):
with open(self.xml_file_path, 'rt') as f:
tree = ElementTree.parse(f)
return tree
def dict_list(self, node):
res = {}
res[node.tag] = []
self.xml_to_dict(node,res[node.tag])
reply = {}
if compute_tail:
reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}
else:
reply[node.tag] = {'value':res[node.tag],'attribs':node.attrib}
return reply
def xml_to_dict(self, node, res):
rep = {}
if len(node):
#n = 0
for n in list(node):
rep[node.tag] = []
value = self.xml_to_dict(n,rep[node.tag])
if len(n):
if compute_tail:
value = {'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}
else:
value = {'value':rep[node.tag],'attributes':n.attrib}
res.append({n.tag:value})
else :
res.append(rep[node.tag][0])
else:
value = {}
if compute_tail:
value = {'value':node.text,'attributes':node.attrib,'tail':node.tail}
else:
value = {'value':node.text,'attributes':node.attrib}
res.append({node.tag:value})
return
if __name__ == '__main__' :
xml_file_path ='tmp.xml'
xml2dict = XmlFileToDict(xml_file_path)
tree = xml2dict.open_and_parse_xml_file()
xml_dict = xml2dict.dict_list(tree.getroot())
pprint.pprint(xml_dict)
________________________________