M
Martin Bless
Hi friends, I've been OFF-Python now for quite a while and am glad
being back. At least to some part as work permits.
Q:
What's a good way to encode and decode those entities like € or
€ ?
I need isolated functions to process lines. Looking at the xml and
sgmlib stuff I didn't really get a clue as to what's the most pythonic
way. Are there library functions I didn't see?
FYI, here is what I hacked down and what will probably (hopefully...)
do the job.
Feel free to comment.
# -*- coding: iso-8859-1 -*-
"""\
entity_stuff.py, mb, 2008-03-14, 2008-03-18
"""
import htmlentitydefs
import re
RE_OBJ_entity = re.compile('(&.+?')
def entity2uc(entity):
"""Convert entity like { to unichr.
Return (result,True) on success or (input string, False)
otherwise. Example:
entity2cp('€') -> (u'\u20ac',True)
entity2cp('€') -> (u'\u20ac',True)
entity2cp('€') -> (u'\u20ac',True)
entity2cp('&foobar;') -> ('&foobar;',False)
"""
gotCodepoint = False
gotUnichr = False
if entity.startswith('&#'):
if entity[2] == 'x':
base = 16
digits = entity[3:-1]
else:
base = 10
digits = entity[2:-1]
try:
v = int(digits,base)
gotCodepoint = True
except:
pass
else:
v = htmlentitydefs.name2codepoint.get(entity[1:-1],None)
if not v is None:
gotCodepoint = True
if gotCodepoint:
try:
v = unichr(v)
gotUnichr = True
except:
pass
if gotUnichr:
return v, gotUnichr
else:
return entity, gotUnichr
def line_entities_to_uc(line):
result = []
cntProblems = 0
for e in RE_OBJ_entity.split(line):
if e.startswith('&'):
e,success = entity2uc(e)
if not success:
cntProblems += 1
result.append(e)
return u''.join(result), cntProblems
def uc2entity(uc):
cp = ord(uc)
if cp > 127:
name = htmlentitydefs.codepoint2name.get(cp,None)
if name:
result = '&%s;' % name
else:
result = '&#x%x;' % cp
else:
result = chr(cp)
return result
def encode_line(line):
return ''.join([uc2entity(u) for u in line])
if 1 and __name__=="__main__":
import codecs
infile = 'temp.ascii.xml'
outfile = 'temp.utf8.xml'
of = codecs.open(outfile,'wb','utf-8')
totalProblems = 0
totalLines = 0
for line in file(infile,'rb'):
line2, cntProblems = line_entities_to_uc(line)
of.write(line2)
totalLines += 1
totalProblems += cntProblems
of.close()
print
print "Summary:"
print " Infile : %s" % (infile,)
print " Outfile: %s" % (outfile,)
print ' %8d %s %s' % (totalLines,
['lines','line'][totalLines==1], 'written.')
print ' %8d %s %s' % (totalProblems,
['entities','entity'][totalProblems==1], 'left unconverted.')
print '%s' % ('Done.',)
Have a nice day and
ru, Martin
(read you, ;-)
being back. At least to some part as work permits.
Q:
What's a good way to encode and decode those entities like € or
€ ?
I need isolated functions to process lines. Looking at the xml and
sgmlib stuff I didn't really get a clue as to what's the most pythonic
way. Are there library functions I didn't see?
FYI, here is what I hacked down and what will probably (hopefully...)
do the job.
Feel free to comment.
# -*- coding: iso-8859-1 -*-
"""\
entity_stuff.py, mb, 2008-03-14, 2008-03-18
"""
import htmlentitydefs
import re
RE_OBJ_entity = re.compile('(&.+?')
def entity2uc(entity):
"""Convert entity like { to unichr.
Return (result,True) on success or (input string, False)
otherwise. Example:
entity2cp('€') -> (u'\u20ac',True)
entity2cp('€') -> (u'\u20ac',True)
entity2cp('€') -> (u'\u20ac',True)
entity2cp('&foobar;') -> ('&foobar;',False)
"""
gotCodepoint = False
gotUnichr = False
if entity.startswith('&#'):
if entity[2] == 'x':
base = 16
digits = entity[3:-1]
else:
base = 10
digits = entity[2:-1]
try:
v = int(digits,base)
gotCodepoint = True
except:
pass
else:
v = htmlentitydefs.name2codepoint.get(entity[1:-1],None)
if not v is None:
gotCodepoint = True
if gotCodepoint:
try:
v = unichr(v)
gotUnichr = True
except:
pass
if gotUnichr:
return v, gotUnichr
else:
return entity, gotUnichr
def line_entities_to_uc(line):
result = []
cntProblems = 0
for e in RE_OBJ_entity.split(line):
if e.startswith('&'):
e,success = entity2uc(e)
if not success:
cntProblems += 1
result.append(e)
return u''.join(result), cntProblems
def uc2entity(uc):
cp = ord(uc)
if cp > 127:
name = htmlentitydefs.codepoint2name.get(cp,None)
if name:
result = '&%s;' % name
else:
result = '&#x%x;' % cp
else:
result = chr(cp)
return result
def encode_line(line):
return ''.join([uc2entity(u) for u in line])
if 1 and __name__=="__main__":
import codecs
infile = 'temp.ascii.xml'
outfile = 'temp.utf8.xml'
of = codecs.open(outfile,'wb','utf-8')
totalProblems = 0
totalLines = 0
for line in file(infile,'rb'):
line2, cntProblems = line_entities_to_uc(line)
of.write(line2)
totalLines += 1
totalProblems += cntProblems
of.close()
print "Summary:"
print " Infile : %s" % (infile,)
print " Outfile: %s" % (outfile,)
print ' %8d %s %s' % (totalLines,
['lines','line'][totalLines==1], 'written.')
print ' %8d %s %s' % (totalProblems,
['entities','entity'][totalProblems==1], 'left unconverted.')
print '%s' % ('Done.',)
Have a nice day and
ru, Martin
(read you, ;-)