D
Dale Gerdemann
I've written simple code in 2.6 and 3.0 to read every charcter of a
set of files and print out some information for each of these
characters. I tested each program on a large Cyrillic/Latin text. The
result was that the 2.6 version was about 5x faster. Here are the two
programs:
#!/usr/bin/env python
import sys
import codecs
import unicodedata
for path in sys.argv[1:]:
lines = codecs.open(path, encoding='UTF-8',
errors='replace').readlines()
for line in lines:
for c in line:
name = unicodedata.name(c,'unknown')
prnt = prnt_rep = c.encode('utf8')
if name == 'unknown':
prnt = ' '
if ord(c) > 127:
print('%s %-14r U+%04x %s' % (prnt, prnt_rep, ord(c),
name))
else:
if ord(c) == 9:
name = 'tab'
prnt = ' '
elif ord(c) == 10:
name = 'LF'
prnt = ' '
elif ord(c) == 13:
name = 'CR'
prnt = ' '
print("{0:s} '\\x{1:02x}' U+{2:04x}
{3:s}".format(
prnt, ord(c), ord(c), name))
#!/usr/bin/env python3
import sys
import unicodedata
for path in sys.argv[1:]:
lines = open(path, errors='replace').readlines()
for line in lines:
for c in line:
code_point = ord(c)
utf8 = c.encode()
if ord(c) <= 127:
utf8 = "b'\\" + hex(ord(c))[1:] + "'"
name = unicodedata.name(c,'unknown')
if name == 'unknown':
c = ' '
if code_point == 9:
c = ' '
name = 'tab'
elif code_point == 10:
c = ' '
name = 'LF'
elif code_point == 13:
c = ' '
name = 'CR'
print("{0:s} {1:15s} U+{2:04x} {3:s}".format(
c, utf8, code_point, name))
set of files and print out some information for each of these
characters. I tested each program on a large Cyrillic/Latin text. The
result was that the 2.6 version was about 5x faster. Here are the two
programs:
#!/usr/bin/env python
import sys
import codecs
import unicodedata
for path in sys.argv[1:]:
lines = codecs.open(path, encoding='UTF-8',
errors='replace').readlines()
for line in lines:
for c in line:
name = unicodedata.name(c,'unknown')
prnt = prnt_rep = c.encode('utf8')
if name == 'unknown':
prnt = ' '
if ord(c) > 127:
print('%s %-14r U+%04x %s' % (prnt, prnt_rep, ord(c),
name))
else:
if ord(c) == 9:
name = 'tab'
prnt = ' '
elif ord(c) == 10:
name = 'LF'
prnt = ' '
elif ord(c) == 13:
name = 'CR'
prnt = ' '
print("{0:s} '\\x{1:02x}' U+{2:04x}
{3:s}".format(
prnt, ord(c), ord(c), name))
#!/usr/bin/env python3
import sys
import unicodedata
for path in sys.argv[1:]:
lines = open(path, errors='replace').readlines()
for line in lines:
for c in line:
code_point = ord(c)
utf8 = c.encode()
if ord(c) <= 127:
utf8 = "b'\\" + hex(ord(c))[1:] + "'"
name = unicodedata.name(c,'unknown')
if name == 'unknown':
c = ' '
if code_point == 9:
c = ' '
name = 'tab'
elif code_point == 10:
c = ' '
name = 'LF'
elif code_point == 13:
c = ' '
name = 'CR'
print("{0:s} {1:15s} U+{2:04x} {3:s}".format(
c, utf8, code_point, name))