J
Johannes Bauer
Hello group,
I'm having trouble reading a utf-16 encoded file with Python3.0. This is
my (complete) code:
#!/usr/bin/python3.0
class AddressBook():
def __init__(self, filename):
f = open(filename, "r", encoding="utf16")
while True:
line = f.readline()
if line == "": break
print([line[x] for x in range(len(line))])
f.close()
a = AddressBook("2008_11_05_Handy_Backup.txt")
This is the file (only 1 kB, if hosting doesn't work please tell me and
I'll see if I can put it someplace else):
http://www.file-upload.net/download-1297291/2008_11_05_Handy_Backup.txt.gz.html
What I get: The file reads file the first few lines. Then, in the last
line, I get lots of garbage (looking like uninitialized memory):
['E', 'n', 't', 'r', 'y', '0', '0', 'T', 'e', 'x', 't', ' ', '=', ' ',
'"', 'A', 'D', 'A', 'C', ' ', 'V', 'e', 'r', 'k', 'e', 'h', 'r', 's',
'i', 'n', 'f', 'o', '"', '\u0d00', '\u0a00', '䔀', '渀', 'ç€', '爀', '礀
', '\u3000', '\u3100', 'å€', '礀', '瀀', '攀', '\u2000', 'ã´€', '\u2000',
'一', '甀', '洀', '戀', '攀', '爀', '䴀', '漀', '戀', '椀', '氀', '攀',
'\u0d00', '\u0a00', '䔀', '渀', 'ç€', '爀', '礀', '\u3000', '\u3100', '
å€', '攀', 'ç €', 'ç€', '\u2000', 'ã´€', '\u2000', '∀', '⬀', 'ã€', '㤀',
'\u3100', '㜀', '㤀', '㈀', '㈀', 'ã€', '㤀', '㤀', '∀', '\u0d00',
'\u0a00', '\u0d00', '\u0a00', '嬀', '倀', 'æ €', 'æ¼€', '渀', '攀', '倀',
'䈀', '䬀', '\u3000', '\u3000', 'ã€', 'å´€', '\u0d00', '\u0a00']
Where the line
Entry00Text = "ADAC Verkehrsinfo"\r\n
is actually the only thing the line contains, Python makes the rest up.
The actual file is much longer and contains private numbers, so I
truncated them away. When I let python process the original file, it
dies with another error:
Traceback (most recent call last):
File "./modify.py", line 12, in <module>
a = AddressBook("2008_11_05_Handy_Backup.txt")
File "./modify.py", line 7, in __init__
line = f.readline()
File "/usr/local/lib/python3.0/io.py", line 1807, in readline
while self._read_chunk():
File "/usr/local/lib/python3.0/io.py", line 1556, in _read_chunk
self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
File "/usr/local/lib/python3.0/io.py", line 1293, in decode
output = self.decoder.decode(input, final=final)
File "/usr/local/lib/python3.0/codecs.py", line 300, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
File "/usr/local/lib/python3.0/encodings/utf_16.py", line 69, in
_buffer_decode
return self.decoder(input, self.errors, final)
UnicodeDecodeError: 'utf16' codec can't decode bytes in position 74-75:
illegal encoding
With the place where it dies being exactly the place where it outputs
the weird garbage in the shortened file. I guess it runs over some page
boundary here or something?
Kind regards,
Johannes
I'm having trouble reading a utf-16 encoded file with Python3.0. This is
my (complete) code:
#!/usr/bin/python3.0
class AddressBook():
def __init__(self, filename):
f = open(filename, "r", encoding="utf16")
while True:
line = f.readline()
if line == "": break
print([line[x] for x in range(len(line))])
f.close()
a = AddressBook("2008_11_05_Handy_Backup.txt")
This is the file (only 1 kB, if hosting doesn't work please tell me and
I'll see if I can put it someplace else):
http://www.file-upload.net/download-1297291/2008_11_05_Handy_Backup.txt.gz.html
What I get: The file reads file the first few lines. Then, in the last
line, I get lots of garbage (looking like uninitialized memory):
['E', 'n', 't', 'r', 'y', '0', '0', 'T', 'e', 'x', 't', ' ', '=', ' ',
'"', 'A', 'D', 'A', 'C', ' ', 'V', 'e', 'r', 'k', 'e', 'h', 'r', 's',
'i', 'n', 'f', 'o', '"', '\u0d00', '\u0a00', '䔀', '渀', 'ç€', '爀', '礀
', '\u3000', '\u3100', 'å€', '礀', '瀀', '攀', '\u2000', 'ã´€', '\u2000',
'一', '甀', '洀', '戀', '攀', '爀', '䴀', '漀', '戀', '椀', '氀', '攀',
'\u0d00', '\u0a00', '䔀', '渀', 'ç€', '爀', '礀', '\u3000', '\u3100', '
å€', '攀', 'ç €', 'ç€', '\u2000', 'ã´€', '\u2000', '∀', '⬀', 'ã€', '㤀',
'\u3100', '㜀', '㤀', '㈀', '㈀', 'ã€', '㤀', '㤀', '∀', '\u0d00',
'\u0a00', '\u0d00', '\u0a00', '嬀', '倀', 'æ €', 'æ¼€', '渀', '攀', '倀',
'䈀', '䬀', '\u3000', '\u3000', 'ã€', 'å´€', '\u0d00', '\u0a00']
Where the line
Entry00Text = "ADAC Verkehrsinfo"\r\n
is actually the only thing the line contains, Python makes the rest up.
The actual file is much longer and contains private numbers, so I
truncated them away. When I let python process the original file, it
dies with another error:
Traceback (most recent call last):
File "./modify.py", line 12, in <module>
a = AddressBook("2008_11_05_Handy_Backup.txt")
File "./modify.py", line 7, in __init__
line = f.readline()
File "/usr/local/lib/python3.0/io.py", line 1807, in readline
while self._read_chunk():
File "/usr/local/lib/python3.0/io.py", line 1556, in _read_chunk
self._set_decoded_chars(self._decoder.decode(input_chunk, eof))
File "/usr/local/lib/python3.0/io.py", line 1293, in decode
output = self.decoder.decode(input, final=final)
File "/usr/local/lib/python3.0/codecs.py", line 300, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
File "/usr/local/lib/python3.0/encodings/utf_16.py", line 69, in
_buffer_decode
return self.decoder(input, self.errors, final)
UnicodeDecodeError: 'utf16' codec can't decode bytes in position 74-75:
illegal encoding
With the place where it dies being exactly the place where it outputs
the weird garbage in the shortened file. I guess it runs over some page
boundary here or something?
Kind regards,
Johannes