S
sean.swolfe
I was running into difficulties with the CSV library in Ruby. I had
some files that were exports from a Filemaker database, and it had
newline and vtab characters within strings. This seemed to cause
problems for the library. I ended up making my own method that would
parse a file character by character (not using readline). I know that
it might be better to use a Regex expression, or specify the character
delimiter for rows in the readline method. But the method I made seems
a bit flexible for different types of characters. Please feel free to
use it or rip it apart. Any suggestions are welcome as well.
# linesafe_parse_csv by Sean Wolfe ( sean at i heart squares dot com)
# (c) 2005 Sean Wolfe (GPL license applies)
# Implementation of a CSV parser that is safe to use with
# strings that may contain newline or other special characters.
# Accepts arguments to specify the Field, string and row delimiter,
# along with an escape character, and a character stripper.
# A block can be passed to the method and will be passed an array
# of strings for each row.
#
# Example:
# Column_Names = [ :id, :first_name, :last_name, :email ]
# table = {}
# file = File.open("mycsv_file.csv", "r")
# linesafe_parse_csv(file, ",", '"', "\r", "\\", "\v") do
|csv_row|
# table_row = {}
# for index in 0...csv_row.length
# table_row[Column_Names[index]] = csv_row[index]
# end
# table[table_row[:id]] = table_row
# end
# file.close
# table
def linesafe_parse_csv(file, cell_delim, string_delim, row_delim,
esc_delim, chars_to_elim)
# reading characters from a file returns a fixednum
# this conversion of the string will help comparisons
str_dim_i = string_delim[0]
cell_dim_i = cell_delim[0]
row_dim_i = row_delim[0]
esc_dim_i = esc_delim[0]
# loop until the end of file
while !file.eof?
row = []
in_str = false
in_esc = false
newrow = false
value = ""
# loop throught and parse a row.
while !newrow && !file.eof?
char = file.getc
# handle what to do with the char
if char == str_dim_i
if !in_str
in_str = true
elsif !in_esc
in_str = false
else
value << char
in_esc = false
end
elsif char == esc_dim_i
if !in_esc
in_esc = true
else
value << char
in_esc = false
end
elsif char == row_dim_i
if !in_str
# handle nil values
if value == ''
row << nil
else
# we strip any unwanted characters before
# adding them to the row array
row << value.tr(chars_to_elim, '').strip
value = ''
end
newrow = true
else
value << char
end
elsif char == cell_dim_i
if !in_str && !in_esc
# handle nil values
if value == ''
row << nil
else
row << value.tr(chars_to_elim, '').strip
value = ''
end
value = ''
elsif in_esc
value << char
in_esc = false
else
value << char
end
else
value << char
end
end
#return the row to the calling function
yield row
end
end
some files that were exports from a Filemaker database, and it had
newline and vtab characters within strings. This seemed to cause
problems for the library. I ended up making my own method that would
parse a file character by character (not using readline). I know that
it might be better to use a Regex expression, or specify the character
delimiter for rows in the readline method. But the method I made seems
a bit flexible for different types of characters. Please feel free to
use it or rip it apart. Any suggestions are welcome as well.
# linesafe_parse_csv by Sean Wolfe ( sean at i heart squares dot com)
# (c) 2005 Sean Wolfe (GPL license applies)
# Implementation of a CSV parser that is safe to use with
# strings that may contain newline or other special characters.
# Accepts arguments to specify the Field, string and row delimiter,
# along with an escape character, and a character stripper.
# A block can be passed to the method and will be passed an array
# of strings for each row.
#
# Example:
# Column_Names = [ :id, :first_name, :last_name, :email ]
# table = {}
# file = File.open("mycsv_file.csv", "r")
# linesafe_parse_csv(file, ",", '"', "\r", "\\", "\v") do
|csv_row|
# table_row = {}
# for index in 0...csv_row.length
# table_row[Column_Names[index]] = csv_row[index]
# end
# table[table_row[:id]] = table_row
# end
# file.close
# table
def linesafe_parse_csv(file, cell_delim, string_delim, row_delim,
esc_delim, chars_to_elim)
# reading characters from a file returns a fixednum
# this conversion of the string will help comparisons
str_dim_i = string_delim[0]
cell_dim_i = cell_delim[0]
row_dim_i = row_delim[0]
esc_dim_i = esc_delim[0]
# loop until the end of file
while !file.eof?
row = []
in_str = false
in_esc = false
newrow = false
value = ""
# loop throught and parse a row.
while !newrow && !file.eof?
char = file.getc
# handle what to do with the char
if char == str_dim_i
if !in_str
in_str = true
elsif !in_esc
in_str = false
else
value << char
in_esc = false
end
elsif char == esc_dim_i
if !in_esc
in_esc = true
else
value << char
in_esc = false
end
elsif char == row_dim_i
if !in_str
# handle nil values
if value == ''
row << nil
else
# we strip any unwanted characters before
# adding them to the row array
row << value.tr(chars_to_elim, '').strip
value = ''
end
newrow = true
else
value << char
end
elsif char == cell_dim_i
if !in_str && !in_esc
# handle nil values
if value == ''
row << nil
else
row << value.tr(chars_to_elim, '').strip
value = ''
end
value = ''
elsif in_esc
value << char
in_esc = false
else
value << char
end
else
value << char
end
end
#return the row to the calling function
yield row
end
end