# = DelimParser # # A DelimParser parses a line of delimited data and return an array of column # values. It handles delimiters, quotes, and double quotes within the data. # DelimParsers are useful for reading .csv (comma-delimited) and # tab-delimited files. # # The delimiter character is set upon construction but may be modified # between calls to #parse by calling #delim(new_delim_char). # # == Example * # In this rather silly example, we read a comma-delimited file and print # the data in each line separated by asterisks. # # parser = DelimParser.new(',') # File.open('foo.csv', 'r').each { | line | # column_data = parser.parse(line) # puts column_data.join('*') # } class DelimParser # A string containing a single character. attr_writer :delim # A string containing a single character (either a single or a double # quote). attr_writer :quote # The number of the line being parsed. The first line is line 1. It is # never reset. Since it is writable, you can reset it, and should set # it to 0. attr_writer :line_number # Sets delimiter and quote characters. Default delimiter is the TAB # character. Default quote is the double-quote. def initialize(delim = "\t", quote = '"') @delim = delim @quote = quote @line_number = 0 end # Given a line of delimiter-separated data, return an array of column # data. Handles delimiters and quotes within the data just as they are # generated by Excel comma- and tab-delimited files. # # Empty columns are returned as empty strings. That is, two delimiter # characters in a row will cause an empty string (not nil or # 'NULL') to appear in the returned array. # # The line ending characters '\n' and '\r' at the end of the line are # ignored. This takes care of Unix, Mac, and DOS line endings. # # In this rather silly example, we read a comma-delimited file and print # the data in each line separated by asterisks. # # parser = DelimParser.new(',') # File.open('foo.csv', 'r').each { | line | # column_data = parser.parse(line) # puts column_data.join('*') # } def parse(line) @line_number += 1 # Remove line ending characters. We will want to change this code # if we ever handle backslash-escaped newlines. line.gsub!(/[\n\r]+/, '') col_index = 0 columns = [] start_of_column = true whole_col_is_quoted = false num_quotes_seen = 0 # We march through each character because we need a state machine # to parse quoted quotes and delim chars within quotes correctly. # We don't use Enumerable#each because we need to peek ahead when # dealing with quote chars. prev_char = nil chars = line.split(//) buf_ptr = 0 while buf_ptr < chars.length c = chars[buf_ptr] puts "c = #{c}" if $DEBUG if start_of_column columns[col_index] = "" puts "start of column" if $DEBUG end case c when '\\' # Backslash escapes next character puts "backslash" if $DEBUG if buf_ptr == chars.length - 1 # last thing on line puts "backslash is at end of line" if $DEBUG # can't handle this yet because we don't have access # to the next line of data $stderr.puts "warning on line #{@line_number}: " + "can not yet process backslashes at end of line;" + " backslash ignored" # we will break out of while loop next time through else buf_ptr += 1 columns[col_index] += chars[buf_ptr] puts "backslash: out next char '#{chars[buf_ptr]}'" if $DEBUG end start_of_column = false when @delim # Delimiter character puts "delim seen" if $DEBUG if !whole_col_is_quoted # Normal delimiter puts "delim is not inside quotes" if $DEBUG col_index += 1 start_of_column = true else # Inside quoted column puts "delim is inside quotes" if $DEBUG columns[col_index] += @delim start_of_column = false end when @quote # Quote character puts "quote seen" if $DEBUG if start_of_column # quote is first char of column puts "quote is at start of column" if $DEBUG whole_col_is_quoted = true elsif buf_ptr == chars.length - 1 # quote is last char in line puts "quote is last char in line" if $DEBUG if !whole_col_is_quoted columns[col_index] += @quote puts "not true whole col quoted; copying" + " quote char to output (this is ok)" if $DEBUG end else # quote is somewhere in the middle # peek ahead to next char next_char_peek = chars[buf_ptr + 1] puts "quote not at start and not at line end:" + " peek ahead char is '#{next_char_peek}'" if $DEBUG case next_char_peek when @quote # char after quote is another quote puts "quote peek char is also a quote" if $DEBUG columns[col_index] += @quote # copy second if whole column is not quoted columns[col_index] += @quote if !whole_col_is_quoted buf_ptr += 1 when @delim # char after quote is delim; we are done puts "quote peek char is delim; string done" if $DEBUG # only copy quote if whole col is not quoted columns[col_index] += @quote unless whole_col_is_quoted if $DEBUG && !whole_col_is_quoted puts "copying quote to output" end whole_col_is_quoted = false else # everything but quote or delim columns[col_index] += @quote if $DEBUG puts "quote peek char is something else;" + " copying quote to output" end end end start_of_column = false else # Any other old char if start_of_column columns[col_index] = "" end num_quotes_seen = 0 columns[col_index] += c start_of_column = false end prev_char = c buf_ptr += 1 end if whole_col_is_quoted && prev_char != @quote $stderr.puts "error on line #{@line_number}: " + "end of line seen when inside column data" $stderr.puts "data line = #{line}" end if prev_char == @delim && columns[col_index].nil?() columns[col_index] = "" end return columns end end