Commit 29ea82d0 authored by Joe Tsai's avatar Joe Tsai Committed by Joe Tsai

encoding/csv: add ParseError.RecordLine

CL 72150 fixes #22352 by reverting the problematic parts of that CL
where the line number and column number were inconsistent with each other.
This CL adds back functionality to address the issue that CL 72150
was trying to solve in the first place. That is, it reports the starting
line of the record, so that users have a frame of reference to start with
when debugging what went wrong.

In the event of gnarly CSV files with multiline quoted strings, a parse
failure likely occurs somewhere between the start of the record and
the point where the parser finally detected an error.
Since ParserError.{Line,Column} reports where the *error* occurs, we
add a RecordLine field to report where the record starts.

Also take this time to cleanup and modernize TestRead.

Fixes #19019
Fixes #22352

Change-Id: I16cebf0b81922c35f75804c7073e9cddbfd11a04
Reviewed-on: https://go-review.googlesource.com/72310Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
parent 38cfeb9c
......@@ -62,23 +62,30 @@ import (
)
// A ParseError is returned for parsing errors.
// The first line is 1. The first column is 0.
// Line numbers are 1-indexed and columns are 0-indexed.
type ParseError struct {
RecordLine int // Line where the record starts
Line int // Line where the error occurred
Column int // Column (rune index) where the error occurred
Err error // The actual error
}
func (e *ParseError) Error() string {
return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
if e.Err == ErrFieldCount {
return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
}
if e.RecordLine != e.Line {
return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.RecordLine, e.Line, e.Column, e.Err)
}
return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
}
// These are the errors that can be returned in ParseError.Error
var (
ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
ErrBareQuote = errors.New("bare \" in non-quoted-field")
ErrQuote = errors.New("extraneous \" in field")
ErrFieldCount = errors.New("wrong number of fields in line")
ErrQuote = errors.New("extraneous or missing \" in field")
ErrFieldCount = errors.New("wrong number of fields")
)
// A Reader reads records from a CSV-encoded file.
......@@ -86,17 +93,17 @@ var (
// As returned by NewReader, a Reader expects input conforming to RFC 4180.
// The exported fields can be changed to customize the details before the
// first call to Read or ReadAll.
//
//
type Reader struct {
// Comma is the field delimiter.
// It is set to comma (',') by NewReader.
Comma rune
// Comment, if not 0, is the comment character. Lines beginning with the
// Comment character without preceding whitespace are ignored.
// With leading whitespace the Comment character becomes part of the
// field, even if TrimLeadingSpace is true.
Comment rune
// FieldsPerRecord is the number of expected fields per record.
// If FieldsPerRecord is positive, Read requires each record to
// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
......@@ -104,18 +111,22 @@ type Reader struct {
// have the same field count. If FieldsPerRecord is negative, no check is
// made and records may have a variable number of fields.
FieldsPerRecord int
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.
LazyQuotes bool
TrailingComma bool // ignored; here for backwards compatibility
// If TrimLeadingSpace is true, leading white space in a field is ignored.
// This is done even if the field delimiter, Comma, is white space.
TrimLeadingSpace bool
// ReuseRecord controls whether calls to Read may return a slice sharing
// the backing array of the previous call's returned slice for performance.
// By default, each call to Read returns newly allocated memory owned by the caller.
ReuseRecord bool
TrailingComma bool // Deprecated: No longer used.
r *bufio.Reader
// numLine is the current line being read in the CSV file.
......@@ -266,7 +277,7 @@ parseField:
if !r.LazyQuotes {
if j := bytes.IndexByte(field, '"'); j >= 0 {
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote}
err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
break parseField
}
}
......@@ -306,7 +317,7 @@ parseField:
default:
// `"*` squence (invalid non-escaped quote).
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
break parseField
}
} else if len(line) > 0 {
......@@ -324,7 +335,7 @@ parseField:
// Abrupt end of file (EOF or error).
if !r.LazyQuotes && errRead == nil {
col := utf8.RuneCount(fullLine)
err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
err = &ParseError{RecordLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
break parseField
}
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
......@@ -354,7 +365,7 @@ parseField:
// Check or update the expected fields per record.
if r.FieldsPerRecord > 0 {
if len(dst) != r.FieldsPerRecord && err == nil {
err = &ParseError{Line: recLine, Err: ErrFieldCount}
err = &ParseError{RecordLine: recLine, Line: recLine, Err: ErrFieldCount}
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(dst)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment