Commit 1d47a145 authored by Russ Cox's avatar Russ Cox

encoding/csv: restore Go 1.9 quoted \r\n handling in Reader

CL 52810 changed Reader to interpret a quoted \r\n as a raw \r\n
when reading fields. This seems likely to break existing users, and
discussion on both #21201 (the original issue that triggered the change)
and #22746 (discussing whether to revert the change) failed to identify
a single motivating example for this change. To avoid breaking existing
users for no clear reason, revert the change.

The Reader has been rewritten in the interim so this is not a git revert
but instead and adjustment (and slight simplification) of the new Reader.

Fixes #22746.

Change-Id: Ie857b2f4b1359a207d085b6d3c3a6d440a997d12
Reviewed-on: https://go-review.googlesource.com/78295Reviewed-by: default avatarJoe Tsai <thebrokentoaster@gmail.com>
parent 918b98ca
...@@ -99,15 +99,24 @@ func validDelim(r rune) bool { ...@@ -99,15 +99,24 @@ func validDelim(r rune) bool {
// As returned by NewReader, a Reader expects input conforming to RFC 4180. // As returned by NewReader, a Reader expects input conforming to RFC 4180.
// The exported fields can be changed to customize the details before the // The exported fields can be changed to customize the details before the
// first call to Read or ReadAll. // first call to Read or ReadAll.
//
// The Reader converts all \r\n sequences in its input to plain \n,
// including in multiline field values, so that the returned data does
// not depend on which line-ending convention an input file uses.
type Reader struct { type Reader struct {
// Comma is the field delimiter. // Comma is the field delimiter.
// It is set to comma (',') by NewReader. // It is set to comma (',') by NewReader.
// Comma must be a valid rune and must not be \r, \n,
// or the Unicode replacement character (0xFFFD).
Comma rune Comma rune
// Comment, if not 0, is the comment character. Lines beginning with the // Comment, if not 0, is the comment character. Lines beginning with the
// Comment character without preceding whitespace are ignored. // Comment character without preceding whitespace are ignored.
// With leading whitespace the Comment character becomes part of the // With leading whitespace the Comment character becomes part of the
// field, even if TrimLeadingSpace is true. // field, even if TrimLeadingSpace is true.
// Comment must be a valid rune and must not be \r, \n,
// or the Unicode replacement character (0xFFFD).
// It must also not be equal to Comma.
Comment rune Comment rune
// FieldsPerRecord is the number of expected fields per record. // FieldsPerRecord is the number of expected fields per record.
...@@ -217,15 +226,17 @@ func (r *Reader) readLine() ([]byte, error) { ...@@ -217,15 +226,17 @@ func (r *Reader) readLine() ([]byte, error) {
err = nil err = nil
} }
r.numLine++ r.numLine++
// Normalize \r\n to \n on all input lines.
if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
line[n-2] = '\n'
line = line[:n-1]
}
return line, err return line, err
} }
// lengthCRLF reports the number of bytes for a trailing "\r\n". // lengthNL reports the number of bytes for the trailing \n.
func lengthCRLF(b []byte) int { func lengthNL(b []byte) int {
if j := len(b) - 1; j >= 0 && b[j] == '\n' { if len(b) > 0 && b[len(b)-1] == '\n' {
if j := len(b) - 2; j >= 0 && b[j] == '\r' {
return 2
}
return 1 return 1
} }
return 0 return 0
...@@ -251,7 +262,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) { ...@@ -251,7 +262,7 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
line = nil line = nil
continue // Skip comment lines continue // Skip comment lines
} }
if errRead == nil && len(line) == lengthCRLF(line) { if errRead == nil && len(line) == lengthNL(line) {
line = nil line = nil
continue // Skip empty lines continue // Skip empty lines
} }
...@@ -281,7 +292,7 @@ parseField: ...@@ -281,7 +292,7 @@ parseField:
if i >= 0 { if i >= 0 {
field = field[:i] field = field[:i]
} else { } else {
field = field[:len(field)-lengthCRLF(field)] field = field[:len(field)-lengthNL(field)]
} }
// Check to make sure a quote does not appear in field. // Check to make sure a quote does not appear in field.
if !r.LazyQuotes { if !r.LazyQuotes {
...@@ -317,7 +328,7 @@ parseField: ...@@ -317,7 +328,7 @@ parseField:
line = line[commaLen:] line = line[commaLen:]
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
continue parseField continue parseField
case lengthCRLF(line) == len(line): case lengthNL(line) == len(line):
// `"\n` sequence (end of line). // `"\n` sequence (end of line).
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
break parseField break parseField
......
...@@ -235,9 +235,9 @@ x,,, ...@@ -235,9 +235,9 @@ x,,,
Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote}, Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
}, { }, {
Name: "CRLFInQuotedField", // Issue 21201 Name: "CRLFInQuotedField", // Issue 21201
Input: "\"Hello\r\nHi\"", Input: "A,\"Hello\r\nHi\",B\r\n",
Output: [][]string{ Output: [][]string{
{"Hello\r\nHi"}, {"A", "Hello\nHi", "B"},
}, },
}, { }, {
Name: "BinaryBlobField", // Issue 19410 Name: "BinaryBlobField", // Issue 19410
......
...@@ -20,7 +20,7 @@ import ( ...@@ -20,7 +20,7 @@ import (
// //
// Comma is the field delimiter. // Comma is the field delimiter.
// //
// If UseCRLF is true, the Writer ends each record with \r\n instead of \n. // If UseCRLF is true, the Writer ends each output line with \r\n instead of \n.
type Writer struct { type Writer struct {
Comma rune // Field delimiter (set to ',' by NewWriter) Comma rune // Field delimiter (set to ',' by NewWriter)
UseCRLF bool // True to use \r\n as the line terminator UseCRLF bool // True to use \r\n as the line terminator
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment