Commit 89ccfe49 authored by Joe Tsai's avatar Joe Tsai Committed by Joe Tsai

encoding/csv: simplify and optimize Reader

The Reader implementation is slow because it operates on a rune-by-rune
basis via bufio.Reader.ReadRune. We speed this up by operating on entire
lines that we read from bufio.Reader.ReadSlice.

In order to ensure that we read the full line, we augment ReadSlice
in our Reader.readLine method to automatically expand the slice if
bufio.ErrBufferFull is every hit.

This change happens to fix #19410 because it no longer relies on
rune-by-rune parsing and only searches for the relevant delimiter rune.

In order to keep column accounting simple and consistent, this change
reverts parts of CL 52830.

This CL is an alternative to CL 36270 and builds on some of the ideas
from that change by Diogo Pinela.

name                                     old time/op    new time/op    delta
Read-8                                   3.12µs ± 1%    2.54µs ± 2%  -18.76%   (p=0.000 n=10+9)
ReadWithFieldsPerRecord-8                3.12µs ± 1%    2.53µs ± 1%  -18.91%    (p=0.000 n=9+9)
ReadWithoutFieldsPerRecord-8             3.13µs ± 0%    2.57µs ± 3%  -18.07%  (p=0.000 n=10+10)
ReadLargeFields-8                        52.3µs ± 1%     5.3µs ± 2%  -89.93%   (p=0.000 n=10+9)
ReadReuseRecord-8                        2.05µs ± 1%    1.40µs ± 1%  -31.48%   (p=0.000 n=10+9)
ReadReuseRecordWithFieldsPerRecord-8     2.05µs ± 1%    1.41µs ± 0%  -31.03%   (p=0.000 n=10+9)
ReadReuseRecordWithoutFieldsPerRecord-8  2.06µs ± 1%    1.40µs ± 1%  -31.70%   (p=0.000 n=9+10)
ReadReuseRecordLargeFields-8             50.9µs ± 0%     4.1µs ± 3%  -92.01%  (p=0.000 n=10+10)

name                                     old alloc/op   new alloc/op
Read-8                                       664B ± 0%      664B ± 0%
ReadWithFieldsPerRecord-8                    664B ± 0%      664B ± 0%
ReadWithoutFieldsPerRecord-8                 664B ± 0%      664B ± 0%
ReadLargeFields-8                          3.94kB ± 0%    3.94kB ± 0%
ReadReuseRecord-8                           24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithFieldsPerRecord-8        24.0B ± 0%     24.0B ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8     24.0B ± 0%     24.0B ± 0%
ReadReuseRecordLargeFields-8               2.98kB ± 0%    2.98kB ± 0%

name                                     old allocs/op  new allocs/op
Read-8                                       18.0 ± 0%      18.0 ± 0%
ReadWithFieldsPerRecord-8                    18.0 ± 0%      18.0 ± 0%
ReadWithoutFieldsPerRecord-8                 18.0 ± 0%      18.0 ± 0%
ReadLargeFields-8                            24.0 ± 0%      24.0 ± 0%
ReadReuseRecord-8                            8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithFieldsPerRecord-8         8.00 ± 0%      8.00 ± 0%
ReadReuseRecordWithoutFieldsPerRecord-8      8.00 ± 0%      8.00 ± 0%
ReadReuseRecordLargeFields-8                 12.0 ± 0%      12.0 ± 0%

Updates #22352
Updates #19019
Fixes #16791
Fixes #19410

Change-Id: I31c27cfcc56880e6abac262f36c947179b550bbf
Reviewed-on: https://go-review.googlesource.com/72150Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 23aad448
......@@ -58,6 +58,7 @@ import (
"fmt"
"io"
"unicode"
"unicode/utf8"
)
// A ParseError is returned for parsing errors.
......@@ -115,20 +116,25 @@ type Reader struct {
// By default, each call to Read returns newly allocated memory owned by the caller.
ReuseRecord bool
line int
recordLine int // line where the current record started
column int
r *bufio.Reader
// lineBuffer holds the unescaped fields read by readField, one after another.
r *bufio.Reader
// numLine is the current line being read in the CSV file.
numLine int
// rawBuffer is a line buffer only used by the readLine method.
rawBuffer []byte
// recordBuffer holds the unescaped fields, one after another.
// The fields can be accessed by using the indexes in fieldIndexes.
// Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and
// fieldIndexes will contain the indexes 0, 1, 2, 5.
lineBuffer bytes.Buffer
// Indexes of fields inside lineBuffer
// The i'th field starts at offset fieldIndexes[i] in lineBuffer.
// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
// and fieldIndexes will contain the indexes [1, 2, 5, 6].
recordBuffer []byte
// fieldIndexes is an index of fields inside recordBuffer.
// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
fieldIndexes []int
// only used when ReuseRecord == true
// lastRecord is a record cache and only used when ReuseRecord == true.
lastRecord []string
}
......@@ -140,15 +146,6 @@ func NewReader(r io.Reader) *Reader {
}
}
// error creates a new ParseError based on err.
func (r *Reader) error(err error) error {
return &ParseError{
Line: r.recordLine,
Column: r.column,
Err: err,
}
}
// Read reads one record (a slice of fields) from r.
// If the record has an unexpected number of fields,
// Read returns the record along with the error ErrFieldCount.
......@@ -164,7 +161,6 @@ func (r *Reader) Read() (record []string, err error) {
} else {
record, err = r.readRecord(nil)
}
return record, err
}
......@@ -186,237 +182,182 @@ func (r *Reader) ReadAll() (records [][]string, err error) {
}
}
// readRecord reads and parses a single csv record from r.
// Unlike parseRecord, readRecord handles FieldsPerRecord.
// If dst has enough capacity it will be used for the returned record.
func (r *Reader) readRecord(dst []string) (record []string, err error) {
for {
record, err = r.parseRecord(dst)
if record != nil {
break
}
if err != nil {
return nil, err
// readLine reads the next line (with the trailing endline).
// If EOF is hit without a trailing endline, it will be omitted.
// If some bytes were read, then the error is never io.EOF.
// The result is only valid until the next call to readLine.
func (r *Reader) readLine() ([]byte, error) {
line, err := r.r.ReadSlice('\n')
if err == bufio.ErrBufferFull {
r.rawBuffer = append(r.rawBuffer[:0], line...)
for err == bufio.ErrBufferFull {
line, err = r.r.ReadSlice('\n')
r.rawBuffer = append(r.rawBuffer, line...)
}
line = r.rawBuffer
}
if r.FieldsPerRecord > 0 {
if len(record) != r.FieldsPerRecord {
r.column = 0 // report at start of record
return record, r.error(ErrFieldCount)
}
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(record)
if len(line) > 0 && err == io.EOF {
err = nil
}
return record, nil
r.numLine++
return line, err
}
// readRune reads one rune from r, folding \r\n to \n and keeping track
// of how far into the line we have read. r.column will point to the start
// of this rune, not the end of this rune.
func (r *Reader) readRune() (rune, error) {
r1, _, err := r.r.ReadRune()
// Handle \r\n here. We make the simplifying assumption that
// anytime \r is followed by \n that it can be folded to \n.
// We will not detect files which contain both \r\n and bare \n.
if r1 == '\r' {
r1, _, err = r.r.ReadRune()
if err == nil {
if r1 != '\n' {
r.r.UnreadRune()
r1 = '\r'
}
// lengthCRLF reports the number of bytes for a trailing "\r\n".
func lengthCRLF(b []byte) int {
if j := len(b) - 1; j >= 0 && b[j] == '\n' {
if j := len(b) - 2; j >= 0 && b[j] == '\r' {
return 2
}
return 1
}
r.column++
return r1, err
return 0
}
// readRawRune works the same way as readRune, but does not fold \r\n to \n.
func (r *Reader) readRawRune() (rune, error) {
r1, _, err := r.r.ReadRune()
r.column++
return r1, err
// nextRune returns the next rune in b or utf8.RuneError.
func nextRune(b []byte) rune {
r, _ := utf8.DecodeRune(b)
return r
}
// skip reads runes up to and including the rune delim or until error.
func (r *Reader) skip(delim rune) error {
for {
r1, err := r.readRune()
if err != nil {
return err
func (r *Reader) readRecord(dst []string) ([]string, error) {
// Read line (automatically skipping past empty lines and any comments).
var line, fullLine []byte
var errRead error
for errRead == nil {
line, errRead = r.readLine()
if r.Comment != 0 && nextRune(line) == r.Comment {
line = nil
continue // Skip comment lines
}
if r1 == delim {
return nil
if errRead == nil && len(line) == lengthCRLF(line) {
line = nil
continue // Skip empty lines
}
fullLine = line
break
}
}
// parseRecord reads and parses a single csv record from r.
// If dst has enough capacity it will be used for the returned fields.
func (r *Reader) parseRecord(dst []string) (fields []string, err error) {
// Each record starts on a new line. We increment our line
// number (lines start at 1, not 0) and set column to -1
// so as we increment in readRune it points to the character we read.
// We track the line where the record starts in recordLine for use in errors.
r.line++
r.recordLine = r.line
r.column = -1
// Peek at the first rune. If it is an error we are done.
// If we support comments and it is the comment character
// then skip to the end of line.
r1, _, err := r.r.ReadRune()
if err != nil {
return nil, err
}
if r.Comment != 0 && r1 == r.Comment {
return nil, r.skip('\n')
if errRead == io.EOF {
return nil, errRead
}
r.r.UnreadRune()
r.lineBuffer.Reset()
// Parse each field in the record.
var err error
const quoteLen = len(`"`)
commaLen := utf8.RuneLen(r.Comma)
recLine := r.numLine // Starting line for record
r.recordBuffer = r.recordBuffer[:0]
r.fieldIndexes = r.fieldIndexes[:0]
// At this point we have at least one field.
parseField:
for {
idx := r.lineBuffer.Len()
haveField, delim, err := r.parseField()
if haveField {
r.fieldIndexes = append(r.fieldIndexes, idx)
if r.TrimLeadingSpace {
line = bytes.TrimLeftFunc(line, unicode.IsSpace)
}
if delim == '\n' || err == io.EOF {
if len(r.fieldIndexes) == 0 {
return nil, err
if len(line) == 0 || line[0] != '"' {
// Non-quoted string field
i := bytes.IndexRune(line, r.Comma)
field := line
if i >= 0 {
field = field[:i]
} else {
field = field[:len(field)-lengthCRLF(field)]
}
break
}
if err != nil {
return nil, err
}
}
fieldCount := len(r.fieldIndexes)
// Using this approach (creating a single string and taking slices of it)
// means that a single reference to any of the fields will retain the whole
// string. The risk of a nontrivial space leak caused by this is considered
// minimal and a tradeoff for better performance through the combined
// allocations.
line := r.lineBuffer.String()
if cap(dst) >= fieldCount {
fields = dst[:fieldCount]
} else {
fields = make([]string, fieldCount)
}
for i, idx := range r.fieldIndexes {
if i == fieldCount-1 {
fields[i] = line[idx:]
} else {
fields[i] = line[idx:r.fieldIndexes[i+1]]
}
}
return fields, nil
}
// parseField parses the next field in the record. The read field is
// appended to r.lineBuffer. Delim is the first character not part of the field
// (r.Comma or '\n').
func (r *Reader) parseField() (haveField bool, delim rune, err error) {
r1, err := r.readRune()
for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
r1, err = r.readRune()
}
if err == io.EOF && r.column != 0 {
return true, 0, err
}
if err != nil {
return false, 0, err
}
switch r1 {
case r.Comma:
// will check below
case '\n':
// We are a trailing empty field or a blank line
if r.column == 0 {
return false, r1, nil
}
return true, r1, nil
case '"':
// quoted field
Quoted:
for {
// use readRawRune instead of readRune to preserve \r\n
// in quotes fields.
r1, err = r.readRawRune()
if err != nil {
if err == io.EOF {
if r.LazyQuotes {
return true, 0, err
}
return false, 0, r.error(ErrQuote)
// Check to make sure a quote does not appear in field.
if !r.LazyQuotes {
if j := bytes.IndexByte(field, '"'); j >= 0 {
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
err = &ParseError{Line: r.numLine, Column: col, Err: ErrBareQuote}
break parseField
}
return false, 0, err
}
switch r1 {
case '"':
r1, err = r.readRune()
if err != nil || r1 == r.Comma {
break Quoted
}
if r1 == '\n' {
return true, r1, nil
}
if r1 != '"' {
if !r.LazyQuotes {
r.column--
return false, 0, r.error(ErrQuote)
r.recordBuffer = append(r.recordBuffer, field...)
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
if i >= 0 {
line = line[i+commaLen:]
continue parseField
}
break parseField
} else {
// Quoted string field
line = line[quoteLen:]
for {
i := bytes.IndexByte(line, '"')
if i >= 0 {
// Hit next quote.
r.recordBuffer = append(r.recordBuffer, line[:i]...)
line = line[i+quoteLen:]
switch rn := nextRune(line); {
case rn == '"':
// `""` sequence (append quote).
r.recordBuffer = append(r.recordBuffer, '"')
line = line[quoteLen:]
case rn == r.Comma:
// `",` sequence (end of field).
line = line[commaLen:]
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
continue parseField
case lengthCRLF(line) == len(line):
// `"\n` sequence (end of line).
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
break parseField
case r.LazyQuotes:
// `"` sequence (bare quote).
r.recordBuffer = append(r.recordBuffer, '"')
default:
// `"*` squence (invalid non-escaped quote).
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
break parseField
}
} else if len(line) > 0 {
// Hit end of line (copy all data so far).
r.recordBuffer = append(r.recordBuffer, line...)
if errRead != nil {
break parseField
}
line, errRead = r.readLine()
if errRead == io.EOF {
errRead = nil
}
// accept the bare quote
r.lineBuffer.WriteRune('"')
fullLine = line
} else {
// Abrupt end of file (EOF or error).
if !r.LazyQuotes && errRead == nil {
col := utf8.RuneCount(fullLine)
err = &ParseError{Line: r.numLine, Column: col, Err: ErrQuote}
break parseField
}
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
break parseField
}
case '\n':
r.line++
r.column = -1
}
r.lineBuffer.WriteRune(r1)
}
}
if err == nil {
err = errRead
}
default:
// unquoted field
for {
r.lineBuffer.WriteRune(r1)
r1, err = r.readRune()
if err != nil || r1 == r.Comma {
break
}
if r1 == '\n' {
return true, r1, nil
}
if !r.LazyQuotes && r1 == '"' {
return false, 0, r.error(ErrBareQuote)
}
}
// Create a single string and create slices out of it.
// This pins the memory of the fields together, but allocates once.
str := string(r.recordBuffer) // Convert to string once to batch allocations
dst = dst[:0]
if cap(dst) < len(r.fieldIndexes) {
dst = make([]string, len(r.fieldIndexes))
}
dst = dst[:len(r.fieldIndexes)]
var preIdx int
for i, idx := range r.fieldIndexes {
dst[i] = str[preIdx:idx]
preIdx = idx
}
if err != nil {
if err == io.EOF {
return true, 0, err
// Check or update the expected fields per record.
if r.FieldsPerRecord > 0 {
if len(dst) != r.FieldsPerRecord && err == nil {
err = &ParseError{Line: recLine, Err: ErrFieldCount}
}
return false, 0, err
} else if r.FieldsPerRecord == 0 {
r.FieldsPerRecord = len(dst)
}
return true, r1, nil
return dst, err
}
......@@ -26,9 +26,8 @@ var readTests = []struct {
TrimLeadingSpace bool
ReuseRecord bool
Error string
Line int // Expected error line if != 0
Column int // Expected error column if line != 0
Error error
Line int // Expected error line if != 0
}{
{
Name: "Simple",
......@@ -140,7 +139,7 @@ field"`,
{
Name: "BadDoubleQuotes",
Input: `a""b,c`,
Error: `bare " in non-quoted-field`, Line: 1, Column: 1,
Error: &ParseError{Line: 1, Column: 1, Err: ErrBareQuote},
},
{
Name: "TrimQuote",
......@@ -151,30 +150,30 @@ field"`,
{
Name: "BadBareQuote",
Input: `a "word","b"`,
Error: `bare " in non-quoted-field`, Line: 1, Column: 2,
Error: &ParseError{Line: 1, Column: 2, Err: ErrBareQuote},
},
{
Name: "BadTrailingQuote",
Input: `"a word",b"`,
Error: `bare " in non-quoted-field`, Line: 1, Column: 10,
Error: &ParseError{Line: 1, Column: 10, Err: ErrBareQuote},
},
{
Name: "ExtraneousQuote",
Input: `"a "word","b"`,
Error: `extraneous " in field`, Line: 1, Column: 3,
Error: &ParseError{Line: 1, Column: 3, Err: ErrQuote},
},
{
Name: "BadFieldCount",
UseFieldsPerRecord: true,
Input: "a,b,c\nd,e",
Error: "wrong number of fields", Line: 2,
Error: &ParseError{Line: 2, Err: ErrFieldCount},
},
{
Name: "BadFieldCount1",
UseFieldsPerRecord: true,
FieldsPerRecord: 2,
Input: `a,b,c`,
Error: "wrong number of fields", Line: 1,
Error: &ParseError{Line: 1, Err: ErrFieldCount},
},
{
Name: "FieldCount",
......@@ -271,18 +270,14 @@ x,,,
},
},
{ // issue 19019
Name: "RecordLine1",
Input: "a,\"b\nc\"d,e",
Error: `extraneous " in field`,
Line: 1,
Column: 1,
Name: "RecordLine1",
Input: "a,\"b\nc\"d,e",
Error: &ParseError{Line: 2, Column: 1, Err: ErrQuote},
},
{
Name: "RecordLine2",
Input: "a,b\n\"d\n\n,e",
Error: `extraneous " in field`,
Line: 2,
Column: 2,
Name: "RecordLine2",
Input: "a,b\n\"d\n\n,e",
Error: &ParseError{Line: 5, Column: 0, Err: ErrQuote},
},
{ // issue 21201
Name: "CRLFInQuotedField",
......@@ -291,6 +286,95 @@ x,,,
{"Hello\r\nHi"},
},
},
{ // issue 19410
Name: "BinaryBlobField",
Input: "x09\x41\xb4\x1c,aktau",
Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
},
{
Name: "TrailingCR",
Input: "field1,field2\r",
Output: [][]string{{"field1", "field2\r"}},
},
{
Name: "NonASCIICommaAndComment",
TrimLeadingSpace: true,
Comma: '£',
Comment: '€',
Input: "a£b,c£ \td,e\n€ comment\n",
Output: [][]string{{"a", "b,c", "d,e"}},
},
{
Name: "NonASCIICommaAndCommentWithQuotes",
Comma: '€',
Comment: 'λ',
Input: "a€\" b,\"€ c\nλ comment\n",
Output: [][]string{{"a", " b,", " c"}},
},
{
Name: "NonASCIICommaConfusion",
Comma: 'λ',
Comment: '€',
// λ and θ start with the same byte. This test is intended to ensure the parser doesn't
// confuse such characters.
Input: "\"abθcd\"λefθgh",
Output: [][]string{{"abθcd", "efθgh"}},
},
{
Name: "NonASCIICommentConfusion",
Comment: 'θ',
Input: \nλ\nθ\nλ\n",
Output: [][]string{{"λ"}, {"λ"}, {"λ"}},
},
{
Name: "QuotedFieldMultipleLF",
Input: "\"\n\n\n\n\"",
Output: [][]string{{"\n\n\n\n"}},
},
{
Name: "MultipleCRLF",
Input: "\r\n\r\n\r\n\r\n",
},
{
// The implementation may read each line in several chunks if it doesn't fit entirely
// in the read buffer, so we should test the code to handle that condition.
Name: "HugeLines",
Comment: '#',
Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
},
{
Name: "QuoteWithTrailingCRLF",
Input: "\"foo\"bar\"\r\n",
Error: &ParseError{Line: 1, Column: 4, Err: ErrQuote},
},
{
Name: "LazyQuoteWithTrailingCRLF",
Input: "\"foo\"bar\"\r\n",
LazyQuotes: true,
Output: [][]string{{`foo"bar`}},
},
{
Name: "DoubleQuoteWithTrailingCRLF",
Input: "\"foo\"\"bar\"\r\n",
Output: [][]string{{`foo"bar`}},
},
{
Name: "EvenQuotes",
Input: `""""""""`,
Output: [][]string{{`"""`}},
},
{
Name: "OddQuotes",
Input: `"""""""`,
Error: &ParseError{Line: 1, Column: 7, Err: ErrQuote},
},
{
Name: "LazyOddQuotes",
Input: `"""""""`,
LazyQuotes: true,
Output: [][]string{{`"""`}},
},
}
func TestRead(t *testing.T) {
......@@ -310,17 +394,10 @@ func TestRead(t *testing.T) {
r.Comma = tt.Comma
}
out, err := r.ReadAll()
perr, _ := err.(*ParseError)
if tt.Error != "" {
if err == nil || !strings.Contains(err.Error(), tt.Error) {
t.Errorf("%s: error %v, want error %q", tt.Name, err, tt.Error)
} else if tt.Line != 0 && (tt.Line != perr.Line || tt.Column != perr.Column) {
t.Errorf("%s: error at %d:%d expected %d:%d", tt.Name, perr.Line, perr.Column, tt.Line, tt.Column)
}
} else if err != nil {
t.Errorf("%s: unexpected error %v", tt.Name, err)
if !reflect.DeepEqual(err, tt.Error) {
t.Errorf("%s: ReadAll() error:\ngot %v\nwant %v", tt.Name, err, tt.Error)
} else if !reflect.DeepEqual(out, tt.Output) {
t.Errorf("%s: out=%q want %q", tt.Name, out, tt.Output)
t.Errorf("%s: ReadAll() output:\ngot %q\nwant %q", tt.Name, out, tt.Output)
}
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment