sql, csv: Cache some values between Read() calls to gain performance (#9645)

Below is the benchmark enhancement after this commit:

benchmark                                            old ns/op     new ns/op     delta
BenchmarkRead-8                                      2807          2189          -22.02%
BenchmarkReadWithFieldsPerRecord-8                   2802          2179          -22.23%
BenchmarkReadWithoutFieldsPerRecord-8                2824          2181          -22.77%
BenchmarkReadLargeFields-8                           3584          3371          -5.94%
BenchmarkReadReuseRecord-8                           2044          1480          -27.59%
BenchmarkReadReuseRecordWithFieldsPerRecord-8        2056          1483          -27.87%
BenchmarkReadReuseRecordWithoutFieldsPerRecord-8     2047          1482          -27.60%
BenchmarkReadReuseRecordLargeFields-8                2777          2594          -6.59%

benchmark                                            old allocs     new allocs     delta
BenchmarkRead-8                                      26             16             -38.46%
BenchmarkReadWithFieldsPerRecord-8                   26             16             -38.46%
BenchmarkReadWithoutFieldsPerRecord-8                26             16             -38.46%
BenchmarkReadLargeFields-8                           36             24             -33.33%
BenchmarkReadReuseRecord-8                           16             6              -62.50%
BenchmarkReadReuseRecordWithFieldsPerRecord-8        16             6              -62.50%
BenchmarkReadReuseRecordWithoutFieldsPerRecord-8     16             6              -62.50%
BenchmarkReadReuseRecordLargeFields-8                24             12             -50.00%

benchmark                                            old bytes     new bytes     delta
BenchmarkRead-8                                      672           664           -1.19%
BenchmarkReadWithFieldsPerRecord-8                   672           664           -1.19%
BenchmarkReadWithoutFieldsPerRecord-8                672           664           -1.19%
BenchmarkReadLargeFields-8                           3948          3936          -0.30%
BenchmarkReadReuseRecord-8                           32            24            -25.00%
BenchmarkReadReuseRecordWithFieldsPerRecord-8        32            24            -25.00%
BenchmarkReadReuseRecordWithoutFieldsPerRecord-8     32            24            -25.00%
BenchmarkReadReuseRecordLargeFields-8                2988          2976          -0.40%
master
Anis Elleuch 4 years ago committed by GitHub
parent bede525dc9
commit 6542bc4a03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 59
      pkg/csvparser/reader.go

@ -171,6 +171,14 @@ type Reader struct {
// lastRecord is a record cache and only used when ReuseRecord == true.
lastRecord []string
// Caching some values between Read() calls for performance gain
cached bool
cachedQuoteEscapeLen int
cachedQuoteLen int
cachedEncodedQuote []byte
cachedCommaLen int
cachedQuotes string
}
// NewReader returns a new Reader that reads from r.
@ -295,21 +303,20 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
return nil, errRead
}
var quoteEscape = r.QuoteEscape
var quoteEscapeLen = utf8.RuneLen(quoteEscape)
var quote rune
var quoteLen int
if len(r.Quote) > 0 {
quote = r.Quote[0]
quoteLen = utf8.RuneLen(quote)
if !r.cached {
r.cachedQuoteEscapeLen = utf8.RuneLen(r.QuoteEscape)
if len(r.Quote) > 0 {
r.cachedQuoteLen = utf8.RuneLen(r.Quote[0])
r.cachedEncodedQuote = encodeRune(r.Quote[0])
r.cachedQuotes += string(r.Quote[0])
}
r.cachedCommaLen = utf8.RuneLen(r.Comma)
r.cachedQuotes += string(r.QuoteEscape)
r.cached = true
}
encodedQuote := encodeRune(quote)
// Parse each field in the record.
var err error
commaLen := utf8.RuneLen(r.Comma)
recLine := r.numLine // Starting line for record
r.recordBuffer = r.recordBuffer[:0]
r.fieldIndexes = r.fieldIndexes[:0]
@ -318,7 +325,7 @@ parseField:
if r.TrimLeadingSpace {
line = bytes.TrimLeftFunc(line, unicode.IsSpace)
}
if len(line) == 0 || quoteLen == 0 || nextRune(line) != quote {
if len(line) == 0 || r.cachedQuoteLen == 0 || nextRune(line) != r.Quote[0] {
// Non-quoted string field
i := bytes.IndexRune(line, r.Comma)
field := line
@ -329,7 +336,7 @@ parseField:
}
// Check to make sure a quote does not appear in field.
if !r.LazyQuotes {
if j := bytes.IndexRune(field, quote); j >= 0 {
if j := bytes.IndexRune(field, r.Quote[0]); j >= 0 {
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
break parseField
@ -338,37 +345,37 @@ parseField:
r.recordBuffer = append(r.recordBuffer, field...)
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
if i >= 0 {
line = line[i+commaLen:]
line = line[i+r.cachedCommaLen:]
continue parseField
}
break parseField
} else {
// Quoted string field
line = line[quoteLen:]
line = line[r.cachedQuoteLen:]
for {
i := bytes.IndexAny(line, string(quote)+string(quoteEscape))
i := bytes.IndexAny(line, r.cachedQuotes)
if i >= 0 {
// Hit next quote or escape quote
r.recordBuffer = append(r.recordBuffer, line[:i]...)
escape := nextRune(line[i:]) == quoteEscape
escape := nextRune(line[i:]) == r.QuoteEscape
if escape {
line = line[i+quoteEscapeLen:]
line = line[i+r.cachedQuoteEscapeLen:]
} else {
line = line[i+quoteLen:]
line = line[i+r.cachedQuoteLen:]
}
switch rn := nextRune(line); {
case escape && quoteEscape != quote:
case escape && r.QuoteEscape != r.Quote[0]:
r.recordBuffer = append(r.recordBuffer, encodeRune(rn)...)
line = line[utf8.RuneLen(rn):]
case rn == quote:
case rn == r.Quote[0]:
// `""` sequence (append quote).
r.recordBuffer = append(r.recordBuffer, encodedQuote...)
line = line[quoteLen:]
r.recordBuffer = append(r.recordBuffer, r.cachedEncodedQuote...)
line = line[r.cachedQuoteLen:]
case rn == r.Comma:
// `",` sequence (end of field).
line = line[commaLen:]
line = line[r.cachedCommaLen:]
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
continue parseField
case lengthNL(line) == len(line):
@ -377,10 +384,10 @@ parseField:
break parseField
case r.LazyQuotes:
// `"` sequence (bare quote).
r.recordBuffer = append(r.recordBuffer, encodedQuote...)
r.recordBuffer = append(r.recordBuffer, r.cachedEncodedQuote...)
default:
// `"*` sequence (invalid non-escaped quote).
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-r.cachedQuoteLen])
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
break parseField
}

Loading…
Cancel
Save