sql, csv: Cache some values between Read() calls to gain performance (#9645)

Below is the benchmark enhancement after this commit: benchmark old ns/op new ns/op delta BenchmarkRead-8 2807 2189 -22.02% BenchmarkReadWithFieldsPerRecord-8 2802 2179 -22.23% BenchmarkReadWithoutFieldsPerRecord-8 2824 2181 -22.77% BenchmarkReadLargeFields-8 3584 3371 -5.94% BenchmarkReadReuseRecord-8 2044 1480 -27.59% BenchmarkReadReuseRecordWithFieldsPerRecord-8 2056 1483 -27.87% BenchmarkReadReuseRecordWithoutFieldsPerRecord-8 2047 1482 -27.60% BenchmarkReadReuseRecordLargeFields-8 2777 2594 -6.59% benchmark old allocs new allocs delta BenchmarkRead-8 26 16 -38.46% BenchmarkReadWithFieldsPerRecord-8 26 16 -38.46% BenchmarkReadWithoutFieldsPerRecord-8 26 16 -38.46% BenchmarkReadLargeFields-8 36 24 -33.33% BenchmarkReadReuseRecord-8 16 6 -62.50% BenchmarkReadReuseRecordWithFieldsPerRecord-8 16 6 -62.50% BenchmarkReadReuseRecordWithoutFieldsPerRecord-8 16 6 -62.50% BenchmarkReadReuseRecordLargeFields-8 24 12 -50.00% benchmark old bytes new bytes delta BenchmarkRead-8 672 664 -1.19% BenchmarkReadWithFieldsPerRecord-8 672 664 -1.19% BenchmarkReadWithoutFieldsPerRecord-8 672 664 -1.19% BenchmarkReadLargeFields-8 3948 3936 -0.30% BenchmarkReadReuseRecord-8 32 24 -25.00% BenchmarkReadReuseRecordWithFieldsPerRecord-8 32 24 -25.00% BenchmarkReadReuseRecordWithoutFieldsPerRecord-8 32 24 -25.00% BenchmarkReadReuseRecordLargeFields-8 2988 2976 -0.40%
5 years ago · 6542bc4a03
parent bede525dc9
commit 6542bc4a03
1 changed files with 33 additions and 26 deletions
--- a/pkg/csvparser/reader.go
+++ b/pkg/csvparser/reader.go
@ -171,6 +171,14 @@ type Reader struct {

 	// lastRecord is a record cache and only used when ReuseRecord == true.
 	lastRecord []string
+
+	// Caching some values between Read() calls for performance gain
+	cached               bool
+	cachedQuoteEscapeLen int
+	cachedQuoteLen       int
+	cachedEncodedQuote   []byte
+	cachedCommaLen       int
+	cachedQuotes         string
 }

 // NewReader returns a new Reader that reads from r.
@ -295,21 +303,20 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 		return nil, errRead
 	}

-	var quoteEscape = r.QuoteEscape
-	var quoteEscapeLen = utf8.RuneLen(quoteEscape)
-
-	var quote rune
-	var quoteLen int
-	if len(r.Quote) > 0 {
-		quote = r.Quote[0]
-		quoteLen = utf8.RuneLen(quote)
+	if !r.cached {
+		r.cachedQuoteEscapeLen = utf8.RuneLen(r.QuoteEscape)
+		if len(r.Quote) > 0 {
+			r.cachedQuoteLen = utf8.RuneLen(r.Quote[0])
+			r.cachedEncodedQuote = encodeRune(r.Quote[0])
+			r.cachedQuotes += string(r.Quote[0])
+		}
+		r.cachedCommaLen = utf8.RuneLen(r.Comma)
+		r.cachedQuotes += string(r.QuoteEscape)
+		r.cached = true
 	}

-	encodedQuote := encodeRune(quote)
-
 	// Parse each field in the record.
 	var err error
-	commaLen := utf8.RuneLen(r.Comma)
 	recLine := r.numLine // Starting line for record
 	r.recordBuffer = r.recordBuffer[:0]
 	r.fieldIndexes = r.fieldIndexes[:0]
@ -318,7 +325,7 @@ parseField:
 		if r.TrimLeadingSpace {
 			line = bytes.TrimLeftFunc(line, unicode.IsSpace)
 		}
-		if len(line) == 0 || quoteLen == 0 || nextRune(line) != quote {
+		if len(line) == 0 || r.cachedQuoteLen == 0 || nextRune(line) != r.Quote[0] {
 			// Non-quoted string field
 			i := bytes.IndexRune(line, r.Comma)
 			field := line
@ -329,7 +336,7 @@ parseField:
 			}
 			// Check to make sure a quote does not appear in field.
 			if !r.LazyQuotes {
-				if j := bytes.IndexRune(field, quote); j >= 0 {
+				if j := bytes.IndexRune(field, r.Quote[0]); j >= 0 {
 					col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
 					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
 					break parseField
@ -338,37 +345,37 @@ parseField:
 			r.recordBuffer = append(r.recordBuffer, field...)
 			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
 			if i >= 0 {
-				line = line[i+commaLen:]
+				line = line[i+r.cachedCommaLen:]
 				continue parseField
 			}
 			break parseField
 		} else {
 			// Quoted string field
-			line = line[quoteLen:]
+			line = line[r.cachedQuoteLen:]
 			for {
-				i := bytes.IndexAny(line, string(quote)+string(quoteEscape))
+				i := bytes.IndexAny(line, r.cachedQuotes)
 				if i >= 0 {
 					// Hit next quote or escape quote
 					r.recordBuffer = append(r.recordBuffer, line[:i]...)

-					escape := nextRune(line[i:]) == quoteEscape
+					escape := nextRune(line[i:]) == r.QuoteEscape
 					if escape {
-						line = line[i+quoteEscapeLen:]
+						line = line[i+r.cachedQuoteEscapeLen:]
 					} else {
-						line = line[i+quoteLen:]
+						line = line[i+r.cachedQuoteLen:]
 					}

 					switch rn := nextRune(line); {
-					case escape && quoteEscape != quote:
+					case escape && r.QuoteEscape != r.Quote[0]:
 						r.recordBuffer = append(r.recordBuffer, encodeRune(rn)...)
 						line = line[utf8.RuneLen(rn):]
-					case rn == quote:
+					case rn == r.Quote[0]:
 						// `""` sequence (append quote).
-						r.recordBuffer = append(r.recordBuffer, encodedQuote...)
-						line = line[quoteLen:]
+						r.recordBuffer = append(r.recordBuffer, r.cachedEncodedQuote...)
+						line = line[r.cachedQuoteLen:]
 					case rn == r.Comma:
 						// `",` sequence (end of field).
-						line = line[commaLen:]
+						line = line[r.cachedCommaLen:]
 						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
 						continue parseField
 					case lengthNL(line) == len(line):
@ -377,10 +384,10 @@ parseField:
 						break parseField
 					case r.LazyQuotes:
 						// `"` sequence (bare quote).
-						r.recordBuffer = append(r.recordBuffer, encodedQuote...)
+						r.recordBuffer = append(r.recordBuffer, r.cachedEncodedQuote...)
 					default:
 						// `"*` sequence (invalid non-escaped quote).
-						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
+						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-r.cachedQuoteLen])
 						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
 						break parseField
 					}