minio/pkg/s3select/csv/record.go

/*
 * MinIO Cloud Storage, (C) 2019 MinIO, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package csv

import (
	"encoding/json"
	"errors"
	"fmt"
	"io"

	"github.com/bcicen/jstream"
	csv "github.com/minio/minio/pkg/csvparser"
	"github.com/minio/minio/pkg/s3select/sql"
)

// Record - is a CSV record.
type Record struct {
	columnNames  []string
	csvRecord    []string
	nameIndexMap map[string]int64
}

// Get - gets the value for a column name. CSV fields do not have any
// defined type (other than the default string). So this function
// always returns fields using sql.FromBytes so that the type
// specified/implied by the query can be used, or can be automatically
// converted based on the query.
func (r *Record) Get(name string) (*sql.Value, error) {
	index, found := r.nameIndexMap[name]
	if !found {
		return nil, fmt.Errorf("column %v not found", name)
	}

	if index >= int64(len(r.csvRecord)) {
		// No value found for column 'name', hence return null
		// value
		return sql.FromNull(), nil
	}

	return sql.FromBytes([]byte(r.csvRecord[index])), nil
}

// Set - sets the value for a column name.
func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
	r.columnNames = append(r.columnNames, name)
	r.csvRecord = append(r.csvRecord, value.CSVString())
	return r, nil
}

// Reset data in record.
func (r *Record) Reset() {
	if len(r.columnNames) > 0 {
		r.columnNames = r.columnNames[:0]
	}
	if len(r.csvRecord) > 0 {
		r.csvRecord = r.csvRecord[:0]
	}
	for k := range r.nameIndexMap {
		delete(r.nameIndexMap, k)
	}
}

// Clone the record.
func (r *Record) Clone(dst sql.Record) sql.Record {
	other, ok := dst.(*Record)
	if !ok {
		other = &Record{}
	}
	if len(other.columnNames) > 0 {
		other.columnNames = other.columnNames[:0]
	}
	if len(other.csvRecord) > 0 {
		other.csvRecord = other.csvRecord[:0]
	}
	other.columnNames = append(other.columnNames, r.columnNames...)
	other.csvRecord = append(other.csvRecord, r.csvRecord...)
	return other
}

// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {
	w := csv.NewWriter(writer)
	w.Comma = opts.FieldDelimiter
	w.AlwaysQuote = opts.AlwaysQuote
	w.Quote = opts.Quote
	w.QuoteEscape = opts.QuoteEscape
	if err := w.Write(r.csvRecord); err != nil {
		return err
	}
	w.Flush()
	if err := w.Error(); err != nil {
		return err
	}

	return nil
}

// WriteJSON - encodes to JSON data.
func (r *Record) WriteJSON(writer io.Writer) error {
	var kvs jstream.KVS = make([]jstream.KV, len(r.columnNames))
	for i := 0; i < len(r.columnNames); i++ {
		kvs[i] = jstream.KV{Key: r.columnNames[i], Value: r.csvRecord[i]}
	}
	return json.NewEncoder(writer).Encode(kvs)
}

// Raw - returns the underlying data with format info.
func (r *Record) Raw() (sql.SelectObjectFormat, interface{}) {
	return sql.SelectFmtCSV, r
}

// Replace - is not supported for CSV
func (r *Record) Replace(_ interface{}) error {
	return errors.New("Replace is not supported for CSV")
}

// NewRecord - creates new CSV record.
func NewRecord() *Record {
	return &Record{}
}
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`/*`
Replace Minio refs in docs with MinIO and links (#7494) 6 years ago			`* MinIO Cloud Storage, (C) 2019 MinIO, Inc.`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`package csv`

			`import (`
Use jstream to serialize records to JSON format in S3Select (#7318) - Also, switch to jstream to generate internal record representation from CSV/JSON readers - This fixes a bug in which JSON output objects have their keys reversed from the order they are specified in the Select columns. - Also includes a fix for tests. 6 years ago			`"encoding/json"`
Add JSON Path expression evaluation support (#7315) - Includes support for FROM clause JSON path 6 years ago			`"errors"`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`"fmt"`
speed up the performance of s3select on csv (#7945) 5 years ago			`"io"`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago
Use jstream to serialize records to JSON format in S3Select (#7318) - Also, switch to jstream to generate internal record representation from CSV/JSON readers - This fixes a bug in which JSON output objects have their keys reversed from the order they are specified in the Select columns. - Also includes a fix for tests. 6 years ago			`"github.com/bcicen/jstream"`
Import CSV parser library (#8927) The CSV library code is imported from Go 1.13.6 5 years ago			`csv "github.com/minio/minio/pkg/csvparser"`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`"github.com/minio/minio/pkg/s3select/sql"`
			`)`

Concurrent CSV parsing and reduce S3 select allocations (#8200) ``` CSV parsing, BEFORE: BenchmarkReaderBasic-12 2842 407533 ns/op 397860 B/op 957 allocs/op BenchmarkReaderReplace-12 2718 429914 ns/op 397844 B/op 957 allocs/op BenchmarkReaderReplaceTwo-12 2718 435556 ns/op 397855 B/op 957 allocs/op BenchmarkAggregateCount_100K-12 171 6798974 ns/op 16667102 B/op 308077 allocs/op BenchmarkAggregateCount_1M-12 19 65657411 ns/op 168057743 B/op 3146610 allocs/op BenchmarkSelectAll_10M-12 1 20882119900 ns/op 2758799896 B/op 41978762 allocs/op CSV parsing, AFTER: BenchmarkReaderBasic-12 3721 312549 ns/op 101920 B/op 338 allocs/op BenchmarkReaderReplace-12 3776 318810 ns/op 101993 B/op 340 allocs/op BenchmarkReaderReplaceTwo-12 3610 330967 ns/op 102012 B/op 341 allocs/op BenchmarkAggregateCount_100K-12 295 4149588 ns/op 3553623 B/op 103261 allocs/op BenchmarkAggregateCount_1M-12 30 37746503 ns/op 33827931 B/op 1049435 allocs/op BenchmarkSelectAll_10M-12 1 17608495800 ns/op 1416504040 B/op 21007082 allocs/op ~ benchcmp old.txt new.txt benchmark old ns/op new ns/op delta BenchmarkReaderBasic-12 407533 312549 -23.31% BenchmarkReaderReplace-12 429914 318810 -25.84% BenchmarkReaderReplaceTwo-12 435556 330967 -24.01% BenchmarkAggregateCount_100K-12 6798974 4149588 -38.97% BenchmarkAggregateCount_1M-12 65657411 37746503 -42.51% BenchmarkSelectAll_10M-12 20882119900 17608495800 -15.68% benchmark old allocs new allocs delta BenchmarkReaderBasic-12 957 338 -64.68% BenchmarkReaderReplace-12 957 340 -64.47% BenchmarkReaderReplaceTwo-12 957 341 -64.37% BenchmarkAggregateCount_100K-12 308077 103261 -66.48% BenchmarkAggregateCount_1M-12 3146610 1049435 -66.65% BenchmarkSelectAll_10M-12 41978762 21007082 -49.96% benchmark old bytes new bytes delta BenchmarkReaderBasic-12 397860 101920 -74.38% BenchmarkReaderReplace-12 397844 101993 -74.36% BenchmarkReaderReplaceTwo-12 397855 102012 -74.36% BenchmarkAggregateCount_100K-12 16667102 3553623 -78.68% BenchmarkAggregateCount_1M-12 168057743 33827931 -79.87% BenchmarkSelectAll_10M-12 2758799896 1416504040 -48.66% ``` ``` BenchmarkReaderHuge/97K-12 2200 540840 ns/op 184.32 MB/s 1604450 B/op 687 allocs/op BenchmarkReaderHuge/194K-12 1522 752257 ns/op 265.04 MB/s 2143135 B/op 1335 allocs/op BenchmarkReaderHuge/389K-12 1190 947858 ns/op 420.69 MB/s 3221831 B/op 2630 allocs/op BenchmarkReaderHuge/778K-12 806 1472486 ns/op 541.61 MB/s 5201856 B/op 5187 allocs/op BenchmarkReaderHuge/1557K-12 426 2575269 ns/op 619.36 MB/s 9101330 B/op 10233 allocs/op BenchmarkReaderHuge/3115K-12 286 4034656 ns/op 790.66 MB/s 12397968 B/op 16099 allocs/op BenchmarkReaderHuge/6230K-12 172 6830563 ns/op 934.05 MB/s 16008416 B/op 26844 allocs/op BenchmarkReaderHuge/12461K-12 100 11409467 ns/op 1118.39 MB/s 22655163 B/op 48107 allocs/op BenchmarkReaderHuge/24922K-12 66 19780395 ns/op 1290.19 MB/s 35158559 B/op 90216 allocs/op BenchmarkReaderHuge/49844K-12 34 37282559 ns/op 1369.03 MB/s 60528624 B/op 174497 allocs/op ``` 5 years ago			`// Record - is a CSV record.`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`type Record struct {`
			`columnNames []string`
			`csvRecord []string`
			`nameIndexMap map[string]int64`
			`}`

Add new SQL parser to support S3 Select syntax (#7102) - New parser written from scratch, allows easier and complete parsing of the full S3 Select SQL syntax. Parser definition is directly provided by the AST defined for the SQL grammar. - Bring support to parse and interpret SQL involving JSON path expressions; evaluation of JSON path expressions will be subsequently added. - Bring automatic type inference and conversion for untyped values (e.g. CSV data). 6 years ago			`// Get - gets the value for a column name. CSV fields do not have any`
			`// defined type (other than the default string). So this function`
			`// always returns fields using sql.FromBytes so that the type`
			`// specified/implied by the query can be used, or can be automatically`
			`// converted based on the query.`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`func (r Record) Get(name string) (sql.Value, error) {`
			`index, found := r.nameIndexMap[name]`
			`if !found {`
			`return nil, fmt.Errorf("column %v not found", name)`
			`}`

			`if index >= int64(len(r.csvRecord)) {`
Add new SQL parser to support S3 Select syntax (#7102) - New parser written from scratch, allows easier and complete parsing of the full S3 Select SQL syntax. Parser definition is directly provided by the AST defined for the SQL grammar. - Bring support to parse and interpret SQL involving JSON path expressions; evaluation of JSON path expressions will be subsequently added. - Bring automatic type inference and conversion for untyped values (e.g. CSV data). 6 years ago			`// No value found for column 'name', hence return null`
			`// value`
			`return sql.FromNull(), nil`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

Add new SQL parser to support S3 Select syntax (#7102) - New parser written from scratch, allows easier and complete parsing of the full S3 Select SQL syntax. Parser definition is directly provided by the AST defined for the SQL grammar. - Bring support to parse and interpret SQL involving JSON path expressions; evaluation of JSON path expressions will be subsequently added. - Bring automatic type inference and conversion for untyped values (e.g. CSV data). 6 years ago			`return sql.FromBytes([]byte(r.csvRecord[index])), nil`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

			`// Set - sets the value for a column name.`
SIMDJSON S3 select input (#8401) 5 years ago			`func (r Record) Set(name string, value sql.Value) (sql.Record, error) {`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`r.columnNames = append(r.columnNames, name)`
			`r.csvRecord = append(r.csvRecord, value.CSVString())`
SIMDJSON S3 select input (#8401) 5 years ago			`return r, nil`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

S3 Select: optimize output (#8238) Queue output items and reuse them. Remove the unneeded type system in sql and just use the Go type system. In best case this is more than an order of magnitude speedup: ``` BenchmarkSelectAll_1M-12 1 1841049400 ns/op 274299728 B/op 4198522 allocs/op BenchmarkSelectAll_1M-12 14 84833400 ns/op 169228346 B/op 3146541 allocs/op ``` 5 years ago			`// Reset data in record.`
			`func (r *Record) Reset() {`
			`if len(r.columnNames) > 0 {`
			`r.columnNames = r.columnNames[:0]`
			`}`
			`if len(r.csvRecord) > 0 {`
			`r.csvRecord = r.csvRecord[:0]`
			`}`
			`for k := range r.nameIndexMap {`
			`delete(r.nameIndexMap, k)`
			`}`
			`}`

S3 select: Fix output conversion on select * (#8303) Fixes #8268 5 years ago			`// Clone the record.`
			`func (r *Record) Clone(dst sql.Record) sql.Record {`
			`other, ok := dst.(*Record)`
S3 Select: optimize output (#8238) Queue output items and reuse them. Remove the unneeded type system in sql and just use the Go type system. In best case this is more than an order of magnitude speedup: ``` BenchmarkSelectAll_1M-12 1 1841049400 ns/op 274299728 B/op 4198522 allocs/op BenchmarkSelectAll_1M-12 14 84833400 ns/op 169228346 B/op 3146541 allocs/op ``` 5 years ago			`if !ok {`
S3 select: Fix output conversion on select * (#8303) Fixes #8268 5 years ago			`other = &Record{}`
S3 Select: optimize output (#8238) Queue output items and reuse them. Remove the unneeded type system in sql and just use the Go type system. In best case this is more than an order of magnitude speedup: ``` BenchmarkSelectAll_1M-12 1 1841049400 ns/op 274299728 B/op 4198522 allocs/op BenchmarkSelectAll_1M-12 14 84833400 ns/op 169228346 B/op 3146541 allocs/op ``` 5 years ago			`}`
S3 select: Fix output conversion on select * (#8303) Fixes #8268 5 years ago			`if len(other.columnNames) > 0 {`
			`other.columnNames = other.columnNames[:0]`
			`}`
			`if len(other.csvRecord) > 0 {`
			`other.csvRecord = other.csvRecord[:0]`
			`}`
			`other.columnNames = append(other.columnNames, r.columnNames...)`
			`other.csvRecord = append(other.csvRecord, r.csvRecord...)`
			`return other`
S3 Select: optimize output (#8238) Queue output items and reuse them. Remove the unneeded type system in sql and just use the Go type system. In best case this is more than an order of magnitude speedup: ``` BenchmarkSelectAll_1M-12 1 1841049400 ns/op 274299728 B/op 4198522 allocs/op BenchmarkSelectAll_1M-12 14 84833400 ns/op 169228346 B/op 3146541 allocs/op ``` 5 years ago			`}`

speed up the performance of s3select on csv (#7945) 5 years ago			`// WriteCSV - encodes to CSV data.`
sql: Add support of escape quote in CSV (#9231) This commit modifies csv parser, a fork of golang csv parser to support a custom quote escape character. The quote escape character is used to escape the quote character when a csv field contains a quote character as part of data. 5 years ago			`func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {`
speed up the performance of s3select on csv (#7945) 5 years ago			`w := csv.NewWriter(writer)`
sql: Add support of escape quote in CSV (#9231) This commit modifies csv parser, a fork of golang csv parser to support a custom quote escape character. The quote escape character is used to escape the quote character when a csv field contains a quote character as part of data. 5 years ago			`w.Comma = opts.FieldDelimiter`
			`w.AlwaysQuote = opts.AlwaysQuote`
			`w.Quote = opts.Quote`
			`w.QuoteEscape = opts.QuoteEscape`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`if err := w.Write(r.csvRecord); err != nil {`
speed up the performance of s3select on csv (#7945) 5 years ago			`return err`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`
			`w.Flush()`
			`if err := w.Error(); err != nil {`
speed up the performance of s3select on csv (#7945) 5 years ago			`return err`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

speed up the performance of s3select on csv (#7945) 5 years ago			`return nil`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

speed up the performance of s3select on csv (#7945) 5 years ago			`// WriteJSON - encodes to JSON data.`
			`func (r *Record) WriteJSON(writer io.Writer) error {`
Use jstream to serialize records to JSON format in S3Select (#7318) - Also, switch to jstream to generate internal record representation from CSV/JSON readers - This fixes a bug in which JSON output objects have their keys reversed from the order they are specified in the Select columns. - Also includes a fix for tests. 6 years ago			`var kvs jstream.KVS = make([]jstream.KV, len(r.columnNames))`
			`for i := 0; i < len(r.columnNames); i++ {`
			`kvs[i] = jstream.KV{Key: r.columnNames[i], Value: r.csvRecord[i]}`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`
speed up the performance of s3select on csv (#7945) 5 years ago			`return json.NewEncoder(writer).Encode(kvs)`
Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`}`

Add JSON Path expression evaluation support (#7315) - Includes support for FROM clause JSON path 6 years ago			`// Raw - returns the underlying data with format info.`
			`func (r *Record) Raw() (sql.SelectObjectFormat, interface{}) {`
			`return sql.SelectFmtCSV, r`
			`}`

			`// Replace - is not supported for CSV`
SIMDJSON S3 select input (#8401) 5 years ago			`func (r *Record) Replace(_ interface{}) error {`
Add JSON Path expression evaluation support (#7315) - Includes support for FROM clause JSON path 6 years ago			`return errors.New("Replace is not supported for CSV")`
			`}`

Refactor s3select to support parquet. (#7023) Also handle pretty formatted JSON documents. 6 years ago			`// NewRecord - creates new CSV record.`
			`func NewRecord() *Record {`
			`return &Record{}`
			`}`