s3select should honour custom record delimiter (#6419)

Allow custom delimiters like `\r\n`, `a`, `\r` etc in input csv and replace with `\n`. Fixes #6403
7 years ago · 30d4a2cf53
parent 92bc7caf7a
commit 30d4a2cf53
5 changed files with 197 additions and 2 deletions
--- a/cmd/object-handlers.go
+++ b/cmd/object-handlers.go
@ -178,6 +178,10 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
 		writeErrorResponse(w, ErrInvalidQuoteFields, r.URL)
 		return
 	}
+	if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 {
+		writeErrorResponse(w, ErrInvalidRequestParameter, r.URL)
+		return
+	}

 	getObject := objectAPI.GetObject
 	if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) {
@ -222,9 +226,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
 	if selectReq.InputSerialization.CSV.FileHeaderInfo == "" {
 		selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
 	}
+	if selectReq.InputSerialization.CSV.RecordDelimiter == "" {
+		selectReq.InputSerialization.CSV.RecordDelimiter = "\n"
+	}
 	if selectReq.InputSerialization.CSV != nil {
 		options := &s3select.Options{
 			HasHeader:            selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone,
+			RecordDelimiter:      selectReq.InputSerialization.CSV.RecordDelimiter,
 			FieldDelimiter:       selectReq.InputSerialization.CSV.FieldDelimiter,
 			Comments:             selectReq.InputSerialization.CSV.Comments,
 			Name:                 "S3Object", // Default table name for all objects
--- a/pkg/ioutil/delimited-reader.go
+++ b/pkg/ioutil/delimited-reader.go
@ -0,0 +1,87 @@
+/*
+ * Minio Cloud Storage, (C) 2018 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ioutil
+
+import (
+	"bufio"
+	"io"
+)
+
+var (
+	nByte byte = 10 // the byte that corresponds to the '\n' rune.
+	rByte byte = 13 // the byte that corresponds to the '\r' rune.
+)
+
+// DelimitedReader reduces the custom delimiter to `\n`.
+type DelimitedReader struct {
+	r           *bufio.Reader
+	delimiter   []rune // Select can have upto 2 characters as delimiter.
+	assignEmpty bool   // Decides whether the next read byte should be discarded.
+}
+
+// NewDelimitedReader detects the custom delimiter and replaces with `\n`.
+func NewDelimitedReader(r io.Reader, delimiter []rune) *DelimitedReader {
+	return &DelimitedReader{r: bufio.NewReader(r), delimiter: delimiter, assignEmpty: false}
+}
+
+// Reads and replaces the custom delimiter with `\n`.
+func (r *DelimitedReader) Read(p []byte) (n int, err error) {
+	n, err = r.r.Read(p)
+	if err != nil {
+		return
+	}
+	for i, b := range p {
+		if r.assignEmpty {
+			swapAndNullify(p, i)
+			r.assignEmpty = false
+			continue
+		}
+		if b == rByte && rune(b) != r.delimiter[0] {
+			// Replace the carriage returns with `\n`.
+			// Mac styled csv will have `\r` as their record delimiter.
+			p[i] = nByte
+		} else if rune(b) == r.delimiter[0] { // Eg, `\r\n`,`ab`,`a` are valid delimiters
+			if i+1 == len(p) && len(r.delimiter) > 1 {
+				// If the first delimiter match falls on the boundary,
+				// Peek the next byte and if it matches, discard it in the next byte read.
+				if nextByte, nerr := r.r.Peek(1); nerr == nil {
+					if rune(nextByte[0]) == r.delimiter[1] {
+						p[i] = nByte
+						// To Discard in the next read.
+						r.assignEmpty = true
+					}
+				}
+			} else if len(r.delimiter) > 1 && rune(p[i+1]) == r.delimiter[1] {
+				// The second delimiter falls in the same chunk.
+				p[i] = nByte
+				r.assignEmpty = true
+			} else if len(r.delimiter) == 1 {
+				// Replace with `\n` incase of single charecter delimiter match.
+				p[i] = nByte
+			}
+		}
+	}
+	return
+}
+
+// Occupy the first byte space and nullify the last byte.
+func swapAndNullify(p []byte, n int) {
+	for i := n; i < len(p)-1; i++ {
+		p[i] = p[i+1]
+	}
+	p[len(p)-1] = 0
+}
--- a/pkg/ioutil/delimited-reader_test.go
+++ b/pkg/ioutil/delimited-reader_test.go
@ -0,0 +1,83 @@
+/*
+ * Minio Cloud Storage, (C) 2016, 2017, 2018 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ioutil
+
+import (
+	"bytes"
+	"io"
+	"strings"
+	"testing"
+)
+
+// Test for DelimitedCSVReader.
+func TestDelimitedReader(t *testing.T) {
+	expected := "username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45"
+
+	inputs := []struct {
+		inputcsv  string
+		delimiter string
+		chunkSize int
+	}{
+		// case 1 - with default `\n` delimiter.
+		{"username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45", "\n", 10},
+		// case 2 - with carriage return `\r` which should be replaced with `\n` by default.
+		{"username,age\rbanana,12\rcarrot,23\rapple,34\rbrinjal,90\rraddish,45", "\n", 10},
+		// case 3 - with a double character delimiter (octals).
+		{"username,age\r\nbanana,12\r\ncarrot,23\r\napple,34\r\nbrinjal,90\r\nraddish,45", "\r\n", 10},
+		// case 4 - with a double character delimiter.
+		{"username,agexvbanana,12xvcarrot,23xvapple,34xvbrinjal,90xvraddish,45", "xv", 10},
+		// case 5 - with a double character delimiter `\t `
+		{"username,age\t banana,12\t carrot,23\t apple,34\t brinjal,90\t raddish,45", "\t ", 10},
+		// case 6 - This is a special case where the first delimiter match falls in the 13'th byte space
+		// ie, the last byte space of the read chunk, In this case the reader should peek in the next byte
+		// and replace with `\n`.
+		{"username,agexxbanana,12xxcarrot,23xxapple,34xxbrinjal,90xxraddish,45", "xx", 13},
+	}
+
+	for c, input := range inputs {
+		var readcsv []byte
+		var err error
+		delimitedReader := NewDelimitedReader(strings.NewReader(input.inputcsv), []rune(input.delimiter))
+		for err == nil {
+			chunk := make([]byte, input.chunkSize)
+			_, err = delimitedReader.Read(chunk)
+			readcsv = append(readcsv, chunk...)
+		}
+		if err != io.EOF {
+			t.Fatalf("Case %d: Error in delimited read", c+1)
+		}
+		expected := []byte(expected)
+		cleanCsv := removeNulls(readcsv)
+		if !bytes.Equal(cleanCsv, expected) {
+			t.Fatalf("Case %d: Expected the delimited csv to be `%s`, but instead found `%s`", c+1, string(expected), string(cleanCsv))
+		}
+	}
+
+}
+
+// Removes all the tailing nulls in chunks.
+// Null chunks will be assigned if there is a reduction
+// Eg, When `xv` is reduced to `\n`, the last byte is nullified.
+func removeNulls(csv []byte) []byte {
+	cleanCsv := []byte{}
+	for _, p := range csv {
+		if p != 0 {
+			cleanCsv = append(cleanCsv, p)
+		}
+	}
+	return cleanCsv
+}
--- a/pkg/s3select/input.go
+++ b/pkg/s3select/input.go
@ -29,6 +29,7 @@ import (
 	"net/http"

 	gzip "github.com/klauspost/pgzip"
+	"github.com/minio/minio/pkg/ioutil"
 )

 const (
@ -79,6 +80,9 @@ type Options struct {
 	// HasHeader when true, will treat the first row as a header row.
 	HasHeader bool

+	// RecordDelimiter is the string that records are delimited by.
+	RecordDelimiter string
+
 	// FieldDelimiter is the string that fields are delimited by.
 	FieldDelimiter string

@ -127,7 +131,8 @@ func NewInput(opts *Options) (*Input, error) {
 		tempBytesScanned = opts.StreamSize
 		myReader = bzip2.NewReader(opts.ReadFrom)
 	}
-
+	// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
+	normalizedReader := ioutil.NewDelimitedReader(myReader, []rune(opts.RecordDelimiter))
 	progress := &statInfo{
 		BytesScanned:   tempBytesScanned,
 		BytesProcessed: 0,
@ -135,7 +140,7 @@ func NewInput(opts *Options) (*Input, error) {
 	}
 	reader := &Input{
 		options: opts,
-		reader:  csv.NewReader(myReader),
+		reader:  csv.NewReader(normalizedReader),
 		stats:   progress,
 	}
 	reader.firstRow = nil
--- a/pkg/s3select/select_test.go
+++ b/pkg/s3select/select_test.go
@ -48,6 +48,7 @@ func TestCheckForDuplicates(t *testing.T) {
 func TestMyProcessing(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -96,6 +97,7 @@ func TestMyProcessing(t *testing.T) {
 func TestMyRowIndexResults(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -271,6 +273,7 @@ func TestMyParser(t *testing.T) {
 	for _, table := range tables {
 		options := &Options{
 			HasHeader:            false,
+			RecordDelimiter:      "\n",
 			FieldDelimiter:       ",",
 			Comments:             "",
 			Name:                 "S3Object", // Default table name for all objects
@ -346,6 +349,7 @@ func TestMyAggregationFunc(t *testing.T) {
 func TestToStringAgg(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -380,6 +384,7 @@ func TestToStringAgg(t *testing.T) {
 func TestMyRowColLiteralResults(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -459,6 +464,7 @@ func TestMyWhereEval(t *testing.T) {
 	for _, table := range tables {
 		options := &Options{
 			HasHeader:            false,
+			RecordDelimiter:      "\n",
 			FieldDelimiter:       ",",
 			Comments:             "",
 			Name:                 "S3Object", // Default table name for all objects
@ -610,6 +616,7 @@ func TestInterpreter(t *testing.T) {
 	for _, table := range tables {
 		options := &Options{
 			HasHeader:            false,
+			RecordDelimiter:      "\n",
 			FieldDelimiter:       ",",
 			Comments:             "",
 			Name:                 "S3Object", // Default table name for all objects
@ -651,6 +658,7 @@ func TestInterpreter(t *testing.T) {
 func TestMyXMLFunction(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -689,6 +697,7 @@ func TestMyXMLFunction(t *testing.T) {
 func TestMyProtocolFunction(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -732,6 +741,7 @@ func TestMyProtocolFunction(t *testing.T) {
 func TestMyInfoProtocolFunctions(t *testing.T) {
 	options := &Options{
 		HasHeader:            true,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -773,6 +783,7 @@ func TestMyInfoProtocolFunctions(t *testing.T) {
 func TestMyErrorProtocolFunctions(t *testing.T) {
 	options := &Options{
 		HasHeader:            false,
+		RecordDelimiter:      "\n",
 		FieldDelimiter:       ",",
 		Comments:             "",
 		Name:                 "S3Object", // Default table name for all objects
@ -1007,6 +1018,7 @@ func TestMyValids(t *testing.T) {
 	for _, table := range tables {
 		options := &Options{
 			HasHeader:            false,
+			RecordDelimiter:      "\n",
 			FieldDelimiter:       ",",
 			Comments:             "",
 			Name:                 "S3Object", // Default table name for all objects