From 30d4a2cf535846b9b9e09b7e914867ec12bd10b0 Mon Sep 17 00:00:00 2001 From: Praveen raj Mani Date: Mon, 10 Sep 2018 21:50:28 +0530 Subject: [PATCH] s3select should honour custom record delimiter (#6419) Allow custom delimiters like `\r\n`, `a`, `\r` etc in input csv and replace with `\n`. Fixes #6403 --- cmd/object-handlers.go | 8 +++ pkg/ioutil/delimited-reader.go | 87 +++++++++++++++++++++++++++++ pkg/ioutil/delimited-reader_test.go | 83 +++++++++++++++++++++++++++ pkg/s3select/input.go | 9 ++- pkg/s3select/select_test.go | 12 ++++ 5 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 pkg/ioutil/delimited-reader.go create mode 100644 pkg/ioutil/delimited-reader_test.go diff --git a/cmd/object-handlers.go b/cmd/object-handlers.go index 56497713b..f9ff7afc0 100644 --- a/cmd/object-handlers.go +++ b/cmd/object-handlers.go @@ -178,6 +178,10 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r writeErrorResponse(w, ErrInvalidQuoteFields, r.URL) return } + if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 { + writeErrorResponse(w, ErrInvalidRequestParameter, r.URL) + return + } getObject := objectAPI.GetObject if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) { @@ -222,9 +226,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r if selectReq.InputSerialization.CSV.FileHeaderInfo == "" { selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone } + if selectReq.InputSerialization.CSV.RecordDelimiter == "" { + selectReq.InputSerialization.CSV.RecordDelimiter = "\n" + } if selectReq.InputSerialization.CSV != nil { options := &s3select.Options{ HasHeader: selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone, + RecordDelimiter: selectReq.InputSerialization.CSV.RecordDelimiter, FieldDelimiter: selectReq.InputSerialization.CSV.FieldDelimiter, Comments: selectReq.InputSerialization.CSV.Comments, Name: "S3Object", // Default table name for all objects diff --git a/pkg/ioutil/delimited-reader.go b/pkg/ioutil/delimited-reader.go new file mode 100644 index 000000000..71b02d7be --- /dev/null +++ b/pkg/ioutil/delimited-reader.go @@ -0,0 +1,87 @@ +/* + * Minio Cloud Storage, (C) 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ioutil + +import ( + "bufio" + "io" +) + +var ( + nByte byte = 10 // the byte that corresponds to the '\n' rune. + rByte byte = 13 // the byte that corresponds to the '\r' rune. +) + +// DelimitedReader reduces the custom delimiter to `\n`. +type DelimitedReader struct { + r *bufio.Reader + delimiter []rune // Select can have upto 2 characters as delimiter. + assignEmpty bool // Decides whether the next read byte should be discarded. +} + +// NewDelimitedReader detects the custom delimiter and replaces with `\n`. +func NewDelimitedReader(r io.Reader, delimiter []rune) *DelimitedReader { + return &DelimitedReader{r: bufio.NewReader(r), delimiter: delimiter, assignEmpty: false} +} + +// Reads and replaces the custom delimiter with `\n`. +func (r *DelimitedReader) Read(p []byte) (n int, err error) { + n, err = r.r.Read(p) + if err != nil { + return + } + for i, b := range p { + if r.assignEmpty { + swapAndNullify(p, i) + r.assignEmpty = false + continue + } + if b == rByte && rune(b) != r.delimiter[0] { + // Replace the carriage returns with `\n`. + // Mac styled csv will have `\r` as their record delimiter. + p[i] = nByte + } else if rune(b) == r.delimiter[0] { // Eg, `\r\n`,`ab`,`a` are valid delimiters + if i+1 == len(p) && len(r.delimiter) > 1 { + // If the first delimiter match falls on the boundary, + // Peek the next byte and if it matches, discard it in the next byte read. + if nextByte, nerr := r.r.Peek(1); nerr == nil { + if rune(nextByte[0]) == r.delimiter[1] { + p[i] = nByte + // To Discard in the next read. + r.assignEmpty = true + } + } + } else if len(r.delimiter) > 1 && rune(p[i+1]) == r.delimiter[1] { + // The second delimiter falls in the same chunk. + p[i] = nByte + r.assignEmpty = true + } else if len(r.delimiter) == 1 { + // Replace with `\n` incase of single charecter delimiter match. + p[i] = nByte + } + } + } + return +} + +// Occupy the first byte space and nullify the last byte. +func swapAndNullify(p []byte, n int) { + for i := n; i < len(p)-1; i++ { + p[i] = p[i+1] + } + p[len(p)-1] = 0 +} diff --git a/pkg/ioutil/delimited-reader_test.go b/pkg/ioutil/delimited-reader_test.go new file mode 100644 index 000000000..452fc5dfd --- /dev/null +++ b/pkg/ioutil/delimited-reader_test.go @@ -0,0 +1,83 @@ +/* + * Minio Cloud Storage, (C) 2016, 2017, 2018 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ioutil + +import ( + "bytes" + "io" + "strings" + "testing" +) + +// Test for DelimitedCSVReader. +func TestDelimitedReader(t *testing.T) { + expected := "username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45" + + inputs := []struct { + inputcsv string + delimiter string + chunkSize int + }{ + // case 1 - with default `\n` delimiter. + {"username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45", "\n", 10}, + // case 2 - with carriage return `\r` which should be replaced with `\n` by default. + {"username,age\rbanana,12\rcarrot,23\rapple,34\rbrinjal,90\rraddish,45", "\n", 10}, + // case 3 - with a double character delimiter (octals). + {"username,age\r\nbanana,12\r\ncarrot,23\r\napple,34\r\nbrinjal,90\r\nraddish,45", "\r\n", 10}, + // case 4 - with a double character delimiter. + {"username,agexvbanana,12xvcarrot,23xvapple,34xvbrinjal,90xvraddish,45", "xv", 10}, + // case 5 - with a double character delimiter `\t ` + {"username,age\t banana,12\t carrot,23\t apple,34\t brinjal,90\t raddish,45", "\t ", 10}, + // case 6 - This is a special case where the first delimiter match falls in the 13'th byte space + // ie, the last byte space of the read chunk, In this case the reader should peek in the next byte + // and replace with `\n`. + {"username,agexxbanana,12xxcarrot,23xxapple,34xxbrinjal,90xxraddish,45", "xx", 13}, + } + + for c, input := range inputs { + var readcsv []byte + var err error + delimitedReader := NewDelimitedReader(strings.NewReader(input.inputcsv), []rune(input.delimiter)) + for err == nil { + chunk := make([]byte, input.chunkSize) + _, err = delimitedReader.Read(chunk) + readcsv = append(readcsv, chunk...) + } + if err != io.EOF { + t.Fatalf("Case %d: Error in delimited read", c+1) + } + expected := []byte(expected) + cleanCsv := removeNulls(readcsv) + if !bytes.Equal(cleanCsv, expected) { + t.Fatalf("Case %d: Expected the delimited csv to be `%s`, but instead found `%s`", c+1, string(expected), string(cleanCsv)) + } + } + +} + +// Removes all the tailing nulls in chunks. +// Null chunks will be assigned if there is a reduction +// Eg, When `xv` is reduced to `\n`, the last byte is nullified. +func removeNulls(csv []byte) []byte { + cleanCsv := []byte{} + for _, p := range csv { + if p != 0 { + cleanCsv = append(cleanCsv, p) + } + } + return cleanCsv +} diff --git a/pkg/s3select/input.go b/pkg/s3select/input.go index 8f37aed12..eb47e9669 100644 --- a/pkg/s3select/input.go +++ b/pkg/s3select/input.go @@ -29,6 +29,7 @@ import ( "net/http" gzip "github.com/klauspost/pgzip" + "github.com/minio/minio/pkg/ioutil" ) const ( @@ -79,6 +80,9 @@ type Options struct { // HasHeader when true, will treat the first row as a header row. HasHeader bool + // RecordDelimiter is the string that records are delimited by. + RecordDelimiter string + // FieldDelimiter is the string that fields are delimited by. FieldDelimiter string @@ -127,7 +131,8 @@ func NewInput(opts *Options) (*Input, error) { tempBytesScanned = opts.StreamSize myReader = bzip2.NewReader(opts.ReadFrom) } - + // DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`. + normalizedReader := ioutil.NewDelimitedReader(myReader, []rune(opts.RecordDelimiter)) progress := &statInfo{ BytesScanned: tempBytesScanned, BytesProcessed: 0, @@ -135,7 +140,7 @@ func NewInput(opts *Options) (*Input, error) { } reader := &Input{ options: opts, - reader: csv.NewReader(myReader), + reader: csv.NewReader(normalizedReader), stats: progress, } reader.firstRow = nil diff --git a/pkg/s3select/select_test.go b/pkg/s3select/select_test.go index ff48539ba..8f03fb26e 100644 --- a/pkg/s3select/select_test.go +++ b/pkg/s3select/select_test.go @@ -48,6 +48,7 @@ func TestCheckForDuplicates(t *testing.T) { func TestMyProcessing(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -96,6 +97,7 @@ func TestMyProcessing(t *testing.T) { func TestMyRowIndexResults(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -271,6 +273,7 @@ func TestMyParser(t *testing.T) { for _, table := range tables { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -346,6 +349,7 @@ func TestMyAggregationFunc(t *testing.T) { func TestToStringAgg(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -380,6 +384,7 @@ func TestToStringAgg(t *testing.T) { func TestMyRowColLiteralResults(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -459,6 +464,7 @@ func TestMyWhereEval(t *testing.T) { for _, table := range tables { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -610,6 +616,7 @@ func TestInterpreter(t *testing.T) { for _, table := range tables { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -651,6 +658,7 @@ func TestInterpreter(t *testing.T) { func TestMyXMLFunction(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -689,6 +697,7 @@ func TestMyXMLFunction(t *testing.T) { func TestMyProtocolFunction(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -732,6 +741,7 @@ func TestMyProtocolFunction(t *testing.T) { func TestMyInfoProtocolFunctions(t *testing.T) { options := &Options{ HasHeader: true, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -773,6 +783,7 @@ func TestMyInfoProtocolFunctions(t *testing.T) { func TestMyErrorProtocolFunctions(t *testing.T) { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects @@ -1007,6 +1018,7 @@ func TestMyValids(t *testing.T) { for _, table := range tables { options := &Options{ HasHeader: false, + RecordDelimiter: "\n", FieldDelimiter: ",", Comments: "", Name: "S3Object", // Default table name for all objects