s3select should honour custom record delimiter (#6419)
Allow custom delimiters like `\r\n`, `a`, `\r` etc in input csv and replace with `\n`. Fixes #6403master
parent
92bc7caf7a
commit
30d4a2cf53
@ -0,0 +1,87 @@ |
||||
/* |
||||
* Minio Cloud Storage, (C) 2018 Minio, Inc. |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package ioutil |
||||
|
||||
import ( |
||||
"bufio" |
||||
"io" |
||||
) |
||||
|
||||
var ( |
||||
nByte byte = 10 // the byte that corresponds to the '\n' rune.
|
||||
rByte byte = 13 // the byte that corresponds to the '\r' rune.
|
||||
) |
||||
|
||||
// DelimitedReader reduces the custom delimiter to `\n`.
|
||||
type DelimitedReader struct { |
||||
r *bufio.Reader |
||||
delimiter []rune // Select can have upto 2 characters as delimiter.
|
||||
assignEmpty bool // Decides whether the next read byte should be discarded.
|
||||
} |
||||
|
||||
// NewDelimitedReader detects the custom delimiter and replaces with `\n`.
|
||||
func NewDelimitedReader(r io.Reader, delimiter []rune) *DelimitedReader { |
||||
return &DelimitedReader{r: bufio.NewReader(r), delimiter: delimiter, assignEmpty: false} |
||||
} |
||||
|
||||
// Reads and replaces the custom delimiter with `\n`.
|
||||
func (r *DelimitedReader) Read(p []byte) (n int, err error) { |
||||
n, err = r.r.Read(p) |
||||
if err != nil { |
||||
return |
||||
} |
||||
for i, b := range p { |
||||
if r.assignEmpty { |
||||
swapAndNullify(p, i) |
||||
r.assignEmpty = false |
||||
continue |
||||
} |
||||
if b == rByte && rune(b) != r.delimiter[0] { |
||||
// Replace the carriage returns with `\n`.
|
||||
// Mac styled csv will have `\r` as their record delimiter.
|
||||
p[i] = nByte |
||||
} else if rune(b) == r.delimiter[0] { // Eg, `\r\n`,`ab`,`a` are valid delimiters
|
||||
if i+1 == len(p) && len(r.delimiter) > 1 { |
||||
// If the first delimiter match falls on the boundary,
|
||||
// Peek the next byte and if it matches, discard it in the next byte read.
|
||||
if nextByte, nerr := r.r.Peek(1); nerr == nil { |
||||
if rune(nextByte[0]) == r.delimiter[1] { |
||||
p[i] = nByte |
||||
// To Discard in the next read.
|
||||
r.assignEmpty = true |
||||
} |
||||
} |
||||
} else if len(r.delimiter) > 1 && rune(p[i+1]) == r.delimiter[1] { |
||||
// The second delimiter falls in the same chunk.
|
||||
p[i] = nByte |
||||
r.assignEmpty = true |
||||
} else if len(r.delimiter) == 1 { |
||||
// Replace with `\n` incase of single charecter delimiter match.
|
||||
p[i] = nByte |
||||
} |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
// Occupy the first byte space and nullify the last byte.
|
||||
func swapAndNullify(p []byte, n int) { |
||||
for i := n; i < len(p)-1; i++ { |
||||
p[i] = p[i+1] |
||||
} |
||||
p[len(p)-1] = 0 |
||||
} |
@ -0,0 +1,83 @@ |
||||
/* |
||||
* Minio Cloud Storage, (C) 2016, 2017, 2018 Minio, Inc. |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package ioutil |
||||
|
||||
import ( |
||||
"bytes" |
||||
"io" |
||||
"strings" |
||||
"testing" |
||||
) |
||||
|
||||
// Test for DelimitedCSVReader.
|
||||
func TestDelimitedReader(t *testing.T) { |
||||
expected := "username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45" |
||||
|
||||
inputs := []struct { |
||||
inputcsv string |
||||
delimiter string |
||||
chunkSize int |
||||
}{ |
||||
// case 1 - with default `\n` delimiter.
|
||||
{"username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45", "\n", 10}, |
||||
// case 2 - with carriage return `\r` which should be replaced with `\n` by default.
|
||||
{"username,age\rbanana,12\rcarrot,23\rapple,34\rbrinjal,90\rraddish,45", "\n", 10}, |
||||
// case 3 - with a double character delimiter (octals).
|
||||
{"username,age\r\nbanana,12\r\ncarrot,23\r\napple,34\r\nbrinjal,90\r\nraddish,45", "\r\n", 10}, |
||||
// case 4 - with a double character delimiter.
|
||||
{"username,agexvbanana,12xvcarrot,23xvapple,34xvbrinjal,90xvraddish,45", "xv", 10}, |
||||
// case 5 - with a double character delimiter `\t `
|
||||
{"username,age\t banana,12\t carrot,23\t apple,34\t brinjal,90\t raddish,45", "\t ", 10}, |
||||
// case 6 - This is a special case where the first delimiter match falls in the 13'th byte space
|
||||
// ie, the last byte space of the read chunk, In this case the reader should peek in the next byte
|
||||
// and replace with `\n`.
|
||||
{"username,agexxbanana,12xxcarrot,23xxapple,34xxbrinjal,90xxraddish,45", "xx", 13}, |
||||
} |
||||
|
||||
for c, input := range inputs { |
||||
var readcsv []byte |
||||
var err error |
||||
delimitedReader := NewDelimitedReader(strings.NewReader(input.inputcsv), []rune(input.delimiter)) |
||||
for err == nil { |
||||
chunk := make([]byte, input.chunkSize) |
||||
_, err = delimitedReader.Read(chunk) |
||||
readcsv = append(readcsv, chunk...) |
||||
} |
||||
if err != io.EOF { |
||||
t.Fatalf("Case %d: Error in delimited read", c+1) |
||||
} |
||||
expected := []byte(expected) |
||||
cleanCsv := removeNulls(readcsv) |
||||
if !bytes.Equal(cleanCsv, expected) { |
||||
t.Fatalf("Case %d: Expected the delimited csv to be `%s`, but instead found `%s`", c+1, string(expected), string(cleanCsv)) |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
// Removes all the tailing nulls in chunks.
|
||||
// Null chunks will be assigned if there is a reduction
|
||||
// Eg, When `xv` is reduced to `\n`, the last byte is nullified.
|
||||
func removeNulls(csv []byte) []byte { |
||||
cleanCsv := []byte{} |
||||
for _, p := range csv { |
||||
if p != 0 { |
||||
cleanCsv = append(cleanCsv, p) |
||||
} |
||||
} |
||||
return cleanCsv |
||||
} |
Loading…
Reference in new issue