From 037319066f3ebf77433788314056f57a90622dd9 Mon Sep 17 00:00:00 2001 From: Yao Zongyou Date: Sat, 6 Jul 2019 00:43:10 +0800 Subject: [PATCH] fix unicode support related bugs in s3select (#7877) --- pkg/s3select/csv/reader.go | 5 ++--- pkg/s3select/sql/funceval.go | 2 +- pkg/s3select/sql/stringfuncs.go | 21 ++++++++++++++++----- pkg/s3select/sql/stringfuncs_test.go | 24 ++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/pkg/s3select/csv/reader.go b/pkg/s3select/csv/reader.go index 8114b78d0..b917a4139 100644 --- a/pkg/s3select/csv/reader.go +++ b/pkg/s3select/csv/reader.go @@ -141,9 +141,8 @@ func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) { // If LazyQuotes is true, a quote may appear in an unquoted field and a // non-doubled quote may appear in a quoted field. csvReader.LazyQuotes = true - // If TrimLeadingSpace is true, leading white space in a field is ignored. - // This is done even if the field delimiter, Comma, is white space. - csvReader.TrimLeadingSpace = true + // We do not trim leading space to keep consistent with s3. + csvReader.TrimLeadingSpace = false r := &Reader{ args: args, diff --git a/pkg/s3select/sql/funceval.go b/pkg/s3select/sql/funceval.go index 51dcbdbd9..574db2f52 100644 --- a/pkg/s3select/sql/funceval.go +++ b/pkg/s3select/sql/funceval.go @@ -196,7 +196,7 @@ func charlen(v *Value) (*Value, error) { err := fmt.Errorf("%s/%s expects a string argument", sqlFnCharLength, sqlFnCharacterLength) return nil, errIncorrectSQLFunctionArgumentType(err) } - return FromInt(int64(len(s))), nil + return FromInt(int64(len([]rune(s)))), nil } func lowerCase(v *Value) (*Value, error) { diff --git a/pkg/s3select/sql/stringfuncs.go b/pkg/s3select/sql/stringfuncs.go index 4f0dc8537..b1a23956b 100644 --- a/pkg/s3select/sql/stringfuncs.go +++ b/pkg/s3select/sql/stringfuncs.go @@ -142,18 +142,29 @@ func dropRune(text string) (res string, ok bool) { } func evalSQLSubstring(s string, startIdx, length int) (res string, err error) { - if startIdx <= 0 || startIdx > len(s) { - return "", errInvalidSubstringIndexLen + rs := []rune(s) + + // According to s3 document, if startIdx < 1, it is set to 1. + if startIdx < 1 { + startIdx = 1 } + + if startIdx > len(rs) { + startIdx = len(rs) + 1 + } + // StartIdx is 1-based in the input startIdx-- - - rs := []rune(s) endIdx := len(rs) if length != -1 { - if length < 0 || startIdx+length > len(s) { + if length < 0 { return "", errInvalidSubstringIndexLen } + + if length > (endIdx - startIdx) { + length = endIdx - startIdx + } + endIdx = startIdx + length } diff --git a/pkg/s3select/sql/stringfuncs_test.go b/pkg/s3select/sql/stringfuncs_test.go index b97f66aa1..242c2283d 100644 --- a/pkg/s3select/sql/stringfuncs_test.go +++ b/pkg/s3select/sql/stringfuncs_test.go @@ -105,3 +105,27 @@ func TestEvalSQLLike(t *testing.T) { } } } + +func TestEvalSQLSubstring(t *testing.T) { + evalCases := []struct { + s string + startIdx int + length int + resExpected string + errExpected error + }{ + {"abcd", 1, 1, "a", nil}, + {"abcd", -1, 1, "a", nil}, + {"abcd", 999, 999, "", nil}, + {"", 999, 999, "", nil}, + {"测试abc", 1, 1, "测", nil}, + {"测试abc", 5, 5, "c", nil}, + } + + for i, tc := range evalCases { + res, err := evalSQLSubstring(tc.s, tc.startIdx, tc.length) + if res != tc.resExpected || err != tc.errExpected { + t.Errorf("Eval Case %d failed: %v %v", i, res, err) + } + } +}