fix unicode support related bugs in s3select (#7877)

master
Yao Zongyou 5 years ago committed by Harshavardhana
parent bb871a7c31
commit 037319066f
  1. 5
      pkg/s3select/csv/reader.go
  2. 2
      pkg/s3select/sql/funceval.go
  3. 21
      pkg/s3select/sql/stringfuncs.go
  4. 24
      pkg/s3select/sql/stringfuncs_test.go

@ -141,9 +141,8 @@ func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.
csvReader.LazyQuotes = true
// If TrimLeadingSpace is true, leading white space in a field is ignored.
// This is done even if the field delimiter, Comma, is white space.
csvReader.TrimLeadingSpace = true
// We do not trim leading space to keep consistent with s3.
csvReader.TrimLeadingSpace = false
r := &Reader{
args: args,

@ -196,7 +196,7 @@ func charlen(v *Value) (*Value, error) {
err := fmt.Errorf("%s/%s expects a string argument", sqlFnCharLength, sqlFnCharacterLength)
return nil, errIncorrectSQLFunctionArgumentType(err)
}
return FromInt(int64(len(s))), nil
return FromInt(int64(len([]rune(s)))), nil
}
func lowerCase(v *Value) (*Value, error) {

@ -142,18 +142,29 @@ func dropRune(text string) (res string, ok bool) {
}
func evalSQLSubstring(s string, startIdx, length int) (res string, err error) {
if startIdx <= 0 || startIdx > len(s) {
return "", errInvalidSubstringIndexLen
rs := []rune(s)
// According to s3 document, if startIdx < 1, it is set to 1.
if startIdx < 1 {
startIdx = 1
}
if startIdx > len(rs) {
startIdx = len(rs) + 1
}
// StartIdx is 1-based in the input
startIdx--
rs := []rune(s)
endIdx := len(rs)
if length != -1 {
if length < 0 || startIdx+length > len(s) {
if length < 0 {
return "", errInvalidSubstringIndexLen
}
if length > (endIdx - startIdx) {
length = endIdx - startIdx
}
endIdx = startIdx + length
}

@ -105,3 +105,27 @@ func TestEvalSQLLike(t *testing.T) {
}
}
}
func TestEvalSQLSubstring(t *testing.T) {
evalCases := []struct {
s string
startIdx int
length int
resExpected string
errExpected error
}{
{"abcd", 1, 1, "a", nil},
{"abcd", -1, 1, "a", nil},
{"abcd", 999, 999, "", nil},
{"", 999, 999, "", nil},
{"测试abc", 1, 1, "测", nil},
{"测试abc", 5, 5, "c", nil},
}
for i, tc := range evalCases {
res, err := evalSQLSubstring(tc.s, tc.startIdx, tc.length)
if res != tc.resExpected || err != tc.errExpected {
t.Errorf("Eval Case %d failed: %v %v", i, res, err)
}
}
}

Loading…
Cancel
Save