diff --git a/mint/run/core/s3select/csv.py b/mint/run/core/s3select/csv.py new file mode 100644 index 000000000..faee96518 --- /dev/null +++ b/mint/run/core/s3select/csv.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# MinIO Python Library for Amazon S3 Compatible Cloud Storage, +# (C) 2015-2020 MinIO, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import io + +from minio import Minio +from minio.select.options import (SelectObjectOptions, CSVInput, + RequestProgress, InputSerialization, + OutputSerialization, CSVOutput, JsonOutput) + +from utils import * + +def test_sql_api(test_name, client, bucket_name, input_data, sql_opts, expected_output): + """ Test if the passed SQL request has the output equal to the passed execpted one""" + object_name = generate_object_name() + got_output = b'' + try: + bytes_content = io.BytesIO(input_data) + client.put_object(bucket_name, object_name, io.BytesIO(input_data), len(input_data)) + data = client.select_object_content(bucket_name, object_name, sql_opts) + # Get the records + records = io.BytesIO() + for d in data.stream(10*1024): + records.write(d.encode('utf-8')) + got_output = records.getvalue() + except Exception as select_err: + if not isinstance(expected_output, Exception): + raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + else: + if isinstance(expected_output, Exception): + raise ValueError('Test {}: expected an exception, got {}'.format(test_name, got_output)) + if got_output != expected_output: + raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format(test_name, expected_output, got_output)) + finally: + client.remove_object(bucket_name, object_name) + + +def test_csv_input_custom_quote_char(client, log_output): + # Get a unique bucket_name and object_name + log_output.args['bucket_name'] = bucket_name = generate_bucket_name() + + tests = [ + # Invalid quote character, should fail + ('""', '"', b'col1,col2,col3\n', Exception()), + # UTF-8 quote character + ('ع', '"', 'عcol1ع,عcol2ع,عcol3ع\n'.encode(), b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), + # Only one field is quoted + ('"', '"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), + ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), + ('\'', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), + ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), + ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), + ('', '"', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), + ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'), + ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'), + ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'), + ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'), + ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), + ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'), + ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'), + ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), + ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'), + ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'), + ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'), + ] + + client.make_bucket(bucket_name) + + try: + for idx, (quote_char, escape_char, data, expected_output) in enumerate(tests): + sql_opts = SelectObjectOptions( + expression="select * from s3object", + input_serialization=InputSerialization( + compression_type="NONE", + csv=CSVInput(FileHeaderInfo="NONE", + RecordDelimiter="\n", + FieldDelimiter=",", + QuoteCharacter=quote_char, + QuoteEscapeCharacter=escape_char, + Comments="#", + AllowQuotedRecordDelimiter="FALSE",), + ), + output_serialization=OutputSerialization( + json = JsonOutput( + RecordDelimiter="\n", + ) + ), + request_progress=RequestProgress( + enabled="False" + ) + ) + + test_sql_api(f'test_{idx}', client, bucket_name, data, sql_opts, expected_output) + finally: + client.remove_bucket(bucket_name) + + # Test passes + print(log_output.json_report()) + +def test_csv_output_custom_quote_char(client, log_output): + # Get a unique bucket_name and object_name + log_output.args['bucket_name'] = bucket_name = generate_bucket_name() + + tests = [ + # UTF-8 quote character + ("''", "''", b'col1,col2,col3\n', Exception()), + ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), + ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), + ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), + ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), + ('"', '"', b'""""\n', b'""""\n'), + ('"', '"', b'\n', b''), + ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), + ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"), + ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"), + ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"), + ("'", "\\", b'col\'\n', b"'col\\''\n"), + # Two consecutive escaped quotes + ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"), + ] + + client.make_bucket(bucket_name) + + try: + for idx, (quote_char, escape_char, input_data, expected_output) in enumerate(tests): + sql_opts = SelectObjectOptions( + expression="select * from s3object", + input_serialization=InputSerialization( + compression_type="NONE", + csv=CSVInput(FileHeaderInfo="NONE", + RecordDelimiter="\n", + FieldDelimiter=",", + QuoteCharacter='"', + QuoteEscapeCharacter='"', + Comments="#", + AllowQuotedRecordDelimiter="FALSE",), + ), + output_serialization=OutputSerialization( + csv=CSVOutput(QuoteFields="ALWAYS", + RecordDelimiter="\n", + FieldDelimiter=",", + QuoteCharacter=quote_char, + QuoteEscapeCharacter=escape_char,) + ), + request_progress=RequestProgress( + enabled="False" + ) + ) + + test_sql_api(f'test_{idx}', client, bucket_name, input_data, sql_opts, expected_output) + finally: + client.remove_bucket(bucket_name) + + # Test passes + print(log_output.json_report()) + + diff --git a/mint/run/core/s3select/sql_ops.py b/mint/run/core/s3select/sql_ops.py new file mode 100644 index 000000000..83931ad21 --- /dev/null +++ b/mint/run/core/s3select/sql_ops.py @@ -0,0 +1,399 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# MinIO Python Library for Amazon S3 Compatible Cloud Storage, +# (C) 2020 MinIO, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +from datetime import datetime + +from minio import Minio +from minio.select.options import (SelectObjectOptions, CSVInput, JSONInput, + RequestProgress, InputSerialization, + OutputSerialization, CSVOutput, JsonOutput) + +from utils import * + +def test_sql_expressions_custom_input_output(client, input_bytes, sql_input, sql_output, tests, log_output): + bucket_name = generate_bucket_name() + object_name = generate_object_name() + + log_output.args['total_tests'] = 0 + log_output.args['total_success'] = 0 + + client.make_bucket(bucket_name) + try: + content = io.BytesIO(bytes(input_bytes, 'utf-8')) + client.put_object(bucket_name, object_name, content, len(input_bytes)) + + for idx, (test_name, select_expression, expected_output) in enumerate(tests): + if select_expression == '': + continue + try: + log_output.args['total_tests'] += 1 + options = SelectObjectOptions( + expression=select_expression, + input_serialization=sql_input, + output_serialization=sql_output, + request_progress=RequestProgress( + enabled="False" + ) + ) + + data = client.select_object_content(bucket_name, object_name, options) + + # Get the records + records = io.BytesIO() + for d in data.stream(10*1024): + records.write(d.encode('utf-8')) + got_output = records.getvalue() + + if got_output != expected_output: + if type(expected_output) == datetime: + # Attempt to parse the date which will throw an exception for any issue + datetime.strptime(got_output.decode("utf-8").strip(), '%Y-%m-%dT%H:%M:%S.%f%z') + else: + raise ValueError('Test {}: data mismatch. Expected : {}. Received: {}.'.format(idx+1, expected_output, got_output)) + + log_output.args['total_success'] += 1 + except Exception as err: + continue ## TODO, raise instead + # raise Exception(err) + finally: + client.remove_object(bucket_name, object_name) + client.remove_bucket(bucket_name) + + +def test_sql_expressions(client, input_json_bytes, tests, log_output): + input_serialization = InputSerialization( + compression_type="NONE", + json=JSONInput(Type="DOCUMENT"), + ) + + output_serialization=OutputSerialization( + csv=CSVOutput(QuoteFields="ASNEEDED") + ) + + test_sql_expressions_custom_input_output(client, input_json_bytes, + input_serialization, output_serialization, tests, log_output) + + +def test_sql_operators(client, log_output): + + json_testfile = """{"id": 1, "name": "John", "age": 3} +{"id": 2, "name": "Elliot", "age": 4} +{"id": 3, "name": "Yves", "age": 5} +{"id": 4, "name": null, "age": 0} +""" + + tests = [ + # Logical operators + ("AND", "select * from S3Object s where s.id = 1 AND s.name = 'John'", b'1,John,3\n'), + ("NOT", "select * from S3Object s where NOT s.id = 1", b'2,Elliot,4\n3,Yves,5\n4,,0\n'), + ("OR", "select * from S3Object s where s.id = 1 OR s.id = 3", b'1,John,3\n3,Yves,5\n'), + # Comparison Operators + ("<", "select * from S3Object s where s.age < 4", b'1,John,3\n4,,0\n'), + (">", "select * from S3Object s where s.age > 4", b'3,Yves,5\n'), + ("<=", "select * from S3Object s where s.age <= 4", b'1,John,3\n2,Elliot,4\n4,,0\n'), + (">=", "select * from S3Object s where s.age >= 4", b'2,Elliot,4\n3,Yves,5\n'), + ("=", "select * from S3Object s where s.age = 4", b'2,Elliot,4\n'), + ("<>", "select * from S3Object s where s.age <> 4", b'1,John,3\n3,Yves,5\n4,,0\n'), + ("!=", "select * from S3Object s where s.age != 4", b'1,John,3\n3,Yves,5\n4,,0\n'), + ("BETWEEN", "select * from S3Object s where s.age BETWEEN 4 AND 5", b'2,Elliot,4\n3,Yves,5\n'), + ("IN", "select * from S3Object s where s.age IN (3,5)", b'1,John,3\n3,Yves,5\n'), + # Pattern Matching Operators + ("LIKE_", "select * from S3Object s where s.name LIKE '_ves'", b'3,Yves,5\n'), + ("LIKE%", "select * from S3Object s where s.name LIKE 'Ell%t'", b'2,Elliot,4\n'), + # Unitary Operators + ("NULL", "select * from S3Object s where s.name IS NULL", b'4,,0\n'), + ("NOT_NULL", "select * from S3Object s where s.age IS NOT NULL", b'1,John,3\n2,Elliot,4\n3,Yves,5\n4,,0\n'), + # Math Operators + ("+", "select * from S3Object s where s.age = 1+3 ", b'2,Elliot,4\n'), + ("-", "select * from S3Object s where s.age = 5-1 ", b'2,Elliot,4\n'), + ("*", "select * from S3Object s where s.age = 2*2 ", b'2,Elliot,4\n'), + ("%", "select * from S3Object s where s.age = 10%6 ", b'2,Elliot,4\n'), + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + +def test_sql_operators_precedence(client, log_output): + + json_testfile = """{"id": 1, "name": "Eric"}""" + + tests = [ + ("-_1", "select -3*3 from S3Object", b'-9\n'), + ("*", "select 10-3*2 from S3Object", b'4\n'), + ("/", "select 13-10/5 from S3Object", b'11\n'), + ("%", "select 13-10%5 from S3Object", b'13\n'), + ("+", "select 1+1*3 from S3Object", b'4\n'), + ("-_2", "select 1-1*3 from S3Object", b'-2\n'), + ("=", "select * from S3Object as s where s.id = 13-12", b'1,Eric\n'), + ("<>", "select * from S3Object as s where s.id <> 1-1", b'1,Eric\n'), + ("NOT", "select * from S3Object where false OR NOT false", b'1,Eric\n'), + ("AND", "select * from S3Object where true AND true OR false ", b'1,Eric\n'), + ("OR", "select * from S3Object where false OR NOT false", b'1,Eric\n'), + ("IN", "select * from S3Object as s where s.id <> -1 AND s.id IN (1,2,3)", b'1,Eric\n'), + ("BETWEEN", "select * from S3Object as s where s.id <> -1 AND s.id BETWEEN -1 AND 3", b'1,Eric\n'), + ("LIKE", "select * from S3Object as s where s.id <> -1 AND s.name LIKE 'E%'", b'1,Eric\n'), + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + + +def test_sql_functions_agg_cond_conv(client, log_output): + + json_testfile = """{"id": 1, "name": "John", "age": 3} +{"id": 2, "name": "Elliot", "age": 4} +{"id": 3, "name": "Yves", "age": 5} +{"id": 4, "name": "Christine", "age": null} +{"id": 5, "name": "Eric", "age": 0} +""" + tests = [ + # Aggregate functions + ("COUNT", "select count(*) from S3Object s", b'5\n'), + ("AVG", "select avg(s.age) from S3Object s", b'3\n'), + ("MAX", "select max(s.age) from S3Object s", b'5\n'), + ("MIN", "select min(s.age) from S3Object s", b'0\n'), + ("SUM", "select sum(s.age) from S3Object s", b'12\n'), + # Conditional functions + ("COALESCE", "SELECT COALESCE(s.age, 99) FROM S3Object s", b'3\n4\n5\n99\n0\n'), + ("NULLIF", "SELECT NULLIF(s.age, 0) FROM S3Object s", b'3\n4\n5\n\n\n'), + ## Conversion functions + ("CAST", "SELECT CAST(s.age AS FLOAT) FROM S3Object s", b'3.0\n4.0\n5.0\n\n0.0\n'), + + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + +def test_sql_functions_date(client, log_output): + + json_testfile = """ +{"id": 1, "name": "John", "datez": "2017-01-02T03:04:05.006+07:30"} +""" + + tests = [ + # DATE_ADD + ("DATE_ADD_1", "select DATE_ADD(year, 5, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2022-01-02T03:04:05.006+07:30\n'), + ("DATE_ADD_2", "select DATE_ADD(month, 1, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017-02-02T03:04:05.006+07:30\n'), + ("DATE_ADD_3", "select DATE_ADD(day, -1, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017-01-01T03:04:05.006+07:30\n'), + ("DATE_ADD_4", "select DATE_ADD(hour, 1, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017-01-02T04:04:05.006+07:30\n'), + ("DATE_ADD_5", "select DATE_ADD(minute, 5, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017-01-02T03:09:05.006+07:30\n'), + ("DATE_ADD_6", "select DATE_ADD(second, 5, TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017-01-02T03:04:10.006+07:30\n'), + # DATE_DIFF + ("DATE_DIFF_1", "select DATE_DIFF(year, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2011-01-01T')) from S3Object as s", b'-6\n'), + ("DATE_DIFF_2", "select DATE_DIFF(month, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2011T')) from S3Object as s", b'-72\n'), + ("DATE_DIFF_3", "select DATE_DIFF(day, TO_TIMESTAMP(s.datez), TO_TIMESTAMP('2010-01-02T')) from S3Object as s", b'-2556\n'), + # EXTRACT + ("EXTRACT_1", "select EXTRACT(year FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'2017\n'), + ("EXTRACT_2", "select EXTRACT(month FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'1\n'), + ("EXTRACT_3", "select EXTRACT(hour FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'3\n'), + ("EXTRACT_4", "select EXTRACT(minute FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'4\n'), + ("EXTRACT_5", "select EXTRACT(timezone_hour FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'7\n'), + ("EXTRACT_6", "select EXTRACT(timezone_minute FROM TO_TIMESTAMP(s.datez)) from S3Object as s", b'30\n'), + # TO_STRING + ("TO_STRING_1", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMMM d, y') from S3Object as s", b'"January 2, 2017"\n'), + ("TO_STRING_2", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMM d, yyyy') from S3Object as s", b'"Jan 2, 2017"\n'), + ("TO_STRING_3", "select TO_STRING(TO_TIMESTAMP(s.datez), 'M-d-yy') from S3Object as s", b'1-2-17\n'), + ("TO_STRING_4", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MM-d-y') from S3Object as s", b'01-2-2017\n'), + ("TO_STRING_5", "select TO_STRING(TO_TIMESTAMP(s.datez), 'MMMM d, y h:m a') from S3Object as s", b'"January 2, 2017 3:4 AM"\n'), + ("TO_STRING_6", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssX') from S3Object as s", b'2017-01-02T3:4:05+0730\n'), + ("TO_STRING_7", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssX') from S3Object as s", b'2017-01-02T3:4:05+0730\n'), + ("TO_STRING_8", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssXXXX') from S3Object as s", b'2017-01-02T3:4:05+0730\n'), + ("TO_STRING_9", "select TO_STRING(TO_TIMESTAMP(s.datez), 'y-MM-dd''T''H:m:ssXXXXX') from S3Object as s", b'2017-01-02T3:4:05+07:30\n'), + ("TO_TIMESTAMP", "select TO_TIMESTAMP(s.datez) from S3Object as s", b'2017-01-02T03:04:05.006+07:30\n'), + ("UTCNOW", "select UTCNOW() from S3Object", datetime(1,1,1)), + + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + +def test_sql_functions_string(client, log_output): + + json_testfile = """ +{"id": 1, "name": "John"} +{"id": 2, "name": " \tfoobar\t "} +{"id": 3, "name": "1112211foobar22211122"} +""" + + tests = [ + # CHAR_LENGTH + ("CHAR_LENGTH", "select CHAR_LENGTH(s.name) from S3Object as s", b'4\n24\n21\n'), + ("CHARACTER_LENGTH", "select CHARACTER_LENGTH(s.name) from S3Object as s", b'4\n24\n21\n'), + # LOWER + ("LOWER", "select LOWER(s.name) from S3Object as s where s.id= 1", b'john\n'), + # SUBSTRING + ("SUBSTRING_1", "select SUBSTRING(s.name FROM 2) from S3Object as s where s.id = 1", b'ohn\n'), + ("SUBSTRING_2", "select SUBSTRING(s.name FROM 2 FOR 2) from S3Object as s where s.id = 1", b'oh\n'), + ("SUBSTRING_3", "select SUBSTRING(s.name FROM -1 FOR 2) from S3Object as s where s.id = 1", b'\n'), + # TRIM + ("TRIM_1", "select TRIM(s.name) from S3Object as s where s.id = 2", b'\tfoobar\t\n'), + ("TRIM_2", "select TRIM(LEADING FROM s.name) from S3Object as s where s.id = 2", b'\tfoobar\t \n'), + ("TRIM_3", "select TRIM(TRAILING FROM s.name) from S3Object as s where s.id = 2", b' \tfoobar\t\n'), + ("TRIM_4", "select TRIM(BOTH FROM s.name) from S3Object as s where s.id = 2", b'\tfoobar\t\n'), + ("TRIM_5", "select TRIM(BOTH '12' FROM s.name) from S3Object as s where s.id = 3", b'foobar\n'), + # UPPER + ("UPPER", "select UPPER(s.name) from S3Object as s where s.id= 1", b'JOHN\n'), + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + +def test_sql_datatypes(client, log_output): + json_testfile = """ +{"name": "John"} +""" + tests = [ + ("bool", "select CAST('true' AS BOOL) from S3Object", b'true\n'), + ("int", "select CAST('13' AS INT) from S3Object", b'13\n'), + ("integer", "select CAST('13' AS INTEGER) from S3Object", b'13\n'), + ("string", "select CAST(true AS STRING) from S3Object", b'true\n'), + ("float", "select CAST('13.3' AS FLOAT) from S3Object", b'13.3\n'), + ("decimal", "select CAST('14.3' AS FLOAT) from S3Object", b'14.3\n'), + ("numeric", "select CAST('14.3' AS FLOAT) from S3Object", b'14.3\n'), + ("timestamp", "select CAST('2007-04-05T14:30Z' AS TIMESTAMP) from S3Object", b'2007-04-05T14:30Z\n'), + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + +def test_sql_select(client, log_output): + + json_testfile = """{"id": 1, "created": "June 27", "modified": "July 6" } +{"id": 2, "Created": "June 28", "Modified": "July 7", "Cast": "Random Date" }""" + tests = [ + ("select_1", "select * from S3Object", b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'), + ("select_2", "select * from S3Object s", b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'), + ("select_3", "select * from S3Object as s", b'1,June 27,July 6\n2,June 28,July 7,Random Date\n'), + ("select_4", "select s.line from S3Object as s", b'\n\n'), + ("select_5", 'select s."Created" from S3Object as s', b'\nJune 28\n'), + ("select_5", 'select s."Cast" from S3Object as s', b'\nRandom Date\n'), + ("where", 'select s.created from S3Object as s', b'June 27\nJune 28\n'), + ("limit", 'select * from S3Object as s LIMIT 1', b'1,June 27,July 6\n'), + ] + + try: + test_sql_expressions(client, json_testfile, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + +def test_sql_select_json(client, log_output): + json_testcontent = """{ "Rules": [ {"id": "1"}, {"expr": "y > x"}, {"id": "2", "expr": "z = DEBUG"} ]} +{ "created": "June 27", "modified": "July 6" } +""" + tests = [ + ("select_1", "SELECT id FROM S3Object[*].Rules[*].id", b'{"id":"1"}\n{}\n{"id":"2"}\n{}\n'), + ("select_2", "SELECT id FROM S3Object[*].Rules[*].id WHERE id IS NOT MISSING", b'{"id":"1"}\n{"id":"2"}\n'), + ("select_3", "SELECT d.created, d.modified FROM S3Object[*] d", b'{}\n{"created":"June 27","modified":"July 6"}\n'), + ("select_4", "SELECT _1.created, _1.modified FROM S3Object[*]", b'{}\n{"created":"June 27","modified":"July 6"}\n'), + ("select_5", "Select s.rules[1].expr from S3Object s", b'{"expr":"y > x"}\n{}\n'), + ] + + input_serialization = InputSerialization(json=JSONInput(Type="DOCUMENT")) + output_serialization = OutputSerialization(json=JsonOutput()) + try: + test_sql_expressions_custom_input_output(client, json_testcontent, + input_serialization, output_serialization, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + +def test_sql_select_csv_no_header(client, log_output): + json_testcontent = """val1,val2,val3 +val4,val5,val6 +""" + tests = [ + ("select_1", "SELECT s._2 FROM S3Object as s", b'val2\nval5\n'), + ] + + input_serialization=InputSerialization( + csv=CSVInput( + FileHeaderInfo="NONE", + AllowQuotedRecordDelimiter="FALSE", + ), + ) + + output_serialization=OutputSerialization(csv=CSVOutput()) + try: + test_sql_expressions_custom_input_output(client, json_testcontent, + input_serialization, output_serialization, tests, log_output) + except Exception as select_err: + raise select_err + # raise ValueError('Test {} unexpectedly failed with: {}'.format(test_name, select_err)) + # pass + + # Test passes + print(log_output.json_report()) + + diff --git a/mint/run/core/s3select/tests.py b/mint/run/core/s3select/tests.py index 5d39cadc0..3e8c26dde 100644 --- a/mint/run/core/s3select/tests.py +++ b/mint/run/core/s3select/tests.py @@ -15,278 +15,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -# from __future__ import division -# from __future__ import absolute_import - import os -import io from sys import exit -import uuid -import inspect -import json -import time -import traceback - from minio import Minio -from minio.select.options import (SelectObjectOptions, CSVInput, - RequestProgress, InputSerialization, - OutputSerialization, CSVOutput, JsonOutput) - -class LogOutput(object): - """ - LogOutput is the class for log output. It is required standard for all - SDK tests controlled by mint. - Here are its attributes: - 'name': name of the SDK under test, e.g. 's3select' - 'function': name of the method/api under test with its signature - The following python code can be used to - pull args information of a and to - put together with the method name: - .__name__+'('+', '.join(args_list)+')' - e.g. 'remove_object(bucket_name, object_name)' - 'args': method/api arguments with their values, in - dictionary form: {'arg1': val1, 'arg2': val2, ...} - 'duration': duration of the whole test in milliseconds, - defaults to 0 - 'alert': any extra information user is needed to be alerted about, - like whether this is a Blocker/Gateway/Server related - issue, etc., defaults to None - 'message': descriptive error message, defaults to None - 'error': stack-trace/exception message(only in case of failure), - actual low level exception/error thrown by the program, - defaults to None - 'status': exit status, possible values are 'PASS', 'FAIL', 'NA', - defaults to 'PASS' - """ - - PASS = 'PASS' - FAIL = 'FAIL' - NA = 'NA' - - def __init__(self, meth, test_name): - self.__args_list = inspect.getargspec(meth).args[1:] - self.__name = 'minio-py:'+test_name - self.__function = meth.__name__+'('+', '.join(self.__args_list)+')' - self.__args = {} - self.__duration = 0 - self.__alert = '' - self.__message = None - self.__error = None - self.__status = self.PASS - self.__start_time = time.time() - - @property - def name(self): return self.__name - - @property - def function(self): return self.__function - - @property - def args(self): return self.__args - - @name.setter - def name(self, val): self.__name = val - - @function.setter - def function(self, val): self.__function = val - - @args.setter - def args(self, val): self.__args = val - - def json_report(self, err_msg='', alert='', status=''): - self.__args = {k: v for k, v in self.__args.items() if v and v != ''} - entry = {'name': self.__name, - 'function': self.__function, - 'args': self.__args, - 'duration': int(round((time.time() - self.__start_time)*1000)), - 'alert': str(alert), - 'message': str(err_msg), - 'error': traceback.format_exc() if err_msg and err_msg != '' else '', - 'status': status if status and status != '' else - self.FAIL if err_msg and err_msg != '' else self.PASS - } - return json.dumps({k: v for k, v in entry.items() if v and v != ''}) - -def generate_bucket_name(): - return "s3select-test-" + uuid.uuid4().__str__() - - -def test_csv_input_custom_quote_char(client, log_output): - # Get a unique bucket_name and object_name - log_output.args['bucket_name'] = bucket_name = generate_bucket_name() - - tests = [ - # Invalid quote character, should fail - ('""', '"', b'col1,col2,col3\n', Exception()), - # UTF-8 quote character - ('ع', '"', b'\xd8\xb9col1\xd8\xb9,\xd8\xb9col2\xd8\xb9,\xd8\xb9col3\xd8\xb9\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), - # Only one field is quoted - ('"', '"', b'"col1",col2,col3\n', b'{"_1":"col1","_2":"col2","_3":"col3"}\n'), - ('"', '"', b'"col1,col2,col3"\n', b'{"_1":"col1,col2,col3"}\n'), - ('\'', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), - ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), - ('', '"', b'"col1",col2,col3\n', b'{"_1":"\\"col1\\"","_2":"col2","_3":"col3"}\n'), - ('', '"', b'"col1","col2","col3"\n', b'{"_1":"\\"col1\\"","_2":"\\"col2\\"","_3":"\\"col3\\""}\n'), - ('"', '"', b'""""""\n', b'{"_1":"\\"\\""}\n'), - ('"', '"', b'A",B\n', b'{"_1":"A\\"","_2":"B"}\n'), - ('"', '"', b'A"",B\n', b'{"_1":"A\\"\\"","_2":"B"}\n'), - ('"', '\\', b'A\\B,C\n', b'{"_1":"A\\\\B","_2":"C"}\n'), - ('"', '"', b'"A""B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), - ('"', '\\', b'"A\\B","CD"\n', b'{"_1":"AB","_2":"CD"}\n'), - ('"', '\\', b'"A\\,","CD"\n', b'{"_1":"A,","_2":"CD"}\n'), - ('"', '\\', b'"A\\"B","CD"\n', b'{"_1":"A\\"B","_2":"CD"}\n'), - ('"', '\\', b'"A\\""\n', b'{"_1":"A\\""}\n'), - ('"', '\\', b'"A\\"\\"B"\n', b'{"_1":"A\\"\\"B"}\n'), - ('"', '\\', b'"A\\"","\\"B"\n', b'{"_1":"A\\"","_2":"\\"B"}\n'), - ] - - try: - client.make_bucket(bucket_name) - - for idx, (quote_char, escape_char, object_content, expected_output) in enumerate(tests): - options = SelectObjectOptions( - expression="select * from s3object", - input_serialization=InputSerialization( - compression_type="NONE", - csv=CSVInput(FileHeaderInfo="NONE", - RecordDelimiter="\n", - FieldDelimiter=",", - QuoteCharacter=quote_char, - QuoteEscapeCharacter=escape_char, - Comments="#", - AllowQuotedRecordDelimiter="FALSE",), - ), - output_serialization=OutputSerialization( - json = JsonOutput( - RecordDelimiter="\n", - ) - ), - request_progress=RequestProgress( - enabled="False" - ) - ) - - got_output = b'' - - try: - got_output = exec_select(client, bucket_name, object_content, options, log_output) - except Exception as select_err: - if not isinstance(expected_output, Exception): - raise ValueError('Test {} unexpectedly failed with: {}'.format(idx+1, select_err)) - else: - if isinstance(expected_output, Exception): - raise ValueError('Test {}: expected an exception, got {}'.format(idx+1, got_output)) - if got_output != expected_output: - raise ValueError('Test {}: data mismatch. Expected : {}, Received {}'.format(idx+1, expected_output, got_output)) - - except Exception as err: - raise Exception(err) - finally: - try: - client.remove_bucket(bucket_name) - except Exception as err: - raise Exception(err) - - # Test passes - print(log_output.json_report()) - -def test_csv_output_custom_quote_char(client, log_output): - # Get a unique bucket_name and object_name - log_output.args['bucket_name'] = bucket_name = generate_bucket_name() - - tests = [ - # UTF-8 quote character - ("''", "''", b'col1,col2,col3\n', Exception()), - ("'", "'", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), - ("", '"', b'col1,col2,col3\n', b'\x00col1\x00,\x00col2\x00,\x00col3\x00\n'), - ('"', '"', b'col1,col2,col3\n', b'"col1","col2","col3"\n'), - ('"', '"', b'col"1,col2,col3\n', b'"col""1","col2","col3"\n'), - ('"', '"', b'""""\n', b'""""\n'), - ('"', '"', b'\n', b''), - ("'", "\\", b'col1,col2,col3\n', b"'col1','col2','col3'\n"), - ("'", "\\", b'col""1,col2,col3\n', b"'col\"\"1','col2','col3'\n"), - ("'", "\\", b'col\'1,col2,col3\n', b"'col\\'1','col2','col3'\n"), - ("'", "\\", b'"col\'1","col2","col3"\n', b"'col\\'1','col2','col3'\n"), - ("'", "\\", b'col\'\n', b"'col\\''\n"), - # Two consecutive escaped quotes - ("'", "\\", b'"a"""""\n', b"'a\"\"'\n"), - ] - - try: - client.make_bucket(bucket_name) - - for idx, (quote_char, escape_char, object_content, expected_output) in enumerate(tests): - options = SelectObjectOptions( - expression="select * from s3object", - input_serialization=InputSerialization( - compression_type="NONE", - csv=CSVInput(FileHeaderInfo="NONE", - RecordDelimiter="\n", - FieldDelimiter=",", - QuoteCharacter='"', - QuoteEscapeCharacter='"', - Comments="#", - AllowQuotedRecordDelimiter="FALSE",), - ), - output_serialization=OutputSerialization( - csv=CSVOutput(QuoteFields="ALWAYS", - RecordDelimiter="\n", - FieldDelimiter=",", - QuoteCharacter=quote_char, - QuoteEscapeCharacter=escape_char,) - ), - request_progress=RequestProgress( - enabled="False" - ) - ) - - got_output = b'' - - try: - got_output = exec_select(client, bucket_name, object_content, options, log_output) - except Exception as select_err: - if not isinstance(expected_output, Exception): - raise ValueError('Test {} unexpectedly failed with: {}'.format(idx+1, select_err)) - else: - if isinstance(expected_output, Exception): - raise ValueError('Test {}: expected an exception, got {}'.format(idx+1, got_output)) - if got_output != expected_output: - raise ValueError('Test {}: data mismatch. Expected : {}. Received: {}.'.format(idx+1, expected_output, got_output)) - - except Exception as err: - raise Exception(err) - finally: - try: - client.remove_bucket(bucket_name) - except Exception as err: - raise Exception(err) - - # Test passes - print(log_output.json_report()) - - -def exec_select(client, bucket_name, object_content, options, log_output): - log_output.args['object_name'] = object_name = uuid.uuid4().__str__() - try: - bytes_content = io.BytesIO(object_content) - client.put_object(bucket_name, object_name, io.BytesIO(object_content), len(object_content)) - - data = client.select_object_content(bucket_name, object_name, options) - # Get the records - records = io.BytesIO() - for d in data.stream(10*1024): - records.write(d.encode('utf-8')) - - return records.getvalue() - - except Exception as err: - raise Exception(err) - finally: - try: - client.remove_object(bucket_name, object_name) - except Exception as err: - raise Exception(err) +from utils import LogOutput +from sql_ops import * +from csv import * def main(): """ @@ -312,6 +47,33 @@ def main(): log_output = LogOutput(client.select_object_content, 'test_csv_output_quote_char') test_csv_output_custom_quote_char(client, log_output) + log_output = LogOutput(client.select_object_content, 'test_sql_operators') + test_sql_operators(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_operators_precedence') + test_sql_operators_precedence(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_functions_agg_cond_conv') + test_sql_functions_agg_cond_conv(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_functions_date') + test_sql_functions_date(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_functions_string') + test_sql_functions_string(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_datatypes') + test_sql_datatypes(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_select') + test_sql_select(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_select_json') + test_sql_select_json(client, log_output) + + log_output = LogOutput(client.select_object_content, 'test_sql_select_csv') + test_sql_select_csv_no_header(client, log_output) + except Exception as err: print(log_output.json_report(err)) exit(1) @@ -319,3 +81,6 @@ def main(): if __name__ == "__main__": # Execute only if run as a script main() + + + diff --git a/mint/run/core/s3select/utils.py b/mint/run/core/s3select/utils.py new file mode 100644 index 000000000..1baabf983 --- /dev/null +++ b/mint/run/core/s3select/utils.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# MinIO Python Library for Amazon S3 Compatible Cloud Storage, +# (C) 2015-2020 MinIO, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid +import inspect +import json +import time +import traceback + +class LogOutput(object): + """ + LogOutput is the class for log output. It is required standard for all + SDK tests controlled by mint. + Here are its attributes: + 'name': name of the SDK under test, e.g. 's3select' + 'function': name of the method/api under test with its signature + The following python code can be used to + pull args information of a and to + put together with the method name: + .__name__+'('+', '.join(args_list)+')' + e.g. 'remove_object(bucket_name, object_name)' + 'args': method/api arguments with their values, in + dictionary form: {'arg1': val1, 'arg2': val2, ...} + 'duration': duration of the whole test in milliseconds, + defaults to 0 + 'alert': any extra information user is needed to be alerted about, + like whether this is a Blocker/Gateway/Server related + issue, etc., defaults to None + 'message': descriptive error message, defaults to None + 'error': stack-trace/exception message(only in case of failure), + actual low level exception/error thrown by the program, + defaults to None + 'status': exit status, possible values are 'PASS', 'FAIL', 'NA', + defaults to 'PASS' + """ + + PASS = 'PASS' + FAIL = 'FAIL' + NA = 'NA' + + def __init__(self, meth, test_name): + self.__args_list = inspect.getargspec(meth).args[1:] + self.__name = 's3select:'+test_name + self.__function = meth.__name__+'('+', '.join(self.__args_list)+')' + self.__args = {} + self.__duration = 0 + self.__alert = '' + self.__message = None + self.__error = None + self.__status = self.PASS + self.__start_time = time.time() + + @property + def name(self): return self.__name + + @property + def function(self): return self.__function + + @property + def args(self): return self.__args + + @name.setter + def name(self, val): self.__name = val + + @function.setter + def function(self, val): self.__function = val + + @args.setter + def args(self, val): self.__args = val + + def json_report(self, err_msg='', alert='', status=''): + self.__args = {k: v for k, v in self.__args.items() if v and v != ''} + entry = {'name': self.__name, + 'function': self.__function, + 'args': self.__args, + 'duration': int(round((time.time() - self.__start_time)*1000)), + 'alert': str(alert), + 'message': str(err_msg), + 'error': traceback.format_exc() if err_msg and err_msg != '' else '', + 'status': status if status and status != '' else + self.FAIL if err_msg and err_msg != '' else self.PASS + } + return json.dumps({k: v for k, v in entry.items() if v and v != ''}) + + +def generate_bucket_name(): + return "s3select-test-" + str(uuid.uuid4()) + +def generate_object_name(): + return str(uuid.uuid4()) + +