From 5a4a57700b8789ce396cc136e2a0150b66169a53 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 17 Aug 2018 17:11:39 -0700 Subject: [PATCH] Add select docs and fix return values for Select API (#6300) --- cmd/object-handlers.go | 11 ++--- docs/select/README.md | 89 +++++++++++++++++++++++++++++++++++++++++ docs/select/select.py | 33 +++++++++++++++ pkg/s3select/helpers.go | 3 ++ 4 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 docs/select/README.md create mode 100644 docs/select/select.py diff --git a/cmd/object-handlers.go b/cmd/object-handlers.go index 97e2f782e..7e4bd8b5a 100644 --- a/cmd/object-handlers.go +++ b/cmd/object-handlers.go @@ -161,19 +161,22 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r writeErrorResponse(w, ErrInvalidExpressionType, r.URL) return } - if len(selectReq.Expression) >= (256 * 1000) { + if len(selectReq.Expression) >= s3select.MaxExpressionLength { writeErrorResponse(w, ErrExpressionTooLong, r.URL) + return } if selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoUse && selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone && selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoIgnore && selectReq.InputSerialization.CSV.FileHeaderInfo != "" { writeErrorResponse(w, ErrInvalidFileHeaderInfo, r.URL) + return } if selectReq.OutputSerialization.CSV.QuoteFields != CSVQuoteFieldsAlways && selectReq.OutputSerialization.CSV.QuoteFields != CSVQuoteFieldsAsNeeded && selectReq.OutputSerialization.CSV.QuoteFields != "" { writeErrorResponse(w, ErrInvalidQuoteFields, r.URL) + return } getObject := objectAPI.GetObject @@ -205,8 +208,7 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r } go func() { defer reader.Close() - if gerr := getObject(ctx, bucket, object, 0, objInfo.Size, writer, - objInfo.ETag); gerr != nil { + if gerr := getObject(ctx, bucket, object, 0, objInfo.Size, writer, objInfo.ETag); gerr != nil { pipewriter.CloseWithError(gerr) return } @@ -243,10 +245,9 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r writeErrorResponse(w, toAPIErrorCode(err), r.URL) return } - if err := s3s.Execute(w); err != nil { + if err = s3s.Execute(w); err != nil { logger.LogIf(ctx, err) } - return } } diff --git a/docs/select/README.md b/docs/select/README.md new file mode 100644 index 000000000..1b8946dde --- /dev/null +++ b/docs/select/README.md @@ -0,0 +1,89 @@ +# Select API Quickstart Guide [![Slack](https://slack.minio.io/slack?type=svg)](https://slack.minio.io) +Traditional retrieval of objects is always as whole entities, i.e GetObject for a 5 GiB object, will always return 5 GiB of data. S3 Select API allows us to retrieve a subset of data by using simple SQL expressions. By using Select API to retrieve only the data needed by the application, drastic performance improvements can be achieved. + +> This implementation is compatible with AWS S3 Select API + +## 1. Prerequisites +- Install Minio Server from [here](http://docs.minio.io/docs/minio-quickstart-guide). +- Familiarity with AWS S3 API +- Familiarity with Python and installing dependencies. + +## 2. Install boto3 +Install `aws-sdk-python` from AWS SDK for Python official docs [here](https://aws.amazon.com/sdk-for-python/) + +## 3. Example +As an example, let us take a gzip compressed CSV file. Without S3 Select, we would need to download, decompress and process the entire CSV to get the data you needed. With Select API, can use a simple SQL expression to return only the data from the CSV you’re interested in, instead of retrieving the entire object. Following Python example shows how to retrieve the first column `Location` from an object containing data in CSV format. + +Please replace ``endpoint_url``,``aws_access_key_id``, ``aws_secret_access_key``, ``Bucket`` and ``Key`` with your local setup in this ``select.py`` file. + +```py +#!/usr/bin/env/env python3 +import boto3 + +s3 = boto3.client('s3', + endpoint_url='http://localhost:9000', + aws_access_key_id='minio', + aws_secret_access_key='minio123', + region_name='us-east-1') + +r = s3.select_object_content( + Bucket='mycsvbucket', + Key='sampledata/TotalPopulation.csv.gz', + ExpressionType='SQL', + Expression="select * from s3object s where s.Location like '%United States%'", + InputSerialization={ + 'CSV': { + "FileHeaderInfo": "USE", + }, + 'CompressionType': 'GZIP', + }, + OutputSerialization={'CSV': {}}, +) + +for event in r['Payload']: + if 'Records' in event: + records = event['Records']['Payload'].decode('utf-8') + print(records) + elif 'Stats' in event: + statsDetails = event['Stats']['Details'] + print("Stats details bytesScanned: ") + print(statsDetails['BytesScanned']) + print("Stats details bytesProcessed: ") + print(statsDetails['BytesProcessed']) +``` + +## 4. Run the Program +Upload first a sample dataset downloaded from [TotalPopulation.csv](https://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2017_TotalPopulationBySex.csv) using the following commands. +```sh +$ curl "https://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/CSV_FILES/WPP2017_TotalPopulationBySex.csv" > TotalPopulation.csv +$ mc mb myminio/mycsvbucket +$ gzip TotalPopulation.csv +$ mc cp TotalPopulation.csv.gz myminio/mycsvbucket/sampledata/ +``` + +Now let us proceed to run our select example to query for `Location` which matches `United States`. +```sh +$ python3 select.py +840,United States of America,2,Medium,1950,1950.5,79233.218,79571.179,158804.395 + +840,United States of America,2,Medium,1951,1951.5,80178.933,80726.116,160905.035 + +840,United States of America,2,Medium,1952,1952.5,81305.206,82019.632,163324.851 + +840,United States of America,2,Medium,1953,1953.5,82565.875,83422.307,165988.190 +.... +.... +.... + +Stats details bytesScanned: +6758866 +Stats details bytesProcessed: +25786743 +``` + +## 5. Explore Further +- [Use `mc` with Minio Server](https://docs.minio.io/docs/minio-client-quickstart-guide) +- [Use `minio-go` SDK with Minio Server](https://docs.minio.io/docs/golang-client-quickstart-guide) +- [Use `aws-cli` with Minio Server](https://docs.minio.io/docs/aws-cli-with-minio) +- [Use `s3cmd` with Minio Server](https://docs.minio.io/docs/s3cmd-with-minio) +- [The Minio documentation website](https://docs.minio.io) diff --git a/docs/select/select.py b/docs/select/select.py new file mode 100644 index 000000000..3c9a5e856 --- /dev/null +++ b/docs/select/select.py @@ -0,0 +1,33 @@ +#!/usr/bin/env/env python3 +import boto3 + +s3 = boto3.client('s3', + endpoint_url='http://localhost:9000', + aws_access_key_id='minio', + aws_secret_access_key='minio123', + region_name='us-east-1') + +r = s3.select_object_content( + Bucket='mycsvbucket', + Key='sampledata/TotalPopulation.csv.gz', + ExpressionType='SQL', + Expression="select * from s3object s where s.Location like '%United States%'", + InputSerialization={ + 'CSV': { + "FileHeaderInfo": "USE", + }, + 'CompressionType': 'GZIP', + }, + OutputSerialization={'CSV': {}}, +) + +for event in r['Payload']: + if 'Records' in event: + records = event['Records']['Payload'].decode('utf-8') + print(records) + elif 'Stats' in event: + statsDetails = event['Stats']['Details'] + print("Stats details bytesScanned: ") + print(statsDetails['BytesScanned']) + print("Stats details bytesProcessed: ") + print(statsDetails['BytesProcessed']) diff --git a/pkg/s3select/helpers.go b/pkg/s3select/helpers.go index 6a58be133..f832349f9 100644 --- a/pkg/s3select/helpers.go +++ b/pkg/s3select/helpers.go @@ -25,6 +25,9 @@ import ( "github.com/xwb1989/sqlparser" ) +// MaxExpressionLength - 256KiB +const MaxExpressionLength = 256 * 1024 + // This function processes size so that we can calculate bytes BytesProcessed. func processSize(myrecord []string) int64 { if len(myrecord) > 0 {