|
|
@ -19,7 +19,9 @@ package parquet |
|
|
|
import ( |
|
|
|
import ( |
|
|
|
"bytes" |
|
|
|
"bytes" |
|
|
|
"context" |
|
|
|
"context" |
|
|
|
|
|
|
|
"errors" |
|
|
|
"fmt" |
|
|
|
"fmt" |
|
|
|
|
|
|
|
"math" |
|
|
|
"strings" |
|
|
|
"strings" |
|
|
|
|
|
|
|
|
|
|
|
"git.apache.org/thrift.git/lib/go/thrift" |
|
|
|
"git.apache.org/thrift.git/lib/go/thrift" |
|
|
@ -101,6 +103,9 @@ func readPage( |
|
|
|
var repLevelsBuf, defLevelsBuf []byte |
|
|
|
var repLevelsBuf, defLevelsBuf []byte |
|
|
|
|
|
|
|
|
|
|
|
if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 { |
|
|
|
if pageHeader.GetType() == parquet.PageType_DATA_PAGE_V2 { |
|
|
|
|
|
|
|
if pageHeader.DataPageHeaderV2 == nil { |
|
|
|
|
|
|
|
return nil, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength() |
|
|
|
repLevelsLen = pageHeader.DataPageHeaderV2.GetRepetitionLevelsByteLength() |
|
|
|
repLevelsBuf = make([]byte, repLevelsLen) |
|
|
|
repLevelsBuf = make([]byte, repLevelsLen) |
|
|
|
if _, err = thriftReader.Read(repLevelsBuf); err != nil { |
|
|
|
if _, err = thriftReader.Read(repLevelsBuf); err != nil { |
|
|
@ -113,8 +118,11 @@ func readPage( |
|
|
|
return nil, err |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
dbLen := pageHeader.GetCompressedPageSize() - repLevelsLen - defLevelsLen |
|
|
|
dataBuf := make([]byte, pageHeader.GetCompressedPageSize()-repLevelsLen-defLevelsLen) |
|
|
|
if dbLen < 0 { |
|
|
|
|
|
|
|
return nil, errors.New("parquet: negative data length") |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
dataBuf := make([]byte, dbLen) |
|
|
|
if _, err = thriftReader.Read(dataBuf); err != nil { |
|
|
|
if _, err = thriftReader.Read(dataBuf); err != nil { |
|
|
|
return nil, err |
|
|
|
return nil, err |
|
|
|
} |
|
|
|
} |
|
|
@ -146,7 +154,9 @@ func readPage( |
|
|
|
if err != nil { |
|
|
|
if err != nil { |
|
|
|
return nil, 0, 0, err |
|
|
|
return nil, 0, 0, err |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if metadata == nil { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: metadata not set") |
|
|
|
|
|
|
|
} |
|
|
|
path := append([]string{}, metadata.GetPathInSchema()...) |
|
|
|
path := append([]string{}, metadata.GetPathInSchema()...) |
|
|
|
|
|
|
|
|
|
|
|
bytesReader := bytes.NewReader(buf) |
|
|
|
bytesReader := bytes.NewReader(buf) |
|
|
@ -160,6 +170,9 @@ func readPage( |
|
|
|
page.Header = pageHeader |
|
|
|
page.Header = pageHeader |
|
|
|
table := new(table) |
|
|
|
table := new(table) |
|
|
|
table.Path = path |
|
|
|
table.Path = path |
|
|
|
|
|
|
|
if pageHeader.DictionaryPageHeader == nil { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: dictionary not set") |
|
|
|
|
|
|
|
} |
|
|
|
values, err := readValues(bytesReader, metadata.GetType(), |
|
|
|
values, err := readValues(bytesReader, metadata.GetType(), |
|
|
|
uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0) |
|
|
|
uint64(pageHeader.DictionaryPageHeader.GetNumValues()), 0) |
|
|
|
if err != nil { |
|
|
|
if err != nil { |
|
|
@ -183,9 +196,15 @@ func readPage( |
|
|
|
var encodingType parquet.Encoding |
|
|
|
var encodingType parquet.Encoding |
|
|
|
|
|
|
|
|
|
|
|
if pageHeader.GetType() == parquet.PageType_DATA_PAGE { |
|
|
|
if pageHeader.GetType() == parquet.PageType_DATA_PAGE { |
|
|
|
|
|
|
|
if pageHeader.DataPageHeader == nil { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
numValues = uint64(pageHeader.DataPageHeader.GetNumValues()) |
|
|
|
numValues = uint64(pageHeader.DataPageHeader.GetNumValues()) |
|
|
|
encodingType = pageHeader.DataPageHeader.GetEncoding() |
|
|
|
encodingType = pageHeader.DataPageHeader.GetEncoding() |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
if pageHeader.DataPageHeaderV2 == nil { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues()) |
|
|
|
numValues = uint64(pageHeader.DataPageHeaderV2.GetNumValues()) |
|
|
|
encodingType = pageHeader.DataPageHeaderV2.GetEncoding() |
|
|
|
encodingType = pageHeader.DataPageHeaderV2.GetEncoding() |
|
|
|
} |
|
|
|
} |
|
|
@ -198,10 +217,13 @@ func readPage( |
|
|
|
return nil, 0, 0, err |
|
|
|
return nil, 0, 0, err |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if repetitionLevels = values.([]int64); uint64(len(repetitionLevels)) > numValues { |
|
|
|
if repetitionLevels = values.([]int64); len(repetitionLevels) > int(numValues) && int(numValues) >= 0 { |
|
|
|
repetitionLevels = repetitionLevels[:numValues] |
|
|
|
repetitionLevels = repetitionLevels[:numValues] |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
if numValues > math.MaxInt64/8 { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: numvalues too large") |
|
|
|
|
|
|
|
} |
|
|
|
repetitionLevels = make([]int64, numValues) |
|
|
|
repetitionLevels = make([]int64, numValues) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -212,10 +234,16 @@ func readPage( |
|
|
|
if err != nil { |
|
|
|
if err != nil { |
|
|
|
return nil, 0, 0, err |
|
|
|
return nil, 0, 0, err |
|
|
|
} |
|
|
|
} |
|
|
|
if definitionLevels = values.([]int64); uint64(len(definitionLevels)) > numValues { |
|
|
|
if numValues > math.MaxInt64/8 { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: numvalues too large") |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
if definitionLevels = values.([]int64); len(definitionLevels) > int(numValues) { |
|
|
|
definitionLevels = definitionLevels[:numValues] |
|
|
|
definitionLevels = definitionLevels[:numValues] |
|
|
|
} |
|
|
|
} |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
if numValues > math.MaxInt64/8 { |
|
|
|
|
|
|
|
return nil, 0, 0, errors.New("parquet: numvalues too large") |
|
|
|
|
|
|
|
} |
|
|
|
definitionLevels = make([]int64, numValues) |
|
|
|
definitionLevels = make([]int64, numValues) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -308,7 +336,10 @@ func (page *page) decode(dictPage *page) { |
|
|
|
|
|
|
|
|
|
|
|
for i := 0; i < len(page.DataTable.Values); i++ { |
|
|
|
for i := 0; i < len(page.DataTable.Values); i++ { |
|
|
|
if page.DataTable.Values[i] != nil { |
|
|
|
if page.DataTable.Values[i] != nil { |
|
|
|
index := page.DataTable.Values[i].(int64) |
|
|
|
index, ok := page.DataTable.Values[i].(int64) |
|
|
|
|
|
|
|
if !ok || int(index) >= len(dictPage.DataTable.Values) { |
|
|
|
|
|
|
|
return |
|
|
|
|
|
|
|
} |
|
|
|
page.DataTable.Values[i] = dictPage.DataTable.Values[index] |
|
|
|
page.DataTable.Values[i] = dictPage.DataTable.Values[index] |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@ -324,7 +355,9 @@ func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaEl |
|
|
|
if pageType == parquet.PageType_DATA_PAGE_V2 { |
|
|
|
if pageType == parquet.PageType_DATA_PAGE_V2 { |
|
|
|
var repLevelsLen, defLevelsLen int32 |
|
|
|
var repLevelsLen, defLevelsLen int32 |
|
|
|
var repLevelsBuf, defLevelsBuf []byte |
|
|
|
var repLevelsBuf, defLevelsBuf []byte |
|
|
|
|
|
|
|
if page.Header.DataPageHeaderV2 == nil { |
|
|
|
|
|
|
|
return 0, 0, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength() |
|
|
|
repLevelsLen = page.Header.DataPageHeaderV2.GetRepetitionLevelsByteLength() |
|
|
|
repLevelsBuf = make([]byte, repLevelsLen) |
|
|
|
repLevelsBuf = make([]byte, repLevelsLen) |
|
|
|
if _, err = bytesReader.Read(repLevelsBuf); err != nil { |
|
|
|
if _, err = bytesReader.Read(repLevelsBuf); err != nil { |
|
|
@ -375,8 +408,14 @@ func (page *page) getRLDLFromRawData(columnNameIndexMap map[string]int, schemaEl |
|
|
|
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: |
|
|
|
case parquet.PageType_DATA_PAGE, parquet.PageType_DATA_PAGE_V2: |
|
|
|
var numValues uint64 |
|
|
|
var numValues uint64 |
|
|
|
if pageType == parquet.PageType_DATA_PAGE { |
|
|
|
if pageType == parquet.PageType_DATA_PAGE { |
|
|
|
|
|
|
|
if page.Header.DataPageHeader == nil { |
|
|
|
|
|
|
|
return 0, 0, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
numValues = uint64(page.Header.DataPageHeader.GetNumValues()) |
|
|
|
numValues = uint64(page.Header.DataPageHeader.GetNumValues()) |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
|
|
|
|
if page.Header.DataPageHeaderV2 == nil { |
|
|
|
|
|
|
|
return 0, 0, errors.New("parquet: Header not set") |
|
|
|
|
|
|
|
} |
|
|
|
numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues()) |
|
|
|
numValues = uint64(page.Header.DataPageHeaderV2.GetNumValues()) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -445,6 +484,9 @@ func (page *page) getValueFromRawData(columnNameIndexMap map[string]int, schemaE |
|
|
|
case parquet.PageType_DICTIONARY_PAGE: |
|
|
|
case parquet.PageType_DICTIONARY_PAGE: |
|
|
|
bytesReader := bytes.NewReader(page.RawData) |
|
|
|
bytesReader := bytes.NewReader(page.RawData) |
|
|
|
var values interface{} |
|
|
|
var values interface{} |
|
|
|
|
|
|
|
if page.Header.DictionaryPageHeader == nil { |
|
|
|
|
|
|
|
return errors.New("parquet: dictionary not set") |
|
|
|
|
|
|
|
} |
|
|
|
values, err = readValues(bytesReader, page.DataType, |
|
|
|
values, err = readValues(bytesReader, page.DataType, |
|
|
|
uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0) |
|
|
|
uint64(page.Header.DictionaryPageHeader.GetNumValues()), 0) |
|
|
|
if err != nil { |
|
|
|
if err != nil { |
|
|
|