Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/acvictor/velox into acvicto…
Browse files Browse the repository at this point in the history
…r/updateFileHandleGen
  • Loading branch information
acvictor committed Apr 30, 2024
2 parents b092615 + 85fd5c6 commit 027baf6
Show file tree
Hide file tree
Showing 24 changed files with 268 additions and 72 deletions.
3 changes: 3 additions & 0 deletions CMake/resolve_dependency_modules/gtest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,6 @@ FetchContent_Declare(
URL_HASH ${VELOX_GTEST_BUILD_SHA256_CHECKSUM})

FetchContent_MakeAvailable(gtest)

# Mask compilation warning in clang 16.
target_compile_options(gtest PRIVATE -Wno-implicit-int-float-conversion)
2 changes: 1 addition & 1 deletion scripts/setup-adapters.sh
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ if [[ "$OSTYPE" == "linux-gnu"* ]]; then
# /etc/os-release is a standard way to query various distribution
# information and is available everywhere
LINUX_DISTRIBUTION=$(. /etc/os-release && echo ${ID})
if [[ "$LINUX_DISTRIBUTION" == "ubuntu" ]]; then
if [[ "$LINUX_DISTRIBUTION" == "ubuntu" || "$LINUX_DISTRIBUTION" == "debian" ]]; then
apt install -y --no-install-recommends libxml2-dev libgsasl7-dev uuid-dev
# Dependencies of GCS, probably a workaround until the docker image is rebuilt
apt install -y --no-install-recommends libc-ares-dev libcurl4-openssl-dev
Expand Down
14 changes: 14 additions & 0 deletions scripts/setup-ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ FMT_VERSION=10.1.1
BOOST_VERSION=boost-1.84.0
NPROC=$(getconf _NPROCESSORS_ONLN)
DEPENDENCY_DIR=${DEPENDENCY_DIR:-$(pwd)}
BUILD_DUCKDB="${BUILD_DUCKDB:-true}"
export CMAKE_BUILD_TYPE=Release
SUDO="${SUDO:-"sudo --preserve-env"}"

Expand All @@ -53,6 +54,7 @@ function install_build_prerequisites {
build-essential \
cmake \
ccache \
curl \
ninja-build \
checkinstall \
git \
Expand Down Expand Up @@ -141,6 +143,17 @@ function install_conda {
bash Miniconda3-latest-Linux-$ARCH.sh -b -p $MINICONDA_PATH
}

function install_duckdb {
if $BUILD_DUCKDB ; then
echo 'Building DuckDB'
wget_and_untar https://github.com/duckdb/duckdb/archive/refs/tags/v0.8.1.tar.gz duckdb
(
cd duckdb
cmake_install -DBUILD_UNITTESTS=OFF -DENABLE_SANITIZER=OFF -DENABLE_UBSAN=OFF -DBUILD_SHELL=OFF -DEXPORT_DLL_SYMBOLS=OFF -DCMAKE_BUILD_TYPE=Release
)
fi
}

function install_cuda {
# See https://developer.nvidia.com/cuda-downloads
if ! dpkg -l cuda-keyring 1>/dev/null; then
Expand All @@ -162,6 +175,7 @@ function install_velox_deps {
run_and_time install_mvfst
run_and_time install_fbthrift
run_and_time install_conda
run_and_time install_duckdb
}

function install_apt_deps {
Expand Down
44 changes: 37 additions & 7 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
auto& schema = fileMetaData_->schema;
uint32_t curSchemaIdx = schemaIdx;
auto& schemaElement = schema[curSchemaIdx];
bool isRepeated = false;
bool isOptional = false;

if (schemaElement.__isset.repetition_type) {
if (schemaElement.repetition_type !=
Expand All @@ -244,6 +246,11 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
if (schemaElement.repetition_type ==
thrift::FieldRepetitionType::REPEATED) {
maxRepeat++;
isRepeated = true;
}
if (schemaElement.repetition_type ==
thrift::FieldRepetitionType::OPTIONAL) {
isOptional = true;
}
}

Expand Down Expand Up @@ -300,7 +307,9 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}

// For backward-compatibility, a group annotated with MAP_KEY_VALUE
Expand All @@ -313,6 +322,12 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
VELOX_CHECK_EQ(children.size(), 1);
const auto& child = children[0];
auto type = child->type();
isRepeated = true;
// This level will not have the "isRepeated" info in the parquet
// schema since parquet schema will have a child layer which will have
// the "repeated info" which we are ignoring here, hence we set the
// isRepeated to true eg
// https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
return std::make_unique<ParquetTypeWithId>(
std::move(type),
std::move(*(ParquetTypeWithId*)child.get()).moveChildren(),
Expand All @@ -323,7 +338,9 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat + 1,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}

default:
Expand Down Expand Up @@ -354,7 +371,9 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
} else if (
schema[parentSchemaIdx].converted_type ==
thrift::ConvertedType::MAP ||
Expand All @@ -374,7 +393,9 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}
} else {
// Row type
Expand All @@ -389,7 +410,9 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine);
maxDefine,
isOptional,
isRepeated);
}
}
} else { // leaf node
Expand All @@ -415,6 +438,8 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
logicalType_,
maxRepeat,
maxDefine,
isOptional,
isRepeated,
precision,
scale,
type_length);
Expand All @@ -430,12 +455,14 @@ std::unique_ptr<ParquetTypeWithId> ReaderBase::getParquetColumnInfo(
std::move(children),
curSchemaIdx,
maxSchemaElementIdx,
columnIdx++,
columnIdx - 1, // was already incremented for leafTypePtr
std::move(name),
std::nullopt,
std::nullopt,
maxRepeat,
maxDefine - 1);
maxDefine - 1,
isOptional,
isRepeated);
}
return leafTypePtr;
}
Expand Down Expand Up @@ -631,6 +658,9 @@ int64_t ReaderBase::rowGroupUncompressedSize(
int32_t rowGroupIndex,
const dwio::common::TypeWithId& type) const {
if (type.column() != ParquetTypeWithId::kNonLeaf) {
VELOX_CHECK_LT(rowGroupIndex, fileMetaData_->row_groups.size());
VELOX_CHECK_LT(
type.column(), fileMetaData_->row_groups[rowGroupIndex].columns.size());
return fileMetaData_->row_groups[rowGroupIndex]
.columns[type.column()]
.meta_data.total_uncompressed_size;
Expand Down
19 changes: 11 additions & 8 deletions velox/dwio/parquet/reader/ParquetTypeWithId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ ParquetTypeWithId::moveChildren() && {
auto logicalType = parquetChild->logicalType_;
auto maxRepeat = parquetChild->maxRepeat_;
auto maxDefine = parquetChild->maxDefine_;
auto isOptional = parquetChild->isOptional_;
auto isRepeated = parquetChild->isRepeated_;
auto precision = parquetChild->precision_;
auto scale = parquetChild->scale_;
auto typeLength = parquetChild->typeLength_;
Expand All @@ -62,6 +64,8 @@ ParquetTypeWithId::moveChildren() && {
std::move(logicalType),
maxRepeat,
maxDefine,
isOptional,
isRepeated,
precision,
scale,
typeLength));
Expand All @@ -86,15 +90,14 @@ bool ParquetTypeWithId::hasNonRepeatedLeaf() const {
}

LevelMode ParquetTypeWithId::makeLevelInfo(LevelInfo& info) const {
int16_t repeatedAncestor = 0;
for (auto parent = parquetParent(); parent;
parent = parent->parquetParent()) {
if (parent->type()->kind() == TypeKind::ARRAY ||
parent->type()->kind() == TypeKind::MAP) {
repeatedAncestor = parent->maxDefine_;
break;
int repeatedAncestor = maxDefine_;
auto node = this;
do {
if (node->isOptional_) {
repeatedAncestor--;
}
}
node = node->parquetParent();
} while (node && !node->isRepeated_);
bool isList = type()->kind() == TypeKind::ARRAY;
bool isStruct = type()->kind() == TypeKind::ROW;
bool isMap = type()->kind() == TypeKind::MAP;
Expand Down
6 changes: 6 additions & 0 deletions velox/dwio/parquet/reader/ParquetTypeWithId.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
std::optional<thrift::LogicalType> logicalType,
uint32_t maxRepeat,
uint32_t maxDefine,
bool isOptional,
bool isRepeated,
int32_t precision = 0,
int32_t scale = 0,
int32_t typeLength = 0)
Expand All @@ -54,6 +56,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
logicalType_(std::move(logicalType)),
maxRepeat_(maxRepeat),
maxDefine_(maxDefine),
isOptional_(isOptional),
isRepeated_(isRepeated),
precision_(precision),
scale_(scale),
typeLength_(typeLength) {}
Expand Down Expand Up @@ -81,6 +85,8 @@ class ParquetTypeWithId : public dwio::common::TypeWithId {
const std::optional<thrift::LogicalType> logicalType_;
const uint32_t maxRepeat_;
const uint32_t maxDefine_;
const bool isOptional_;
const bool isRepeated_;
const int32_t precision_;
const int32_t scale_;
const int32_t typeLength_;
Expand Down
Binary file not shown.
56 changes: 41 additions & 15 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,19 @@ class ParquetTableScanTest : public HiveConnectorTestBase {
createDuckDbTable({data});
}

void loadDataWithRowType(const std::string& filePath, RowVectorPtr data) {
splits_ = {makeSplit(filePath)};
auto pool = facebook::velox::memory::memoryManager()->addLeafPool();
dwio::common::ReaderOptions readerOpts{pool.get()};
auto reader = std::make_unique<ParquetReader>(
std::make_unique<facebook::velox::dwio::common::BufferedInput>(
std::make_shared<LocalReadFile>(filePath),
readerOpts.getMemoryPool()),
readerOpts);
rowType_ = reader->rowType();
createDuckDbTable({data});
}

std::string getExampleFilePath(const std::string& fileName) {
return facebook::velox::test::getDataFilePath(
"velox/dwio/parquet/tests/reader", "../examples/" + fileName);
Expand Down Expand Up @@ -303,9 +316,8 @@ TEST_F(ParquetTableScanTest, singleRowStruct) {
}

// Core dump and incorrect result are fixed.
TEST_F(ParquetTableScanTest, DISABLED_array) {
auto vector = makeArrayVector<int32_t>({{1, 2, 3}});

TEST_F(ParquetTableScanTest, array) {
auto vector = makeArrayVector<int32_t>({});
loadData(
getExampleFilePath("old_repeated_int.parquet"),
ROW({"repeatedInt"}, {ARRAY(INTEGER())}),
Expand All @@ -316,12 +328,11 @@ TEST_F(ParquetTableScanTest, DISABLED_array) {
}));

assertSelectWithFilter(
{"repeatedInt"}, {}, "", "SELECT repeatedInt FROM tmp");
{"repeatedInt"}, {}, "", "SELECT UNNEST(array[array[1,2,3]])");
}

// Optional array with required elements.
// Incorrect result.
TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) {
TEST_F(ParquetTableScanTest, optArrayReqEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand All @@ -341,8 +352,7 @@ TEST_F(ParquetTableScanTest, DISABLED_optArrayReqEle) {
}

// Required array with required elements.
// Core dump is fixed, but the result is incorrect.
TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) {
TEST_F(ParquetTableScanTest, reqArrayReqEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand All @@ -362,8 +372,7 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayReqEle) {
}

// Required array with optional elements.
// Incorrect result.
TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) {
TEST_F(ParquetTableScanTest, reqArrayOptEle) {
auto vector = makeArrayVector<StringView>({});

loadData(
Expand All @@ -382,22 +391,39 @@ TEST_F(ParquetTableScanTest, DISABLED_reqArrayOptEle) {
"SELECT UNNEST(array[array['a', null], array[], array[null, 'b']])");
}

TEST_F(ParquetTableScanTest, arrayOfArrayTest) {
auto vector = makeArrayVector<StringView>({});

loadDataWithRowType(
getExampleFilePath("array_of_array1.parquet"),
makeRowVector(
{"_1"},
{
vector,
}));

assertSelectWithFilter(
{"_1"},
{},
"",
"SELECT UNNEST(array[null, array[array['g', 'h'], null]])");
}

// Required array with legacy format.
// Incorrect result.
TEST_F(ParquetTableScanTest, DISABLED_reqArrayLegacy) {
TEST_F(ParquetTableScanTest, reqArrayLegacy) {
auto vector = makeArrayVector<StringView>({});

loadData(
getExampleFilePath("array_3.parquet"),
ROW({"_1"}, {ARRAY(VARCHAR())}),
ROW({"element"}, {ARRAY(VARCHAR())}),
makeRowVector(
{"_1"},
{"element"},
{
vector,
}));

assertSelectWithFilter(
{"_1"},
{"element"},
{},
"",
"SELECT UNNEST(array[array['a', 'b'], array[], array['c', 'd']])");
Expand Down
4 changes: 2 additions & 2 deletions velox/dwio/parquet/writer/arrow/tests/BloomFilterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
#include "arrow/buffer.h"
#include "arrow/io/file.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_util.h" // @manual=fbsource//third-party/apache-arrow:arrow_testing
#include "arrow/testing/random.h" // @manual=fbsource//third-party/apache-arrow:arrow_testing
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"

#include "velox/dwio/parquet/writer/arrow/Exception.h"
#include "velox/dwio/parquet/writer/arrow/Platform.h"
Expand Down
2 changes: 1 addition & 1 deletion velox/dwio/parquet/writer/arrow/tests/ColumnWriterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <gtest/gtest.h>

#include "arrow/io/buffered.h"
#include "arrow/testing/gtest_util.h" // @manual=fbsource//third-party/apache-arrow:arrow_testing
#include "arrow/testing/gtest_util.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#include "velox/dwio/parquet/writer/arrow/tests/TestUtil.h"

#include "arrow/io/memory.h"
#include "arrow/testing/gtest_util.h" // @manual=fbsource//third-party/apache-arrow:arrow_testing
#include "arrow/testing/gtest_util.h"
#include "arrow/util/compression.h"
#include "arrow/util/crc32.h"

Expand Down
Loading

0 comments on commit 027baf6

Please sign in to comment.