Skip to content

Commit

Permalink
Enable kLeftSemiFilter and kRightSemiFilter join type in SMJ (faceboo…
Browse files Browse the repository at this point in the history
…kincubator#9877)

Summary:
The kLeftSemiFilter and kRightSemiFilter  Join can leverage the logic of the Inner Join with two key modifications:

1. Even if there are duplicate records in the right/left table, the corresponding records from the left/right table should only appear once in the final result.
2. The final output should exclude all records from the right/left table. Since the rightProjections/leftProjections of the leftSemiFilter/rightSemiFilter are empty [here](https://github.com/facebookincubator/velox/blob/main/velox/exec/MergeJoin.cpp#L69), it will not copy the results from the right/left side into the final output [here](https://github.com/facebookincubator/velox/blob/main/velox/exec/MergeJoin.cpp#L270).

Pull Request resolved: facebookincubator#9877

Reviewed By: bikramSingh91

Differential Revision: D57917728

Pulled By: pedroerp

fbshipit-source-id: 6b0a6f72b52d7ad1b7e13c9ec80bd8c87293945a
  • Loading branch information
JkSelf authored and facebook-github-bot committed May 29, 2024
1 parent 5b52dca commit 4937148
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 3 deletions.
29 changes: 27 additions & 2 deletions velox/exec/MergeJoin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ MergeJoin::MergeJoin(
numKeys_{joinNode->leftKeys().size()},
joinNode_(joinNode) {
VELOX_USER_CHECK(
joinNode_->isInnerJoin() || joinNode_->isLeftJoin(),
"Merge join supports only inner and left joins. Other join types are not supported yet.");
joinNode_->isInnerJoin() || joinNode_->isLeftJoin() ||
joinNode_->isLeftSemiFilterJoin() ||
joinNode_->isRightSemiFilterJoin(),
"Merge join supports only inner, left and left semi joins. Other join types are not supported yet.");
}

void MergeJoin::initialize() {
Expand Down Expand Up @@ -64,6 +66,12 @@ void MergeJoin::initialize() {
}
}

if (joinNode_->isRightSemiFilterJoin()) {
VELOX_USER_CHECK(
leftProjections_.empty(),
"The left side projections should be empty for right semi join");
}

for (auto i = 0; i < rightType->size(); ++i) {
auto name = rightType->nameOf(i);
auto outIndex = outputType_->getChildIdxIfExists(name);
Expand All @@ -72,6 +80,12 @@ void MergeJoin::initialize() {
}
}

if (joinNode_->isLeftSemiFilterJoin()) {
VELOX_USER_CHECK(
rightProjections_.empty(),
"The right side projections should be empty for left semi join");
}

if (joinNode_->filter()) {
initializeFilter(joinNode_->filter(), leftType, rightType);

Expand Down Expand Up @@ -383,6 +397,17 @@ bool MergeJoin::addToOutput() {
auto rightEnd =
r == numRights - 1 ? rightMatch_->endIndex : right->size();

// TODO: Since semi joins only require determining if there is at least
// one match on the other side, we could explore specialized algorithms
// or data structures that short-circuit the join process once a match
// is found.
if (isLeftSemiFilterJoin(joinType_) ||
isRightSemiFilterJoin(joinType_)) {
// LeftSemiFilter produce each row from the left at most once.
// RightSemiFilter produce each row from the right at most once.
rightEnd = rightStart + 1;
}

for (auto j = rightStart; j < rightEnd; ++j) {
if (outputSize_ == outputBatchSize_) {
leftMatch_->setCursor(l, i);
Expand Down
3 changes: 2 additions & 1 deletion velox/exec/fuzzer/JoinFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,8 @@ void JoinFuzzer::makeAlternativePlans(
.planNode()});

// Use OrderBy + MergeJoin (if join type is inner or left).
if (joinNode->isInnerJoin() || joinNode->isLeftJoin()) {
if (joinNode->isInnerJoin() || joinNode->isLeftJoin() ||
joinNode->isLeftSemiFilterJoin() || joinNode->isRightSemiFilterJoin()) {
auto planWithSplits = makeMergeJoinPlan(
joinType, probeKeys, buildKeys, probeInput, buildInput, outputColumns);
plans.push_back(planWithSplits);
Expand Down
43 changes: 43 additions & 0 deletions velox/exec/tests/MergeJoinTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,49 @@ TEST_F(MergeJoinTest, lazyVectors) {
"SELECT c0, rc0, c1, rc1, c2, c3 FROM t, u WHERE t.c0 = u.rc0 and c1 + rc1 < 30");
}

TEST_F(MergeJoinTest, semiJoin) {
auto left = makeRowVector(
{"t0"}, {makeNullableFlatVector<int64_t>({1, 2, 2, 6, std::nullopt})});

auto right = makeRowVector(
{"u0"},
{makeNullableFlatVector<int64_t>(
{1, 2, 2, 7, std::nullopt, std::nullopt})});

createDuckDbTable("t", {left});
createDuckDbTable("u", {right});

auto testSemiJoin = [&](const std::string& filter,
const std::string& sql,
const std::vector<std::string>& outputLayout,
core::JoinType joinType) {
auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
auto plan =
PlanBuilder(planNodeIdGenerator)
.values({left})
.mergeJoin(
{"t0"},
{"u0"},
PlanBuilder(planNodeIdGenerator).values({right}).planNode(),
filter,
outputLayout,
joinType)
.planNode();
AssertQueryBuilder(plan, duckDbQueryRunner_).assertResults(sql);
};

testSemiJoin(
"t0 >1",
"SELECT t0 FROM t where t0 IN (SELECT u0 from u) and t0 > 1",
{"t0"},
core::JoinType::kLeftSemiFilter);
testSemiJoin(
"u0 > 1",
"SELECT u0 FROM u where u0 IN (SELECT t0 from t) and u0 > 1",
{"u0"},
core::JoinType::kRightSemiFilter);
}

TEST_F(MergeJoinTest, nullKeys) {
auto left = makeRowVector(
{"t0"}, {makeNullableFlatVector<int64_t>({1, 2, 5, std::nullopt})});
Expand Down

0 comments on commit 4937148

Please sign in to comment.