Skip to content

Commit

Permalink
Merge pull request #2137 from mabel-dev/#2129
Browse files Browse the repository at this point in the history
  • Loading branch information
joocer authored Dec 21, 2024
2 parents 77c7955 + 1fa522e commit 7c9df82
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 13 deletions.
2 changes: 1 addition & 1 deletion opteryx/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__build__ = 905
__build__ = 906

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
5 changes: 3 additions & 2 deletions opteryx/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,9 +248,10 @@ def __init__(self, dataset: str):
class UnexpectedDatasetReferenceError(SqlError):
"""Exception raised for unexpected dataset references."""

def __init__(self, dataset: str):
def __init__(self, dataset: str, message: Optional[str] = None):
self.dataset = dataset
message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause."
if not message:
message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause."
super().__init__(message)


Expand Down
13 changes: 6 additions & 7 deletions opteryx/operators/outer_join_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
popular SEMI and ANTI joins we leave to PyArrow for now.
"""

from collections import deque
from typing import List

import pyarrow

from opteryx import EOS
from opteryx.compiled.structures import HashSet
from opteryx.compiled.structures import HashTable
from opteryx.models import QueryProperties
from opteryx.utils.arrow import align_tables
Expand All @@ -52,8 +54,6 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu
Returns:
A pyarrow.Table containing the result of the LEFT JOIN operation.
"""
from collections import deque

from opteryx.compiled.structures.hash_table import hash_join_map

left_indexes: deque = deque()
Expand Down Expand Up @@ -190,16 +190,15 @@ def left_anti_join(
Returns:
A pyarrow.Table containing the result of the LEFT ANTI JOIN operation.
"""
hash_table = HashTable()
non_null_right_values = right_relation.select(right_columns).itercolumns()
for i, value_tuple in enumerate(zip(*non_null_right_values)):
hash_table.insert(hash(value_tuple), i)
right_hash_set = set(zip(*non_null_right_values))

left_indexes = []
left_values = left_relation.select(left_columns).itercolumns()
for i, value_tuple in enumerate(zip(*left_values)):
rows = hash_table.get(hash(value_tuple))
if not rows: # Only include left rows that have no match in the right table
if (
value_tuple not in right_hash_set
): # Only include left rows that have no match in the right table
left_indexes.append(i)

# Filter the left_chunk based on the anti join condition
Expand Down
8 changes: 7 additions & 1 deletion opteryx/planner/binder/binder.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,13 @@ def create_variable_node(node: Node, context: BindingContext) -> Node:

# if there are no candidates, we probably don't know the relation
if not candidate_schemas:
raise UnexpectedDatasetReferenceError(dataset=node.source)
if node.source in context.relations:
raise UnexpectedDatasetReferenceError(
dataset=node.source,
message=f"Dataset `{node.source}` is not available after being used on the right side of a ANTI or SEMI JOIN",
)
else:
raise UnexpectedDatasetReferenceError(dataset=node.source)

# look up the column in the candidate schemas
column, found_source_relation = locate_identifier_in_loaded_schemas(
Expand Down
4 changes: 2 additions & 2 deletions tests/sql_battery/test_shapes_and_errors_battery.py
Original file line number Diff line number Diff line change
Expand Up @@ -1535,8 +1535,8 @@
("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE P.id > 5;", 0, 20, None),
("SELECT * FROM $planets AS P LEFT ANTI JOIN (SELECT id FROM $satellites WHERE name LIKE 'Moon%') AS S ON S.id = P.id;", 8, 20, None),
("SELECT * FROM GENERATE_SERIES(1, 10) AS C LEFT ANTI JOIN $satellites AS S ON S.id = C;", 0, 1, None),
# ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE S.size > 1000;", 0, 20, None),
# ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'Moon%';", 0, 20, None),
("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name LIKE 'E%';", 1, 20, None),
("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'E%';", 1, 20, UnexpectedDatasetReferenceError),
("SELECT * FROM $planets AS P LEFT SEMI JOIN (SELECT id FROM $satellites WHERE name != 'Moon') AS S ON S.id = P.id;", 8, 20, None),
("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name != 'Earth';", 8, 20, None),
("SELECT * FROM GENERATE_SERIES(1, 10) AS G LEFT SEMI JOIN $satellites AS S ON S.id = G;", 10, 1, None),
Expand Down

0 comments on commit 7c9df82

Please sign in to comment.