From d0bc16d464c0ee93a31d3427fa80c3af6f6afa4e Mon Sep 17 00:00:00 2001 From: joocer Date: Sat, 21 Dec 2024 21:47:50 +0000 Subject: [PATCH 1/2] #2129 --- opteryx/exceptions.py | 5 +++-- opteryx/operators/outer_join_node.py | 13 ++++++------- opteryx/planner/binder/binder.py | 8 +++++++- tests/sql_battery/test_shapes_and_errors_battery.py | 4 ++-- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/opteryx/exceptions.py b/opteryx/exceptions.py index ca4d9db3..19d51834 100644 --- a/opteryx/exceptions.py +++ b/opteryx/exceptions.py @@ -248,9 +248,10 @@ def __init__(self, dataset: str): class UnexpectedDatasetReferenceError(SqlError): """Exception raised for unexpected dataset references.""" - def __init__(self, dataset: str): + def __init__(self, dataset: str, message: Optional[str] = None): self.dataset = dataset - message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause." + if not message: + message = f"Dataset '{dataset}' is referenced in query but it doesn't appear in a FROM or JOIN clause." super().__init__(message) diff --git a/opteryx/operators/outer_join_node.py b/opteryx/operators/outer_join_node.py index dd95bfe6..78c1665a 100644 --- a/opteryx/operators/outer_join_node.py +++ b/opteryx/operators/outer_join_node.py @@ -23,11 +23,13 @@ popular SEMI and ANTI joins we leave to PyArrow for now. """ +from collections import deque from typing import List import pyarrow from opteryx import EOS +from opteryx.compiled.structures import HashSet from opteryx.compiled.structures import HashTable from opteryx.models import QueryProperties from opteryx.utils.arrow import align_tables @@ -52,8 +54,6 @@ def left_join(left_relation, right_relation, left_columns: List[str], right_colu Returns: A pyarrow.Table containing the result of the LEFT JOIN operation. """ - from collections import deque - from opteryx.compiled.structures.hash_table import hash_join_map left_indexes: deque = deque() @@ -190,16 +190,15 @@ def left_anti_join( Returns: A pyarrow.Table containing the result of the LEFT ANTI JOIN operation. """ - hash_table = HashTable() non_null_right_values = right_relation.select(right_columns).itercolumns() - for i, value_tuple in enumerate(zip(*non_null_right_values)): - hash_table.insert(hash(value_tuple), i) + right_hash_set = set(zip(*non_null_right_values)) left_indexes = [] left_values = left_relation.select(left_columns).itercolumns() for i, value_tuple in enumerate(zip(*left_values)): - rows = hash_table.get(hash(value_tuple)) - if not rows: # Only include left rows that have no match in the right table + if ( + value_tuple not in right_hash_set + ): # Only include left rows that have no match in the right table left_indexes.append(i) # Filter the left_chunk based on the anti join condition diff --git a/opteryx/planner/binder/binder.py b/opteryx/planner/binder/binder.py index fa079815..d403f13a 100644 --- a/opteryx/planner/binder/binder.py +++ b/opteryx/planner/binder/binder.py @@ -138,7 +138,13 @@ def create_variable_node(node: Node, context: BindingContext) -> Node: # if there are no candidates, we probably don't know the relation if not candidate_schemas: - raise UnexpectedDatasetReferenceError(dataset=node.source) + if node.source in context.relations: + raise UnexpectedDatasetReferenceError( + dataset=node.source, + message=f"Dataset `{node.source}` is not available after being used on the right side of a ANTI or SEMI JOIN", + ) + else: + raise UnexpectedDatasetReferenceError(dataset=node.source) # look up the column in the candidate schemas column, found_source_relation = locate_identifier_in_loaded_schemas( diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index a20bb3c1..2edfbf0d 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -1535,8 +1535,8 @@ ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE P.id > 5;", 0, 20, None), ("SELECT * FROM $planets AS P LEFT ANTI JOIN (SELECT id FROM $satellites WHERE name LIKE 'Moon%') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS C LEFT ANTI JOIN $satellites AS S ON S.id = C;", 0, 1, None), -# ("SELECT * FROM $planets AS P LEFT ANTI JOIN $satellites AS S ON S.id = P.id WHERE S.size > 1000;", 0, 20, None), -# ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'Moon%';", 0, 20, None), + ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name LIKE 'E%';", 1, 20, None), + ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE S.name LIKE 'E%';", 1, 20, UnexpectedDatasetReferenceError), ("SELECT * FROM $planets AS P LEFT SEMI JOIN (SELECT id FROM $satellites WHERE name != 'Moon') AS S ON S.id = P.id;", 8, 20, None), ("SELECT * FROM $planets AS P LEFT SEMI JOIN $satellites AS S ON S.id = P.id WHERE P.name != 'Earth';", 8, 20, None), ("SELECT * FROM GENERATE_SERIES(1, 10) AS G LEFT SEMI JOIN $satellites AS S ON S.id = G;", 10, 1, None), From 1fa522e2ef23e15c15e6f8fa4ab18e07be6903d9 Mon Sep 17 00:00:00 2001 From: XB500 Date: Sat, 21 Dec 2024 21:48:13 +0000 Subject: [PATCH 2/2] Opteryx Version 0.19.0-alpha.906 --- opteryx/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 48d2443d..ea2130aa 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 905 +__build__ = 906 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.