diff --git a/opteryx/compiled/structures/node.pyx b/opteryx/compiled/structures/node.pyx index 962835e8..80963275 100644 --- a/opteryx/compiled/structures/node.pyx +++ b/opteryx/compiled/structures/node.pyx @@ -1,4 +1,10 @@ # cython: language_level=3 +# cython: nonecheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: infer_types=True +# cython: wraparound=True +# cython: boundscheck=False """ Node Module diff --git a/opteryx/functions/other_functions.py b/opteryx/functions/other_functions.py index 27b5720d..b8145b13 100644 --- a/opteryx/functions/other_functions.py +++ b/opteryx/functions/other_functions.py @@ -15,6 +15,7 @@ import numpy import pyarrow +import simdjson from pyarrow import compute from opteryx.exceptions import SqlError @@ -196,17 +197,46 @@ def cosine_similarity( return similarities -def jsonb_object_keys(arr): +def jsonb_object_keys(arr: numpy.ndarray): + """ + Extract the keys from a NumPy array of JSON objects or JSON strings/bytes. + + Parameters: + arr: numpy.ndarray + A NumPy array of dictionaries or JSON-encoded strings/bytes. + + Returns: + pyarrow.Array + A PyArrow Array containing lists of keys for each input element. + """ + # Early exit for empty input if len(arr) == 0: - return [] - result = [] - if isinstance(arr[0], dict): - result = [[str(key) for key in row] for row in arr] - if isinstance(arr[0], (str, bytes)): - import simdjson + return numpy.array([]) + + # we may get pyarrow arrays here - usually not though + if isinstance(arr, pyarrow.Array): + arr = arr.to_numpy(zero_copy_only=False) + + # Determine type based on dtype of the array + if not numpy.issubdtype(arr.dtype, numpy.object_): + raise ValueError( + "Unsupported array dtype. Expected object dtype for dicts or strings/bytes." + ) - def keys(doc): - return simdjson.Parser().parse(doc).keys() # type:ignore + # Pre-create the result array as a NumPy boolean array set to False + result = numpy.empty(arr.shape, dtype=list) + + if isinstance(arr[0], dict): + # Process dictionaries + for i, row in enumerate(arr): + result[i] = [str(key) for key in row.keys()] + elif isinstance(arr[0], (str, bytes)): + # SIMD-JSON parser instance for JSON string/bytes + parser = simdjson.Parser() + for i, row in enumerate(arr): + result[i] = [str(key) for key in parser.parse(row).keys()] + else: + raise ValueError("Unsupported dtype for array elements. Expected dict, str, or bytes.") - result = [[str(key) for key in keys(row)] for row in arr] - return pyarrow.array(result) + # Return the result as a PyArrow array + return result diff --git a/opteryx/managers/expression/binary_operators.py b/opteryx/managers/expression/binary_operators.py index 1e97ad66..95098e03 100644 --- a/opteryx/managers/expression/binary_operators.py +++ b/opteryx/managers/expression/binary_operators.py @@ -18,33 +18,41 @@ import numpy import pyarrow +import simdjson from orso.types import OrsoTypes from pyarrow import compute from opteryx.compiled import list_ops +# Initialize simdjson parser once +parser = simdjson.Parser() + def ArrowOp(documents, elements) -> pyarrow.Array: """JSON Selector""" element = elements[0] - # if it's dicts, extract the value from the dict + # Fast path: if the documents are dicts, delegate to the cython optimized op if len(documents) > 0 and isinstance(documents[0], dict): return list_ops.cython_arrow_op(documents, element) - # if it's a string, parse and extract, we don't need a dict (dicts are s_l_o_w) - # so we can use a library which allows us to access the values directly - import simdjson + if hasattr(documents, "to_numpy"): + documents = documents.to_numpy(zero_copy_only=False) + # Function to extract value from a document def extract(doc: bytes, elem: Union[bytes, str]) -> Any: - value = simdjson.Parser().parse(doc).get(elem) # type:ignore + value = parser.parse(doc).get(elem) # type:ignore if hasattr(value, "as_list"): return value.as_list() if hasattr(value, "as_dict"): - return value.as_dict() + return value.mini return value - return pyarrow.array([None if d is None else extract(d, element) for d in documents]) + # Use a generator expression to lazily evaluate the extraction + extracted_values = (None if d is None else extract(d, element) for d in documents) + + # Return the result as a PyArrow array + return pyarrow.array(extracted_values) def LongArrowOp(documents, elements) -> pyarrow.Array: @@ -54,6 +62,9 @@ def LongArrowOp(documents, elements) -> pyarrow.Array: if len(documents) > 0 and isinstance(documents[0], dict): return list_ops.cython_long_arrow_op(documents, element) + if hasattr(documents, "to_numpy"): + documents = documents.to_numpy(zero_copy_only=False) + import simdjson def extract(doc: bytes, elem: Union[bytes, str]) -> bytes: diff --git a/opteryx/managers/expression/formatter.py b/opteryx/managers/expression/formatter.py index af7b3f1b..fa1e644d 100644 --- a/opteryx/managers/expression/formatter.py +++ b/opteryx/managers/expression/formatter.py @@ -101,8 +101,6 @@ def format_expression(root, qualify: bool = False): "ShiftRight": ">>", "Arrow": "->", "LongArrow": "->>", - "AtQuestion": "@?", - "AtArrow": "@>", } return f"{format_expression(root.left, qualify)} {_map.get(root.value, root.value).upper()} {format_expression(root.right, qualify)}" if node_type == NodeType.EXPRESSION_LIST: @@ -116,6 +114,8 @@ def format_expression(root, qualify: bool = False): "BitwiseOr": "|", "LtEq": "<=", "GtEq": ">=", + "AtQuestion": "@?", + "AtArrow": "@>", } return f"{format_expression(root.left, qualify)} {_map.get(root.value, root.value).upper()} {format_expression(root.right, qualify)}" if node_type == NodeType.UNARY_OPERATOR: diff --git a/opteryx/managers/expression/ops.py b/opteryx/managers/expression/ops.py index e6f59e75..474a961f 100644 --- a/opteryx/managers/expression/ops.py +++ b/opteryx/managers/expression/ops.py @@ -178,9 +178,28 @@ def _inner_filter_operations(arr, operator, value): import simdjson - # Don't warn on rule SIM118, the object isn't actually a dictionary + parser = simdjson.Parser() + + if not element.startswith("$."): + # Don't warn on rule SIM118, the object isn't actually a dictionary + return pyarrow.array( + [element in parser.parse(doc).keys() for doc in arr], + type=pyarrow.bool_(), # type:ignore + ) + + _keys = element[2:].split(".") + + def json_path_extract(current_value, keys): + for key in keys: + if key not in current_value: + return False # Key doesn't exist + + # Proceed to the next level of the JSON object + current_value = current_value[key] + return True # Key exists if traversal succeeds + return pyarrow.array( - [element in simdjson.Parser().parse(doc).keys() for doc in arr], + [json_path_extract(parser.parse(doc), _keys) for doc in arr], type=pyarrow.bool_(), # type:ignore ) diff --git a/opteryx/planner/cost_based_optimizer/__init__.py b/opteryx/planner/cost_based_optimizer/__init__.py index 2cdf7fb2..8d24c37c 100644 --- a/opteryx/planner/cost_based_optimizer/__init__.py +++ b/opteryx/planner/cost_based_optimizer/__init__.py @@ -88,6 +88,7 @@ def __init__(self, statistics: QueryStatistics): ProjectionPushdownStrategy(statistics), DistinctPushdownStrategy(statistics), OperatorFusionStrategy(statistics), + LimitPushdownStrategy(statistics), RedundantOperationsStrategy(statistics), ConstantFoldingStrategy(statistics), ] diff --git a/opteryx/planner/cost_based_optimizer/strategies/__init__.py b/opteryx/planner/cost_based_optimizer/strategies/__init__.py index 6820472f..6fc7973a 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/__init__.py +++ b/opteryx/planner/cost_based_optimizer/strategies/__init__.py @@ -1,6 +1,7 @@ from .boolean_simplication import BooleanSimplificationStrategy from .constant_folding import ConstantFoldingStrategy from .distinct_pushdown import DistinctPushdownStrategy +from .limit_pushdown import LimitPushdownStrategy from .operator_fusion import OperatorFusionStrategy from .predicate_pushdown import PredicatePushdownStrategy from .predicate_rewriter import PredicateRewriteStrategy @@ -12,6 +13,7 @@ "BooleanSimplificationStrategy", "ConstantFoldingStrategy", "DistinctPushdownStrategy", + "LimitPushdownStrategy", "OperatorFusionStrategy", "PredicatePushdownStrategy", "PredicateRewriteStrategy", diff --git a/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py new file mode 100644 index 00000000..86105d75 --- /dev/null +++ b/opteryx/planner/cost_based_optimizer/strategies/limit_pushdown.py @@ -0,0 +1,63 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Optimization Rule - Limit Pushdown + +Type: Heuristic +Goal: Reduce Rows + +We try to push the limit to the other side of PROJECTS +""" + +from opteryx.planner.logical_planner import LogicalPlan +from opteryx.planner.logical_planner import LogicalPlanNode +from opteryx.planner.logical_planner import LogicalPlanStepType + +from .optimization_strategy import OptimizationStrategy +from .optimization_strategy import OptimizerContext + + +class LimitPushdownStrategy(OptimizationStrategy): + def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerContext: + if not context.optimized_plan: + context.optimized_plan = context.pre_optimized_tree.copy() # type: ignore + + if node.node_type == LogicalPlanStepType.Limit: + node.nid = context.node_id + context.collected_limits.append(node) + return context + + if node.node_type in ( + LogicalPlanStepType.Join, + LogicalPlanStepType.Scan, + LogicalPlanStepType.AggregateAndGroup, + LogicalPlanStepType.Aggregate, + LogicalPlanStepType.Subquery, + LogicalPlanStepType.Union, + LogicalPlanStepType.Filter, + ): + # we don't push past here + for limit_node in context.collected_limits: + self.statistics.optimization_limit_pushdown += 1 + context.optimized_plan.remove_node(limit_node.nid, heal=True) + context.optimized_plan.insert_node_after( + limit_node.nid, limit_node, context.node_id + ) + limit_node.columns = [] + context.collected_limits.clear() + + return context + + def complete(self, plan: LogicalPlan, context: OptimizerContext) -> LogicalPlan: + # No finalization needed for this strategy + return plan diff --git a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py index 1c336954..77d8fba5 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py +++ b/opteryx/planner/cost_based_optimizer/strategies/operator_fusion.py @@ -11,6 +11,11 @@ # limitations under the License. """ +Optimization Rule - Operator Fusion + +Type: Heuristic +Goal: Chose more efficient physical implementations. + Some operators can be fused to be faster. 'Fused' opertors are when physical operations perform multiple logical operations. diff --git a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py index 9dd6daae..ec4e3cbe 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py +++ b/opteryx/planner/cost_based_optimizer/strategies/optimization_strategy.py @@ -34,7 +34,10 @@ def __init__(self, tree: LogicalPlan): """We collect column identities so we can push column selection as close to the read as possible, including off to remote systems""" self.collected_distincts: list = [] - """We collect distincts to try to eliminate records earlier""" + """We collect distincts to try to eliminate rows earlier""" + + self.collected_limits: list = [] + """We collect limits to to to eliminate rows earlier""" class OptimizationStrategy: diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py index f832c99b..ddcaa188 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_pushdown.py @@ -11,7 +11,10 @@ # limitations under the License. """ -PUSH DOWN +Optimization Rule - Predicate Pushdown + +Type: Heuristic +Goal: Filter rows as early as possible One main heuristic strategy is it eliminate rows to be processed as early as possible, to do that we try to push filter conditions to as close to the @@ -62,7 +65,6 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo ): # Handle predicates specific to node types context = self._handle_predicates(node, context) - self.statistics.optimization_predicate_pushdown += 1 context.optimized_plan.add_node(context.node_id, LogicalPlanNode(**node.properties)) if context.last_nid: context.optimized_plan.add_edge(context.node_id, context.last_nid) diff --git a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py index 1f0d5b6e..6d64cfbf 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py +++ b/opteryx/planner/cost_based_optimizer/strategies/predicate_rewriter.py @@ -11,7 +11,10 @@ # limitations under the License. """ -PREDICATE REWRITER +Optimization Rule - Predicate rewriter + +Type: Heuristic +Goal: Chose more efficient predicate evaluations We rewrite some conditions to a more optimal form; for example if doing a LIKE comparison and the pattern contains no wildcards, we rewrite to be an diff --git a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py index 87fabeb7..5f936cbc 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py +++ b/opteryx/planner/cost_based_optimizer/strategies/projection_pushdown.py @@ -10,6 +10,21 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Optimization Rule - Projection Pushdown + +Type: Heuristic +Goal: Limit columns which need to be moved around + +We bind from the the scans, exposing the available columns to each operator +as we make our way to the top of the plan (usually the SELECT). The projection +pushdown is done as part of the optimizers, but isn't quite like the other +optimizations; this is collecting used column information as it goes from the +top of the plan down to the selects. The other optimizations tend to move or +remove operations, or update what a step does, this is just collecting and +updating the used columns. +""" + from typing import Set from opteryx.managers.expression import NodeType @@ -81,7 +96,6 @@ def visit(self, node: LogicalPlanNode, context: OptimizerContext) -> OptimizerCo node.columns = node_columns context.optimized_plan.add_node(context.node_id, LogicalPlanNode(**node.properties)) - self.statistics.optimization_projection_pushdown += 1 if context.parent_nid: context.optimized_plan.add_edge(context.node_id, context.parent_nid) diff --git a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py index 8a3df5c5..2c2bd205 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py +++ b/opteryx/planner/cost_based_optimizer/strategies/redundant_operators.py @@ -11,6 +11,11 @@ # limitations under the License. """ +Optimization Rule - Remove Redundant Operators + +Type: Heuristic +Goal: Remove steps which don't affect the result + This optimization runs toward the end of the set, it removes operators which were useful during planning and optimization. diff --git a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py index f670cdb0..777865bb 100644 --- a/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py +++ b/opteryx/planner/cost_based_optimizer/strategies/split_conjunctive_predicates.py @@ -10,6 +10,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Optimization Rule - Split Conjections + +Type: Heuristic +Goal: Break filters into units which are easier to handle +""" + from orso.tools import random_string from opteryx.managers.expression import NodeType diff --git a/opteryx/planner/sql_rewriter.py b/opteryx/planner/sql_rewriter.py index 9a173f5c..f67371c4 100644 --- a/opteryx/planner/sql_rewriter.py +++ b/opteryx/planner/sql_rewriter.py @@ -70,7 +70,7 @@ from opteryx.exceptions import InvalidTemporalRangeFilterError from opteryx.utils import dates -COLLECT_RELATION = [ +COLLECT_RELATION = { r"FROM", r"INNER\sJOIN", r"CROSS\sJOIN", @@ -87,11 +87,11 @@ r"JOIN", r"CREATE\sTABLE", r"ANALYZE\sTABLE", -] +} -COLLECT_TEMPORAL = [r"FOR"] +COLLECT_TEMPORAL = {r"FOR"} -STOP_COLLECTING = [ +STOP_COLLECTING = { r"GROUP\sBY", r"HAVING", r"LIKE", @@ -107,32 +107,35 @@ r";", r",", r"UNION", -] +} -COLLECT_ALIAS = [r"AS"] +COLLECT_ALIAS = {r"AS"} -BOUNDARIES = [r"(", r")"] +BOUNDARIES = {r"(", r")"} + +FOR_DATE_CLAUSES = { + r"DATES\sIN\s\w+", + r"DATES\sBETWEEN\s[^\r\n\t\f\v]AND\s[^\r\n\t\f\v]", + r"DATES\sSINCE\s\w+", +} + +FUNCTIONS_WITH_FROM_SYNTAX = {"EXTRACT", "SUBSTRING", "TRIM"} SQL_PARTS = ( - COLLECT_RELATION - + COLLECT_TEMPORAL - + STOP_COLLECTING - + COLLECT_ALIAS - + [ - r"DATES\sIN\s\w+", - r"DATES\sBETWEEN\s[^\r\n\t\f\v]AND\s[^\r\n\t\f\v]", - r"DATES\sSINCE\s\w+", - ] + COLLECT_RELATION.union(COLLECT_TEMPORAL) + .union(STOP_COLLECTING) + .union(COLLECT_ALIAS) + .union(FOR_DATE_CLAUSES) ) COMBINE_WHITESPACE_REGEX = re.compile(r"\r\n\t\f\v+") # states for the collection algorithm WAITING: int = 1 -RELATION: int = 4 -TEMPORAL: int = 16 -ALIAS: int = 64 -FUNCTION_RELATION: int = 128 +RELATION: int = 2 +TEMPORAL: int = 4 +ALIAS: int = 8 +FUNCTION_RELATION: int = 16 def sql_parts(string): @@ -259,6 +262,9 @@ def _temporal_extration_state_machine( # # We're essentially using a bit mask to record state and transitions. + in_special_function = False + special_function_brackets = 0 + state = WAITING relation = "" temporal = "" @@ -270,29 +276,37 @@ def _temporal_extration_state_machine( transition = [state] comparable_part = part.upper().replace(" ", r"\s") + if comparable_part in FUNCTIONS_WITH_FROM_SYNTAX: + in_special_function = True + special_function_brackets = open_count + # work out what our current state is - if comparable_part in BOUNDARIES: + elif comparable_part in BOUNDARIES: + if comparable_part == "(": + open_count += 1 + if comparable_part == ")": + open_count -= 1 if relation == "": state = WAITING else: # function relations, like FAKE(234,234) need the items between the # brackets be be consumed state = FUNCTION_RELATION - if comparable_part == "(": - open_count += 1 - if comparable_part == ")": - open_count -= 1 - if comparable_part in STOP_COLLECTING: - if state == FUNCTION_RELATION and open_count > 0: - pass - else: - state = WAITING - if comparable_part in COLLECT_RELATION: - state = RELATION - if comparable_part in COLLECT_TEMPORAL: - state = TEMPORAL - if comparable_part in COLLECT_ALIAS: - state = ALIAS + elif in_special_function and open_count == special_function_brackets: + in_special_function = False + + if not in_special_function: + if comparable_part in STOP_COLLECTING: + if state == FUNCTION_RELATION and open_count > 0: + pass + else: + state = WAITING + if comparable_part in COLLECT_RELATION: + state = RELATION + if comparable_part in COLLECT_TEMPORAL: + state = TEMPORAL + if comparable_part in COLLECT_ALIAS: + state = ALIAS transition.append(state) # based on what the state was and what it is now, do something diff --git a/testdata/flat/nvd/nvd.parquet b/testdata/flat/nvd/nvd.parquet new file mode 100644 index 00000000..0429614e Binary files /dev/null and b/testdata/flat/nvd/nvd.parquet differ diff --git a/tests/plan_optimization/test_optimizations_invoked.py b/tests/plan_optimization/test_optimizations_invoked.py index 78c0dfec..cda3d4c8 100644 --- a/tests/plan_optimization/test_optimizations_invoked.py +++ b/tests/plan_optimization/test_optimizations_invoked.py @@ -18,7 +18,8 @@ ("SELECT * FROM $planets WHERE NOT id != 4", "optimization_boolean_rewrite_inversion"), ("SELECT * FROM $planets WHERE id = 4 + 4", "optimization_constant_fold_expression"), ("SELECT * FROM $planets WHERE id * 0 = 1", "optimization_constant_fold_reduce"), - + ("SELECT id ^ 1 = 1 FROM $planets LIMIT 10", "optimization_limit_pushdown"), + ("SELECT name FROM $astronauts WHERE name = 'Neil A. Armstrong'", "optimization_predicate_pushdown") ] # fmt:on diff --git a/tests/plan_optimization/test_temporal_extraction.py b/tests/plan_optimization/test_temporal_extraction.py index d466109e..477d3360 100644 --- a/tests/plan_optimization/test_temporal_extraction.py +++ b/tests/plan_optimization/test_temporal_extraction.py @@ -101,6 +101,16 @@ ("SELECT * FROM $planets FOR YESTERDAY WHERE id IN (SELECT * FROM $planets);", [('$planets', YESTERDAY, YESTERDAY.replace(hour=23, minute=59)), ('$planets', None, None)]), ("SELECT * FROM $planets WHERE id IN (SELECT * FROM $planets FOR YESTERDAY);", [('$planets', None, None), ('$planets', YESTERDAY, YESTERDAY.replace(hour=23, minute=59))]), ("SELECT * FROM $planets WHERE id IN (SELECT * FROM $planets);", [('$planets', None, None), ('$planets', None, None)]), + # FROM in functions + ("SELECT EXTRACT(YEAR FROM birth_date) FROM $astronauts", [("$astronauts", None, None)]), + ("SELECT SUBSTRING(name FROM 1 FOR 1) FROM $astronauts", [("$astronauts", None, None)]), + ("SELECT EXTRACT(YEAR FROM CURRENT_TIME)", []), + ("SELECT SUBSTRING('name' FROM 1 FOR 1)", []), + ("SELECT EXTRACT(YEAR FROM birth_date) FROM $astronauts FOR TODAY", [("$astronauts", THIS_MORNING, TONIGHT)]), + ("SELECT SUBSTRING(name FROM 1 FOR 1) FROM $astronauts FOR TODAY", [("$astronauts", THIS_MORNING, TONIGHT)]), + ("SELECT TRIM ( 'foo' FROM 'foo' )", []), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets", [("$planets", None, None)]), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets FOR TODAY", [("$planets", THIS_MORNING, TONIGHT)]), ] # fmt:on diff --git a/tests/sql_battery/test_battery_sql92.py b/tests/sql_battery/test_battery_sql92.py index c075f8bf..121e1ee9 100644 --- a/tests/sql_battery/test_battery_sql92.py +++ b/tests/sql_battery/test_battery_sql92.py @@ -119,8 +119,8 @@ # ("SELECT CHAR_LENGTH ( 'foo' USING CHARACTERS )", "E021-04"), # ("SELECT CHAR_LENGTH ( 'foo' USING OCTETS )", "E021-04"), # ("SELECT OCTET_LENGTH ( 'foo' )", "E021-05"), -# ("SELECT SUBSTRING ( 'foo' FROM 1 )", "E021-06"), -# ("SELECT SUBSTRING ( 'foo' FROM 1 FOR 2 )", "E021-06"), + ("SELECT SUBSTRING ( 'foo' FROM 1 )", "E021-06"), + ("SELECT SUBSTRING ( 'foo' FROM 1 FOR 2 )", "E021-06"), # ("SELECT SUBSTRING ( 'foo' FROM 1 FOR 2 USING CHARACTERS )", "E021-06"), # ("SELECT SUBSTRING ( 'foo' FROM 1 FOR 2 USING OCTETS )", "E021-06"), # ("SELECT SUBSTRING ( 'foo' FROM 1 USING CHARACTERS )", "E021-06"), @@ -132,7 +132,7 @@ ("SELECT TRIM ( 'foo' FROM 'foo' )", "E021-09"), ("SELECT TRIM ( BOTH 'foo' FROM 'foo' )", "E021-09"), # ("SELECT TRIM ( BOTH FROM 'foo' )", "E021-09"), - # ("SELECT TRIM ( FROM 'foo' )", "E021-09"), +# ("SELECT TRIM ( FROM 'foo' )", "E021-09"), ("SELECT TRIM ( LEADING 'foo' FROM 'foo' )", "E021-09"), # ("SELECT TRIM ( LEADING FROM 'foo' )", "E021-09"), ("SELECT TRIM ( TRAILING 'foo' FROM 'foo' )", "E021-09"), diff --git a/tests/sql_battery/test_shapes_and_errors_battery.py b/tests/sql_battery/test_shapes_and_errors_battery.py index 13dcc3bf..d43408e2 100644 --- a/tests/sql_battery/test_shapes_and_errors_battery.py +++ b/tests/sql_battery/test_shapes_and_errors_battery.py @@ -672,9 +672,20 @@ ("SELECT BP->>'state' FROM (SELECT VARCHAR(birth_place) AS BP FROM $astronauts) AS I", 357, 1, None), ("SELECT BP->>'address' FROM (SELECT VARCHAR(birth_place) AS BP FROM $astronauts) AS I", 357, 1, None), ("SELECT dict->>'list', dict->'list' AS thisisalongercolumnname, STRUCT(dict)->'list', dict->>'once', dict->'once' FROM testdata.flat.struct", 6, 5, None), + ("SELECT cve -> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT cve ->> 'CVE_data_meta' ->> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), + ("SELECT cve -> 'CVE_data_meta' -> 'ASSIGNER' FROM testdata.flat.nvd limit 10", 10, 1, None), ("SELECT dict @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT struct(dict) @? 'list' FROM testdata.flat.struct", 6, 1, None), ("SELECT birth_place @? 'town' FROM $astronauts", 357, 1, None), + ("SELECT dict @? '$.list' FROM testdata.flat.struct", 6, 1, None), + ("SELECT cve @? '$.CVE_data_meta.ASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.data_meta.ASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.CVE_data_meta' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? 'CVE_data_meta' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT cve @? '$.CVE_data_meta.REASSIGNER' FROM testdata.flat.nvd LIMIT 10", 10, 1, None), + ("SELECT struct(dict) @? '$.list' FROM testdata.flat.struct", 6, 1, None), + ("SELECT birth_place @? '$.town' FROM $astronauts", 357, 1, None), ("SELECT birth_place['town'] FROM $astronauts", 357, 1, None), ("SELECT missions[0] FROM $astronauts", 357, 1, None), ("SELECT birth_place['town'] FROM $astronauts WHERE birth_place['town'] = 'Warsaw'", 1, 1, None), @@ -1287,6 +1298,20 @@ ("SELECT name FROM $planets WHERE SUBSTRING ( name, 2, 1 ) = 'a'", 3, 1, None), ("SELECT name FROM $planets WHERE SUBSTRING ( name, 3 ) = 'rth'", 1, 1, None), ("SELECT name FROM $planets WHERE SUBSTRING ( name, -1 ) = 's'", 3, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 1 FOR 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 2 FOR 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets WHERE SUBSTRING ( name FROM -1 ) = 's'", 3, 1, None), + ("SELECT SUBSTRING ( name FROM 5 FOR 2 ) FROM $planets", 9, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 1, 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 2, 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name, -1 ) = 's'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 1 FOR 1 ) = 'M'", 2, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 2 FOR 1 ) = 'a'", 3, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM 3 ) = 'rth'", 1, 1, None), + ("SELECT name FROM $planets FOR TODAY WHERE SUBSTRING ( name FROM -1 ) = 's'", 3, 1, None), + ("SELECT SUBSTRING ( name FROM 5 FOR 2 ) FROM $planets FOR TODAY ", 9, 1, None), ("SELECT TIMESTAMP '2022-01-02', DATEDIFF('days', TIMESTAMP '2022-01-02', TIMESTAMP '2022-10-01') FROM $astronauts;", 357, 2, None), ("SELECT * FROM $satellites WHERE NULLIF(planetId, 5) IS NULL", 67, 8, None), ("SELECT * FROM $satellites WHERE NULLIF(planetId, 5) IS NOT NULL", 110, 8, None), @@ -1350,6 +1375,8 @@ ("SELECT TRIM(LEADING 'E' FROM name) FROM $planets;", 9, 1, None), ("SELECT * FROM $planets WHERE TRIM(TRAILING 'arth' FROM name) = 'E'", 1, 20, None), ("SELECT * FROM $planets WHERE TRIM(TRAILING 'ahrt' FROM name) = 'E'", 1, 20, None), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets", 9, 1, None), + ("SELECT TRIM ( 'MVEJSONP' FROM name ) FROM $planets FOR TODAY", 9, 1, None), ("SELECT user_name, user_verified FROM testdata.flat.formats.parquet WITH(NO_PARTITION) WHERE user_verified IS TRUE", 711, 2, None), ("SELECT user_name, user_verified FROM testdata.flat.formats.parquet WITH(NO_PARTITION) WHERE user_verified = TRUE", 711, 2, None), @@ -1724,6 +1751,13 @@ ("SELECT name FROM (SELECT MD5(name) AS hash, name FROM $planets) AS S", 9, 1, None), + ("SELECT jsonb_object_keys(birth_place) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(VARCHAR(birth_place)) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(BLOB(birth_place)) FROM $astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(birth_place) FROM testdata.astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(VARCHAR(birth_place)) FROM testdata.astronauts", 357, 1, None), + ("SELECT jsonb_object_keys(BLOB(birth_place)) FROM testdata.astronauts", 357, 1, None), + # Edge Case with Empty Joins ("SELECT * FROM $planets LEFT JOIN (SELECT id FROM $satellites WHERE planetId < 0) AS S ON $planets.id = S.id", 9, 21, None), # Handling NULL Comparisons in WHERE Clause