Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
syou6162 committed Dec 20, 2024
2 parents 468c808 + a18288f commit 0f07280
Show file tree
Hide file tree
Showing 21 changed files with 164 additions and 117 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

### Changed

## [0.10.16] - 2024-12-19

### Added
- Support for exporting a Data Contract to an Iceberg schema definition.
- When importing in dbt format, add the dbt `not_null` information as a datacontract `required` field (#547)
Expand All @@ -22,8 +28,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- SodaCL: Prevent `KeyError: 'fail'` from happening when testing with SodaCL
- fix: populate database and schema values for bigquery in exported dbt sources (#543)
- Fixing the options for importing and exporting to standard output (#544)
- Fixing the data quality name for model-level and field-level quality tests

## [0.10.15] - 2024-10-26
## [0.10.15] - 2024-12-02

### Added
- Support for model import from parquet file metadata.
Expand Down
6 changes: 3 additions & 3 deletions datacontract/breaking/breaking.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from datacontract.breaking.breaking_rules import BreakingRules
from datacontract.model.breaking_change import BreakingChange, Location, Severity
from datacontract.model.data_contract_specification import Contact, Field, Info, Model, Quality, Terms
from datacontract.model.data_contract_specification import Contact, DeprecatedQuality, Field, Info, Model, Terms


def info_breaking_changes(
Expand Down Expand Up @@ -216,8 +216,8 @@ def terms_breaking_changes(


def quality_breaking_changes(
old_quality: Quality,
new_quality: Quality,
old_quality: DeprecatedQuality,
new_quality: DeprecatedQuality,
new_path: str,
include_severities: [Severity],
) -> list[BreakingChange]:
Expand Down
8 changes: 4 additions & 4 deletions datacontract/engines/soda/check_soda_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from datacontract.engines.soda.connections.trino import to_trino_soda_configuration
from datacontract.export.sodacl_converter import to_sodacl_yaml
from datacontract.model.data_contract_specification import DataContractSpecification, Server
from datacontract.model.run import Check, Log, Run
from datacontract.model.run import Check, Log, ResultEnum, Run


def check_soda_execute(run: Run, data_contract: DataContractSpecification, server: Server, spark, tmp_dir):
Expand All @@ -33,7 +33,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
Check(
type="general",
name="Check that format is supported",
result="warning",
result=ResultEnum.warning,
reason=f"Format {server.format} not yet supported by datacontract CLI",
engine="datacontract",
)
Expand Down Expand Up @@ -93,7 +93,7 @@ def check_soda_execute(run: Run, data_contract: DataContractSpecification, serve
Check(
type="general",
name="Check that server type is supported",
result="warning",
result=ResultEnum.warning,
reason=f"Server type {server.type} not yet supported by datacontract CLI",
engine="datacontract-cli",
)
Expand Down Expand Up @@ -183,4 +183,4 @@ def update_reason(check, c):
# print(check.reason)
break # Exit the loop once the desired block is found
if "fail" in c["diagnostics"]:
check.reason = f"Got: {c['diagnostics']['value']} Expected: {c['diagnostics']['fail']}"
check.reason = f"Value: {c['diagnostics']['value']} Fails`: {c['diagnostics']['fail']}"
8 changes: 2 additions & 6 deletions datacontract/export/dbt_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,7 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
length_test["min_value"] = field.minLength
if field.maxLength is not None:
length_test["max_value"] = field.maxLength
column["data_tests"].append(
{"dbt_expectations.expect_column_value_lengths_to_be_between": length_test}
)
column["data_tests"].append({"dbt_expectations.expect_column_value_lengths_to_be_between": length_test})
if field.pii is not None:
column.setdefault("meta", {})["pii"] = field.pii
if field.classification is not None:
Expand All @@ -188,9 +186,7 @@ def _to_column(field_name: str, field: Field, supports_constraints: bool, adapte
column.setdefault("tags", []).extend(field.tags)
if field.pattern is not None:
# Beware, the data contract pattern is a regex, not a like pattern
column["data_tests"].append(
{"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}}
)
column["data_tests"].append({"dbt_expectations.expect_column_values_to_match_regex": {"regex": field.pattern}})
if (
field.minimum is not None
or field.maximum is not None
Expand Down
1 change: 0 additions & 1 deletion datacontract/export/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ class ExportFormat(str, Enum):
dcs = "dcs"
iceberg = "iceberg"


@classmethod
def get_supported_formats(cls):
return list(map(lambda c: c.value, cls))
Expand Down
72 changes: 43 additions & 29 deletions datacontract/export/odcs_v3_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ def to_odcs_schema(model_key, model_value: Model) -> dict:
if properties:
odcs_table["properties"] = properties

model_quality = to_odcs_quality_list(model_value.quality)
if len(model_quality) > 0:
odcs_table["quality"] = model_quality

odcs_table["customProperties"] = []
if model_value.model_extra is not None:
for key, value in model_value.model_extra.items():
Expand Down Expand Up @@ -257,38 +261,48 @@ def to_property(field_name: str, field: Field) -> dict:
del property["logicalTypeOptions"]

if field.quality is not None:
quality_property = []
for quality in field.quality:
quality_dict = {"type": quality.type}
if quality.description is not None:
quality_dict["description"] = quality.description
if quality.query is not None:
quality_dict["query"] = quality.query
# dialect is not supported in v3.0.0
if quality.mustBe is not None:
quality_dict["mustBe"] = quality.mustBe
if quality.mustNotBe is not None:
quality_dict["mustNotBe"] = quality.mustNotBe
if quality.mustBeGreaterThan is not None:
quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan
if quality.mustBeGreaterThanOrEqualTo is not None:
quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo
if quality.mustBeLessThan is not None:
quality_dict["mustBeLessThan"] = quality.mustBeLessThan
if quality.mustBeLessThanOrEqualTo is not None:
quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo
if quality.mustBeBetween is not None:
quality_dict["mustBeBetween"] = quality.mustBeBetween
if quality.mustNotBeBetween is not None:
quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween
if quality.engine is not None:
quality_dict["engine"] = quality.engine
if quality.implementation is not None:
quality_dict["implementation"] = quality.implementation
quality_property.append(quality_dict)
quality_list = field.quality
quality_property = to_odcs_quality_list(quality_list)
if len(quality_property) > 0:
property["quality"] = quality_property

# todo enum

return property


def to_odcs_quality_list(quality_list):
quality_property = []
for quality in quality_list:
quality_property.append(to_odcs_quality(quality))
return quality_property


def to_odcs_quality(quality):
quality_dict = {"type": quality.type}
if quality.description is not None:
quality_dict["description"] = quality.description
if quality.query is not None:
quality_dict["query"] = quality.query
# dialect is not supported in v3.0.0
if quality.mustBe is not None:
quality_dict["mustBe"] = quality.mustBe
if quality.mustNotBe is not None:
quality_dict["mustNotBe"] = quality.mustNotBe
if quality.mustBeGreaterThan is not None:
quality_dict["mustBeGreaterThan"] = quality.mustBeGreaterThan
if quality.mustBeGreaterThanOrEqualTo is not None:
quality_dict["mustBeGreaterThanOrEqualTo"] = quality.mustBeGreaterThanOrEqualTo
if quality.mustBeLessThan is not None:
quality_dict["mustBeLessThan"] = quality.mustBeLessThan
if quality.mustBeLessThanOrEqualTo is not None:
quality_dict["mustBeLessThanOrEqualTo"] = quality.mustBeLessThanOrEqualTo
if quality.mustBeBetween is not None:
quality_dict["mustBeBetween"] = quality.mustBeBetween
if quality.mustNotBeBetween is not None:
quality_dict["mustNotBeBetween"] = quality.mustNotBeBetween
if quality.engine is not None:
quality_dict["engine"] = quality.engine
if quality.implementation is not None:
quality_dict["implementation"] = quality.implementation
return quality_dict
5 changes: 3 additions & 2 deletions datacontract/export/sodacl_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,9 @@ def check_quality_list(model_name, field_name, quality_list: List[Quality]):
for quality in quality_list:
if quality.type == "sql":
if field_name is None:
metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
else:
metric_name = f"{model_name}_quality_sql_{count}"
else:
metric_name = f"{model_name}_{field_name}_quality_sql_{count}"
threshold = to_sodacl_threshold(quality)
query = prepare_query(quality, model_name, field_name)
if query is None:
Expand Down Expand Up @@ -265,6 +265,7 @@ def to_sodacl_threshold(quality: Quality) -> str | None:
return None


# These are deprecated root-level quality specifications, use the model-level and field-level quality fields instead
def add_quality_checks(sodacl, data_contract_spec):
if data_contract_spec.quality is None:
return
Expand Down
5 changes: 5 additions & 0 deletions datacontract/imports/odcs_v3_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Field,
Info,
Model,
Quality,
Retention,
Server,
ServiceLevel,
Expand Down Expand Up @@ -193,6 +194,10 @@ def import_models(odcs_contract: Dict[str, Any]) -> Dict[str, Model]:
model.fields = import_fields(
odcs_schema.get("properties"), custom_type_mappings, server_type=get_server_type(odcs_contract)
)
if odcs_schema.get("quality") is not None:
# convert dict to pydantic model

model.quality = [Quality.model_validate(q) for q in odcs_schema.get("quality")]
model.title = schema_name
if odcs_schema.get("dataGranularityDescription") is not None:
model.config = {"dataGranularityDescription": odcs_schema.get("dataGranularityDescription")}
Expand Down
8 changes: 6 additions & 2 deletions datacontract/lint/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from datacontract.lint.resources import read_resource
from datacontract.lint.schema import fetch_schema
from datacontract.lint.urls import fetch_resource
from datacontract.model.data_contract_specification import DataContractSpecification, Definition, Quality
from datacontract.model.data_contract_specification import (
DataContractSpecification,
Definition,
DeprecatedQuality,
)
from datacontract.model.exceptions import DataContractException
from datacontract.model.odcs import is_open_data_contract_standard

Expand Down Expand Up @@ -156,7 +160,7 @@ def _fetch_file(path) -> str:
return file.read()


def _resolve_quality_ref(quality: Quality):
def _resolve_quality_ref(quality: DeprecatedQuality):
"""
Return the content of a ref file path
@param quality data contract quality specification
Expand Down
5 changes: 3 additions & 2 deletions datacontract/model/data_contract_specification.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ class Example(pyd.BaseModel):
data: str | object = None


class Quality(pyd.BaseModel):
# Deprecated Quality class
class DeprecatedQuality(pyd.BaseModel):
type: str = None
specification: str | object = None

Expand Down Expand Up @@ -287,7 +288,7 @@ class DataContractSpecification(pyd.BaseModel):
default_factory=list,
deprecated="Removed in Data Contract Specification " "v1.1.0. Use models.examples instead.",
)
quality: Quality = pyd.Field(
quality: DeprecatedQuality = pyd.Field(
default=None,
deprecated="Removed in Data Contract Specification v1.1.0. Use " "model-level and field-level quality instead.",
)
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "datacontract-cli"
version = "0.10.15"
version = "0.10.16"
description = "The datacontract CLI is an open source command-line tool for working with Data Contracts. It uses data contract YAML files to lint the data contract, connect to data sources and execute schema and quality tests, detect breaking changes, and export to different formats. The tool is written in Python. It can be used as a standalone CLI tool, in a CI/CD pipeline, or directly as a Python library."
readme = "README.md"
authors = [
Expand Down Expand Up @@ -46,7 +46,9 @@ avro = [
]

bigquery = [
"soda-core-bigquery>=3.3.1,<3.5.0"
"soda-core-bigquery>=3.3.1,<3.5.0",
"google-cloud-bigquery-storage>=2.27.0",
"grpcio==1.62.3" # avoid strange log output in newer versions
]

databricks = [
Expand Down
9 changes: 8 additions & 1 deletion tests/fixtures/export/datacontract.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,11 @@ models:
enum:
- pending
- shipped
- delivered
- delivered
quality:
- type: sql
description: Row Count
query: |
SELECT COUNT(*) AS row_count
FROM orders
mustBeGreaterThan: 1000
10 changes: 10 additions & 0 deletions tests/fixtures/odcs_v3/full-example.datacontract.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ models:
partitionKeyPosition: -1
criticalDataElement: false
encryptedName: rcvr_cntry_code_encrypted
quality:
- type: library
description: Ensure row count is within expected volume range
rule: countCheck
dimension: completeness
method: reconciliation
severity: error
businessImpact: operational
schedule: 0 20 * * *
scheduler: cron
config:
dataGranularityDescription: Aggregation on columns txn_ref_dt, pmt_txn_id
servicelevels:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/postgres/datacontract.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ servers:
my-dataproduct/postgres:
type: postgres
host: localhost
port: __PORT__
port: 5432
database: test
schema: public
models:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/postgres/datacontract_case_sensitive.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ servers:
my-dataproduct/postgres:
type: postgres
host: localhost
port: __PORT__
port: 5432
database: test
schema: public
models:
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/postgres/odcs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ servers:
database: test
schema: public
host: localhost
port: __PORT__
port: 5432
5 changes: 2 additions & 3 deletions tests/fixtures/quality/datacontract.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ servers:
my-dataproduct/postgres:
type: postgres
host: localhost
port: __PORT__
port: 5432
database: test
schema: public
models:
Expand All @@ -28,8 +28,7 @@ models:
- type: sql
description: 95% of all order total values are expected to be between 10 and 499 EUR.
dialect: postgres
query: |
SELECT percentile_cont(0.95) WITHIN GROUP (ORDER BY field_two) AS percentile_95 FROM my_table
query: SELECT percentile_cont(0.95) WITHIN GROUP (ORDER BY field_two) AS percentile_95 FROM my_table
mustBeBetween: [ 1000, 49900 ]
field_three:
type: timestamp
Expand Down
7 changes: 7 additions & 0 deletions tests/test_export_odcs_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ def test_to_odcs():
logicalType: string
physicalType: text
isNullable: false
quality:
- type: sql
description: Row Count
query: |
SELECT COUNT(*) AS row_count
FROM orders
mustBeGreaterThan: 1000
servers:
- server: production
Expand Down
Loading

0 comments on commit 0f07280

Please sign in to comment.