Skip to content

Commit

Permalink
Improving datamap performance (#5601)
Browse files Browse the repository at this point in the history
  • Loading branch information
galvana authored Dec 16, 2024
1 parent 0030db7 commit fa1c416
Show file tree
Hide file tree
Showing 7 changed files with 317 additions and 71 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ The types of changes are:

### Fixed
- Fixing quickstart.py script [#5585](https://github.com/ethyca/fides/pull/5585)
- Fixed miscellaneous performance issues with Systems and PrivacyDeclarations [#5601](https://github.com/ethyca/fides/pull/5601)

### Changed
- Adjusted Ant's Select component colors and icon [#5594](https://github.com/ethyca/fides/pull/5594)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -426,49 +426,65 @@ export const getDatamapReportColumns = ({
columnHelper.accessor((row) => row.system_undeclared_data_categories, {
id: COLUMN_IDS.SYSTEM_UNDECLARED_DATA_CATEGORIES,
cell: (props) => {
const value = props.getValue();

const cellValues = props.getValue();
if (!cellValues || cellValues.length === 0) {
return null;
}
const values = isArray(cellValues)
? cellValues.map((value) => {
return { label: getDataCategoryDisplayName(value), key: value };
})
: [
{
label: getDataCategoryDisplayName(cellValues),
key: cellValues,
},
];
return (
<GroupCountBadgeCell
ignoreZero
suffix="system undeclared data categories"
value={
isArray(value)
? map(value, getDataCategoryDisplayName)
: getDataCategoryDisplayName(value || "")
}
badgeProps={{ variant: "outline" }}
{...props}
<BadgeCellExpandable
values={values}
cellProps={props as any}
variant="outline"
/>
);
},
meta: {
showHeaderMenu: !isRenaming,
showHeaderMenuWrapOption: true,
width: "auto",
overflow: "hidden",
},
}),
columnHelper.accessor((row) => row.data_use_undeclared_data_categories, {
id: COLUMN_IDS.DATA_USE_UNDECLARED_DATA_CATEGORIES,
cell: (props) => {
const value = props.getValue();

const cellValues = props.getValue();
if (!cellValues || cellValues.length === 0) {
return null;
}
const values = isArray(cellValues)
? cellValues.map((value) => {
return { label: getDataCategoryDisplayName(value), key: value };
})
: [
{
label: getDataCategoryDisplayName(cellValues),
key: cellValues,
},
];
return (
<GroupCountBadgeCell
ignoreZero
suffix="data use undeclared data categories"
value={
isArray(value)
? map(value, getDataCategoryDisplayName)
: getDataCategoryDisplayName(value || "")
}
badgeProps={{ variant: "outline" }}
{...props}
<BadgeCellExpandable
values={values}
cellProps={props as any}
variant="outline"
/>
);
},
meta: {
showHeaderMenu: !isRenaming,
showHeaderMenuWrapOption: true,
width: "auto",
overflow: "hidden",
},
}),
columnHelper.accessor((row) => row.cookies, {
Expand Down
57 changes: 30 additions & 27 deletions src/fides/api/models/sql_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
UniqueConstraint,
case,
cast,
func,
select,
text,
type_coerce,
Expand All @@ -36,7 +37,7 @@
from sqlalchemy.ext.asyncio import AsyncSession, async_object_session
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import Session, relationship
from sqlalchemy.sql import Select, func
from sqlalchemy.sql import Select
from sqlalchemy.sql.elements import Case
from sqlalchemy.sql.sqltypes import DateTime
from typing_extensions import Protocol, runtime_checkable
Expand Down Expand Up @@ -404,15 +405,6 @@ class System(Base, FidesBase):
"Cookies", back_populates="system", lazy="selectin", uselist=True, viewonly=True
)

# index scan using ix_ctl_datasets_fides_key on ctl_datasets
datasets = relationship(
"Dataset",
primaryjoin="foreign(Dataset.fides_key)==any_(System.dataset_references)",
lazy="selectin",
uselist=True,
viewonly=True,
)

@classmethod
def get_data_uses(
cls: Type[System], systems: List[System], include_parents: bool = True
Expand All @@ -430,11 +422,21 @@ def get_data_uses(
data_uses.add(data_use)
return data_uses

@property
def undeclared_data_categories(self) -> Set[str]:
def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]:
aggregate = set()
for dataset_key in self.dataset_references or []:
aggregate.update(data_categories.get(dataset_key, set()))
return aggregate

def undeclared_data_categories(
self, data_categories: Dict[str, Set[str]]
) -> Set[str]:
"""
Returns a set of data categories defined on the system's datasets
that are not associated with any data use (privacy declaration).
Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map.
This is done to improve performance.
"""

privacy_declaration_data_categories = set()
Expand All @@ -443,9 +445,9 @@ def undeclared_data_categories(self) -> Set[str]:
privacy_declaration.data_categories
)

system_dataset_data_categories = set()
for dataset in self.datasets:
system_dataset_data_categories.update(dataset.field_data_categories)
system_dataset_data_categories = set(
self.dataset_data_categories(data_categories)
)

return find_undeclared_categories(
system_dataset_data_categories, privacy_declaration_data_categories
Expand Down Expand Up @@ -501,13 +503,6 @@ class PrivacyDeclaration(Base):
cookies = relationship(
"Cookies", back_populates="privacy_declaration", lazy="joined", uselist=True
)
datasets = relationship(
"Dataset",
primaryjoin="foreign(Dataset.fides_key)==any_(PrivacyDeclaration.dataset_references)",
lazy="selectin",
uselist=True,
viewonly=True,
)

@classmethod
def create(
Expand Down Expand Up @@ -547,21 +542,29 @@ def purpose(cls) -> Case:
else_=None,
)

@property
def undeclared_data_categories(self) -> Set[str]:
def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]:
aggregate = set()
for dataset_key in self.dataset_references or []:
aggregate.update(data_categories.get(dataset_key, set()))
return aggregate

def undeclared_data_categories(
self, data_categories: Dict[str, Set[str]]
) -> Set[str]:
"""
Aggregates a unique set of data categories across the collections in the associated datasets and
returns the data categories that are not defined directly on this or any sibling privacy declarations.
Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map.
This is done to improve performance.
"""

# Note: This property evaluates the data categories attached to the datasets associated with this specific
# privacy declaration. However, the search space for identifying undeclared data categories includes all
# data categories across this privacy declaration and its sibling privacy declarations.

# all data categories from the datasets
dataset_data_categories = set()
for dataset in self.datasets:
dataset_data_categories.update(dataset.field_data_categories)
dataset_data_categories = set(self.dataset_data_categories(data_categories))

# all data categories specified directly on this and sibling privacy declarations
declared_data_categories = set()
Expand Down
32 changes: 31 additions & 1 deletion src/fides/api/util/data_category.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from enum import Enum as EnumType
from typing import List, Type
from typing import Dict, List, Set, Type

from fideslang.default_taxonomy import DEFAULT_TAXONOMY
from fideslang.validation import FidesKey
from sqlalchemy import func, select, text
from sqlalchemy.orm import Session

from fides.api import common_exceptions

from fides.api.models.sql_models import ( # type: ignore[attr-defined] # isort: skip
DataCategory as DataCategoryDbModel,
Dataset,
)


Expand Down Expand Up @@ -89,3 +91,31 @@ def filter_data_categories(
}
return sorted(list(default_categories))
return sorted(user_categories)


def get_data_categories_map(db: Session) -> Dict[str, Set[str]]:
"""
Returns a map of all datasets, where the keys are the fides keys
of each dataset and the value is a set of data categories associated with each dataset
"""

subquery = (
select(
Dataset.fides_key,
func.jsonb_array_elements_text(
text(
"jsonb_path_query(collections::jsonb, '$.** ? (@.data_categories != null).data_categories')"
)
).label("category"),
).select_from(Dataset)
).cte()

query = (
select(
[subquery.c.fides_key, func.array_agg(func.distinct(subquery.c.category))]
)
.select_from(subquery)
.group_by(subquery.c.fides_key)
)
result = db.execute(query)
return {key: set(value) if value else set() for key, value in result.all()}
Loading

0 comments on commit fa1c416

Please sign in to comment.