Improving datamap performance (#5601)

ethyca · Dec 16, 2024 · fa1c416 · fa1c416
1 parent 0030db7
commit fa1c416
Show file tree

Hide file tree

Showing 7 changed files with 317 additions and 71 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ The types of changes are:
 
 ### Fixed
 - Fixing quickstart.py script [#5585](https://github.com/ethyca/fides/pull/5585)
+- Fixed miscellaneous performance issues with Systems and PrivacyDeclarations [#5601](https://github.com/ethyca/fides/pull/5601)
 
 ### Changed
 - Adjusted Ant's Select component colors and icon [#5594](https://github.com/ethyca/fides/pull/5594)

diff --git a/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx b/clients/admin-ui/src/features/datamap/reporting/DatamapReportTableColumns.tsx
@@ -426,49 +426,65 @@ export const getDatamapReportColumns = ({
     columnHelper.accessor((row) => row.system_undeclared_data_categories, {
       id: COLUMN_IDS.SYSTEM_UNDECLARED_DATA_CATEGORIES,
       cell: (props) => {
-        const value = props.getValue();
-
+        const cellValues = props.getValue();
+        if (!cellValues || cellValues.length === 0) {
+          return null;
+        }
+        const values = isArray(cellValues)
+          ? cellValues.map((value) => {
+              return { label: getDataCategoryDisplayName(value), key: value };
+            })
+          : [
+              {
+                label: getDataCategoryDisplayName(cellValues),
+                key: cellValues,
+              },
+            ];
         return (
-          <GroupCountBadgeCell
-            ignoreZero
-            suffix="system undeclared data categories"
-            value={
-              isArray(value)
-                ? map(value, getDataCategoryDisplayName)
-                : getDataCategoryDisplayName(value || "")
-            }
-            badgeProps={{ variant: "outline" }}
-            {...props}
+          <BadgeCellExpandable
+            values={values}
+            cellProps={props as any}
+            variant="outline"
           />
         );
       },
       meta: {
         showHeaderMenu: !isRenaming,
+        showHeaderMenuWrapOption: true,
         width: "auto",
+        overflow: "hidden",
       },
     }),
     columnHelper.accessor((row) => row.data_use_undeclared_data_categories, {
       id: COLUMN_IDS.DATA_USE_UNDECLARED_DATA_CATEGORIES,
       cell: (props) => {
-        const value = props.getValue();
-
+        const cellValues = props.getValue();
+        if (!cellValues || cellValues.length === 0) {
+          return null;
+        }
+        const values = isArray(cellValues)
+          ? cellValues.map((value) => {
+              return { label: getDataCategoryDisplayName(value), key: value };
+            })
+          : [
+              {
+                label: getDataCategoryDisplayName(cellValues),
+                key: cellValues,
+              },
+            ];
         return (
-          <GroupCountBadgeCell
-            ignoreZero
-            suffix="data use undeclared data categories"
-            value={
-              isArray(value)
-                ? map(value, getDataCategoryDisplayName)
-                : getDataCategoryDisplayName(value || "")
-            }
-            badgeProps={{ variant: "outline" }}
-            {...props}
+          <BadgeCellExpandable
+            values={values}
+            cellProps={props as any}
+            variant="outline"
           />
         );
       },
       meta: {
         showHeaderMenu: !isRenaming,
+        showHeaderMenuWrapOption: true,
         width: "auto",
+        overflow: "hidden",
       },
     }),
     columnHelper.accessor((row) => row.cookies, {

diff --git a/src/fides/api/models/sql_models.py b/src/fides/api/models/sql_models.py
@@ -27,6 +27,7 @@
     UniqueConstraint,
     case,
     cast,
+    func,
     select,
     text,
     type_coerce,
@@ -36,7 +37,7 @@
 from sqlalchemy.ext.asyncio import AsyncSession, async_object_session
 from sqlalchemy.ext.hybrid import hybrid_property
 from sqlalchemy.orm import Session, relationship
-from sqlalchemy.sql import Select, func
+from sqlalchemy.sql import Select
 from sqlalchemy.sql.elements import Case
 from sqlalchemy.sql.sqltypes import DateTime
 from typing_extensions import Protocol, runtime_checkable
@@ -404,15 +405,6 @@ class System(Base, FidesBase):
         "Cookies", back_populates="system", lazy="selectin", uselist=True, viewonly=True
     )
 
-    # index scan using ix_ctl_datasets_fides_key on ctl_datasets
-    datasets = relationship(
-        "Dataset",
-        primaryjoin="foreign(Dataset.fides_key)==any_(System.dataset_references)",
-        lazy="selectin",
-        uselist=True,
-        viewonly=True,
-    )
-
     @classmethod
     def get_data_uses(
         cls: Type[System], systems: List[System], include_parents: bool = True
@@ -430,11 +422,21 @@ def get_data_uses(
                         data_uses.add(data_use)
         return data_uses
 
-    @property
-    def undeclared_data_categories(self) -> Set[str]:
+    def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]:
+        aggregate = set()
+        for dataset_key in self.dataset_references or []:
+            aggregate.update(data_categories.get(dataset_key, set()))
+        return aggregate
+
+    def undeclared_data_categories(
+        self, data_categories: Dict[str, Set[str]]
+    ) -> Set[str]:
         """
         Returns a set of data categories defined on the system's datasets
         that are not associated with any data use (privacy declaration).
+
+        Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map.
+        This is done to improve performance.
         """
 
         privacy_declaration_data_categories = set()
@@ -443,9 +445,9 @@ def undeclared_data_categories(self) -> Set[str]:
                 privacy_declaration.data_categories
             )
 
-        system_dataset_data_categories = set()
-        for dataset in self.datasets:
-            system_dataset_data_categories.update(dataset.field_data_categories)
+        system_dataset_data_categories = set(
+            self.dataset_data_categories(data_categories)
+        )
 
         return find_undeclared_categories(
             system_dataset_data_categories, privacy_declaration_data_categories
@@ -501,13 +503,6 @@ class PrivacyDeclaration(Base):
     cookies = relationship(
         "Cookies", back_populates="privacy_declaration", lazy="joined", uselist=True
     )
-    datasets = relationship(
-        "Dataset",
-        primaryjoin="foreign(Dataset.fides_key)==any_(PrivacyDeclaration.dataset_references)",
-        lazy="selectin",
-        uselist=True,
-        viewonly=True,
-    )
 
     @classmethod
     def create(
@@ -547,21 +542,29 @@ def purpose(cls) -> Case:
             else_=None,
         )
 
-    @property
-    def undeclared_data_categories(self) -> Set[str]:
+    def dataset_data_categories(self, data_categories: Dict[str, Set[str]]) -> Set[str]:
+        aggregate = set()
+        for dataset_key in self.dataset_references or []:
+            aggregate.update(data_categories.get(dataset_key, set()))
+        return aggregate
+
+    def undeclared_data_categories(
+        self, data_categories: Dict[str, Set[str]]
+    ) -> Set[str]:
         """
         Aggregates a unique set of data categories across the collections in the associated datasets and
         returns the data categories that are not defined directly on this or any sibling privacy declarations.
+
+        Looks up the unique set of data categories for a given dataset from the pre-computed data_categories map.
+        This is done to improve performance.
         """
 
         # Note: This property evaluates the data categories attached to the datasets associated with this specific
         # privacy declaration. However, the search space for identifying undeclared data categories includes all
         # data categories across this privacy declaration and its sibling privacy declarations.
 
         # all data categories from the datasets
-        dataset_data_categories = set()
-        for dataset in self.datasets:
-            dataset_data_categories.update(dataset.field_data_categories)
+        dataset_data_categories = set(self.dataset_data_categories(data_categories))
 
         # all data categories specified directly on this and sibling privacy declarations
         declared_data_categories = set()

diff --git a/src/fides/api/util/data_category.py b/src/fides/api/util/data_category.py
@@ -1,14 +1,16 @@
 from enum import Enum as EnumType
-from typing import List, Type
+from typing import Dict, List, Set, Type
 
 from fideslang.default_taxonomy import DEFAULT_TAXONOMY
 from fideslang.validation import FidesKey
+from sqlalchemy import func, select, text
 from sqlalchemy.orm import Session
 
 from fides.api import common_exceptions
 
 from fides.api.models.sql_models import (  # type: ignore[attr-defined] # isort: skip
     DataCategory as DataCategoryDbModel,
+    Dataset,
 )
 
 
@@ -89,3 +91,31 @@ def filter_data_categories(
         }
         return sorted(list(default_categories))
     return sorted(user_categories)
+
+
+def get_data_categories_map(db: Session) -> Dict[str, Set[str]]:
+    """
+    Returns a map of all datasets, where the keys are the fides keys
+    of each dataset and the value is a set of data categories associated with each dataset
+    """
+
+    subquery = (
+        select(
+            Dataset.fides_key,
+            func.jsonb_array_elements_text(
+                text(
+                    "jsonb_path_query(collections::jsonb, '$.** ? (@.data_categories != null).data_categories')"
+                )
+            ).label("category"),
+        ).select_from(Dataset)
+    ).cte()
+
+    query = (
+        select(
+            [subquery.c.fides_key, func.array_agg(func.distinct(subquery.c.category))]
+        )
+        .select_from(subquery)
+        .group_by(subquery.c.fides_key)
+    )
+    result = db.execute(query)
+    return {key: set(value) if value else set() for key, value in result.all()}