vda-lab · JelmerBot · Aug 15, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/.github/workflows/Wheels.yml b/.github/workflows/Wheels.yml
@@ -47,7 +47,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r requirements.txt
-          pip install twine flake8
+          pip install twine flake8 Cython
 
       - name: Lint with flake8
         run: |

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,6 +1,16 @@
+v0.1.3
+    This version matches the first major revision of our paper:
+    - Notable changes to eccentricity definition and branch hierarchy
+      simplification. Eccentricity is now directly defined by distance to 
+      cluster centroids. 
+    - Fixes to branch-membership centrality measure and the update labelling
+      function. It no longer changes labels of non-noise points. 
+    - New and updated figures included in the paper.
+    - Re-evaluated all benchmark notebooks.
+    - Removed Cython as install dependency.
 v0.1.2
-    Update Github Actions
+    Update Github Actions.
 v0.1.1
-    Fix typo in readme
+    Fix typo in readme.
 v0.1.0
-    Initial version
+    Initial version.
diff --git a/README.md b/README.md
@@ -73,6 +73,7 @@ pip install -t .
 
 A scientific publication of this algorithm is available on Arxiv:
 
+```bibtex
 @misc{bot2023flasc,
   title={FLASC: A Flare-Sensitive Clustering Algorithm: Extending HDBSCAN* for Detecting Branches in Clusters}, 
   author={D. M. Bot and J. Peeters and J. Liesenborgs and J. Aerts},
@@ -81,6 +82,7 @@ A scientific publication of this algorithm is available on Arxiv:
   archivePrefix={arXiv},
   primaryClass={cs.LG}
 }
+```
 
 This FLASC algorithm and software package is very closely related to McInnes et
 al.'s HDBSCAN\* software package. If you wish to cite the HDBSCAN\* package in a

diff --git a/flasc/_flasc.py b/flasc/_flasc.py
@@ -65,13 +65,13 @@ def flasc(
     """Performs FLASC clustering with flare detection post-processing step.
 
     FLASC - Flare-Sensitive Clustering.
-    Performs :py:mod:`hdbscan` clustering [1]_ with a post-processing step to 
+    Performs :py:mod:`hdbscan` clustering [1]_ with a post-processing step to
     detect branches within individual clusters. For each cluster, a graph is
     constructed connecting the data points based on their mutual reachability
-    distances. Each edge is given a centrality value based on how many edges
-    need to be traversed to reach the cluster's root point from the edge. Then,
-    the edges are clustered as if that centrality was a density, progressively
-    removing the 'centre' of each cluster and seeing how many branches remain.
+    distances. Each edge is given a centrality value based on how far it lies
+    from the cluster's center. Then, the edges are clustered as if that
+    centrality was a distance, progressively removing the 'center' of each
+    cluster and seeing how many branches remain.
 
     Parameters
     ----------
@@ -167,7 +167,7 @@ def flasc(
 
     allow_single_branch : bool, optional (default=False)
         Analogous to ``allow_single_cluster``. Note that depending on
-        ``label_sides_as_branches`` FFLASC requires at least 3 branches to
+        ``label_sides_as_branches`` FLASC requires at least 3 branches to
         exist in a cluster before they are incorporated in the final labelling.
 
     branch_detection_method : str, optional (default=``full``)
@@ -186,18 +186,18 @@ def flasc(
 
     branch_selection_method : str, optional (default='eom')
         The method used to select branches from the cluster's condensed tree.
-        The standard approach for FFLASC is to use the ``eom`` approach.
+        The standard approach for FLASC is to use the ``eom`` approach.
         Options are:
           * ``eom``
           * ``leaf``
 
     branch_selection_persistence: float, optional (default=0.0)
-        A centrality persistence threshold. Branches with a persistence below 
+        An eccentricity persistence threshold. Branches with a persistence below
         this value will be merged. See [3]_ for more information. Note that this
-        should not be used if we want to predict the cluster labels for new 
-        points in future (e.g. using approximate_predict), as the 
-        :func:`~flasc.prediction.approximate_predict` function
-        is not aware of this argument.
+        should not be used if we want to predict the cluster labels for new
+        points in future (e.g. using approximate_predict), as the
+        :func:`~flasc.prediction.approximate_predict` function is not aware of
+        this argument.
 
     max_branch_size : int, optional (default=0)
         A limit to the size of clusters returned by the ``eom`` algorithm.
@@ -287,8 +287,7 @@ def flasc(
         assigned 0.
 
     branch_persistences : tuple (n_clusters)
-        A branch persistence for each cluster produced during the branch
-        detection step.
+        A branch persistence (eccentricity range) for each detected branch.
 
     condensed_tree : record array
         The condensed cluster hierarchy used to generate clusters.

diff --git a/flasc/_flasc_branches.pyx b/flasc/_flasc_branches.pyx
@@ -13,7 +13,7 @@ from hdbscan._hdbscan_tree import compute_stability, condense_tree
 from ._hdbscan_dist_metrics import DistanceMetric
 from ._hdbscan_dist_metrics cimport DistanceMetric
 from ._flasc_linkage import label
-from ._flasc_tree import get_clusters
+from ._flasc_tree import get_clusters, simplify_hierarchy
 from ._flasc_edges import (
     _fill_edge_centrality,
     _relabel_edges_with_data_ids,
@@ -85,7 +85,7 @@ def _compute_branch_linkage_of_cluster(
         points = space_tree.data.base[cluster_points]
         centroid = np.average(points, weights=cluster_probabilities, axis=0)
         centralities = metric_fun.pairwise(centroid[None], points)[0, :]
-    centralities = centralities.max() - centralities
+    centralities = 1 / centralities
 
     # within cluster ids
     cdef np.ndarray[np.double_t, ndim=1] cluster_ids = np.full(num_points, -1, dtype=np.double)
@@ -136,15 +136,16 @@ def _compute_branch_segmentation_of_cluster(
     cdef np.ndarray condensed_tree = condense_tree(
         single_linkage_tree.base, min_branch_size
     )
+    if branch_selection_persistence > 0.0:
+        condensed_tree = simplify_hierarchy(condensed_tree, branch_selection_persistence)
     cdef dict stability = compute_stability(condensed_tree)
     (labels, probabilities, persistences) = get_clusters(
         condensed_tree, stability,
         allow_single_branch=allow_single_branch,
         branch_selection_method=branch_selection_method,
-        branch_selection_persistence=branch_selection_persistence,
         max_branch_size=max_branch_size
     )
-    # Reset noise labels to 0-cluster
+    # Reset noise labels to k-cluster
     labels[labels < 0] = len(persistences)
     return (labels, probabilities, persistences, condensed_tree)
 
@@ -169,7 +170,7 @@ def _update_labelling(
 
     # Compute the labels and probabilities
     cdef Py_ssize_t num_branches = 0
-    cdef np.intp_t running_id = 0, cid = 0
+    cdef np.intp_t running_id = 0
     cdef np.ndarray[np.intp_t, ndim=1] _points, _labels
     cdef np.ndarray[np.double_t, ndim=1] _probs, _pers, _depths
     for _points, _depths, _labels, _probs, _pers in zip(

diff --git a/flasc/_flasc_depths.pyx b/flasc/_flasc_depths.pyx
@@ -38,8 +38,7 @@ cpdef np.ndarray[np.double_t, ndim=1] _compute_cluster_centralities(
 
     # Traversal variables
     cdef np.double_t grand_child = 0 
-    cdef np.double_t depth=0
-    cdef np.double_t max_depth = 0
+    cdef np.double_t depth = 0 
     cdef pair[np.double_t, np.double_t] edge
     cdef pair[pair[np.double_t, np.double_t], np.double_t] item
     cdef deque[pair[pair[np.double_t, np.double_t], np.double_t]] queue
@@ -49,12 +48,12 @@ cpdef np.ndarray[np.double_t, ndim=1] _compute_cluster_centralities(
     # with nogil:
     # Queue the root's children
     edge.first = <np.double_t> cluster_root
-    depths_view[cluster_root] = 0.0
+    depths_view[cluster_root] = 1.0
     flags[cluster_root] = True
     for child in network[cluster_root]:
         edge.second = child
         item.first = edge
-        item.second = 1.0
+        item.second = 2.0
         queue.push_back(item)
         flags[<np.intp_t> child] = True
 
@@ -69,7 +68,6 @@ cpdef np.ndarray[np.double_t, ndim=1] _compute_cluster_centralities(
 
         # Fill in the depth value, keep track of max
         depths_view[<np.intp_t> child] = depth
-        max_depth = max(depth, max_depth)
 
         # Enqueue grand-children
         item.second += 1.0
@@ -81,4 +79,4 @@ cpdef np.ndarray[np.double_t, ndim=1] _compute_cluster_centralities(
             item.first = edge
             queue.push_back(item)
             flags[<np.intp_t> grand_child] = True
-    return max_depth - depths
+    return 1 / depths
diff --git a/flasc/_flasc_edges.pyx b/flasc/_flasc_edges.pyx
@@ -70,9 +70,6 @@ cpdef _extract_core_approximation_of_cluster(
     np.minimum(core_parent, core_children, edges[count:, 0])
     np.maximum(core_parent, core_children, edges[count:, 1])
 
-    # Extract unique edges that stay within the cluster
-    edges = np.unique(edges[edges[:, 0] > -1.0, :], axis=0)
-
     # Fill mutual reachabilities
     edges[:count, 3] = cluster_spanning_tree[:, 2]
     # (astype copy more effiecient than manual iteration)
@@ -81,6 +78,9 @@ cpdef _extract_core_approximation_of_cluster(
         core_distances[edges[count:, 1].astype(np.intp)],
         edges[count:, 3]
     )
+
+    # Extract unique edges that stay within the cluster
+    edges = np.unique(edges[edges[:, 0] > -1.0, :], axis=0)
 
     # Return output
     return edges

diff --git a/flasc/_flasc_linkage.pyx b/flasc/_flasc_linkage.pyx
@@ -15,7 +15,7 @@ cpdef np.ndarray[np.double_t, ndim=2] label(
     np.intp_t num_points
 ):
     """Convert an edge list into single linkage hierarchy."""
-    # ALlocate output and working structure
+    # Allocate output and working structure
     cdef np.intp_t N = edges.shape[0]
     cdef np.ndarray[np.double_t, ndim=2] result = np.zeros((N, 4))
     cdef np.double_t[:, ::1] result_view = result