feat(py): expose python caching utilities

sdsc-ordes · Oct 21, 2024 · f3cd9b1 · f3cd9b1
1 parent 6131ad5
commit f3cd9b1
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 3 deletions.
diff --git a/pyfuzon/python/pyfuzon/__init__.py b/pyfuzon/python/pyfuzon/__init__.py
@@ -5,3 +5,4 @@
     __all__ = pyfuzon.__all__
 
 from .matcher import TermMatcher
+from .cache import get_cache_key, get_cache_path
diff --git a/pyfuzon/python/pyfuzon/cache.py b/pyfuzon/python/pyfuzon/cache.py
@@ -0,0 +1,28 @@
+"""Caching utilities for pyfuzon.
+
+This module provides functions to help manage the cache of TermMatchers.
+Cache keys are built by fuzon using the sorted source paths. For each path,
+a stamp is computed as follows (missing values are replaced by empty strings):
+    + file path: {path}-{size}-{last-modified-datetime}
+    + url: {url}-{etag-checksum}-{last-modified-datetime}
+All stamps are then concatenated and hash of the result is used as the cache key.
+
+Cache paths adhere to the following specifications:
+    + Linux: XDG base / user directory
+    + Windows: Known folder API
+    + MacOS: Standard Directories guidelines
+For more information, see: https://github.com/dirs-dev/dirs-rs
+"""
+
+from pathlib import Path
+
+from .pyfuzon import get_cache_key as _get_cache_key
+from .pyfuzon import get_cache_path as _get_cache_path
+
+def get_cache_key(sources: list[str]) -> str:
+    """Return a deterministic cache key based on a collection of source paths."""
+    return _get_cache_key(sources)
+
+def get_cache_path(sources: list[str]) -> Path:
+    """Return a full platform-specific cache path based on a collection of source paths."""
+    return Path(_get_cache_path(sources))
diff --git a/pyfuzon/python/pyfuzon/matcher.py b/pyfuzon/python/pyfuzon/matcher.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
 from typing import Self
+
 from dataclasses import dataclass
 
 from pyfuzon import Term, score_terms, parse_files, load_terms, dump_terms
@@ -10,7 +11,7 @@ class TermMatcher:
     """Fuzzy matches terms from RDF terminologies to input queries."""
 
     terms: list[Term]
-    
+
     def top(self, query: str, n: int=5) -> list[Term]:
         """Return the n terms most similar to input query."""
         return self.rank(query)[:n]
@@ -19,7 +20,7 @@ def rank(self, query: str) -> list[Term]:
         """Return all terms, ranked by query similarity."""
         scores = self.score(query)
         ranks = [
-            i[0] for i in 
+            i[0] for i in
             sorted(enumerate(scores), key=lambda x:x[1], reverse=True)
         ]
         return [self.terms[rank] for rank in ranks]
@@ -45,4 +46,3 @@ def load(cls, path):
     def dump(self, path):
         """Serialize to disk."""
         dump_terms(self.terms, path)
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,3 +5,4 @@
		__all__ = pyfuzon.__all__

		from .matcher import TermMatcher
		from .cache import get_cache_key, get_cache_path