diff --git a/pyfuzon/python/pyfuzon/__init__.py b/pyfuzon/python/pyfuzon/__init__.py index fda4e6b..69d6b01 100644 --- a/pyfuzon/python/pyfuzon/__init__.py +++ b/pyfuzon/python/pyfuzon/__init__.py @@ -5,3 +5,4 @@ __all__ = pyfuzon.__all__ from .matcher import TermMatcher +from .cache import get_cache_key, get_cache_path diff --git a/pyfuzon/python/pyfuzon/cache.py b/pyfuzon/python/pyfuzon/cache.py new file mode 100644 index 0000000..5521f62 --- /dev/null +++ b/pyfuzon/python/pyfuzon/cache.py @@ -0,0 +1,28 @@ +"""Caching utilities for pyfuzon. + +This module provides functions to help manage the cache of TermMatchers. +Cache keys are built by fuzon using the sorted source paths. For each path, +a stamp is computed as follows (missing values are replaced by empty strings): + + file path: {path}-{size}-{last-modified-datetime} + + url: {url}-{etag-checksum}-{last-modified-datetime} +All stamps are then concatenated and hash of the result is used as the cache key. + +Cache paths adhere to the following specifications: + + Linux: XDG base / user directory + + Windows: Known folder API + + MacOS: Standard Directories guidelines +For more information, see: https://github.com/dirs-dev/dirs-rs +""" + +from pathlib import Path + +from .pyfuzon import get_cache_key as _get_cache_key +from .pyfuzon import get_cache_path as _get_cache_path + +def get_cache_key(sources: list[str]) -> str: + """Return a deterministic cache key based on a collection of source paths.""" + return _get_cache_key(sources) + +def get_cache_path(sources: list[str]) -> Path: + """Return a full platform-specific cache path based on a collection of source paths.""" + return Path(_get_cache_path(sources)) diff --git a/pyfuzon/python/pyfuzon/matcher.py b/pyfuzon/python/pyfuzon/matcher.py index 9b36e0e..466e9fb 100644 --- a/pyfuzon/python/pyfuzon/matcher.py +++ b/pyfuzon/python/pyfuzon/matcher.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from typing import Self + from dataclasses import dataclass from pyfuzon import Term, score_terms, parse_files, load_terms, dump_terms @@ -10,7 +11,7 @@ class TermMatcher: """Fuzzy matches terms from RDF terminologies to input queries.""" terms: list[Term] - + def top(self, query: str, n: int=5) -> list[Term]: """Return the n terms most similar to input query.""" return self.rank(query)[:n] @@ -19,7 +20,7 @@ def rank(self, query: str) -> list[Term]: """Return all terms, ranked by query similarity.""" scores = self.score(query) ranks = [ - i[0] for i in + i[0] for i in sorted(enumerate(scores), key=lambda x:x[1], reverse=True) ] return [self.terms[rank] for rank in ranks] @@ -45,4 +46,3 @@ def load(cls, path): def dump(self, path): """Serialize to disk.""" dump_terms(self.terms, path) -