diff --git a/client-python/elastiknn/api.py b/client-python/elastiknn/api.py index 4a5c3671b..d933779b3 100644 --- a/client-python/elastiknn/api.py +++ b/client-python/elastiknn/api.py @@ -17,6 +17,7 @@ class Similarity(Enum): L1 = 3 L2 = 4 Cosine = 5 + Dot = 6 class Vec: @@ -144,7 +145,24 @@ def to_dict(self): "k": self.k } } + + @dataclass(frozen=True) + class DotLsh(Base): + dims: int + L: int + k: int + def to_dict(self): + return { + "type": "elastiknn_dense_float_vector", + "elastiknn": { + "model": "lsh", + "similarity": "dot", + "dims": self.dims, + "L": self.L, + "k": self.k + } + } @dataclass(frozen=True) class L2Lsh(Base): dims: int @@ -271,6 +289,27 @@ def with_vec(self, vec: Vec.Base): return NearestNeighborsQuery.CosineLsh(field=self.field, vec=vec, similarity=self.similarity, candidates=self.candidates) + @dataclass(frozen=True) + class DotLsh(Base): + field: str + vec: Vec.Base + similarity: Similarity = Similarity.Dot + candidates: int = 1000 + + def to_dict(self): + return { + "field": self.field, + "model": "lsh", + "similarity": self.similarity.name.lower(), + "candidates": self.candidates, + "vec": self.vec.to_dict() + } + + def with_vec(self, vec: Vec.Base): + return NearestNeighborsQuery.DotLsh(field=self.field, vec=vec, similarity=self.similarity, + candidates=self.candidates) + + @dataclass(frozen=True) class L2Lsh(Base): field: str diff --git a/client-python/elastiknn/models.py b/client-python/elastiknn/models.py index 03291076e..410b582c3 100644 --- a/client-python/elastiknn/models.py +++ b/client-python/elastiknn/models.py @@ -91,6 +91,8 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.L2) elif self._metric == 'cosine': return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Cosine) + elif self._metric == 'dot': + return Mapping.DenseFloat(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Dot) elif self._metric == 'jaccard': return Mapping.SparseBool(self._dims), NearestNeighborsQuery.Exact(field, dummy, Similarity.Jaccard) elif self._metric == 'hamming': @@ -103,6 +105,9 @@ def _mk_mapping_query(self, query_params: dict()) -> (Mapping.Base, NearestNeigh elif self._metric == 'cosine': return Mapping.CosineLsh(self._dims, **self._mapping_params), \ NearestNeighborsQuery.CosineLsh(field, dummy, **query_params) + elif self._metric == 'dot': + return Mapping.DotLsh(self._dims, **self._mapping_params), \ + NearestNeighborsQuery.DotLsh(field, dummy, **query_params) elif self._metric == 'hamming': return Mapping.CosineLsh(self._dims, **self._mapping_params), \ NearestNeighborsQuery.HammingLsh(field, dummy, **query_params) diff --git a/client-python/elastiknn/utils.py b/client-python/elastiknn/utils.py index dbba8b32d..897569536 100644 --- a/client-python/elastiknn/utils.py +++ b/client-python/elastiknn/utils.py @@ -13,10 +13,12 @@ ('exact', 'l1'), ('exact', 'l2'), ('exact', 'cosine'), + ('exact', 'dot'), ('exact', 'hamming'), ('exact', 'jaccard'), ('lsh', 'l2'), ('lsh', 'cosine'), + ('lsh', 'dot'), ('lsh', 'jaccard'), ('lsh', 'hamming'), ('permutation_lsh', 'cosine'), diff --git a/docs/_posts/2021-07-30-how-does-elastiknn-work.md b/docs/_posts/2021-07-30-how-does-elastiknn-work.md index 1192aa7f2..8ffe37d74 100644 --- a/docs/_posts/2021-07-30-how-does-elastiknn-work.md +++ b/docs/_posts/2021-07-30-how-does-elastiknn-work.md @@ -43,8 +43,8 @@ The name is a combination of _Elastic_ and _KNN_ (K-Nearest Neighbors). The full list of features (copied from the home page) is as follows: - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). -- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity. +- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates: start with any number of vectors and incrementally create/update/delete more without ever re-building the entire index. - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch. @@ -88,13 +88,13 @@ So Java is used for all the CPU-bound LSH models and Lucene abstractions, and Sc Elasticsearch requires non-negative scores, with higher scores indicating higher relevance. -Elastiknn supports five vector similarity functions (L1, L2, Cosine, Jaccard, and Hamming). +Elastiknn supports five vector similarity functions (L1, L2, Cosine,Dot, Jaccard, and Hamming). Three of these are problematic with respect to this scoring requirement. Specifically, L1 and L2 are generally defined as _distance_ functions, rather than similarity functions, which means that higher relevance (i.e., lower distance) yields _lower_ scores. Cosine similarity is defined over $$[-1, 1]$$, and we can't have negative scores. - +Dot similarity is defined over $$[-1, 1]$$, and we can't have negative scores, if vectors have a magnitude of 1, then it's equivalent to cosine similarity. To work around this, Elastiknn applies simple transformations to produce L1, L2, and Cosine _similarity_ in accordance with the Elasticsearch requirements. The exact transformations are documented [on the API page](/api/#similarity-scoring). diff --git a/docs/pages/api.md b/docs/pages/api.md index c102c7fd2..3c7e1369b 100644 --- a/docs/pages/api.md +++ b/docs/pages/api.md @@ -292,6 +292,30 @@ PUT /my-index/_mapping } } ``` +### Dot LSH Mapping + +Uses the [Random Projection algorithm](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection) +to hash and store dense float vectors such that they support approximate Dot similarity queries. Equivalent to Cosine similarity if the vectors are normalized + +The implementation is influenced by Chapter 3 of [Mining Massive Datasets.](http://www.mmds.org/) + +```json +PUT /my-index/_mapping +{ + "properties": { + "my_vec": { + "type": "elastiknn_dense_float_vector", # 1 + "elastiknn": { + "dims": 100, # 2 + "model": "lsh", # 3 + "similarity": "dot", # 4 + "L": 99, # 5 + "k": 1 # 6 + } + } + } +} +``` |#|Description| |:--|:--| @@ -425,7 +449,7 @@ GET /my-index/_search ### Compatibility of Vector Types and Similarities Jaccard and Hamming similarity only work with sparse bool vectors. -Cosine,[^note-angular-cosine] L1, and L2 similarity only work with dense float vectors. +Cosine,[^note-angular-cosine],Dot[^note-dot-product], L1, and L2 similarity only work with dense float vectors. The following documentation assume this restriction is known. These restrictions aren't inherent to the types and algorithms, i.e., you could in theory run cosine similarity on sparse vectors. @@ -446,9 +470,12 @@ The exact transformations are described below. |Jaccard|N/A|0|1.0| |Hamming|N/A|0|1.0| |Cosine[^note-angular-cosine]|`cosine similarity + 1`|0|2| +|Dot[^note-dot-product]|`Dot similarity + 1`|0|2| |L1|`1 / (1 + l1 distance)`|0|1| |L2|`1 / (1 + l2 distance)`|0|1| +Dot similirarity will produce negative scores if the vectors are not normalized + If you're using the `elastiknn_nearest_neighbors` query with other queries, and the score values are inconvenient (e.g. huge values like 1e6), consider wrapping the query in a [Script Score Query](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-script-score-query.html), where you can access and transform the `_score` value. ### Query Vector @@ -621,6 +648,36 @@ GET /my-index/_search |5|Number of candidates per segment. See the section on LSH Search Strategy.| |6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.| +### Dot LSH Query + +Retrieve dense float vectors based on approximate Cosine similarity.[^note-angular-cosine] + +```json +GET /my-index/_search +{ + "query": { + "elastiknn_nearest_neighbors": { + "field": "my_vec", # 1 + "vec": { # 2 + "values": [0.1, 0.2, 0.3, ...] + }, + "model": "lsh", # 3 + "similarity": "dot", # 4 + "candidates": 50 # 5 + } + } +} +``` + +|#|Description| +|:--|:--| +|1|Indexed field. Must use `lsh` mapping model with `dot`[^note-dot-product] similarity.| +|2|Query vector. Must be literal dense float or a pointer to an indexed dense float vector.| +|3|Model name.| +|4|Similarity function.| +|5|Number of candidates per segment. See the section on LSH Search Strategy.| +|6|Set to true to use the more-like-this heuristic to pick a subset of hashes. Generally faster but still experimental.| + ### L1 LSH Query Not yet implemented. @@ -707,12 +764,13 @@ The similarity functions are abbreviated (J: Jaccard, H: Hamming, C: Cosine,[^no #### elastiknn_dense_float_vector -|Model / Query |Exact |Cosine LSH |L2 LSH |Permutation LSH| -|:-- |:-- |:-- |:-- |:-- | -|Exact (i.e. no model specified) |✔ (C, L1, L2) |x |x |x | -|Cosine LSH |✔ (C, L1, L2) |✔ |x |x | -|L2 LSH |✔ (C, L1, L2) |x |✔ |x | -|Permutation LSH |✔ (C, L1, L2) |x |x |✔ | +|Model / Query |Exact |Cosine LSH |Dot LSH|L2 LSH |Permutation LSH| +|:-- |:-- |:-- |:-- |:-- |:-- | +|Exact (i.e. no model specified) |✔ (C, D, L1, L2) |x |x |x |x | +|Cosine LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x | +|Dot LSH |✔ (C, D, L1, L2) |✔ |✔ |x |x | +|L2 LSH |✔ (C, D, L1, L2) |x |x |✔ |x | +|Permutation LSH |✔ (C, D, L1, L2) |x |x |x |✔ | ### Running Nearest Neighbors Query on a Filtered Subset of Documents @@ -860,4 +918,5 @@ PUT /my-index See the [create index documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/indices-create-index.html) for more details. -[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. \ No newline at end of file +[^note-angular-cosine]: Cosine similarity used to be (incorrectly) called "angular" similarity. All references to "angular" were renamed to "Cosine" in 7.13.3.2. You can still use "angular" in the JSON/HTTP API; it will convert to "cosine" internally. +[^note-dot-product]: Dot product is thought to be used with normalized vectors V, meaning that ||v||==1. \ No newline at end of file diff --git a/docs/pages/index.md b/docs/pages/index.md index 44a049161..dd755fd63 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -15,8 +15,8 @@ This enables users to combine traditional queries (e.g., "some product") with ve ## Features - Datatypes to efficiently store dense and sparse numerical vectors in Elasticsearch documents, including multiple vectors per document. -- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). -- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Jaccard, and Hamming similarity. +- Exact nearest neighbor queries for five similarity functions: [L1](https://en.wikipedia.org/wiki/Taxicab_geometry), [L2](https://en.wikipedia.org/wiki/Euclidean_distance), [Cosine](https://en.wikipedia.org/wiki/Cosine_similarity), [Dot](https://en.wikipedia.org/wiki/Dot_product) (for normalized vectors), [Jaccard](https://en.wikipedia.org/wiki/Jaccard_index), and [Hamming](https://en.wikipedia.org/wiki/Hamming_distance). +- Approximate queries using [Locality Sensitive Hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) for L2, Cosine, Dot, Jaccard, and Hamming similarity. - Integration of nearest neighbor queries with standard Elasticsearch queries. - Incremental index updates. Start with 1 vector or 1 million vectors and then create/update/delete documents and vectors without ever re-building the entire index. - Implementation based on standard Elasticsearch and Lucene primitives, entirely in the JVM. Indexing and querying scale horizontally with Elasticsearch. diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala index 32752bfde..695253371 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala @@ -15,6 +15,8 @@ object Mapping { final case class CosineLsh(dims: Int, L: Int, k: Int) extends Mapping + final case class DotLsh(dims: Int, L: Int, k: Int) extends Mapping + final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala index 0157be03c..f6a76bbbb 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala @@ -29,6 +29,14 @@ object NearestNeighborsQuery { override def similarity: Similarity = Similarity.Cosine } + final case class DotLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery { + override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v) + + override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates) + + override def similarity: Similarity = Similarity.Dot + } + final case class HammingLsh(field: String, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery { override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v) diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala index 405a1af3f..1232cd925 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Similarity.scala @@ -5,6 +5,8 @@ sealed trait Similarity object Similarity { case object Cosine extends Similarity + case object Dot extends Similarity + case object Hamming extends Similarity case object Jaccard extends Similarity @@ -13,5 +15,5 @@ object Similarity { case object L2 extends Similarity - val values: Seq[Similarity] = Vector(Cosine, Jaccard, Hamming, L1, L2) + val values: Seq[Similarity] = Vector(Cosine, Dot, Jaccard, Hamming, L1, L2) } diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala index 3b2626f7c..27a8fb3b7 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/XContentCodec.scala @@ -89,6 +89,7 @@ object XContentCodec { case Similarity.L1 => b.value(Names.L1) case Similarity.L2 => b.value(Names.L2) case Similarity.Cosine => b.value(Names.COSINE) + case Similarity.Dot => b.value(Names.DOT) } () } @@ -206,6 +207,20 @@ object XContentCodec { } } + implicit val dotLshMapping: MappingEncoder[Mapping.DotLsh] = new MappingEncoder[Mapping.DotLsh] { + override protected def vectorType: String = Names.EKNN_DENSE_FLOAT_VECTOR + override def encodeElastiknnObject(t: Mapping.DotLsh, b: XContentBuilder): Unit = { + b.startObject(Names.ELASTIKNN) + b.field(Names.LSH_L, t.L) + b.field(Names.DIMS, t.dims) + b.field(Names.LSH_K, t.k) + b.field(Names.MODEL, Names.LSH) + b.field(Names.SIMILARITY, Names.DOT) + b.endObject() + () + } + } + implicit val l2LshMapping: MappingEncoder[Mapping.L2Lsh] = new MappingEncoder[Mapping.L2Lsh] { override protected def vectorType: String = Names.EKNN_DENSE_FLOAT_VECTOR override def encodeElastiknnObject(t: Mapping.L2Lsh, b: XContentBuilder): Unit = { @@ -242,6 +257,7 @@ object XContentCodec { case m: Mapping.HammingLsh => hammingLshMapping.encodeElastiknnObject(m, b) case m: Mapping.DenseFloat => denseFloatMapping.encodeElastiknnObject(m, b) case m: Mapping.CosineLsh => cosineLshMapping.encodeElastiknnObject(m, b) + case m: Mapping.DotLsh => dotLshMapping.encodeElastiknnObject(m, b) case m: Mapping.L2Lsh => l2LshMapping.encodeElastiknnObject(m, b) case m: Mapping.PermutationLsh => permutationLshMapping.encodeElastiknnObject(m, b) } @@ -252,6 +268,7 @@ object XContentCodec { case m: Mapping.HammingLsh => hammingLshMapping.encodeUnsafe(m, b) case m: Mapping.DenseFloat => denseFloatMapping.encodeUnsafe(m, b) case m: Mapping.CosineLsh => cosineLshMapping.encodeUnsafe(m, b) + case m: Mapping.DotLsh => dotLshMapping.encodeUnsafe(m, b) case m: Mapping.L2Lsh => l2LshMapping.encodeUnsafe(m, b) case m: Mapping.PermutationLsh => permutationLshMapping.encodeUnsafe(m, b) } @@ -316,6 +333,21 @@ object XContentCodec { } } + implicit val dotLshQuery: Encoder[NearestNeighborsQuery.DotLsh] = new Encoder[NearestNeighborsQuery.DotLsh] { + override def encodeUnsafe(t: NearestNeighborsQuery.DotLsh, b: XContentBuilder): Unit = { + b.startObject() + b.field(Names.CANDIDATES, t.candidates) + b.field(Names.FIELD, t.field) + b.field(Names.MODEL, Names.LSH) + b.field(Names.SIMILARITY) + similarity.encodeUnsafe(t.similarity, b) + b.field(Names.VEC) + vec.encodeUnsafe(t.vec, b) + b.endObject() + () + } + } + implicit val l2LshQuery: Encoder[NearestNeighborsQuery.L2Lsh] = new Encoder[NearestNeighborsQuery.L2Lsh] { override def encodeUnsafe(t: NearestNeighborsQuery.L2Lsh, b: XContentBuilder): Unit = { b.startObject() @@ -354,6 +386,7 @@ object XContentCodec { case q: NearestNeighborsQuery.JaccardLsh => jaccardLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.HammingLsh => hammingLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.CosineLsh => cosineLshQuery.encodeUnsafe(q, b) + case q: NearestNeighborsQuery.DotLsh => dotLshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.L2Lsh => l2LshQuery.encodeUnsafe(q, b) case q: NearestNeighborsQuery.PermutationLsh => permutationLshQuery.encodeUnsafe(q, b) } @@ -441,6 +474,7 @@ object XContentCodec { case Names.L1 => Similarity.L1 case Names.L2 => Similarity.L2 case Names.COSINE => Similarity.Cosine + case Names.DOT => Similarity.Dot case Names.ANGULAR => Similarity.Cosine case _ => throw new XContentParseException(unexpectedValue(s1, Names.SIMILARITIES)) } @@ -603,6 +637,8 @@ object XContentCodec { Mapping.L2Lsh(dims, l, k, w) case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.LSH), Some(dims), Some(Similarity.Cosine), Some(l), Some(k), _, _) => Mapping.CosineLsh(dims, l, k) + case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.LSH), Some(dims), Some(Similarity.Dot), Some(l), Some(k), _, _) => + Mapping.DotLsh(dims, l, k) case (Some(Names.EKNN_DENSE_FLOAT_VECTOR), Some(Names.PERMUTATION_LSH), Some(dims), _, _, Some(k), _, Some(repeating)) => Mapping.PermutationLsh(dims, k, repeating) case _ => throw new XContentParseException(unableToConstruct("mapping")) @@ -645,6 +681,8 @@ object XContentCodec { NearestNeighborsQuery.Exact(field, similarity, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Cosine), Some(v)) => NearestNeighborsQuery.CosineLsh(field, candidates, v) + case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Dot), Some(v)) => + NearestNeighborsQuery.DotLsh(field, candidates, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Hamming), Some(v)) => NearestNeighborsQuery.HammingLsh(field, candidates, v) case (Some(candidates), Some(field), Some(Names.LSH), _, Some(Similarity.Jaccard), Some(v)) => @@ -662,6 +700,7 @@ object XContentCodec { val ANGULAR = "angular" val CANDIDATES = "candidates" val COSINE = "cosine" + val DOT = "dot" val DIMS = "dims" val ELASTIKNN = "elastiknn" val EKNN_DENSE_FLOAT_VECTOR = s"${ELASTIKNN_NAME}_dense_float_vector" diff --git a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala index 4dfa801cb..8d91be94d 100644 --- a/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala +++ b/elastiknn-api4s/src/test/scala/com/klibisz/elastiknn/api/XContentCodecSuite.scala @@ -110,13 +110,16 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { ("L2", Similarity.L2), ("cosine", Similarity.Cosine), ("Cosine", Similarity.Cosine), - ("COSINE", Similarity.Cosine) + ("COSINE", Similarity.Cosine), + ("dot", Similarity.Dot), + ("Dot", Similarity.Dot), + ("DOT", Similarity.Dot) ) } roundtrip[Similarity](Json.fromString(str.toLowerCase), sim) } "errors" in { val ex1 = intercept[XContentParseException](decodeUnsafeFromString[Similarity]("\"wrong\"")) - ex1.getMessage shouldBe "Expected token to be one of [cosine,hamming,jaccard,l1,l2] but found [wrong]" + ex1.getMessage shouldBe "Expected token to be one of [cosine,dot,hamming,jaccard,l1,l2] but found [wrong]" val ex2 = intercept[XContentParseException](decodeUnsafeFromString[Similarity]("99")) ex2.getMessage shouldBe "Expected token to be one of [VALUE_STRING] but found [VALUE_NUMBER]" } @@ -326,7 +329,7 @@ class XContentCodecSuite extends AnyFreeSpec with Matchers { | } |} |""".stripMargin)) - ex2.getMessage shouldBe "Expected token to be one of [cosine,hamming,jaccard,l1,l2] but found [jacard]" + ex2.getMessage shouldBe "Expected token to be one of [cosine,dot, hamming,jaccard,l1,l2] but found [jacard]" } } "HammingLsh" - { diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java new file mode 100644 index 000000000..b30898d1d --- /dev/null +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/DotLshModel.java @@ -0,0 +1,53 @@ +package com.klibisz.elastiknn.models; + +import com.klibisz.elastiknn.storage.BitBuffer; +import com.klibisz.elastiknn.vectors.FloatVectorOps; + +import static com.klibisz.elastiknn.storage.ByteBufferSerialization.writeInt; + +import java.util.Random; + +public class DotLshModel implements HashingModel.DenseFloat { + + private final int L; + private final int k; + private final float[][] planes; + + private final FloatVectorOps vectorOps; + + /** + * Locality sensitive hashing model for Dot similarity. + * Uses the random hyperplanes method described in Mining Massive Datasets chapter 3. + * @param dims length of the vectors hashed by this model + * @param L number of hash tables + * @param k number of hash functions concatenated to form a hash for each table + * @param rng random number generator used to instantiate model parameters + */ + public DotLshModel(int dims, int L, int k, Random rng, FloatVectorOps vectorOps) { + this.L = L; + this.k = k; + this.planes = new float[L * k][dims]; + this.vectorOps = vectorOps; + for (int i = 0; i < this.planes.length; i++) { + for (int j = 0; j < dims; j++) { + this.planes[i][j] = (float) rng.nextGaussian(); + } + } + } + + @Override + public HashAndFreq[] hash(float[] values) { + HashAndFreq[] hashes = new HashAndFreq[L]; + for (int ixL = 0; ixL < L; ixL++) { + BitBuffer.IntBuffer buf = new BitBuffer.IntBuffer(writeInt(ixL)); + for (int ixk = 0; ixk < k; ixk++) { + double dot = vectorOps.dotProduct(planes[ixL * k + ixk], values); + if (dot > 0) buf.putOne(); + else buf.putZero(); + } + hashes[ixL] = HashAndFreq.once(buf.toByteArray()); + } + return hashes; + } + +} diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java index fb23f7d2d..39cee7d9a 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/models/ExactModel.java @@ -38,4 +38,9 @@ public static double l1Similarity(FloatVectorOps floatVectorOps, float[] v1, flo public static double cosineSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { return 1 + floatVectorOps.cosineSimilarity(v1, v2); } + + @ForceInline + public static double dotSimilarity(FloatVectorOps floatVectorOps, float[] v1, float[] v2) { + return 1 + floatVectorOps.dotSimilarity(v1, v2); + } } diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java index 95809a482..93e5362e5 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/DefaultFloatVectorOps.java @@ -19,6 +19,10 @@ public double cosineSimilarity(float[] v1, float[] v2) { else return -1; } + public double dotSimilarity(float[] v1, float[] v2) { + return dotProduct(v1, v2); + } + public double dotProduct(float[] v1, float[] v2) { float dotProd = 0f; for (int i = 0; i < v1.length; i++) dotProd += v1[i] * v2[i]; diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java index 485bca4f6..ccda505c8 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/FloatVectorOps.java @@ -9,4 +9,6 @@ public interface FloatVectorOps { double l1Distance(float[] v1, float[] v2); double cosineSimilarity(float[] v1, float[] v2); + + double dotSimilarity(float[] v1, float[] v2); } diff --git a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java index da065283b..98c36362f 100644 --- a/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java +++ b/elastiknn-models/src/main/java/com/klibisz/elastiknn/vectors/PanamaFloatVectorOps.java @@ -49,6 +49,10 @@ public double cosineSimilarity(float[] v1, float[] v2) { else return -1; } + public double dotSimilarity(float[] v1, float[] v2) { + return dotProduct(v1, v2); + } + public double dotProduct(float[] v1, float[] v2) { int i = 0; double dotProd = 0d; diff --git a/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala b/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala new file mode 100644 index 000000000..2bb4cae59 --- /dev/null +++ b/elastiknn-models/src/test/scala/com/klibisz/elastiknn/models/DotLshModelSuite.scala @@ -0,0 +1,29 @@ +package com.klibisz.elastiknn.models + +import com.klibisz.elastiknn.api.Vec +import com.klibisz.elastiknn.vectors.PanamaFloatVectorOps +import org.scalatest.funsuite.AnyFunSuite +import org.scalatest.matchers.should.Matchers + +import scala.util.Random + +class DotLshModelSuite extends AnyFunSuite with Matchers { + + test("model is dependent of vector magnitude but hashing should not") { + implicit val rng: Random = new Random(0) + val dims = 10 + for { + l <- 1 to 100 by 10 + k <- 1 to 5 + isUnit <- Seq(true, false) + } { + val mlsh = new DotLshModel(dims, l, k, new java.util.Random(0), new PanamaFloatVectorOps) + val vec = Vec.DenseFloat.random(dims, unit = isUnit) + val scaled = (1 to 10).map(m => vec.copy(vec.values.map(_ * m))) + val hashed = scaled.map(v => mlsh.hash(v.values).toList) + scaled.distinct.length shouldBe 10 + hashed.distinct.length shouldBe 1 + } + } + +} diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala index 29d0b35fc..445b5698f 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/mapper/VectorMapper.scala @@ -53,6 +53,7 @@ object VectorMapper { mapping match { case Mapping.DenseFloat(_) => Try(Seq(ExactQuery.index(field, vec))) case m: Mapping.CosineLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) + case m: Mapping.DotLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case m: Mapping.L2Lsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case m: Mapping.PermutationLsh => Try(HashingQuery.index(field, luceneFieldType, vec, modelCache(m).hash(vec.values))) case _ => Failure(incompatible(mapping, vec)) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala index 269c8012b..fe14af868 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ExactSimilarityFunction.scala @@ -34,4 +34,9 @@ object ExactSimilarityFunction { override def apply(v1: Vec.DenseFloat, v2: StoredVec.DenseFloat): Double = ExactModel.cosineSimilarity(floatVectorOps, v1.values, v2.values) } + final class Dot(floatVectorOps: FloatVectorOps) extends ExactSimilarityFunction[Vec.DenseFloat, StoredVec.DenseFloat] { + override def maxScore: Float = 2f + override def apply(v1: Vec.DenseFloat, v2: StoredVec.DenseFloat): Double = + ExactModel.dotSimilarity(floatVectorOps, v1.values, v2.values) + } } diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala index 7d8882c54..41c7b7146 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/models/ModelCache.scala @@ -15,12 +15,14 @@ final class ModelCache(floatVectorOps: FloatVectorOps) { }) private val cosine = cache((m: Mapping.CosineLsh) => new CosineLshModel(m.dims, m.L, m.k, new Random(0), floatVectorOps)) + private val dot = cache((m: Mapping.DotLsh) => new DotLshModel(m.dims, m.L, m.k, new Random(0), floatVectorOps)) private val jaccard = cache((m: Mapping.JaccardLsh) => new JaccardLshModel(m.L, m.k, new Random(0))) private val hamming = cache((m: Mapping.HammingLsh) => new HammingLshModel(m.dims, m.L, m.k, new Random(0))) private val l2 = cache((m: Mapping.L2Lsh) => new L2LshModel(m.dims, m.L, m.k, m.w, new Random(0), floatVectorOps)) private val permutation = cache((m: Mapping.PermutationLsh) => new PermutationLshModel(m.k, m.repeating)) def apply(m: Mapping.CosineLsh): CosineLshModel = cosine.get(m) + def apply(m: Mapping.DotLsh): DotLshModel = dot.get(m) def apply(m: Mapping.JaccardLsh): JaccardLshModel = jaccard.get(m) def apply(m: Mapping.HammingLsh): HammingLshModel = hamming.get(m) def apply(m: Mapping.L2Lsh): L2LshModel = l2.get(m) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala index c612f097b..4fdcc1582 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElastiknnQueryBuilder.scala @@ -55,21 +55,26 @@ final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: Mo case ( Exact(f, Similarity.L1, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh |_: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, l1) case ( Exact(f, Similarity.L2, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, l2) case ( Exact(f, Similarity.Cosine, v: Vec.DenseFloat), - _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh ) => new ExactQuery(f, v, cosine) + case ( + Exact(f, Similarity.Dot, v: Vec.DenseFloat), + _: Mapping.DenseFloat | _: Mapping.CosineLsh | _: Mapping.DotLsh | _: Mapping.L2Lsh | _: Mapping.PermutationLsh + ) => + new ExactQuery(f, v, dot) case (JaccardLsh(f, candidates, v: Vec.SparseBool), m: Mapping.JaccardLsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.trueIndices, v.totalIndices), ESF.Jaccard) @@ -79,7 +84,10 @@ final class ElastiknnQueryBuilder(floatVectorOps: FloatVectorOps, modelCache: Mo case (CosineLsh(f, candidates, v: Vec.DenseFloat), m: Mapping.CosineLsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.values), cosine) - + + case (DotLsh(f, candidates, v: Vec.DenseFloat), m: Mapping.DotLsh) => + new HashingQuery(f, v, candidates, modelCache(m).hash(v.values), dot) + case (L2Lsh(f, candidates, probes, v: Vec.DenseFloat), m: Mapping.L2Lsh) => new HashingQuery(f, v, candidates, modelCache(m).hash(v.values, probes), l2) diff --git a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala index a7f3e5856..cc8761292 100644 --- a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala +++ b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityFunctionSuite.scala @@ -93,7 +93,31 @@ class ExactSimilarityFunctionSuite extends AnyFunSpec with Matchers { cosine(v2, v2) shouldBe 2d } } + + describe("Dot Similarity") { + val dot = new ExactSimilarityFunction.Dot(new PanamaFloatVectorOps) + + it("matches reference") { + for (_ <- 0 until reps) { + val len = rng.nextInt(4096) + 10 + val v1 = Vec.DenseFloat.random(len) + val v2 = Vec.DenseFloat.random(len) + dot(v1, v2) shouldBe (ExactSimilarityReference.Dot(v1, v2) +- tol) + } + } + + it("handles identity") { + val v1 = Vec.DenseFloat.random(199) + dot(v1, v1) shouldBe (2d +- tol) + } + + it("handles all zeros") { + val v1 = Vec.DenseFloat.random(199) + val v2 = Vec.DenseFloat(v1.values.map(_ * 0)) + dot(v2, v2) shouldBe 2d + } + } describe("Jaccard Similarity") { it("matches reference") { diff --git a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala index 1c8724de7..2b302b894 100644 --- a/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala +++ b/elastiknn-plugin/src/test/scala/com/klibisz/elastiknn/models/ExactSimilarityReference.scala @@ -19,7 +19,9 @@ object ExactSimilarityReference { val Cosine: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { 1 + (1 - cosineDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) } - + val Dot: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { + 1 + (1 - dotDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) + } val Hamming: (Vec.SparseBool, Vec.SparseBool) => Double = (v1: Vec.SparseBool, v2: Vec.SparseBool) => { val d1 = new Array[Boolean](v1.totalIndices) val d2 = new Array[Boolean](v2.totalIndices)