From bf381040ca02d88c2b0195ddbb6715448e68ab91 Mon Sep 17 00:00:00 2001 From: Alex Klibisz Date: Mon, 20 Nov 2023 11:50:32 -0500 Subject: [PATCH] Build: add sbt-scalafmt and linting check in CI workflow (#594) --- .github/workflows/ci.yaml | 3 + .scalafmt.conf | 1 + Taskfile.yaml | 5 ++ .../com/klibisz/elastiknn/api/Mapping.scala | 2 +- .../elastiknn/api/NearestNeighborsQuery.scala | 5 +- .../scala/com/klibisz/elastiknn/api/Vec.scala | 4 +- .../elastiknn/client/ElastiknnClient.scala | 66 ++++++++++++------- .../elastiknn/client/ElastiknnRequests.scala | 62 ++++++++++------- .../com/klibisz/elastiknn/RecallSuite.scala | 19 +++--- .../klibisz/elastiknn/VectorMapperSuite.scala | 2 +- .../query/ElasticsearchQueryBuilder.scala | 3 +- .../elastiknn/query/HashingQuery.scala | 6 +- .../klibisz/elastiknn/storage/StoredVec.scala | 31 ++++----- project/plugins.sbt | 2 + 14 files changed, 125 insertions(+), 86 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 810e4d822..e128b7876 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -40,6 +40,9 @@ jobs: - name: Increase MMAP Limits timeout-minutes: 1 run: sudo sysctl -w vm.max_map_count=262144 + - name: Lint + timeout-minutes: 5 + run: task jvmLint - name: Compile timeout-minutes: 5 run: task jvmCompile diff --git a/.scalafmt.conf b/.scalafmt.conf index 7df1445e8..104edf4c7 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,2 +1,3 @@ version = "3.7.15" maxColumn = 140 +runner.dialect = scala213 diff --git a/Taskfile.yaml b/Taskfile.yaml index 8d281fcbd..80eb4b017 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -160,6 +160,11 @@ tasks: cmds: - sbt -client "compile; Test/compile" + jvmLint: + desc: Lint the JVM code + cmds: + - sbt -client scalafmtCheckAll + jvmAssemble: desc: Build the plugin bundle using SBT cmds: diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala index 7fba01bae..32752bfde 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala @@ -18,4 +18,4 @@ object Mapping { final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping -} \ No newline at end of file +} diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala index 3e0051e03..0157be03c 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala @@ -53,10 +53,9 @@ object NearestNeighborsQuery { override def similarity: Similarity = Similarity.L2 } - final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty()) - extends ApproximateQuery { + final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery { override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v) override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates) } -} \ No newline at end of file +} diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala index 539cb7e5e..9e34ae556 100644 --- a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala +++ b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala @@ -27,7 +27,7 @@ object Vec { override def equals(other: Any): Boolean = other match { case other: SparseBool => (trueIndices sameElements other.trueIndices) && totalIndices == other.totalIndices - case _ => false + case _ => false } override def toString: String = s"SparseBool(${trueIndices.take(3).mkString(",")},...,${trueIndices.length}/$totalIndices)" @@ -50,7 +50,7 @@ object Vec { final case class DenseFloat(values: Array[Float]) extends Vec with KnownDims { override def equals(other: Any): Boolean = other match { case other: DenseFloat => other.values sameElements values - case _ => false + case _ => false } override def toString: String = s"DenseFloat(${values.take(3).map(n => f"$n%.2f").mkString(",")},...,${values.length})" diff --git a/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala b/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala index 2a128a960..ce6447344 100644 --- a/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala +++ b/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala @@ -34,24 +34,34 @@ trait ElastiknnClient[F[_]] extends AutoCloseable { execute(ElastiknnRequests.putMapping(index, vecField, storedIdField, vecMapping)) /** Create an index with recommended defaults. - * @param index The index name. - * @param shards How many shards, 1 by default. - * @param replicas How many replicas, 1 by default. - * @param elastiknn Value for `index.elastiknn` setting, true by default. - * @return CreateIndexResponse + * @param index + * The index name. + * @param shards + * How many shards, 1 by default. + * @param replicas + * How many replicas, 1 by default. + * @param elastiknn + * Value for `index.elastiknn` setting, true by default. + * @return + * CreateIndexResponse */ def createIndex(index: String, shards: Int = 1, replicas: Int = 0): F[Response[CreateIndexResponse]] = execute(ElasticDsl.createIndex(index).shards(shards).replicas(replicas)) - /** Index a batch of vectors as new Elasticsearch docs, one doc per vector. - * Also see ElastiknnRequests.index(). + /** Index a batch of vectors as new Elasticsearch docs, one doc per vector. Also see ElastiknnRequests.index(). * - * @param index Index where vectors are stored. - * @param vecField Field in each doc where vector is stored. - * @param vecs Sequence of vectors to store. - * @param storedIdField Field in each doc where ID is stored as a doc value. - * @param ids Sequence of ids. Assumed one-to-one correspondence to given vectors. - * @return Response containing BulkResponse containing indexing responses. + * @param index + * Index where vectors are stored. + * @param vecField + * Field in each doc where vector is stored. + * @param vecs + * Sequence of vectors to store. + * @param storedIdField + * Field in each doc where ID is stored as a doc value. + * @param ids + * Sequence of ids. Assumed one-to-one correspondence to given vectors. + * @return + * Response containing BulkResponse containing indexing responses. */ def index(index: String, vecField: String, vecs: Seq[Vec], storedIdField: String, ids: Seq[String]): F[Response[BulkResponse]] = { val reqs = vecs.zip(ids).map { case (vec, id) => @@ -93,10 +103,14 @@ object ElastiknnClient { final case class StrictFailureException(message: String, cause: Throwable = None.orNull) extends RuntimeException(message, cause) /** Build an [[ElastiknnFutureClient]] from an elasticsearch RestClient. - * @param restClient The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes. - * @param strictFailure If true, convert non-200 responses and bulk responses containing any failure to a failed Future. - * @param ec ExecutionContext where requests are executed and responses processed. - * @return [[ElastiknnFutureClient]] + * @param restClient + * The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes. + * @param strictFailure + * If true, convert non-200 responses and bulk responses containing any failure to a failed Future. + * @param ec + * ExecutionContext where requests are executed and responses processed. + * @return + * [[ElastiknnFutureClient]] */ def futureClient(restClient: RestClient, strictFailure: Boolean)(implicit ec: ExecutionContext): ElastiknnFutureClient = { val jc: JavaClient = new JavaClient(restClient) @@ -121,12 +135,18 @@ object ElastiknnClient { } /** Build an [[ElastiknnFutureClient]] that connects to a single host - * @param host Elasticsearch host. - * @param port Elasticsearch port. - * @param timeoutMillis Amount of time to wait for a response. - * @param strictFailure If true, convert non-2xx responses and bulk responses containing any failure to a failed Future. - * @param ec ExecutionContext where requests are executed and responses processed. - * @return [[ElastiknnFutureClient]] + * @param host + * Elasticsearch host. + * @param port + * Elasticsearch port. + * @param timeoutMillis + * Amount of time to wait for a response. + * @param strictFailure + * If true, convert non-2xx responses and bulk responses containing any failure to a failed Future. + * @param ec + * ExecutionContext where requests are executed and responses processed. + * @return + * [[ElastiknnFutureClient]] */ def futureClient(host: String = "localhost", port: Int = 9200, strictFailure: Boolean = true, timeoutMillis: Int = 30000)(implicit ec: ExecutionContext diff --git a/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala b/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala index 3705ec280..3b3894b30 100644 --- a/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala +++ b/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala @@ -8,37 +8,46 @@ import com.sksamuel.elastic4s.requests.mappings.PutMappingRequest import com.sksamuel.elastic4s.requests.searches.SearchRequest import com.sksamuel.elastic4s.{ElasticDsl, Indexes} -/** Methods for creating Elastic4s requests for common elastiknn tasks. - * Methods are optimized for documents containing only a vector and running searches as quickly as possible. - * For example, we store the ID as a doc value instead of using the default stored ID which is slower to access. - * I am open to creating less-optimized, more-convenient methods in the future. +/** Methods for creating Elastic4s requests for common elastiknn tasks. Methods are optimized for documents containing only a vector and + * running searches as quickly as possible. For example, we store the ID as a doc value instead of using the default stored ID which is + * slower to access. I am open to creating less-optimized, more-convenient methods in the future. */ trait ElastiknnRequests { /** Create a request for indexing a vector. * - * @param index Name of the index. - * @param vecField Field where vector is stored. - * @param vec Vector to index. - * @param storedIdField Field where document ID is stored. - * @param id Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval. - * @return Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest. + * @param index + * Name of the index. + * @param vecField + * Field where vector is stored. + * @param vec + * Vector to index. + * @param storedIdField + * Field where document ID is stored. + * @param id + * Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval. + * @return + * Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest. */ def index(index: String, vecField: String, vec: Vec, storedIdField: String, id: String): IndexRequest = { val xcb = XContentFactory.jsonBuilder().rawField(vecField, XContentCodec.encodeUnsafeToString(vec)).field(storedIdField, id) IndexRequest(index, source = Some(JacksonBuilder.writeAsString(xcb.value)), id = Some(id)) } - /** Create a request for running a nearest neighbors query. - * Optimized for high performance, so it returns the document ID in the body. - * Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html) - * as the hash of the query for more deterministic results. + /** Create a request for running a nearest neighbors query. Optimized for high performance, so it returns the document ID in the body. + * Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html) as the + * hash of the query for more deterministic results. * - * @param index Index being searched against. - * @param query Constructed query, containing the vector, field, etc. - * @param k Number of results to return. - * @param storedIdField Field containing the document ID. See ElastiknnRequests.index() method. - * @return Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest. + * @param index + * Index being searched against. + * @param query + * Constructed query, containing the vector, field, etc. + * @param k + * Number of results to return. + * @param storedIdField + * Field containing the document ID. See ElastiknnRequests.index() method. + * @return + * Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest. */ def nearestNeighbors(index: String, query: NearestNeighborsQuery, k: Int, storedIdField: String): SearchRequest = ElasticDsl @@ -51,11 +60,16 @@ trait ElastiknnRequests { /** Create a mapping containing a vector field and a stored ID field. * - * @param index Index to which this mapping is applied. - * @param vecField Field where vector is stored. - * @param storedIdField Field where ID is stored. - * @param vecMapping Mapping for the stored vector. - * @return Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest. + * @param index + * Index to which this mapping is applied. + * @param vecField + * Field where vector is stored. + * @param storedIdField + * Field where ID is stored. + * @param vecMapping + * Mapping for the stored vector. + * @return + * Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest. */ def putMapping(index: String, vecField: String, storedIdField: String, vecMapping: Mapping): PutMappingRequest = { val mappingJsonString = diff --git a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala index 789cabac0..a5c29311c 100644 --- a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala +++ b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala @@ -13,15 +13,14 @@ import java.util.UUID import scala.concurrent.Future import scala.util.hashing.MurmurHash3.orderedHash -/** Tests for recall regressions for all of the mappings and their queries using random vectors. - * There are some subtleties: - * - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could - * have the same score relative a query vector. - * - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_. - * Each shard can have a non-specific number of segments but we merge each shard to a specific number. - * - Repeated query results against the same index should be deterministic. However if you re-index the data and run - * the same query, I have seen different results at times. This seems to be an effect at the Elasticsearch level. - * I've tested at the Lucene (sans ES) level and that seems to be reliably deterministic. +/** Tests for recall regressions for all of the mappings and their queries using random vectors. There are some subtleties: + * - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could have the same score + * relative a query vector. + * - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_. Each shard can have + * a non-specific number of segments but we merge each shard to a specific number. + * - Repeated query results against the same index should be deterministic. However if you re-index the data and run the same query, I + * have seen different results at times. This seems to be an effect at the Elasticsearch level. I've tested at the Lucene (sans ES) + * level and that seems to be reliably deterministic. */ class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient with AsyncCancelAfterFailure { @@ -139,7 +138,7 @@ class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient wi NearestNeighborsQuery.Exact(vecField, Similarity.L2) -> 1d, NearestNeighborsQuery.Exact(vecField, Similarity.Cosine) -> 1d, NearestNeighborsQuery.PermutationLsh(vecField, Similarity.Cosine, 200) -> 0.31, - NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3, + NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3 ), // TODO: This one seems to be more sensitive for some unknown reason. recallTolerance = 5e-2 diff --git a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala index ae30e35aa..26b6054c7 100644 --- a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala +++ b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala @@ -58,7 +58,7 @@ class VectorMapperSuite extends AsyncFreeSpec with Matchers with Elastic4sMatche // } // } // } - //} + // } res.body shouldBe defined val json = parse(res.body.get) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala index cb2e2ab64..5dde38c2b 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala @@ -47,8 +47,7 @@ object ElasticsearchQueryBuilder { // Used by rewriteGetVector to asynchronously provide an ElasticsearchQueryBuilder for an indexed vector. // The doRewrite method will delay indefinitely until until the query builder is provided. - private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed) - extends AbstractQueryBuilder[IndexedVectorQueryBuilder] { + private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed) extends AbstractQueryBuilder[IndexedVectorQueryBuilder] { private val ref = new AtomicReference[ElasticsearchQueryBuilder](null) diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala index 5ec8e45f4..be189ab1b 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala @@ -36,9 +36,9 @@ final class HashingQuery[V <: Vec, S <: StoredVec: Decoder]( ) } - /** Note that this score function does not re-score the top candidates. - * The final score produced is `(max possible score for this similarity * (number of matching hashes / total number of hashes)`. - * This is necessary because a ScoreFunction can only evaluate one doc at a time and must immediately score it. + /** Note that this score function does not re-score the top candidates. The final score produced is `(max possible score for this + * similarity * (number of matching hashes / total number of hashes)`. This is necessary because a ScoreFunction can only evaluate one + * doc at a time and must immediately score it. */ override def toScoreFunction(indexReader: IndexReader): ScoreFunction = { diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala index c7065c772..9c64c3427 100644 --- a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala +++ b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala @@ -4,25 +4,22 @@ import java.util import com.klibisz.elastiknn.api.Vec - -/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them. - * This is decoupled from the api Vec case classes so we can support various optimizations that might change the - * interface, e.g. streaming the vectors in a read-once fashion. Currently the fastest storage methods support roughly - * the same interface. +/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them. This is decoupled from the api Vec case + * classes so we can support various optimizations that might change the interface, e.g. streaming the vectors in a read-once fashion. + * Currently the fastest storage methods support roughly the same interface. * - * The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance. - * The implementation is based mostly on the Kryo library's use of sun.misc.Unsafe. - * Many other options were considered: - * - fast-serialization library with unsafe configuration - roughly same as using Unsafe. - * - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe. - * - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution. - * - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size - * doesn't seem to matter as it's compressed by ES anyway. - * - java.io.DataOutput/InputStreams - far slower. - * - scodec - far slower. + * The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance. The implementation is based + * mostly on the Kryo library's use of sun.misc.Unsafe. Many other options were considered: + * - fast-serialization library with unsafe configuration - roughly same as using Unsafe. + * - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe. + * - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution. + * - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size doesn't seem to matter as + * it's compressed by ES anyway. + * - java.io.DataOutput/InputStreams - far slower. + * - scodec - far slower. * - * Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions. - * If this becomes a problem we should likely switch to ObjectOutput/InputStreams. + * Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions. If this becomes a problem we should + * likely switch to ObjectOutput/InputStreams. */ sealed trait StoredVec diff --git a/project/plugins.sbt b/project/plugins.sbt index 06185fbe1..583d2d525 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,3 +1,5 @@ addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6") addSbtPlugin("org.typelevel" % "sbt-tpolecat" % "0.5.0") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") +