Build: add sbt-scalafmt and linting check in CI workflow (#594)

alexklibisz · Nov 20, 2023 · bf38104 · bf38104
1 parent 1a1a561
commit bf38104
Show file tree

Hide file tree

Showing 14 changed files with 125 additions and 86 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -40,6 +40,9 @@ jobs:
       - name: Increase MMAP Limits
         timeout-minutes: 1
         run: sudo sysctl -w vm.max_map_count=262144
+      - name: Lint
+        timeout-minutes: 5
+        run: task jvmLint
       - name: Compile
         timeout-minutes: 5
         run: task jvmCompile

diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -1,2 +1,3 @@
 version = "3.7.15"
 maxColumn = 140
+runner.dialect = scala213
diff --git a/Taskfile.yaml b/Taskfile.yaml
@@ -160,6 +160,11 @@ tasks:
     cmds:
       - sbt -client "compile; Test/compile"
 
+  jvmLint:
+    desc: Lint the JVM code
+    cmds:
+      - sbt -client scalafmtCheckAll
+
   jvmAssemble:
     desc: Build the plugin bundle using SBT
     cmds:

diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Mapping.scala
@@ -18,4 +18,4 @@ object Mapping {
   final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping
 
   final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping
-}
+}
diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/NearestNeighborsQuery.scala
@@ -53,10 +53,9 @@ object NearestNeighborsQuery {
     override def similarity: Similarity = Similarity.L2
   }
 
-  final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty())
-    extends ApproximateQuery {
+  final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
     override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)
 
     override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates)
   }
-}
+}
diff --git a/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala b/elastiknn-api4s/src/main/scala/com/klibisz/elastiknn/api/Vec.scala
@@ -27,7 +27,7 @@ object Vec {
 
     override def equals(other: Any): Boolean = other match {
       case other: SparseBool => (trueIndices sameElements other.trueIndices) && totalIndices == other.totalIndices
-      case _ => false
+      case _                 => false
     }
 
     override def toString: String = s"SparseBool(${trueIndices.take(3).mkString(",")},...,${trueIndices.length}/$totalIndices)"
@@ -50,7 +50,7 @@ object Vec {
   final case class DenseFloat(values: Array[Float]) extends Vec with KnownDims {
     override def equals(other: Any): Boolean = other match {
       case other: DenseFloat => other.values sameElements values
-      case _ => false
+      case _                 => false
     }
 
     override def toString: String = s"DenseFloat(${values.take(3).map(n => f"$n%.2f").mkString(",")},...,${values.length})"

diff --git a/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala b/elastiknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnClient.scala
@@ -34,24 +34,34 @@ trait ElastiknnClient[F[_]] extends AutoCloseable {
     execute(ElastiknnRequests.putMapping(index, vecField, storedIdField, vecMapping))
 
   /** Create an index with recommended defaults.
-    * @param index The index name.
-    * @param shards How many shards, 1 by default.
-    * @param replicas How many replicas, 1 by default.
-    * @param elastiknn Value for `index.elastiknn` setting, true by default.
-    * @return CreateIndexResponse
+    * @param index
+    *   The index name.
+    * @param shards
+    *   How many shards, 1 by default.
+    * @param replicas
+    *   How many replicas, 1 by default.
+    * @param elastiknn
+    *   Value for `index.elastiknn` setting, true by default.
+    * @return
+    *   CreateIndexResponse
     */
   def createIndex(index: String, shards: Int = 1, replicas: Int = 0): F[Response[CreateIndexResponse]] =
     execute(ElasticDsl.createIndex(index).shards(shards).replicas(replicas))
 
-  /** Index a batch of vectors as new Elasticsearch docs, one doc per vector.
-    * Also see ElastiknnRequests.index().
+  /** Index a batch of vectors as new Elasticsearch docs, one doc per vector. Also see ElastiknnRequests.index().
     *
-    * @param index Index where vectors are stored.
-    * @param vecField Field in each doc where vector is stored.
-    * @param vecs Sequence of vectors to store.
-    * @param storedIdField Field in each doc where ID is stored as a doc value.
-    * @param ids Sequence of ids. Assumed one-to-one correspondence to given vectors.
-    * @return Response containing BulkResponse containing indexing responses.
+    * @param index
+    *   Index where vectors are stored.
+    * @param vecField
+    *   Field in each doc where vector is stored.
+    * @param vecs
+    *   Sequence of vectors to store.
+    * @param storedIdField
+    *   Field in each doc where ID is stored as a doc value.
+    * @param ids
+    *   Sequence of ids. Assumed one-to-one correspondence to given vectors.
+    * @return
+    *   Response containing BulkResponse containing indexing responses.
     */
   def index(index: String, vecField: String, vecs: Seq[Vec], storedIdField: String, ids: Seq[String]): F[Response[BulkResponse]] = {
     val reqs = vecs.zip(ids).map { case (vec, id) =>
@@ -93,10 +103,14 @@ object ElastiknnClient {
   final case class StrictFailureException(message: String, cause: Throwable = None.orNull) extends RuntimeException(message, cause)
 
   /** Build an [[ElastiknnFutureClient]] from an elasticsearch RestClient.
-    * @param restClient The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes.
-    * @param strictFailure If true, convert non-200 responses and bulk responses containing any failure to a failed Future.
-    * @param ec ExecutionContext where requests are executed and responses processed.
-    * @return [[ElastiknnFutureClient]]
+    * @param restClient
+    *   The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes.
+    * @param strictFailure
+    *   If true, convert non-200 responses and bulk responses containing any failure to a failed Future.
+    * @param ec
+    *   ExecutionContext where requests are executed and responses processed.
+    * @return
+    *   [[ElastiknnFutureClient]]
     */
   def futureClient(restClient: RestClient, strictFailure: Boolean)(implicit ec: ExecutionContext): ElastiknnFutureClient = {
     val jc: JavaClient = new JavaClient(restClient)
@@ -121,12 +135,18 @@ object ElastiknnClient {
   }
 
   /** Build an [[ElastiknnFutureClient]] that connects to a single host
-    * @param host Elasticsearch host.
-    * @param port Elasticsearch port.
-    * @param timeoutMillis Amount of time to wait for a response.
-    * @param strictFailure If true, convert non-2xx responses and bulk responses containing any failure to a failed Future.
-    * @param ec ExecutionContext where requests are executed and responses processed.
-    * @return [[ElastiknnFutureClient]]
+    * @param host
+    *   Elasticsearch host.
+    * @param port
+    *   Elasticsearch port.
+    * @param timeoutMillis
+    *   Amount of time to wait for a response.
+    * @param strictFailure
+    *   If true, convert non-2xx responses and bulk responses containing any failure to a failed Future.
+    * @param ec
+    *   ExecutionContext where requests are executed and responses processed.
+    * @return
+    *   [[ElastiknnFutureClient]]
     */
   def futureClient(host: String = "localhost", port: Int = 9200, strictFailure: Boolean = true, timeoutMillis: Int = 30000)(implicit
       ec: ExecutionContext

diff --git a/...iknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala b/...iknn-client-elastic4s/src/main/scala/com/klibisz/elastiknn/client/ElastiknnRequests.scala
@@ -8,37 +8,46 @@ import com.sksamuel.elastic4s.requests.mappings.PutMappingRequest
 import com.sksamuel.elastic4s.requests.searches.SearchRequest
 import com.sksamuel.elastic4s.{ElasticDsl, Indexes}
 
-/** Methods for creating Elastic4s requests for common elastiknn tasks.
-  * Methods are optimized for documents containing only a vector and running searches as quickly as possible.
-  * For example, we store the ID as a doc value instead of using the default stored ID which is slower to access.
-  * I am open to creating less-optimized, more-convenient methods in the future.
+/** Methods for creating Elastic4s requests for common elastiknn tasks. Methods are optimized for documents containing only a vector and
+  * running searches as quickly as possible. For example, we store the ID as a doc value instead of using the default stored ID which is
+  * slower to access. I am open to creating less-optimized, more-convenient methods in the future.
   */
 trait ElastiknnRequests {
 
   /** Create a request for indexing a vector.
     *
-    * @param index Name of the index.
-    * @param vecField Field where vector is stored.
-    * @param vec Vector to index.
-    * @param storedIdField Field where document ID is stored.
-    * @param id Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval.
-    * @return Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest.
+    * @param index
+    *   Name of the index.
+    * @param vecField
+    *   Field where vector is stored.
+    * @param vec
+    *   Vector to index.
+    * @param storedIdField
+    *   Field where document ID is stored.
+    * @param id
+    *   Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval.
+    * @return
+    *   Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest.
     */
   def index(index: String, vecField: String, vec: Vec, storedIdField: String, id: String): IndexRequest = {
     val xcb = XContentFactory.jsonBuilder().rawField(vecField, XContentCodec.encodeUnsafeToString(vec)).field(storedIdField, id)
     IndexRequest(index, source = Some(JacksonBuilder.writeAsString(xcb.value)), id = Some(id))
   }
 
-  /** Create a request for running a nearest neighbors query.
-    * Optimized for high performance, so it returns the document ID in the body.
-    * Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html)
-    *  as the hash of the query for more deterministic results.
+  /** Create a request for running a nearest neighbors query. Optimized for high performance, so it returns the document ID in the body.
+    * Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html) as the
+    * hash of the query for more deterministic results.
     *
-    * @param index Index being searched against.
-    * @param query Constructed query, containing the vector, field, etc.
-    * @param k Number of results to return.
-    * @param storedIdField Field containing the document ID. See ElastiknnRequests.index() method.
-    * @return Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest.
+    * @param index
+    *   Index being searched against.
+    * @param query
+    *   Constructed query, containing the vector, field, etc.
+    * @param k
+    *   Number of results to return.
+    * @param storedIdField
+    *   Field containing the document ID. See ElastiknnRequests.index() method.
+    * @return
+    *   Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest.
     */
   def nearestNeighbors(index: String, query: NearestNeighborsQuery, k: Int, storedIdField: String): SearchRequest =
     ElasticDsl
@@ -51,11 +60,16 @@ trait ElastiknnRequests {
 
   /** Create a mapping containing a vector field and a stored ID field.
     *
-    * @param index Index to which this mapping is applied.
-    * @param vecField Field where vector is stored.
-    * @param storedIdField Field where ID is stored.
-    * @param vecMapping Mapping for the stored vector.
-    * @return Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest.
+    * @param index
+    *   Index to which this mapping is applied.
+    * @param vecField
+    *   Field where vector is stored.
+    * @param storedIdField
+    *   Field where ID is stored.
+    * @param vecMapping
+    *   Mapping for the stored vector.
+    * @return
+    *   Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest.
     */
   def putMapping(index: String, vecField: String, storedIdField: String, vecMapping: Mapping): PutMappingRequest = {
     val mappingJsonString =

diff --git a/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala b/elastiknn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/RecallSuite.scala
@@ -13,15 +13,14 @@ import java.util.UUID
 import scala.concurrent.Future
 import scala.util.hashing.MurmurHash3.orderedHash
 
-/** Tests for recall regressions for all of the mappings and their queries using random vectors.
-  * There are some subtleties:
-  * - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could
-  *   have the same score relative a query vector.
-  * - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_.
-  *   Each shard can have a non-specific number of segments but we merge each shard to a specific number.
-  * - Repeated query results against the same index should be deterministic. However if you re-index the data and run
-  *   the same query, I have seen different results at times. This seems to be an effect at the Elasticsearch level.
-  *   I've tested at the Lucene (sans ES) level and that seems to be reliably deterministic.
+/** Tests for recall regressions for all of the mappings and their queries using random vectors. There are some subtleties:
+  *   - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could have the same score
+  *     relative a query vector.
+  *   - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_. Each shard can have
+  *     a non-specific number of segments but we merge each shard to a specific number.
+  *   - Repeated query results against the same index should be deterministic. However if you re-index the data and run the same query, I
+  *     have seen different results at times. This seems to be an effect at the Elasticsearch level. I've tested at the Lucene (sans ES)
+  *     level and that seems to be reliably deterministic.
   */
 class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient with AsyncCancelAfterFailure {
 
@@ -139,7 +138,7 @@ class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient wi
         NearestNeighborsQuery.Exact(vecField, Similarity.L2) -> 1d,
         NearestNeighborsQuery.Exact(vecField, Similarity.Cosine) -> 1d,
         NearestNeighborsQuery.PermutationLsh(vecField, Similarity.Cosine, 200) -> 0.31,
-        NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3,
+        NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3
       ),
       // TODO: This one seems to be more sensitive for some unknown reason.
       recallTolerance = 5e-2

diff --git a/...knn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala b/...knn-plugin-integration-tests/src/test/scala/com/klibisz/elastiknn/VectorMapperSuite.scala
@@ -58,7 +58,7 @@ class VectorMapperSuite extends AsyncFreeSpec with Matchers with Elastic4sMatche
         //      }
         //    }
         //  }
-        //}
+        // }
 
         res.body shouldBe defined
         val json = parse(res.body.get)

diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/ElasticsearchQueryBuilder.scala
@@ -47,8 +47,7 @@ object ElasticsearchQueryBuilder {
 
   // Used by rewriteGetVector to asynchronously provide an ElasticsearchQueryBuilder for an indexed vector.
   // The doRewrite method will delay indefinitely until until the query builder is provided.
-  private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed)
-    extends AbstractQueryBuilder[IndexedVectorQueryBuilder] {
+  private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed) extends AbstractQueryBuilder[IndexedVectorQueryBuilder] {
 
     private val ref = new AtomicReference[ElasticsearchQueryBuilder](null)
 

diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/query/HashingQuery.scala
@@ -36,9 +36,9 @@ final class HashingQuery[V <: Vec, S <: StoredVec: Decoder](
     )
   }
 
-  /** Note that this score function does not re-score the top candidates.
-    * The final score produced is `(max possible score for this similarity * (number of matching hashes / total number of hashes)`.
-    * This is necessary because a ScoreFunction can only evaluate one doc at a time and must immediately score it.
+  /** Note that this score function does not re-score the top candidates. The final score produced is `(max possible score for this
+    * similarity * (number of matching hashes / total number of hashes)`. This is necessary because a ScoreFunction can only evaluate one
+    * doc at a time and must immediately score it.
     */
   override def toScoreFunction(indexReader: IndexReader): ScoreFunction = {
 

diff --git a/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala b/elastiknn-plugin/src/main/scala/com/klibisz/elastiknn/storage/StoredVec.scala
@@ -4,25 +4,22 @@ import java.util
 
 import com.klibisz.elastiknn.api.Vec
 
-
-/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them.
-  * This is decoupled from the api Vec case classes so we can support various optimizations that might change the
-  * interface, e.g. streaming the vectors in a read-once fashion. Currently the fastest storage methods support roughly
-  * the same interface.
+/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them. This is decoupled from the api Vec case
+  * classes so we can support various optimizations that might change the interface, e.g. streaming the vectors in a read-once fashion.
+  * Currently the fastest storage methods support roughly the same interface.
   *
-  * The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance.
-  * The implementation is based mostly on the Kryo library's use of sun.misc.Unsafe.
-  * Many other options were considered:
-  *  - fast-serialization library with unsafe configuration - roughly same as using Unsafe.
-  *  - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe.
-  *  - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution.
-  *  - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size
-  *    doesn't seem to matter as it's compressed by ES anyway.
-  *  - java.io.DataOutput/InputStreams - far slower.
-  *  - scodec - far slower.
+  * The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance. The implementation is based
+  * mostly on the Kryo library's use of sun.misc.Unsafe. Many other options were considered:
+  *   - fast-serialization library with unsafe configuration - roughly same as using Unsafe.
+  *   - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe.
+  *   - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution.
+  *   - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size doesn't seem to matter as
+  *     it's compressed by ES anyway.
+  *   - java.io.DataOutput/InputStreams - far slower.
+  *   - scodec - far slower.
   *
-  *  Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions.
-  *  If this becomes a problem we should likely switch to ObjectOutput/InputStreams.
+  * Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions. If this becomes a problem we should
+  * likely switch to ObjectOutput/InputStreams.
   */
 sealed trait StoredVec
 

diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,3 +1,5 @@
 addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6")
 addSbtPlugin("org.typelevel" % "sbt-tpolecat" % "0.5.0")
+addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2")
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,4 +18,4 @@ object Mapping { @@
       final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping
       final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping
-    }
+    }