Skip to content

Commit

Permalink
Build: add sbt-scalafmt and linting check in CI workflow (#594)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexklibisz authored Nov 20, 2023
1 parent 1a1a561 commit bf38104
Show file tree
Hide file tree
Showing 14 changed files with 125 additions and 86 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ jobs:
- name: Increase MMAP Limits
timeout-minutes: 1
run: sudo sysctl -w vm.max_map_count=262144
- name: Lint
timeout-minutes: 5
run: task jvmLint
- name: Compile
timeout-minutes: 5
run: task jvmCompile
Expand Down
1 change: 1 addition & 0 deletions .scalafmt.conf
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
version = "3.7.15"
maxColumn = 140
runner.dialect = scala213
5 changes: 5 additions & 0 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@ tasks:
cmds:
- sbt -client "compile; Test/compile"

jvmLint:
desc: Lint the JVM code
cmds:
- sbt -client scalafmtCheckAll

jvmAssemble:
desc: Build the plugin bundle using SBT
cmds:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ object Mapping {
final case class L2Lsh(dims: Int, L: Int, k: Int, w: Int) extends Mapping

final case class PermutationLsh(dims: Int, k: Int, repeating: Boolean) extends Mapping
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,9 @@ object NearestNeighborsQuery {
override def similarity: Similarity = Similarity.L2
}

final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty())
extends ApproximateQuery {
final case class PermutationLsh(field: String, similarity: Similarity, candidates: Int, vec: Vec = Vec.Empty()) extends ApproximateQuery {
override def withVec(v: Vec): NearestNeighborsQuery = copy(vec = v)

override def withCandidates(candidates: Int): ApproximateQuery = copy(candidates = candidates)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ object Vec {

override def equals(other: Any): Boolean = other match {
case other: SparseBool => (trueIndices sameElements other.trueIndices) && totalIndices == other.totalIndices
case _ => false
case _ => false
}

override def toString: String = s"SparseBool(${trueIndices.take(3).mkString(",")},...,${trueIndices.length}/$totalIndices)"
Expand All @@ -50,7 +50,7 @@ object Vec {
final case class DenseFloat(values: Array[Float]) extends Vec with KnownDims {
override def equals(other: Any): Boolean = other match {
case other: DenseFloat => other.values sameElements values
case _ => false
case _ => false
}

override def toString: String = s"DenseFloat(${values.take(3).map(n => f"$n%.2f").mkString(",")},...,${values.length})"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,34 @@ trait ElastiknnClient[F[_]] extends AutoCloseable {
execute(ElastiknnRequests.putMapping(index, vecField, storedIdField, vecMapping))

/** Create an index with recommended defaults.
* @param index The index name.
* @param shards How many shards, 1 by default.
* @param replicas How many replicas, 1 by default.
* @param elastiknn Value for `index.elastiknn` setting, true by default.
* @return CreateIndexResponse
* @param index
* The index name.
* @param shards
* How many shards, 1 by default.
* @param replicas
* How many replicas, 1 by default.
* @param elastiknn
* Value for `index.elastiknn` setting, true by default.
* @return
* CreateIndexResponse
*/
def createIndex(index: String, shards: Int = 1, replicas: Int = 0): F[Response[CreateIndexResponse]] =
execute(ElasticDsl.createIndex(index).shards(shards).replicas(replicas))

/** Index a batch of vectors as new Elasticsearch docs, one doc per vector.
* Also see ElastiknnRequests.index().
/** Index a batch of vectors as new Elasticsearch docs, one doc per vector. Also see ElastiknnRequests.index().
*
* @param index Index where vectors are stored.
* @param vecField Field in each doc where vector is stored.
* @param vecs Sequence of vectors to store.
* @param storedIdField Field in each doc where ID is stored as a doc value.
* @param ids Sequence of ids. Assumed one-to-one correspondence to given vectors.
* @return Response containing BulkResponse containing indexing responses.
* @param index
* Index where vectors are stored.
* @param vecField
* Field in each doc where vector is stored.
* @param vecs
* Sequence of vectors to store.
* @param storedIdField
* Field in each doc where ID is stored as a doc value.
* @param ids
* Sequence of ids. Assumed one-to-one correspondence to given vectors.
* @return
* Response containing BulkResponse containing indexing responses.
*/
def index(index: String, vecField: String, vecs: Seq[Vec], storedIdField: String, ids: Seq[String]): F[Response[BulkResponse]] = {
val reqs = vecs.zip(ids).map { case (vec, id) =>
Expand Down Expand Up @@ -93,10 +103,14 @@ object ElastiknnClient {
final case class StrictFailureException(message: String, cause: Throwable = None.orNull) extends RuntimeException(message, cause)

/** Build an [[ElastiknnFutureClient]] from an elasticsearch RestClient.
* @param restClient The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes.
* @param strictFailure If true, convert non-200 responses and bulk responses containing any failure to a failed Future.
* @param ec ExecutionContext where requests are executed and responses processed.
* @return [[ElastiknnFutureClient]]
* @param restClient
* The Elasticsearch RestClient, configured how you like, e.g. connected to multiple nodes.
* @param strictFailure
* If true, convert non-200 responses and bulk responses containing any failure to a failed Future.
* @param ec
* ExecutionContext where requests are executed and responses processed.
* @return
* [[ElastiknnFutureClient]]
*/
def futureClient(restClient: RestClient, strictFailure: Boolean)(implicit ec: ExecutionContext): ElastiknnFutureClient = {
val jc: JavaClient = new JavaClient(restClient)
Expand All @@ -121,12 +135,18 @@ object ElastiknnClient {
}

/** Build an [[ElastiknnFutureClient]] that connects to a single host
* @param host Elasticsearch host.
* @param port Elasticsearch port.
* @param timeoutMillis Amount of time to wait for a response.
* @param strictFailure If true, convert non-2xx responses and bulk responses containing any failure to a failed Future.
* @param ec ExecutionContext where requests are executed and responses processed.
* @return [[ElastiknnFutureClient]]
* @param host
* Elasticsearch host.
* @param port
* Elasticsearch port.
* @param timeoutMillis
* Amount of time to wait for a response.
* @param strictFailure
* If true, convert non-2xx responses and bulk responses containing any failure to a failed Future.
* @param ec
* ExecutionContext where requests are executed and responses processed.
* @return
* [[ElastiknnFutureClient]]
*/
def futureClient(host: String = "localhost", port: Int = 9200, strictFailure: Boolean = true, timeoutMillis: Int = 30000)(implicit
ec: ExecutionContext
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,46 @@ import com.sksamuel.elastic4s.requests.mappings.PutMappingRequest
import com.sksamuel.elastic4s.requests.searches.SearchRequest
import com.sksamuel.elastic4s.{ElasticDsl, Indexes}

/** Methods for creating Elastic4s requests for common elastiknn tasks.
* Methods are optimized for documents containing only a vector and running searches as quickly as possible.
* For example, we store the ID as a doc value instead of using the default stored ID which is slower to access.
* I am open to creating less-optimized, more-convenient methods in the future.
/** Methods for creating Elastic4s requests for common elastiknn tasks. Methods are optimized for documents containing only a vector and
* running searches as quickly as possible. For example, we store the ID as a doc value instead of using the default stored ID which is
* slower to access. I am open to creating less-optimized, more-convenient methods in the future.
*/
trait ElastiknnRequests {

/** Create a request for indexing a vector.
*
* @param index Name of the index.
* @param vecField Field where vector is stored.
* @param vec Vector to index.
* @param storedIdField Field where document ID is stored.
* @param id Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval.
* @return Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest.
* @param index
* Name of the index.
* @param vecField
* Field where vector is stored.
* @param vec
* Vector to index.
* @param storedIdField
* Field where document ID is stored.
* @param id
* Document ID. Stored as the ID known by Elasticsearch, and in the document for faster retrieval.
* @return
* Instance of a com.sksamuel.elastic4s.requests.indexes.IndexRequest.
*/
def index(index: String, vecField: String, vec: Vec, storedIdField: String, id: String): IndexRequest = {
val xcb = XContentFactory.jsonBuilder().rawField(vecField, XContentCodec.encodeUnsafeToString(vec)).field(storedIdField, id)
IndexRequest(index, source = Some(JacksonBuilder.writeAsString(xcb.value)), id = Some(id))
}

/** Create a request for running a nearest neighbors query.
* Optimized for high performance, so it returns the document ID in the body.
* Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html)
* as the hash of the query for more deterministic results.
/** Create a request for running a nearest neighbors query. Optimized for high performance, so it returns the document ID in the body.
* Sets the preference parameter (see: https://www.elastic.co/guide/en/elasticsearch/reference/master/consistent-scoring.html) as the
* hash of the query for more deterministic results.
*
* @param index Index being searched against.
* @param query Constructed query, containing the vector, field, etc.
* @param k Number of results to return.
* @param storedIdField Field containing the document ID. See ElastiknnRequests.index() method.
* @return Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest.
* @param index
* Index being searched against.
* @param query
* Constructed query, containing the vector, field, etc.
* @param k
* Number of results to return.
* @param storedIdField
* Field containing the document ID. See ElastiknnRequests.index() method.
* @return
* Instance of com.sksamuel.elastic4s.requests.searches.SearchRequest.
*/
def nearestNeighbors(index: String, query: NearestNeighborsQuery, k: Int, storedIdField: String): SearchRequest =
ElasticDsl
Expand All @@ -51,11 +60,16 @@ trait ElastiknnRequests {

/** Create a mapping containing a vector field and a stored ID field.
*
* @param index Index to which this mapping is applied.
* @param vecField Field where vector is stored.
* @param storedIdField Field where ID is stored.
* @param vecMapping Mapping for the stored vector.
* @return Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest.
* @param index
* Index to which this mapping is applied.
* @param vecField
* Field where vector is stored.
* @param storedIdField
* Field where ID is stored.
* @param vecMapping
* Mapping for the stored vector.
* @return
* Instance of com.sksamuel.elastic4s.requests.mappings.PutMappingRequest.
*/
def putMapping(index: String, vecField: String, storedIdField: String, vecMapping: Mapping): PutMappingRequest = {
val mappingJsonString =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,14 @@ import java.util.UUID
import scala.concurrent.Future
import scala.util.hashing.MurmurHash3.orderedHash

/** Tests for recall regressions for all of the mappings and their queries using random vectors.
* There are some subtleties:
* - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could
* have the same score relative a query vector.
* - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_.
* Each shard can have a non-specific number of segments but we merge each shard to a specific number.
* - Repeated query results against the same index should be deterministic. However if you re-index the data and run
* the same query, I have seen different results at times. This seems to be an effect at the Elasticsearch level.
* I've tested at the Lucene (sans ES) level and that seems to be reliably deterministic.
/** Tests for recall regressions for all of the mappings and their queries using random vectors. There are some subtleties:
* - Recall is evaluated based on the scores returned, not the ids, to account for cases where multiple vectors could have the same score
* relative a query vector.
* - Using more shards will generally increase recall for LSH queries because candidates are evaluated per _segment_. Each shard can have
* a non-specific number of segments but we merge each shard to a specific number.
* - Repeated query results against the same index should be deterministic. However if you re-index the data and run the same query, I
* have seen different results at times. This seems to be an effect at the Elasticsearch level. I've tested at the Lucene (sans ES)
* level and that seems to be reliably deterministic.
*/
class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient with AsyncCancelAfterFailure {

Expand Down Expand Up @@ -139,7 +138,7 @@ class RecallSuite extends AsyncFunSuite with Matchers with ElasticAsyncClient wi
NearestNeighborsQuery.Exact(vecField, Similarity.L2) -> 1d,
NearestNeighborsQuery.Exact(vecField, Similarity.Cosine) -> 1d,
NearestNeighborsQuery.PermutationLsh(vecField, Similarity.Cosine, 200) -> 0.31,
NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3,
NearestNeighborsQuery.PermutationLsh(vecField, Similarity.L2, 200) -> 0.3
),
// TODO: This one seems to be more sensitive for some unknown reason.
recallTolerance = 5e-2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class VectorMapperSuite extends AsyncFreeSpec with Matchers with Elastic4sMatche
// }
// }
// }
//}
// }

res.body shouldBe defined
val json = parse(res.body.get)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ object ElasticsearchQueryBuilder {

// Used by rewriteGetVector to asynchronously provide an ElasticsearchQueryBuilder for an indexed vector.
// The doRewrite method will delay indefinitely until until the query builder is provided.
private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed)
extends AbstractQueryBuilder[IndexedVectorQueryBuilder] {
private final class IndexedVectorQueryBuilder(indexedVec: Vec.Indexed) extends AbstractQueryBuilder[IndexedVectorQueryBuilder] {

private val ref = new AtomicReference[ElasticsearchQueryBuilder](null)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ final class HashingQuery[V <: Vec, S <: StoredVec: Decoder](
)
}

/** Note that this score function does not re-score the top candidates.
* The final score produced is `(max possible score for this similarity * (number of matching hashes / total number of hashes)`.
* This is necessary because a ScoreFunction can only evaluate one doc at a time and must immediately score it.
/** Note that this score function does not re-score the top candidates. The final score produced is `(max possible score for this
* similarity * (number of matching hashes / total number of hashes)`. This is necessary because a ScoreFunction can only evaluate one
* doc at a time and must immediately score it.
*/
override def toScoreFunction(indexReader: IndexReader): ScoreFunction = {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,22 @@ import java.util

import com.klibisz.elastiknn.api.Vec


/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them.
* This is decoupled from the api Vec case classes so we can support various optimizations that might change the
* interface, e.g. streaming the vectors in a read-once fashion. Currently the fastest storage methods support roughly
* the same interface.
/** Abstraction for different vector storage layouts and typeclasses for encoding/decoding them. This is decoupled from the api Vec case
* classes so we can support various optimizations that might change the interface, e.g. streaming the vectors in a read-once fashion.
* Currently the fastest storage methods support roughly the same interface.
*
* The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance.
* The implementation is based mostly on the Kryo library's use of sun.misc.Unsafe.
* Many other options were considered:
* - fast-serialization library with unsafe configuration - roughly same as using Unsafe.
* - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe.
* - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution.
* - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size
* doesn't seem to matter as it's compressed by ES anyway.
* - java.io.DataOutput/InputStreams - far slower.
* - scodec - far slower.
* The current default serialization method is using sun.misc.Unsafe to eek out the best possible performance. The implementation is based
* mostly on the Kryo library's use of sun.misc.Unsafe. Many other options were considered:
* - fast-serialization library with unsafe configuration - roughly same as using Unsafe.
* - kryo library with unsafe input/output - a bit slower than fast-serialization and bare Unsafe.
* - java.io.ObjectOutput/InputStreams - 30-40% slower than Unsafe, but by far the best vanilla JVM solution.
* - protocol buffers - roughly same as ObjectOutput/InputStreams, but with smaller byte array sizes; the size doesn't seem to matter as
* it's compressed by ES anyway.
* - java.io.DataOutput/InputStreams - far slower.
* - scodec - far slower.
*
* Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions.
* If this becomes a problem we should likely switch to ObjectOutput/InputStreams.
* Anything using Unsafe comes with the tradeoff that it requires extra JVM security permissions. If this becomes a problem we should
* likely switch to ObjectOutput/InputStreams.
*/
sealed trait StoredVec

Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.6")
addSbtPlugin("org.typelevel" % "sbt-tpolecat" % "0.5.0")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2")


0 comments on commit bf38104

Please sign in to comment.